rockbox/apps/codecs/demac/libdemac/vector_math16_mmx.h

/*

libdemac - A Monkey's Audio decoder

$Id$

Copyright (C) Dave Chapman 2007

MMX vector math copyright (C) 2010 Jens Arnold

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA

*/

#define FUSED_VECTOR_MATH

#define __E(__e) #__e
#define __S(__e) __E(__e)

static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)
{
    int res, t;
#if ORDER > 256
    int cnt = ORDER>>8;
#endif

    asm volatile (
#if ORDER > 256
        "pxor    %%mm2, %%mm2        \n"
        ".set    ofs, 0              \n"
    "1:                              \n"
        ".rept   64                  \n"
#else
        "movq    (%[v1]), %%mm2      \n"
        "movq    %%mm2, %%mm0        \n"
        "pmaddwd (%[f2]), %%mm2      \n"
        "paddw   (%[s2]), %%mm0      \n"
        "movq    %%mm0, (%[v1])      \n"
        ".set    ofs, 8              \n"

        ".rept  " __S(ORDER>>2 - 1) "\n"
#endif
        "movq    ofs(%[v1]), %%mm1   \n"
        "movq    %%mm1, %%mm0        \n"
        "pmaddwd ofs(%[f2]), %%mm1   \n"
        "paddw   ofs(%[s2]), %%mm0   \n"
        "movq    %%mm0, ofs(%[v1])   \n"
        "paddd   %%mm1, %%mm2        \n"
        ".set    ofs, ofs + 8        \n"
        ".endr                       \n"
#if ORDER > 256
        "add     $512, %[v1]         \n"
        "add     $512, %[s2]         \n"
        "add     $512, %[f2]         \n"
        "dec     %[cnt]              \n"
        "jne     1b                  \n"
#endif

        "movd    %%mm2, %[t]         \n"
        "psrlq   $32, %%mm2          \n"
        "movd    %%mm2, %[res]       \n"
        "add     %[t], %[res]        \n"
        : /* outputs */
#if ORDER > 256
        [cnt]"+r"(cnt),
        [s2] "+r"(s2),
        [res]"=r"(res),
        [t]  "=r"(t)
        : /* inputs */
        [v1]"2"(v1),
        [f2]"3"(f2)
#else
        [res]"=r"(res),
        [t]  "=r"(t)
        : /* inputs */
        [v1]"r"(v1),
        [f2]"r"(f2),
        [s2]"r"(s2)
#endif
        : /* clobbers */
        "mm0", "mm1", "mm2"
    );
    return res;
}

static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)
{
    int res, t;
#if ORDER > 256
    int cnt = ORDER>>8;
#endif

    asm volatile (
#if ORDER > 256
        "pxor    %%mm2, %%mm2        \n"
        ".set    ofs, 0              \n"
    "1:                              \n"
        ".rept   64                  \n"
#else
        "movq    (%[v1]), %%mm2      \n"
        "movq    %%mm2, %%mm0        \n"
        "pmaddwd (%[f2]), %%mm2      \n"
        "psubw   (%[s2]), %%mm0      \n"
        "movq    %%mm0, (%[v1])      \n"
        ".set    ofs, 8              \n"

        ".rept  " __S(ORDER>>2 - 1) "\n"
#endif
        "movq    ofs(%[v1]), %%mm1   \n"
        "movq    %%mm1, %%mm0        \n"
        "pmaddwd ofs(%[f2]), %%mm1   \n"
        "psubw   ofs(%[s2]), %%mm0   \n"
        "movq    %%mm0, ofs(%[v1])   \n"
        "paddd   %%mm1, %%mm2        \n"
        ".set    ofs, ofs + 8        \n"
        ".endr                       \n"
#if ORDER > 256
        "add     $512, %[v1]         \n"
        "add     $512, %[s2]         \n"
        "add     $512, %[f2]         \n"
        "dec     %[cnt]              \n"
        "jne     1b                  \n"
#endif

        "movd    %%mm2, %[t]         \n"
        "psrlq   $32, %%mm2          \n"
        "movd    %%mm2, %[res]       \n"
        "add     %[t], %[res]        \n"
        : /* outputs */
#if ORDER > 256
        [cnt]"+r"(cnt),
        [s2] "+r"(s2),
        [res]"=r"(res),
        [t]  "=r"(t)
        : /* inputs */
        [v1]"2"(v1),
        [f2]"3"(f2)
#else
        [res]"=r"(res),
        [t]  "=r"(t)
        : /* inputs */
        [v1]"r"(v1),
        [f2]"r"(f2),
        [s2]"r"(s2)
#endif
        : /* clobbers */
        "mm0", "mm1", "mm2"
    );
    return res;
}

static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
{
    int res, t;
#if ORDER > 256
    int cnt = ORDER>>8;
#endif
               
    asm volatile (
#if ORDER > 256
        "pxor    %%mm1, %%mm1        \n"
        ".set    ofs, 0              \n"
    "1:                              \n"
        ".rept   64                  \n"
#else
        "movq    (%[v1]), %%mm1      \n"
        "pmaddwd (%[v2]), %%mm1      \n"
        ".set    ofs, 8              \n"

        ".rept  " __S(ORDER>>2 - 1) "\n"
#endif
        "movq    ofs(%[v1]), %%mm0   \n"
        "pmaddwd ofs(%[v2]), %%mm0   \n"
        "paddd   %%mm0, %%mm1        \n"
        ".set    ofs, ofs + 8        \n"
        ".endr                       \n"
#if ORDER > 256
        "add     $512, %[v1]         \n"
        "add     $512, %[v2]         \n"
        "dec     %[cnt]              \n"
        "jne     1b                  \n"
#endif

        "movd    %%mm1, %[t]         \n"
        "psrlq   $32, %%mm1          \n"
        "movd    %%mm1, %[res]       \n"
        "add     %[t], %[res]        \n"
        : /* outputs */
#if ORDER > 256
        [cnt]"+r"(cnt),
        [res]"=r"(res),
        [t]  "=r"(t)
        : /* inputs */
        [v1]"1"(v1),
        [v2]"2"(v2)
#else
        [res]"=r"(res),
        [t]  "=r"(t)
        : /* inputs */
        [v1]"r"(v1),
        [v2]"r"(v2)
#endif
        : /* clobbers */
        "mm0", "mm1"
    );
    return res;
}
libdemac: Add x86/x86_64 MMX asm for the filters. Not relevant for target but speeds up decoding on x86/x86_64 sims. Average speedup ranges from 25% for -c2000 to 3 times for -c5000; on Intel Atom it's even 45% for -c2000 to 6 times for -c5000. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24663 a1c6a512-1295-4272-9138-f99709370657 2010-02-15 01:27:04 +00:00			`/*`

			`libdemac - A Monkey's Audio decoder`

			$Id$

			`Copyright (C) Dave Chapman 2007`

			`MMX vector math copyright (C) 2010 Jens Arnold`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA`

			`*/`

			`#define FUSED_VECTOR_MATH`

			`#define __E(__e) #__e`
			`#define __S(__e) __E(__e)`

			`static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t *s2)`
			`{`
			`int res, t;`
			`#if ORDER > 256`
			`int cnt = ORDER>>8;`
			`#endif`

			`asm volatile (`
			`#if ORDER > 256`
			`"pxor %%mm2, %%mm2 \n"`
			`".set ofs, 0 \n"`
			`"1: \n"`
			`".rept 64 \n"`
			`#else`
			`"movq (%[v1]), %%mm2 \n"`
			`"movq %%mm2, %%mm0 \n"`
			`"pmaddwd (%[f2]), %%mm2 \n"`
			`"paddw (%[s2]), %%mm0 \n"`
			`"movq %%mm0, (%[v1]) \n"`
			`".set ofs, 8 \n"`

			`".rept " __S(ORDER>>2 - 1) "\n"`
			`#endif`
			`"movq ofs(%[v1]), %%mm1 \n"`
			`"movq %%mm1, %%mm0 \n"`
			`"pmaddwd ofs(%[f2]), %%mm1 \n"`
			`"paddw ofs(%[s2]), %%mm0 \n"`
			`"movq %%mm0, ofs(%[v1]) \n"`
			`"paddd %%mm1, %%mm2 \n"`
			`".set ofs, ofs + 8 \n"`
			`".endr \n"`
			`#if ORDER > 256`
			`"add $512, %[v1] \n"`
			`"add $512, %[s2] \n"`
			`"add $512, %[f2] \n"`
			`"dec %[cnt] \n"`
			`"jne 1b \n"`
			`#endif`

			`"movd %%mm2, %[t] \n"`
			`"psrlq $32, %%mm2 \n"`
			`"movd %%mm2, %[res] \n"`
			`"add %[t], %[res] \n"`
			`: /* outputs */`
			`#if ORDER > 256`
			`[cnt]"+r"(cnt),`
			`[s2] "+r"(s2),`
			`[res]"=r"(res),`
			`[t] "=r"(t)`
			`: /* inputs */`
			`[v1]"2"(v1),`
			`[f2]"3"(f2)`
			`#else`
			`[res]"=r"(res),`
			`[t] "=r"(t)`
			`: /* inputs */`
			`[v1]"r"(v1),`
			`[f2]"r"(f2),`
			`[s2]"r"(s2)`
			`#endif`
			`: /* clobbers */`
			`"mm0", "mm1", "mm2"`
			`);`
			`return res;`
			`}`

			`static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t *s2)`
			`{`
			`int res, t;`
			`#if ORDER > 256`
			`int cnt = ORDER>>8;`
			`#endif`

			`asm volatile (`
			`#if ORDER > 256`
			`"pxor %%mm2, %%mm2 \n"`
			`".set ofs, 0 \n"`
			`"1: \n"`
			`".rept 64 \n"`
			`#else`
			`"movq (%[v1]), %%mm2 \n"`
			`"movq %%mm2, %%mm0 \n"`
			`"pmaddwd (%[f2]), %%mm2 \n"`
			`"psubw (%[s2]), %%mm0 \n"`
			`"movq %%mm0, (%[v1]) \n"`
			`".set ofs, 8 \n"`

			`".rept " __S(ORDER>>2 - 1) "\n"`
			`#endif`
			`"movq ofs(%[v1]), %%mm1 \n"`
			`"movq %%mm1, %%mm0 \n"`
			`"pmaddwd ofs(%[f2]), %%mm1 \n"`
			`"psubw ofs(%[s2]), %%mm0 \n"`
			`"movq %%mm0, ofs(%[v1]) \n"`
			`"paddd %%mm1, %%mm2 \n"`
			`".set ofs, ofs + 8 \n"`
			`".endr \n"`
			`#if ORDER > 256`
			`"add $512, %[v1] \n"`
			`"add $512, %[s2] \n"`
			`"add $512, %[f2] \n"`
			`"dec %[cnt] \n"`
			`"jne 1b \n"`
			`#endif`

			`"movd %%mm2, %[t] \n"`
			`"psrlq $32, %%mm2 \n"`
			`"movd %%mm2, %[res] \n"`
			`"add %[t], %[res] \n"`
			`: /* outputs */`
			`#if ORDER > 256`
			`[cnt]"+r"(cnt),`
			`[s2] "+r"(s2),`
			`[res]"=r"(res),`
			`[t] "=r"(t)`
			`: /* inputs */`
			`[v1]"2"(v1),`
			`[f2]"3"(f2)`
			`#else`
			`[res]"=r"(res),`
			`[t] "=r"(t)`
			`: /* inputs */`
			`[v1]"r"(v1),`
			`[f2]"r"(f2),`
			`[s2]"r"(s2)`
			`#endif`
			`: /* clobbers */`
			`"mm0", "mm1", "mm2"`
			`);`
			`return res;`
			`}`

			`static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)`
			`{`
			`int res, t;`
			`#if ORDER > 256`
			`int cnt = ORDER>>8;`
			`#endif`

			`asm volatile (`
			`#if ORDER > 256`
			`"pxor %%mm1, %%mm1 \n"`
			`".set ofs, 0 \n"`
			`"1: \n"`
			`".rept 64 \n"`
			`#else`
			`"movq (%[v1]), %%mm1 \n"`
			`"pmaddwd (%[v2]), %%mm1 \n"`
			`".set ofs, 8 \n"`

			`".rept " __S(ORDER>>2 - 1) "\n"`
			`#endif`
			`"movq ofs(%[v1]), %%mm0 \n"`
			`"pmaddwd ofs(%[v2]), %%mm0 \n"`
			`"paddd %%mm0, %%mm1 \n"`
			`".set ofs, ofs + 8 \n"`
			`".endr \n"`
			`#if ORDER > 256`
			`"add $512, %[v1] \n"`
			`"add $512, %[v2] \n"`
			`"dec %[cnt] \n"`
			`"jne 1b \n"`
			`#endif`

			`"movd %%mm1, %[t] \n"`
			`"psrlq $32, %%mm1 \n"`
			`"movd %%mm1, %[res] \n"`
			`"add %[t], %[res] \n"`
			`: /* outputs */`
			`#if ORDER > 256`
			`[cnt]"+r"(cnt),`
			`[res]"=r"(res),`
			`[t] "=r"(t)`
			`: /* inputs */`
			`[v1]"1"(v1),`
			`[v2]"2"(v2)`
			`#else`
			`[res]"=r"(res),`
			`[t] "=r"(t)`
			`: /* inputs */`
			`[v1]"r"(v1),`
			`[v2]"r"(v2)`
			`#endif`
			`: /* clobbers */`
			`"mm0", "mm1"`
			`);`
			`return res;`
			`}`