rockbox/apps/codecs/demac/libdemac/vector_math16_armv7.h

214 lines
6.2 KiB
C

/*
libdemac - A Monkey's Audio decoder
$Id$
Copyright (C) Dave Chapman 2007
ARMv7 neon vector math copyright (C) 2010 Jens Arnold
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
*/
#define FUSED_VECTOR_MATH
#if ORDER > 32
#define REPEAT_BLOCK(x) x x x
#elif ORDER > 16
#define REPEAT_BLOCK(x) x
#else
#define REPEAT_BLOCK(x)
#endif
/* Calculate scalarproduct, then add a 2nd vector (fused for performance) */
static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
{
int res;
#if ORDER > 64
int cnt = ORDER>>6;
#endif
asm volatile (
#if ORDER > 64
"vmov.i16 q0, #0 \n"
"1: \n"
"subs %[cnt], %[cnt], #1 \n"
#endif
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
#if ORDER > 64
"vmlal.s16 q0, d2, d6 \n"
#else
"vmull.s16 q0, d2, d6 \n"
#endif
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vadd.i16 q1, q1, q5 \n"
"vadd.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
REPEAT_BLOCK(
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
"vmlal.s16 q0, d2, d6 \n"
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vadd.i16 q1, q1, q5 \n"
"vadd.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
)
#if ORDER > 64
"bne 1b \n"
#endif
"vpadd.i32 d0, d0, d1 \n"
"vpaddl.s32 d0, d0 \n"
"vmov.32 %[res], d0[0] \n"
: /* outputs */
#if ORDER > 64
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
[f2] "+r"(f2),
[s2] "+r"(s2),
[res]"=r"(res)
: /* inputs */
: /* clobbers */
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "memory"
);
return res;
}
/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */
static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
{
int res;
#if ORDER > 64
int cnt = ORDER>>6;
#endif
asm volatile (
#if ORDER > 64
"vmov.i16 q0, #0 \n"
"1: \n"
"subs %[cnt], %[cnt], #1 \n"
#endif
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
#if ORDER > 64
"vmlal.s16 q0, d2, d6 \n"
#else
"vmull.s16 q0, d2, d6 \n"
#endif
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vsub.i16 q1, q1, q5 \n"
"vsub.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
REPEAT_BLOCK(
"vld1.16 {d6-d9}, [%[f2]]! \n"
"vld1.16 {d2-d5}, [%[v1]] \n"
"vld1.16 {d10-d13}, [%[s2]]! \n"
"vmlal.s16 q0, d2, d6 \n"
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
"vsub.i16 q1, q1, q5 \n"
"vsub.i16 q2, q2, q6 \n"
"vst1.16 {d2-d5}, [%[v1]]! \n"
)
#if ORDER > 64
"bne 1b \n"
#endif
"vpadd.i32 d0, d0, d1 \n"
"vpaddl.s32 d0, d0 \n"
"vmov.32 %[res], d0[0] \n"
: /* outputs */
#if ORDER > 64
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
[f2] "+r"(f2),
[s2] "+r"(s2),
[res]"=r"(res)
: /* inputs */
: /* clobbers */
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
"d8", "d9", "d10", "d11", "d12", "d13", "memory"
);
return res;
}
static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
{
int res;
#if ORDER > 64
int cnt = ORDER>>6;
#endif
asm volatile (
#if ORDER > 64
"vmov.i16 q0, #0 \n"
"1: \n"
"subs %[cnt], %[cnt], #1 \n"
#endif
"vld1.16 {d2-d5}, [%[v1]]! \n"
"vld1.16 {d6-d9}, [%[v2]]! \n"
#if ORDER > 64
"vmlal.s16 q0, d2, d6 \n"
#else
"vmull.s16 q0, d2, d6 \n"
#endif
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
REPEAT_BLOCK(
"vld1.16 {d2-d5}, [%[v1]]! \n"
"vld1.16 {d6-d9}, [%[v2]]! \n"
"vmlal.s16 q0, d2, d6 \n"
"vmlal.s16 q0, d3, d7 \n"
"vmlal.s16 q0, d4, d8 \n"
"vmlal.s16 q0, d5, d9 \n"
)
#if ORDER > 64
"bne 1b \n"
#endif
"vpadd.i32 d0, d0, d1 \n"
"vpaddl.s32 d0, d0 \n"
"vmov.32 %[res], d0[0] \n"
: /* outputs */
#if ORDER > 64
[cnt]"+r"(cnt),
#endif
[v1] "+r"(v1),
[v2] "+r"(v2),
[res]"=r"(res)
: /* inputs */
: /* clobbers */
"d0", "d1", "d2", "d3", "d4",
"d5", "d6", "d7", "d8", "d9"
);
return res;
}