/* libdemac - A Monkey's Audio decoder $Id$ Copyright (C) Dave Chapman 2007 ARMv7 neon vector math copyright (C) 2010 Jens Arnold This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA */ #define FUSED_VECTOR_MATH #if ORDER > 32 #define REPEAT_BLOCK(x) x x x #elif ORDER > 16 #define REPEAT_BLOCK(x) x #else #define REPEAT_BLOCK(x) #endif /* Calculate scalarproduct, then add a 2nd vector (fused for performance) */ static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) { int res; #if ORDER > 64 int cnt = ORDER>>6; #endif asm volatile ( #if ORDER > 64 "vmov.i16 q0, #0 \n" "1: \n" "subs %[cnt], %[cnt], #1 \n" #endif "vld1.16 {d6-d9}, [%[f2]]! \n" "vld1.16 {d2-d5}, [%[v1]] \n" "vld1.16 {d10-d13}, [%[s2]]! \n" #if ORDER > 64 "vmlal.s16 q0, d2, d6 \n" #else "vmull.s16 q0, d2, d6 \n" #endif "vmlal.s16 q0, d3, d7 \n" "vmlal.s16 q0, d4, d8 \n" "vmlal.s16 q0, d5, d9 \n" "vadd.i16 q1, q1, q5 \n" "vadd.i16 q2, q2, q6 \n" "vst1.16 {d2-d5}, [%[v1]]! \n" REPEAT_BLOCK( "vld1.16 {d6-d9}, [%[f2]]! \n" "vld1.16 {d2-d5}, [%[v1]] \n" "vld1.16 {d10-d13}, [%[s2]]! \n" "vmlal.s16 q0, d2, d6 \n" "vmlal.s16 q0, d3, d7 \n" "vmlal.s16 q0, d4, d8 \n" "vmlal.s16 q0, d5, d9 \n" "vadd.i16 q1, q1, q5 \n" "vadd.i16 q2, q2, q6 \n" "vst1.16 {d2-d5}, [%[v1]]! \n" ) #if ORDER > 64 "bne 1b \n" #endif "vpadd.i32 d0, d0, d1 \n" "vpaddl.s32 d0, d0 \n" "vmov.32 %[res], d0[0] \n" : /* outputs */ #if ORDER > 64 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), [f2] "+r"(f2), [s2] "+r"(s2), [res]"=r"(res) : /* inputs */ : /* clobbers */ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory" ); return res; } /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) */ static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) { int res; #if ORDER > 64 int cnt = ORDER>>6; #endif asm volatile ( #if ORDER > 64 "vmov.i16 q0, #0 \n" "1: \n" "subs %[cnt], %[cnt], #1 \n" #endif "vld1.16 {d6-d9}, [%[f2]]! \n" "vld1.16 {d2-d5}, [%[v1]] \n" "vld1.16 {d10-d13}, [%[s2]]! \n" #if ORDER > 64 "vmlal.s16 q0, d2, d6 \n" #else "vmull.s16 q0, d2, d6 \n" #endif "vmlal.s16 q0, d3, d7 \n" "vmlal.s16 q0, d4, d8 \n" "vmlal.s16 q0, d5, d9 \n" "vsub.i16 q1, q1, q5 \n" "vsub.i16 q2, q2, q6 \n" "vst1.16 {d2-d5}, [%[v1]]! \n" REPEAT_BLOCK( "vld1.16 {d6-d9}, [%[f2]]! \n" "vld1.16 {d2-d5}, [%[v1]] \n" "vld1.16 {d10-d13}, [%[s2]]! \n" "vmlal.s16 q0, d2, d6 \n" "vmlal.s16 q0, d3, d7 \n" "vmlal.s16 q0, d4, d8 \n" "vmlal.s16 q0, d5, d9 \n" "vsub.i16 q1, q1, q5 \n" "vsub.i16 q2, q2, q6 \n" "vst1.16 {d2-d5}, [%[v1]]! \n" ) #if ORDER > 64 "bne 1b \n" #endif "vpadd.i32 d0, d0, d1 \n" "vpaddl.s32 d0, d0 \n" "vmov.32 %[res], d0[0] \n" : /* outputs */ #if ORDER > 64 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), [f2] "+r"(f2), [s2] "+r"(s2), [res]"=r"(res) : /* inputs */ : /* clobbers */ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory" ); return res; } static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) { int res; #if ORDER > 64 int cnt = ORDER>>6; #endif asm volatile ( #if ORDER > 64 "vmov.i16 q0, #0 \n" "1: \n" "subs %[cnt], %[cnt], #1 \n" #endif "vld1.16 {d2-d5}, [%[v1]]! \n" "vld1.16 {d6-d9}, [%[v2]]! \n" #if ORDER > 64 "vmlal.s16 q0, d2, d6 \n" #else "vmull.s16 q0, d2, d6 \n" #endif "vmlal.s16 q0, d3, d7 \n" "vmlal.s16 q0, d4, d8 \n" "vmlal.s16 q0, d5, d9 \n" REPEAT_BLOCK( "vld1.16 {d2-d5}, [%[v1]]! \n" "vld1.16 {d6-d9}, [%[v2]]! \n" "vmlal.s16 q0, d2, d6 \n" "vmlal.s16 q0, d3, d7 \n" "vmlal.s16 q0, d4, d8 \n" "vmlal.s16 q0, d5, d9 \n" ) #if ORDER > 64 "bne 1b \n" #endif "vpadd.i32 d0, d0, d1 \n" "vpaddl.s32 d0, d0 \n" "vmov.32 %[res], d0[0] \n" : /* outputs */ #if ORDER > 64 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), [v2] "+r"(v2), [res]"=r"(res) : /* inputs */ : /* clobbers */ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9" ); return res; }