From dbdc0a8a8cbfa4e6b72e5f6fb643f5b0ef4afc27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nils=20Wallm=C3=A9nius?= Date: Fri, 5 Nov 2010 11:20:50 +0000 Subject: [PATCH] libmusepack: ARMv6 assembler for mpc_decoder_windowing_D, speeds up decoding of 128kbps sample file 2MHz, or 8%, on gigabeat S. The output difference to the c implementation and the other ARM implementation is +/-1 in less than 0.1% of the output samples. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28487 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmusepack/synth_filter.c | 14 +- apps/codecs/libmusepack/synth_filter_arm.S | 196 ++++++++++++++++++++- 2 files changed, 205 insertions(+), 5 deletions(-) diff --git a/apps/codecs/libmusepack/synth_filter.c b/apps/codecs/libmusepack/synth_filter.c index 9a79328106..94c57eb213 100644 --- a/apps/codecs/libmusepack/synth_filter.c +++ b/apps/codecs/libmusepack/synth_filter.c @@ -46,10 +46,16 @@ #if defined(MPC_FIXED_POINT) #if defined(CPU_ARM) - // do not up-scale D-values to achieve higher speed in smull/mlal - // operations. saves ~14/8 = 1.75 cycles per multiplication - #define D(value) (value) - + #if ARM_ARCH >= 6 + // on ARMv6 we use 32*32=64>>32 multiplies (smmul/smmla) so we need to scale up the D coefficients + // the ARM11 multiplier doesn't have early termination so the magnitude of the multiplicands does not + // matter for speed. + #define D(value) (value << (14)) + #else + // do not up-scale D-values to achieve higher speed in smull/mlal + // operations. saves ~14/8 = 1.75 cycles per multiplication + #define D(value) (value) + #endif // in this configuration a post-shift by >>16 is needed after synthesis #else // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S index b44e029a43..598f218e45 100644 --- a/apps/codecs/libmusepack/synth_filter_arm.S +++ b/apps/codecs/libmusepack/synth_filter_arm.S @@ -296,7 +296,7 @@ mpc_decoder_windowing_D: add r1, r1, #4 /* V++ */ ldmpc regs=r4-r11 -#else /* arm9 and above */ +#elif ARM_ARCH < 6 /* arm9 and above */ mpc_decoder_windowing_D: /* r0 = Data[] */ /* r1 = V[] */ @@ -500,6 +500,200 @@ mpc_decoder_windowing_D: str r8, [r0], #4 /* store Data */ add r1, r1, #4 /* V++ */ + ldmpc regs=r4-r11 +#else + mpc_decoder_windowing_D: + /* r0 = Data[] */ + /* r1 = V[] */ + /* r2 = D[] */ + /* lr = counter */ + /************************************************************************ + * Further speed up through making use of symmetries within D[]-window. + * The row V[00] can be extracted as it has symmetries within this single + * row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's. + * The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be + * saved at the cost of 15 x 4 + 1 add's. + * The row V[16] can be extracted as it has symmetries within this single + * row. 8 smull/mlal and 8 ldr's can be saved. + * On armv6 use smmulr/smlalr which are faster than smull/smlal and only + * accumulate the top 32 bits of the result so that frees up 2 + * registers so we can ldm larger blocks. + ***********************************************************************/ + stmfd sp!, {r4-r11, lr} + + /****************************************** + * row 0 with internal symmetry + *****************************************/ + add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ + ldmia r2!, { r3-r6 } /* load D[01..04] */ + ldr r7 , [r1, #96*4] /* 1 */ + ldr r10, [r1, #992*4] /* 15 */ + ldr r11, [r1, #128*4] /* 2 */ + rsb r10, r10, r7 /* V[01] - V[15] */ + ldr r12, [r1, #896*4] /* 14 */ + smmulr r9, r10, r3 + ldr r7 , [r1, #224*4] /* 3 */ + add r12, r12, r11 /* V[02] + V[14] */ + ldr r10, [r1, #864*4] /* 13 */ + smmlar r9, r12, r4, r9 + ldr r11, [r1, #256*4] /* 4 */ + rsb r10, r10, r7 /* V[03] - V[13] */ + ldr r12, [r1, #768*4] /* 12 */ + smmlar r9, r10, r5, r9 + ldr r7 , [r1, #352*4] /* 5 */ + add r12, r12, r11 /* V[04] + V[12] */ + ldr r10, [r1, #736*4] /* 11 */ + smmlar r9, r12, r6, r9 + ldmia r2!, { r3-r6 } /* load D[05..08] */ + ldr r11, [r1, #384*4] /* 6 */ + rsb r10, r10, r7 /* V[05] - V[11] */ + ldr r12, [r1, #640*4] /* 10 */ + smmlar r9, r10, r3, r9 + ldr r7 , [r1, #480*4] /* 7 */ + add r12, r12, r11 /* V[06] + V[10] */ + ldr r10, [r1, #608*4] /* 9 */ + smmlar r9, r12, r4, r9 + rsb r10, r10, r7 /* V[07] - V[09] */ + ldr r11, [r1, #512*4] /* 8 */ + smmlar r9, r10, r5, r9 + add r1, r1, #4 /* V+=1, r1 = V[01] */ + smmlar r9, r11, r6, r9 + add r2, r2, #7*4 /* D+=7, r2 = D[16] */ + mov r9, r9, lsl #2 + str r9, [r0], #4 /* store Data */ + + /****************************************** + * rows 01..15 are symmetric to rows 31..17 + * r9 = acc of 01..15 + * r1 = V[01..15] + * r11 = acc of 31..17 + * r12 = V[31..16] + *****************************************/ + mov lr, #15*8 + add r12, r1, #30*4 /* r12 = V[31] */ +.loop15: + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldr r7, [r12, #896*4] /* 14 */ + ldr r8, [r12, #992*4] /* 15 */ + smmulr r11, r7, r4 + ldr r7, [r1] /* 0 */ + smmlar r11, r8, r3, r11 + ldr r8, [r1, #96*4] /* 1 */ + smmulr r9, r7, r3 + ldr r7, [r12, #768*4] /* 12 */ + smmlar r9, r8, r4, r9 + ldr r8, [r12, #864*4] /* 13 */ + smmlar r11, r7, r6, r11 + ldr r7, [r1, #128*4] /* 2 */ + smmlar r11, r8, r5, r11 + ldr r8, [r1, #224*4] /* 3 */ + smmlar r9, r7, r5, r9 + ldr r7, [r1, #256*4] /* 4 */ + smmlar r9, r8, r6, r9 + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldr r8, [r1, #352*4] /* 5 */ + smmlar r9, r7, r3, r9 + ldr r7, [r12, #640*4] /* 10 */ + smmlar r9, r8, r4, r9 + ldr r8, [r12, #736*4] /* 11 */ + smmlar r11, r7, r4, r11 + ldr r7, [r1, #384*4] /* 6 */ + smmlar r11, r8, r3, r11 + ldr r8, [r1, #480*4] /* 7 */ + smmlar r9, r7, r5, r9 + ldr r7, [r12, #512*4] /* 8 */ + smmlar r9, r8, r6, r9 + ldr r8, [r12, #608*4] /* 9 */ + smmlar r11, r7, r6, r11 + ldr r7, [r12, #384*4] /* 6 */ + smmlar r11, r8, r5, r11 + ldmia r2!, { r3-r6 } /* load D[08..11] */ + ldr r8, [r12, #480*4] /* 7 */ + smmlar r11, r7, r4, r11 + ldr r7, [r1, #512*4] /* 8 */ + smmlar r11, r8, r3, r11 + ldr r8, [r1, #608*4] /* 9 */ + smmlar r9, r7, r3, r9 + ldr r7, [r1, #640*4] /* 10 */ + smmlar r9, r8, r4, r9 + ldr r8, [r1, #736*4] /* 11 */ + smmlar r9, r7, r5, r9 + ldr r7, [r12, #256*4] /* 4 */ + smmlar r9, r8, r6, r9 + ldr r8, [r12, #352*4] /* 5 */ + smmlar r11, r7, r6, r11 + ldr r7, [r1, #768*4] /* 12 */ + smmlar r11, r8, r5, r11 + ldmia r2!, { r3-r6 } /* load D[12..15] */ + ldr r8, [r1, #864*4] /* 13 */ + smmlar r9, r7, r3, r9 + ldr r7, [r12, #128*4] /* 2 */ + smmlar r9, r8, r4, r9 + ldr r8, [r12, #224*4] /* 3 */ + smmlar r11, r7, r4, r11 + ldr r7, [r12] /* 0 */ + smmlar r11, r8, r3, r11 + ldr r8, [r12, #96*4] /* 1 */ + smmlar r11, r7, r6, r11 + ldr r7, [r1, #896*4] /* 14 */ + smmlar r11, r8, r5, r11 + ldr r8, [r1, #992*4] /* 15 */ + smmlar r9, r7, r5, r9 + sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */ + smmlar r9, r8, r6, r9 + add r1, r1, #4 /* r1 = V++ correct adresses for next loop */ + rsb r11, r11, #0 /* r11 = -r11 */ + /* store Data[01..15] */ + mov r9, r9, lsl #2 + str r9, [r0] /* store Data */ + /* store Data[31..17] */ + mov r11, r11, lsl #2 + str r11, [r0, lr] /* store Data */ + add r0, r0, #4 /* r0++ */ + /* next loop */ + subs lr, lr, #8 + bgt .loop15 + + /****************************************** + * V[16] with internal symmetry + *****************************************/ + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldr r7 , [r1] /* 0 */ + ldr r10, [r1, #992*4] /* 15 */ + ldr r11, [r1, #96*4] /* 1 */ + rsb r10, r10, r7 /* V[00] - V[15] */ + ldr r12, [r1, #896*4] /* 14 */ + smmulr r9, r10, r3 + ldr r7 , [r1, #128*4] /* 2 */ + rsb r12, r12, r11 /* V[01] - V[14] */ + ldr r10, [r1, #864*4] /* 13 */ + smmlar r9, r12, r4, r9 + ldr r11, [r1, #224*4] /* 3 */ + rsb r10, r10, r7 /* V[02] - V[13] */ + ldr r12, [r1, #768*4] /* 12 */ + smmlar r9, r10, r5, r9 + ldr r7 , [r1, #256*4] /* 4 */ + rsb r12, r12, r11 /* V[03] - V[12] */ + ldr r10, [r1, #736*4] /* 11 */ + smmlar r9, r12, r6, r9 + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldr r11, [r1, #352*4] /* 5 */ + rsb r10, r10, r7 /* V[04] - V[11] */ + ldr r12, [r1, #640*4] /* 10 */ + smmlar r9, r10, r3, r9 + ldr r7 , [r1, #384*4] /* 6 */ + rsb r12, r12, r11 /* V[05] - V[10] */ + ldr r10, [r1, #608*4] /* 9 */ + smmlar r9, r12, r4, r9 + ldr r11, [r1, #480*4] /* 7 */ + rsb r10, r10, r7 /* V[06] - V[09] */ + ldr r12, [r1, #512*4] /* 8 */ + smmlar r9, r10, r5, r9 + rsb r12, r12, r11 /* V[07] - V[08] */ + smmlar r9, r12, r6, r9 + mov r9, r9, lsl #2 + str r9, [r0], #4 /* store Data */ + ldmpc regs=r4-r11 #endif .mpc_dewindowing_end: