libmusepack: ARMv6 assembler for mpc_decoder_windowing_D, speeds up decoding of 128kbps sample file 2MHz, or 8%, on gigabeat S. The output difference to the c implementation and the other ARM implementation is +/-1 in less than 0.1% of the output samples.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28487 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
8404c53ee6
commit
dbdc0a8a8c
2 changed files with 205 additions and 5 deletions
|
@ -46,10 +46,16 @@
|
|||
|
||||
#if defined(MPC_FIXED_POINT)
|
||||
#if defined(CPU_ARM)
|
||||
// do not up-scale D-values to achieve higher speed in smull/mlal
|
||||
// operations. saves ~14/8 = 1.75 cycles per multiplication
|
||||
#define D(value) (value)
|
||||
|
||||
#if ARM_ARCH >= 6
|
||||
// on ARMv6 we use 32*32=64>>32 multiplies (smmul/smmla) so we need to scale up the D coefficients
|
||||
// the ARM11 multiplier doesn't have early termination so the magnitude of the multiplicands does not
|
||||
// matter for speed.
|
||||
#define D(value) (value << (14))
|
||||
#else
|
||||
// do not up-scale D-values to achieve higher speed in smull/mlal
|
||||
// operations. saves ~14/8 = 1.75 cycles per multiplication
|
||||
#define D(value) (value)
|
||||
#endif
|
||||
// in this configuration a post-shift by >>16 is needed after synthesis
|
||||
#else
|
||||
// saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17
|
||||
|
|
|
@ -296,7 +296,7 @@ mpc_decoder_windowing_D:
|
|||
add r1, r1, #4 /* V++ */
|
||||
|
||||
ldmpc regs=r4-r11
|
||||
#else /* arm9 and above */
|
||||
#elif ARM_ARCH < 6 /* arm9 and above */
|
||||
mpc_decoder_windowing_D:
|
||||
/* r0 = Data[] */
|
||||
/* r1 = V[] */
|
||||
|
@ -500,6 +500,200 @@ mpc_decoder_windowing_D:
|
|||
str r8, [r0], #4 /* store Data */
|
||||
add r1, r1, #4 /* V++ */
|
||||
|
||||
ldmpc regs=r4-r11
|
||||
#else
|
||||
mpc_decoder_windowing_D:
|
||||
/* r0 = Data[] */
|
||||
/* r1 = V[] */
|
||||
/* r2 = D[] */
|
||||
/* lr = counter */
|
||||
/************************************************************************
|
||||
* Further speed up through making use of symmetries within D[]-window.
|
||||
* The row V[00] can be extracted as it has symmetries within this single
|
||||
* row. 8 smull/mlal and 8 ldr's can be saved at the cost of 2 add's.
|
||||
* The rows V[01..15] are symmetric to V[31..17]. 15 x 16 ldr's can be
|
||||
* saved at the cost of 15 x 4 + 1 add's.
|
||||
* The row V[16] can be extracted as it has symmetries within this single
|
||||
* row. 8 smull/mlal and 8 ldr's can be saved.
|
||||
* On armv6 use smmulr/smlalr which are faster than smull/smlal and only
|
||||
* accumulate the top 32 bits of the result so that frees up 2
|
||||
* registers so we can ldm larger blocks.
|
||||
***********************************************************************/
|
||||
stmfd sp!, {r4-r11, lr}
|
||||
|
||||
/******************************************
|
||||
* row 0 with internal symmetry
|
||||
*****************************************/
|
||||
add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */
|
||||
ldmia r2!, { r3-r6 } /* load D[01..04] */
|
||||
ldr r7 , [r1, #96*4] /* 1 */
|
||||
ldr r10, [r1, #992*4] /* 15 */
|
||||
ldr r11, [r1, #128*4] /* 2 */
|
||||
rsb r10, r10, r7 /* V[01] - V[15] */
|
||||
ldr r12, [r1, #896*4] /* 14 */
|
||||
smmulr r9, r10, r3
|
||||
ldr r7 , [r1, #224*4] /* 3 */
|
||||
add r12, r12, r11 /* V[02] + V[14] */
|
||||
ldr r10, [r1, #864*4] /* 13 */
|
||||
smmlar r9, r12, r4, r9
|
||||
ldr r11, [r1, #256*4] /* 4 */
|
||||
rsb r10, r10, r7 /* V[03] - V[13] */
|
||||
ldr r12, [r1, #768*4] /* 12 */
|
||||
smmlar r9, r10, r5, r9
|
||||
ldr r7 , [r1, #352*4] /* 5 */
|
||||
add r12, r12, r11 /* V[04] + V[12] */
|
||||
ldr r10, [r1, #736*4] /* 11 */
|
||||
smmlar r9, r12, r6, r9
|
||||
ldmia r2!, { r3-r6 } /* load D[05..08] */
|
||||
ldr r11, [r1, #384*4] /* 6 */
|
||||
rsb r10, r10, r7 /* V[05] - V[11] */
|
||||
ldr r12, [r1, #640*4] /* 10 */
|
||||
smmlar r9, r10, r3, r9
|
||||
ldr r7 , [r1, #480*4] /* 7 */
|
||||
add r12, r12, r11 /* V[06] + V[10] */
|
||||
ldr r10, [r1, #608*4] /* 9 */
|
||||
smmlar r9, r12, r4, r9
|
||||
rsb r10, r10, r7 /* V[07] - V[09] */
|
||||
ldr r11, [r1, #512*4] /* 8 */
|
||||
smmlar r9, r10, r5, r9
|
||||
add r1, r1, #4 /* V+=1, r1 = V[01] */
|
||||
smmlar r9, r11, r6, r9
|
||||
add r2, r2, #7*4 /* D+=7, r2 = D[16] */
|
||||
mov r9, r9, lsl #2
|
||||
str r9, [r0], #4 /* store Data */
|
||||
|
||||
/******************************************
|
||||
* rows 01..15 are symmetric to rows 31..17
|
||||
* r9 = acc of 01..15
|
||||
* r1 = V[01..15]
|
||||
* r11 = acc of 31..17
|
||||
* r12 = V[31..16]
|
||||
*****************************************/
|
||||
mov lr, #15*8
|
||||
add r12, r1, #30*4 /* r12 = V[31] */
|
||||
.loop15:
|
||||
ldmia r2!, { r3-r6 } /* load D[00..03] */
|
||||
ldr r7, [r12, #896*4] /* 14 */
|
||||
ldr r8, [r12, #992*4] /* 15 */
|
||||
smmulr r11, r7, r4
|
||||
ldr r7, [r1] /* 0 */
|
||||
smmlar r11, r8, r3, r11
|
||||
ldr r8, [r1, #96*4] /* 1 */
|
||||
smmulr r9, r7, r3
|
||||
ldr r7, [r12, #768*4] /* 12 */
|
||||
smmlar r9, r8, r4, r9
|
||||
ldr r8, [r12, #864*4] /* 13 */
|
||||
smmlar r11, r7, r6, r11
|
||||
ldr r7, [r1, #128*4] /* 2 */
|
||||
smmlar r11, r8, r5, r11
|
||||
ldr r8, [r1, #224*4] /* 3 */
|
||||
smmlar r9, r7, r5, r9
|
||||
ldr r7, [r1, #256*4] /* 4 */
|
||||
smmlar r9, r8, r6, r9
|
||||
ldmia r2!, { r3-r6 } /* load D[04..07] */
|
||||
ldr r8, [r1, #352*4] /* 5 */
|
||||
smmlar r9, r7, r3, r9
|
||||
ldr r7, [r12, #640*4] /* 10 */
|
||||
smmlar r9, r8, r4, r9
|
||||
ldr r8, [r12, #736*4] /* 11 */
|
||||
smmlar r11, r7, r4, r11
|
||||
ldr r7, [r1, #384*4] /* 6 */
|
||||
smmlar r11, r8, r3, r11
|
||||
ldr r8, [r1, #480*4] /* 7 */
|
||||
smmlar r9, r7, r5, r9
|
||||
ldr r7, [r12, #512*4] /* 8 */
|
||||
smmlar r9, r8, r6, r9
|
||||
ldr r8, [r12, #608*4] /* 9 */
|
||||
smmlar r11, r7, r6, r11
|
||||
ldr r7, [r12, #384*4] /* 6 */
|
||||
smmlar r11, r8, r5, r11
|
||||
ldmia r2!, { r3-r6 } /* load D[08..11] */
|
||||
ldr r8, [r12, #480*4] /* 7 */
|
||||
smmlar r11, r7, r4, r11
|
||||
ldr r7, [r1, #512*4] /* 8 */
|
||||
smmlar r11, r8, r3, r11
|
||||
ldr r8, [r1, #608*4] /* 9 */
|
||||
smmlar r9, r7, r3, r9
|
||||
ldr r7, [r1, #640*4] /* 10 */
|
||||
smmlar r9, r8, r4, r9
|
||||
ldr r8, [r1, #736*4] /* 11 */
|
||||
smmlar r9, r7, r5, r9
|
||||
ldr r7, [r12, #256*4] /* 4 */
|
||||
smmlar r9, r8, r6, r9
|
||||
ldr r8, [r12, #352*4] /* 5 */
|
||||
smmlar r11, r7, r6, r11
|
||||
ldr r7, [r1, #768*4] /* 12 */
|
||||
smmlar r11, r8, r5, r11
|
||||
ldmia r2!, { r3-r6 } /* load D[12..15] */
|
||||
ldr r8, [r1, #864*4] /* 13 */
|
||||
smmlar r9, r7, r3, r9
|
||||
ldr r7, [r12, #128*4] /* 2 */
|
||||
smmlar r9, r8, r4, r9
|
||||
ldr r8, [r12, #224*4] /* 3 */
|
||||
smmlar r11, r7, r4, r11
|
||||
ldr r7, [r12] /* 0 */
|
||||
smmlar r11, r8, r3, r11
|
||||
ldr r8, [r12, #96*4] /* 1 */
|
||||
smmlar r11, r7, r6, r11
|
||||
ldr r7, [r1, #896*4] /* 14 */
|
||||
smmlar r11, r8, r5, r11
|
||||
ldr r8, [r1, #992*4] /* 15 */
|
||||
smmlar r9, r7, r5, r9
|
||||
sub r12, r12, #4 /* r12 = V-- correct adresses for next loop */
|
||||
smmlar r9, r8, r6, r9
|
||||
add r1, r1, #4 /* r1 = V++ correct adresses for next loop */
|
||||
rsb r11, r11, #0 /* r11 = -r11 */
|
||||
/* store Data[01..15] */
|
||||
mov r9, r9, lsl #2
|
||||
str r9, [r0] /* store Data */
|
||||
/* store Data[31..17] */
|
||||
mov r11, r11, lsl #2
|
||||
str r11, [r0, lr] /* store Data */
|
||||
add r0, r0, #4 /* r0++ */
|
||||
/* next loop */
|
||||
subs lr, lr, #8
|
||||
bgt .loop15
|
||||
|
||||
/******************************************
|
||||
* V[16] with internal symmetry
|
||||
*****************************************/
|
||||
ldmia r2!, { r3-r6 } /* load D[00..03] */
|
||||
ldr r7 , [r1] /* 0 */
|
||||
ldr r10, [r1, #992*4] /* 15 */
|
||||
ldr r11, [r1, #96*4] /* 1 */
|
||||
rsb r10, r10, r7 /* V[00] - V[15] */
|
||||
ldr r12, [r1, #896*4] /* 14 */
|
||||
smmulr r9, r10, r3
|
||||
ldr r7 , [r1, #128*4] /* 2 */
|
||||
rsb r12, r12, r11 /* V[01] - V[14] */
|
||||
ldr r10, [r1, #864*4] /* 13 */
|
||||
smmlar r9, r12, r4, r9
|
||||
ldr r11, [r1, #224*4] /* 3 */
|
||||
rsb r10, r10, r7 /* V[02] - V[13] */
|
||||
ldr r12, [r1, #768*4] /* 12 */
|
||||
smmlar r9, r10, r5, r9
|
||||
ldr r7 , [r1, #256*4] /* 4 */
|
||||
rsb r12, r12, r11 /* V[03] - V[12] */
|
||||
ldr r10, [r1, #736*4] /* 11 */
|
||||
smmlar r9, r12, r6, r9
|
||||
ldmia r2!, { r3-r6 } /* load D[04..07] */
|
||||
ldr r11, [r1, #352*4] /* 5 */
|
||||
rsb r10, r10, r7 /* V[04] - V[11] */
|
||||
ldr r12, [r1, #640*4] /* 10 */
|
||||
smmlar r9, r10, r3, r9
|
||||
ldr r7 , [r1, #384*4] /* 6 */
|
||||
rsb r12, r12, r11 /* V[05] - V[10] */
|
||||
ldr r10, [r1, #608*4] /* 9 */
|
||||
smmlar r9, r12, r4, r9
|
||||
ldr r11, [r1, #480*4] /* 7 */
|
||||
rsb r10, r10, r7 /* V[06] - V[09] */
|
||||
ldr r12, [r1, #512*4] /* 8 */
|
||||
smmlar r9, r10, r5, r9
|
||||
rsb r12, r12, r11 /* V[07] - V[08] */
|
||||
smmlar r9, r12, r6, r9
|
||||
mov r9, r9, lsl #2
|
||||
str r9, [r0], #4 /* store Data */
|
||||
|
||||
ldmpc regs=r4-r11
|
||||
#endif
|
||||
.mpc_dewindowing_end:
|
||||
|
|
Loading…
Reference in a new issue