ARM9 optimized synth_full for libmad. Speeds up mp3 decoding by an even 2 MHz on all ARM9 and later devices. Note this is only optimized for arm9 (non-E), although it is faster on later devices. An arm9E/11 version will be needed for optimal performance on newer devices.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28710 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Michael Giacomelli 2010-11-29 22:34:51 +00:00
parent 90d77fb77a
commit 9929512682

View file

@ -31,7 +31,12 @@
;; r1 = fo
;; r2 = fe
;; r3 = D0ptr
;; r4 = D1ptr
;; r4 = D1ptr
/*;; r5 = loop counter
;; r6,r7 accumulator1
;; r8,r9 accumulator2 */
synth_full_odd_sbsample:
stmdb sp!, {r4-r11, lr}
ldr r4, [sp, #36]
@ -40,88 +45,89 @@ synth_full_odd_sbsample:
mov r5, #15
add r2, r2, #32
.l:
/* ;; PROD_O and odd half of SB_SAMPLE*/
add r3, r3, #128
add r4, r4, #128
ldmia r1!, {r10, r11, r12, lr}
ldr r7, [r3, #4]
smull r6, r7, r10, r7
ldmia r1!, {r10, r11, r12, lr}
ldr r9, [r4, #120]
smull r6, r7, r10, r7
ldr sp, [r3, #60]
smull r8, r9, r10, r9
ldr r10, [r3, #60]
smlal r6, r7, r11, r10
ldr r10, [r3, #52]
smlal r6, r7, r11, sp
ldr sp, [r3, #44]
smlal r6, r7, r12, r10
ldr r10, [r3, #44]
smlal r6, r7, lr, r10
ldr r10, [r4, #64]
smlal r6, r7, lr, sp
ldr sp, [r4, #72]
smlal r8, r9, r11, r10
ldr r10, [r4, #72]
smlal r8, r9, r12, r10
ldr r10, [r4, #80]
smlal r8, r9, r12, sp
smlal r8, r9, lr, r10
ldr r10, [r3, #36]
ldmia r1!, {r11, r12, sp, lr}
ldr r10, [r3, #36]
smlal r6, r7, r11, r10
ldr r10, [r3, #28]
smlal r6, r7, r12, r10
ldr r10, [r3, #20]
smlal r6, r7, sp, r10
ldr r10, [r3, #12]
smlal r6, r7, lr, r10
ldr r10, [r4, #88]
ldr r10, [r4, #88] /*;;1 cycle stall on arm9, but we free up r11*/
smlal r8, r9, r11, r10
ldr r10, [r4, #96]
smlal r8, r9, r12, r10
ldr r10, [r3, #28]
ldr r11, [r3, #20]
smlal r6, r7, r12, r10
ldr r10, [r3, #12]
smlal r6, r7, sp, r11
ldr r11, [r4, #96]
smlal r6, r7, lr, r10
ldr r10, [r4, #104]
smlal r8, r9, r12, r11
ldr r11, [r4, #112]
smlal r8, r9, sp, r10
ldr r10, [r4, #112]
smlal r8, r9, lr, r10
smlal r8, r9, lr, r11
rsbs r6, r6, #0
rsc r7, r7, #0
ldmia r2!, {r11, r12, sp, lr}
/* ;; PROD_A and even half of SB_SAMPLE*/
ldr r10, [r3, #0]
smlal r6, r7, r11, r10
ldr r10, [r3, #56]
smlal r6, r7, r12, r10
ldr r10, [r3, #48]
smlal r6, r7, sp, r10
ldr r10, [r3, #40]
smlal r6, r7, lr, r10
ldr r10, [r4, #60]
smlal r8, r9, r11, r10
ldr r10, [r4, #68]
smlal r8, r9, r12, r10
ldr r10, [r4, #76]
smlal r8, r9, sp, r10
ldr r10, [r4, #84]
smlal r8, r9, lr, r10
ldmia r2!, {r11, r12, sp, lr}
ldr r10, [r3, #32]
smlal r6, r7, r11, r10
ldr r10, [r3, #24]
smlal r6, r7, r12, r10
ldr r10, [r3, #16]
ldr r10, [r4, #60] /*;;1 cycle stall on arm9, but we free up r11*/
smlal r8, r9, r11, r10
ldr r11, [r3, #56]
ldr r10, [r3, #48]
smlal r6, r7, r12, r11
ldr r11, [r3, #40]
smlal r6, r7, sp, r10
ldr r10, [r3, #8]
smlal r6, r7, lr, r10
ldr r10, [r4, #92]
smlal r8, r9, r11, r10
ldr r10, [r4, #100]
ldr r10, [r4, #68]
smlal r6, r7, lr, r11
ldr r11, [r4, #76]
smlal r8, r9, r12, r10
ldr r10, [r4, #108]
smlal r8, r9, sp, r10
ldr r10, [r4, #116]
ldr r10, [r4, #84]
smlal r8, r9, sp, r11
smlal r8, r9, lr, r10
ldr r10, [r3, #32]
ldmia r2!, {r11, r12, sp, lr}
smlal r6, r7, r11, r10
ldr r10, [r4, #92] /*;;1 cycle stall on arm9, but we free up r11*/
smlal r8, r9, r11, r10
ldr r10, [r3, #24]
ldr r11, [r3, #16]
smlal r6, r7, r12, r10
ldr r10, [r3, #8]
smlal r6, r7, sp, r11
ldr r11, [r4, #100]
smlal r6, r7, lr, r10
ldr r10, [r4, #108]
smlal r8, r9, r12, r11
ldr r11, [r4, #116]
smlal r8, r9, sp, r10
smlal r8, r9, lr, r11
movs r6, r6, lsr #16
adc r6, r6, r7, lsl #16
@ -146,88 +152,88 @@ synth_full_even_sbsample:
mov r5, #15
add r2, r2, #32
.l2:
/* ;; PROD_O and odd half of SB_SAMPLE*/
add r3, r3, #128
add r4, r4, #128
ldmia r1!, {r10, r11, r12, lr}
ldr r7, [r3, #0]
smull r6, r7, r10, r7
ldmia r1!, {r10, r11, r12, lr}
ldr r9, [r4, #60]
smull r6, r7, r10, r7
ldr sp, [r3, #56]
smull r8, r9, r10, r9
ldr r10, [r3, #56]
smlal r6, r7, r11, r10
ldr r10, [r3, #48]
smlal r6, r7, r11, sp
ldr sp, [r3, #40]
smlal r6, r7, r12, r10
ldr r10, [r3, #40]
smlal r6, r7, lr, r10
ldr r10, [r4, #68]
smlal r6, r7, lr, sp
ldr sp, [r4, #76]
smlal r8, r9, r11, r10
ldr r10, [r4, #76]
smlal r8, r9, r12, r10
ldr r10, [r4, #84]
smlal r8, r9, r12, sp
smlal r8, r9, lr, r10
ldmia r1!, {r11, r12, sp, lr}
ldr r10, [r3, #32]
smlal r6, r7, r11, r10
ldr r10, [r3, #24]
smlal r6, r7, r12, r10
ldr r10, [r3, #16]
smlal r6, r7, sp, r10
ldr r10, [r3, #8]
smlal r6, r7, lr, r10
ldmia r1!, {r11, r12, sp, lr}
ldr r10, [r4, #92]
smlal r6, r7, r11, r10
ldr r10, [r4, #92]
smlal r8, r9, r11, r10
ldr r10, [r4, #100]
smlal r8, r9, r12, r10
ldr r10, [r3, #24]
ldr r11, [r3, #16]
smlal r6, r7, r12, r10
ldr r10, [r3, #8]
smlal r6, r7, sp, r11
ldr r11, [r4, #100]
smlal r6, r7, lr, r10
ldr r10, [r4, #108]
smlal r8, r9, r12, r11
ldr r11, [r4, #116]
smlal r8, r9, sp, r10
ldr r10, [r4, #116]
smlal r8, r9, lr, r10
smlal r8, r9, lr, r11
rsbs r6, r6, #0
rsc r7, r7, #0
ldmia r2!, {r11, r12, sp, lr}
ldr r10, [r3, #4]
smlal r6, r7, r11, r10
ldr r10, [r3, #60]
smlal r6, r7, r12, r10
ldr r10, [r3, #52]
smlal r6, r7, sp, r10
ldr r10, [r3, #44]
smlal r6, r7, lr, r10
ldr r10, [r4, #120]
smlal r8, r9, r11, r10
ldr r10, [r4, #64]
smlal r8, r9, r12, r10
ldr r10, [r4, #72]
smlal r8, r9, sp, r10
ldr r10, [r4, #80]
smlal r8, r9, lr, r10
ldmia r2!, {r11, r12, sp, lr}
ldr r10, [r3, #36]
smlal r6, r7, r11, r10
ldr r10, [r3, #28]
ldr r10, [r4, #120] /*;;1 cycle stall on arm9, but we free up r11*/
smlal r8, r9, r11, r10
ldr r10, [r3, #60]
ldr r11, [r3, #52]
smlal r6, r7, r12, r10
ldr r10, [r3, #20]
smlal r6, r7, sp, r10
ldr r10, [r3, #12]
ldr r10, [r3, #44]
smlal r6, r7, sp, r11
ldr r11, [r4, #64]
smlal r6, r7, lr, r10
ldr r10, [r4, #88]
smlal r8, r9, r11, r10
ldr r10, [r4, #96]
smlal r8, r9, r12, r10
ldr r10, [r4, #104]
ldr r10, [r4, #72]
smlal r8, r9, r12, r11
ldr r11, [r4, #80]
smlal r8, r9, sp, r10
ldr r10, [r4, #112]
smlal r8, r9, lr, r10
smlal r8, r9, lr, r11
ldr r10, [r3, #36]
ldmia r2!, {r11, r12, sp, lr}
smlal r6, r7, r11, r10
ldr r10, [r4, #88] /*;;1 cycle stall on arm9, but we free up r11*/
smlal r8, r9, r11, r10
ldr r10, [r3, #28]
ldr r11, [r3, #20]
smlal r6, r7, r12, r10
ldr r10, [r3, #12]
smlal r6, r7, sp, r11
ldr r11, [r4, #96]
smlal r6, r7, lr, r10
ldr r10, [r4, #104]
smlal r8, r9, r12, r11
ldr r11, [r4, #112]
smlal r8, r9, sp, r10
smlal r8, r9, lr, r11
movs r6, r6, lsr #16
adc r6, r6, r7, lsl #16