rockbox/apps/codecs/libffmpegFLAC/arm.S
Thom Johansen 4ba14d9113 Free one register extra so we can block load a bit more.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@9069 a1c6a512-1295-4272-9138-f99709370657
2006-03-17 00:31:15 +00:00

263 lines
6.9 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2006 by Thom Johansen
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
/* The following is an assembler optimised version of the LPC filtering
routines needed for FLAC decoding. It is optimised for use with ARM
processors.
All LPC filtering up to order 9 is done in specially optimised unrolled
loops, while every order above this is handled by a slower default routine.
*/
.section .icode,"ax",%progbits
.global lpc_decode_arm
lpc_decode_arm:
stmdb sp!, { r4-r11, lr }
ldr r4, [sp, #36]
/* r0 = blocksize, r1 = qlevel, r2 = pred_order
r3 = data, r4 = coeffs
*/
/* the data pointer always lags behind history pointer by 'pred_order'
samples. since we have one loop for each order, we can hard code this
and free a register by not saving data pointer.
*/
sub r3, r3, r2, lsl #2 @ r3 = history
cmp r0, #0 @ no samples to process
beq .exit
cmp r2, #9 @ check if order is too high for unrolled loops
addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists
@ jumptable:
b .default @ order too high, go to default routine
b .exit @ zero order filter isn't possible, exit function
b .order1
b .order2
b .order3
b .order4
b .order5
b .order6
b .order7
b .order8
@ last jump table entry coincides with target, so leave it out
.order9:
ldmia r4, { r5-r12, r14 } @ fetch coefs
.loop9:
ldr r4, [r3], #4 @ load first history sample
mul r2, r4, r14 @ multiply with last coef
ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2
mla r2, r4, r12, r2
ldr r4, [r3], #4
mla r2, r4, r11, r2
ldr r4, [r3], #4
mla r2, r4, r10, r2
ldr r4, [r3], #4
mla r2, r4, r9, r2
ldr r4, [r3], #4
mla r2, r4, r8, r2
ldr r4, [r3], #4
mla r2, r4, r7, r2
ldr r4, [r3], #4
mla r2, r4, r6, r2
ldr r4, [r3], #4
mla r2, r4, r5, r2
ldr r4, [r3] @ r4 = residual
add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual
str r2, [r3], #-8*4 @ save result and wrap history pointer back
subs r0, r0, #1 @ check if we're done
bne .loop9 @ nope, jump back
b .exit
.order8:
ldmia r4, { r5-r12 }
.loop8:
@ we have more registers to spare here, so start block reading
ldmia r3!, { r4, r14 }
mul r2, r4, r12
mla r2, r14, r11, r2
ldmia r3!, { r4, r14 }
mla r2, r4, r10, r2
mla r2, r14, r9, r2
ldmia r3!, { r4, r14 }
mla r2, r4, r8, r2
mla r2, r14, r7, r2
ldmia r3!, { r4, r14 }
mla r2, r4, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-7*4
subs r0, r0, #1
bne .loop8
b .exit
.order7:
ldmia r4, { r5-r11 }
.loop7:
ldmia r3!, { r4, r12, r14 }
mul r2, r4, r11
mla r2, r12, r10, r2
mla r2, r14, r9, r2
ldmia r3!, { r4, r12, r14 }
mla r2, r4, r8, r2
mla r2, r12, r7, r2
mla r2, r14, r6, r2
ldr r4, [r3], #4
mla r2, r4, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-6*4
subs r0, r0, #1
bne .loop7
b .exit
.order6:
ldmia r4, { r5-r10 }
.loop6:
ldmia r3!, { r4, r11-r12, r14 }
mul r2, r4, r10
mla r2, r11, r9, r2
mla r2, r12, r8, r2
mla r2, r14, r7, r2
ldmia r3!, { r4, r11 }
mla r2, r4, r6, r2
mla r2, r11, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-5*4
subs r0, r0, #1
bne .loop6
b .exit
.order5:
ldmia r4, { r5-r9 }
.loop5:
ldmia r3!, { r4, r10-r12, r14 }
mul r2, r4, r9
mla r2, r10, r8, r2
mla r2, r11, r7, r2
mla r2, r12, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-4*4
subs r0, r0, #1
bne .loop5
b .exit
.order4:
ldmia r4, { r5-r8 }
.loop4:
ldmia r3!, { r4, r11-r12, r14 }
mul r2, r4, r8
mla r2, r11, r7, r2
mla r2, r12, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-3*4
subs r0, r0, #1
bne .loop4
b .exit
.order3:
ldmia r4, { r5-r7 }
.loop3:
ldmia r3!, { r4, r12, r14 }
mul r2, r4, r7
mla r2, r12, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-2*4
subs r0, r0, #1
bne .loop3
b .exit
.order2:
ldmia r4, { r5-r6 }
.loop2:
ldmia r3!, { r4, r14 }
mul r2, r4, r6
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-1*4
subs r0, r0, #1
bne .loop2
b .exit
.order1:
ldr r5, [r4]
ldr r4, [r3], #4
.loop1:
mul r2, r4, r5
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #4
subs r0, r0, #1
bne .loop1
b .exit
.default:
/* we do the filtering in an unrolled by 4 loop as far as we can, and then
do the rest by jump table. */
add r5, r4, r2, lsl #2 @ need to start in the other end of coefs
mov r7, r2, lsr #2 @ r7 = coefs/4
mov r14, #0 @ init accumulator
.dloop1:
ldmdb r5!, { r8-r11 }
ldmia r3!, { r6, r12 }
mla r14, r6, r11, r14
mla r14, r12, r10, r14
ldmia r3!, { r6, r12 }
mla r14, r6, r9, r14
mla r14, r12, r8, r14
subs r7, r7, #1
bne .dloop1
and r7, r2, #3 @ get remaining samples to be filtered
add pc, pc, r7, lsl #2 @ jump into accumulator chain
@ jumptable:
b .dsave @ padding
b .dsave
b .oneleft
b .twoleft
@ implicit .threeleft
ldr r12, [r5, #-4]!
ldr r8, [r3], #4
mla r14, r12, r8, r14
.twoleft:
ldr r12, [r5, #-4]!
ldr r8, [r3], #4
mla r14, r12, r8, r14
.oneleft:
ldr r12, [r5, #-4]!
ldr r8, [r3], #4
mla r14, r12, r8, r14
.dsave:
ldr r12, [r3] @ load residual
add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
str r14, [r3], #4 @ store result
sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos
subs r0, r0, #1 @ are we done?
bne .default @ no, prepare for next sample
.exit:
ldmia sp!, { r4-r11, pc }