rockbox/apps/codecs/libffmpegFLAC/arm.S
Daniel Stenberg 2acc0ac542 Updated our source code header to explicitly mention that we are GPL v2 or
later. We still need to hunt down snippets used that are not. 1324 modified
files...
http://www.rockbox.org/mail/archive/rockbox-dev-archive-2008-06/0060.shtml


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17847 a1c6a512-1295-4272-9138-f99709370657
2008-06-28 18:10:04 +00:00

271 lines
7.4 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2006 by Thom Johansen
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
/* The following is an assembler optimised version of the LPC filtering
routines needed for FLAC decoding. It is optimised for use with ARM
processors.
All LPC filtering up to order 9 is done in specially optimised unrolled
loops, while every order above this is handled by a slower default routine.
*/
#ifdef USE_IRAM
.section .icode,"ax",%progbits
#else
.text
#endif
.global lpc_decode_arm
lpc_decode_arm:
stmdb sp!, { r4-r11, lr }
ldr r4, [sp, #36]
/* r0 = blocksize, r1 = qlevel, r2 = pred_order
r3 = data, r4 = coeffs
*/
/* the data pointer always lags behind history pointer by 'pred_order'
samples. since we have one loop for each order, we can hard code this
and free a register by not saving data pointer.
*/
sub r3, r3, r2, lsl #2 @ r3 = history
cmp r0, #0 @ no samples to process
beq .exit
cmp r2, #9 @ check if order is too high for unrolled loops
addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists
@ jumptable:
b .default @ order too high, go to default routine
b .exit @ zero order filter isn't possible, exit function
b .order1
b .order2
b .order3
b .order4
b .order5
b .order6
b .order7
b .order8
@ last jump table entry coincides with target, so leave it out
.order9:
ldmia r4, { r5-r12, r14 } @ fetch coefs
.loop9:
ldr r4, [r3], #4 @ load first history sample
mul r2, r4, r14 @ multiply with last coef
ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2
mla r2, r4, r12, r2
ldr r4, [r3], #4
mla r2, r4, r11, r2
ldr r4, [r3], #4
mla r2, r4, r10, r2
ldr r4, [r3], #4
mla r2, r4, r9, r2
ldr r4, [r3], #4
mla r2, r4, r8, r2
ldr r4, [r3], #4
mla r2, r4, r7, r2
ldr r4, [r3], #4
mla r2, r4, r6, r2
ldr r4, [r3], #4
mla r2, r4, r5, r2
ldr r4, [r3] @ r4 = residual
add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual
str r2, [r3], #-8*4 @ save result and wrap history pointer back
subs r0, r0, #1 @ check if we're done
bne .loop9 @ nope, jump back
b .exit
.order8:
ldmia r4, { r5-r12 }
.loop8:
@ we have more registers to spare here, so start block reading
ldmia r3!, { r4, r14 }
mul r2, r4, r12
mla r2, r14, r11, r2
ldmia r3!, { r4, r14 }
mla r2, r4, r10, r2
mla r2, r14, r9, r2
ldmia r3!, { r4, r14 }
mla r2, r4, r8, r2
mla r2, r14, r7, r2
ldmia r3!, { r4, r14 }
mla r2, r4, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-7*4
subs r0, r0, #1
bne .loop8
b .exit
.order7:
ldmia r4, { r5-r11 }
.loop7:
ldmia r3!, { r4, r12, r14 }
mul r2, r4, r11
mla r2, r12, r10, r2
mla r2, r14, r9, r2
ldmia r3!, { r4, r12, r14 }
mla r2, r4, r8, r2
mla r2, r12, r7, r2
mla r2, r14, r6, r2
ldr r4, [r3], #4
mla r2, r4, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-6*4
subs r0, r0, #1
bne .loop7
b .exit
.order6:
ldmia r4, { r5-r10 }
.loop6:
ldmia r3!, { r4, r11-r12, r14 }
mul r2, r4, r10
mla r2, r11, r9, r2
mla r2, r12, r8, r2
mla r2, r14, r7, r2
ldmia r3!, { r4, r11 }
mla r2, r4, r6, r2
mla r2, r11, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-5*4
subs r0, r0, #1
bne .loop6
b .exit
.order5:
ldmia r4, { r5-r9 }
.loop5:
ldmia r3!, { r4, r10-r12, r14 }
mul r2, r4, r9
mla r2, r10, r8, r2
mla r2, r11, r7, r2
mla r2, r12, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-4*4
subs r0, r0, #1
bne .loop5
b .exit
.order4:
ldmia r4, { r5-r8 }
.loop4:
ldmia r3!, { r4, r11-r12, r14 }
mul r2, r4, r8
mla r2, r11, r7, r2
mla r2, r12, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-3*4
subs r0, r0, #1
bne .loop4
b .exit
.order3:
ldmia r4, { r5-r7 }
.loop3:
ldmia r3!, { r4, r12, r14 }
mul r2, r4, r7
mla r2, r12, r6, r2
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-2*4
subs r0, r0, #1
bne .loop3
b .exit
.order2:
ldmia r4, { r5-r6 }
.loop2:
ldmia r3!, { r4, r14 }
mul r2, r4, r6
mla r2, r14, r5, r2
ldr r4, [r3]
add r2, r4, r2, asr r1
str r2, [r3], #-1*4
subs r0, r0, #1
bne .loop2
b .exit
.order1:
ldr r5, [r4] @ load the one coef we need
ldr r4, [r3], #4 @ load one history sample, r3 now points to residual
.loop1:
mul r2, r4, r5 @ multiply coef by history sample
ldr r4, [r3] @ load residual
add r4, r4, r2, asr r1 @ add result to residual
str r4, [r3], #4 @ place r3 at next residual, we already have
subs r0, r0, #1 @ the current sample in r4 for the next iteration
bne .loop1
b .exit
.default:
/* we do the filtering in an unrolled by 4 loop as far as we can, and then
do the rest by jump table. */
add r5, r4, r2, lsl #2 @ need to start in the other end of coefs
mov r7, r2, lsr #2 @ r7 = coefs/4
mov r14, #0 @ init accumulator
.dloop1:
ldmdb r5!, { r8-r11 }
ldmia r3!, { r6, r12 }
mla r14, r6, r11, r14
mla r14, r12, r10, r14
ldmia r3!, { r6, r12 }
mla r14, r6, r9, r14
mla r14, r12, r8, r14
subs r7, r7, #1
bne .dloop1
and r7, r2, #3 @ get remaining samples to be filtered
add pc, pc, r7, lsl #2 @ jump into accumulator chain
@ jumptable:
b .dsave @ padding
b .dsave
b .oneleft
b .twoleft
@ implicit .threeleft
ldr r12, [r5, #-4]!
ldr r8, [r3], #4
mla r14, r12, r8, r14
.twoleft:
ldr r12, [r5, #-4]!
ldr r8, [r3], #4
mla r14, r12, r8, r14
.oneleft:
ldr r12, [r5, #-4]!
ldr r8, [r3], #4
mla r14, r12, r8, r14
.dsave:
ldr r12, [r3] @ load residual
add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
str r14, [r3], #4 @ store result
sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos
subs r0, r0, #1 @ are we done?
bne .default @ no, prepare for next sample
.exit:
ldmia sp!, { r4-r11, pc }