Revert r25929. Test have shown that the assembler code is more than 50% faster than the C code both on ARM7TDMI (tested on PP5002 and PP5022) and on ARM1136JF-S (tested on Gigabeat S). If it is slower on ARM9*, it should be disabled for ARM9 only.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25937 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
ed704825d2
commit
08d3c0be37
4 changed files with 287 additions and 0 deletions
|
@ -3,4 +3,6 @@ decoder.c
|
||||||
shndec.c
|
shndec.c
|
||||||
#if defined(CPU_COLDFIRE)
|
#if defined(CPU_COLDFIRE)
|
||||||
coldfire.S
|
coldfire.S
|
||||||
|
#elif defined(CPU_ARM)
|
||||||
|
arm.S
|
||||||
#endif
|
#endif
|
||||||
|
|
271
apps/codecs/libffmpegFLAC/arm.S
Normal file
271
apps/codecs/libffmpegFLAC/arm.S
Normal file
|
@ -0,0 +1,271 @@
|
||||||
|
/***************************************************************************
|
||||||
|
* __________ __ ___.
|
||||||
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
||||||
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
||||||
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
||||||
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
||||||
|
* \/ \/ \/ \/ \/
|
||||||
|
* $Id$
|
||||||
|
*
|
||||||
|
* Copyright (C) 2006 by Thom Johansen
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License
|
||||||
|
* as published by the Free Software Foundation; either version 2
|
||||||
|
* of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
||||||
|
* KIND, either express or implied.
|
||||||
|
*
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
/* The following is an assembler optimised version of the LPC filtering
|
||||||
|
routines needed for FLAC decoding. It is optimised for use with ARM
|
||||||
|
processors.
|
||||||
|
All LPC filtering up to order 9 is done in specially optimised unrolled
|
||||||
|
loops, while every order above this is handled by a slower default routine.
|
||||||
|
*/
|
||||||
|
#ifdef USE_IRAM
|
||||||
|
.section .icode,"ax",%progbits
|
||||||
|
#else
|
||||||
|
.text
|
||||||
|
#endif
|
||||||
|
.global lpc_decode_arm
|
||||||
|
lpc_decode_arm:
|
||||||
|
stmdb sp!, { r4-r11, lr }
|
||||||
|
ldr r4, [sp, #36]
|
||||||
|
/* r0 = blocksize, r1 = qlevel, r2 = pred_order
|
||||||
|
r3 = data, r4 = coeffs
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* the data pointer always lags behind history pointer by 'pred_order'
|
||||||
|
samples. since we have one loop for each order, we can hard code this
|
||||||
|
and free a register by not saving data pointer.
|
||||||
|
*/
|
||||||
|
sub r3, r3, r2, lsl #2 @ r3 = history
|
||||||
|
cmp r0, #0 @ no samples to process
|
||||||
|
beq .exit
|
||||||
|
cmp r2, #9 @ check if order is too high for unrolled loops
|
||||||
|
addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists
|
||||||
|
@ jumptable:
|
||||||
|
b .default @ order too high, go to default routine
|
||||||
|
b .exit @ zero order filter isn't possible, exit function
|
||||||
|
b .order1
|
||||||
|
b .order2
|
||||||
|
b .order3
|
||||||
|
b .order4
|
||||||
|
b .order5
|
||||||
|
b .order6
|
||||||
|
b .order7
|
||||||
|
b .order8
|
||||||
|
|
||||||
|
@ last jump table entry coincides with target, so leave it out
|
||||||
|
.order9:
|
||||||
|
ldmia r4, { r5-r12, r14 } @ fetch coefs
|
||||||
|
.loop9:
|
||||||
|
ldr r4, [r3], #4 @ load first history sample
|
||||||
|
mul r2, r4, r14 @ multiply with last coef
|
||||||
|
ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2
|
||||||
|
mla r2, r4, r12, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r11, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r10, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r9, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r8, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r7, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r6, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r5, r2
|
||||||
|
ldr r4, [r3] @ r4 = residual
|
||||||
|
add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual
|
||||||
|
str r2, [r3], #-8*4 @ save result and wrap history pointer back
|
||||||
|
subs r0, r0, #1 @ check if we're done
|
||||||
|
bne .loop9 @ nope, jump back
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order8:
|
||||||
|
ldmia r4, { r5-r12 }
|
||||||
|
.loop8:
|
||||||
|
@ we have more registers to spare here, so start block reading
|
||||||
|
ldmia r3!, { r4, r14 }
|
||||||
|
mul r2, r4, r12
|
||||||
|
mla r2, r14, r11, r2
|
||||||
|
ldmia r3!, { r4, r14 }
|
||||||
|
mla r2, r4, r10, r2
|
||||||
|
mla r2, r14, r9, r2
|
||||||
|
ldmia r3!, { r4, r14 }
|
||||||
|
mla r2, r4, r8, r2
|
||||||
|
mla r2, r14, r7, r2
|
||||||
|
ldmia r3!, { r4, r14 }
|
||||||
|
mla r2, r4, r6, r2
|
||||||
|
mla r2, r14, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-7*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop8
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order7:
|
||||||
|
ldmia r4, { r5-r11 }
|
||||||
|
.loop7:
|
||||||
|
ldmia r3!, { r4, r12, r14 }
|
||||||
|
mul r2, r4, r11
|
||||||
|
mla r2, r12, r10, r2
|
||||||
|
mla r2, r14, r9, r2
|
||||||
|
ldmia r3!, { r4, r12, r14 }
|
||||||
|
mla r2, r4, r8, r2
|
||||||
|
mla r2, r12, r7, r2
|
||||||
|
mla r2, r14, r6, r2
|
||||||
|
ldr r4, [r3], #4
|
||||||
|
mla r2, r4, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-6*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop7
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order6:
|
||||||
|
ldmia r4, { r5-r10 }
|
||||||
|
.loop6:
|
||||||
|
ldmia r3!, { r4, r11-r12, r14 }
|
||||||
|
mul r2, r4, r10
|
||||||
|
mla r2, r11, r9, r2
|
||||||
|
mla r2, r12, r8, r2
|
||||||
|
mla r2, r14, r7, r2
|
||||||
|
ldmia r3!, { r4, r11 }
|
||||||
|
mla r2, r4, r6, r2
|
||||||
|
mla r2, r11, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-5*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop6
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order5:
|
||||||
|
ldmia r4, { r5-r9 }
|
||||||
|
.loop5:
|
||||||
|
ldmia r3!, { r4, r10-r12, r14 }
|
||||||
|
mul r2, r4, r9
|
||||||
|
mla r2, r10, r8, r2
|
||||||
|
mla r2, r11, r7, r2
|
||||||
|
mla r2, r12, r6, r2
|
||||||
|
mla r2, r14, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-4*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop5
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order4:
|
||||||
|
ldmia r4, { r5-r8 }
|
||||||
|
.loop4:
|
||||||
|
ldmia r3!, { r4, r11-r12, r14 }
|
||||||
|
mul r2, r4, r8
|
||||||
|
mla r2, r11, r7, r2
|
||||||
|
mla r2, r12, r6, r2
|
||||||
|
mla r2, r14, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-3*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop4
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order3:
|
||||||
|
ldmia r4, { r5-r7 }
|
||||||
|
.loop3:
|
||||||
|
ldmia r3!, { r4, r12, r14 }
|
||||||
|
mul r2, r4, r7
|
||||||
|
mla r2, r12, r6, r2
|
||||||
|
mla r2, r14, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-2*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop3
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order2:
|
||||||
|
ldmia r4, { r5-r6 }
|
||||||
|
.loop2:
|
||||||
|
ldmia r3!, { r4, r14 }
|
||||||
|
mul r2, r4, r6
|
||||||
|
mla r2, r14, r5, r2
|
||||||
|
ldr r4, [r3]
|
||||||
|
add r2, r4, r2, asr r1
|
||||||
|
str r2, [r3], #-1*4
|
||||||
|
subs r0, r0, #1
|
||||||
|
bne .loop2
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.order1:
|
||||||
|
ldr r5, [r4] @ load the one coef we need
|
||||||
|
ldr r4, [r3], #4 @ load one history sample, r3 now points to residual
|
||||||
|
.loop1:
|
||||||
|
mul r2, r4, r5 @ multiply coef by history sample
|
||||||
|
ldr r4, [r3] @ load residual
|
||||||
|
add r4, r4, r2, asr r1 @ add result to residual
|
||||||
|
str r4, [r3], #4 @ place r3 at next residual, we already have
|
||||||
|
subs r0, r0, #1 @ the current sample in r4 for the next iteration
|
||||||
|
bne .loop1
|
||||||
|
b .exit
|
||||||
|
|
||||||
|
.default:
|
||||||
|
/* we do the filtering in an unrolled by 4 loop as far as we can, and then
|
||||||
|
do the rest by jump table. */
|
||||||
|
add r5, r4, r2, lsl #2 @ need to start in the other end of coefs
|
||||||
|
mov r7, r2, lsr #2 @ r7 = coefs/4
|
||||||
|
mov r14, #0 @ init accumulator
|
||||||
|
.dloop1:
|
||||||
|
ldmdb r5!, { r8-r11 }
|
||||||
|
ldmia r3!, { r6, r12 }
|
||||||
|
mla r14, r6, r11, r14
|
||||||
|
mla r14, r12, r10, r14
|
||||||
|
ldmia r3!, { r6, r12 }
|
||||||
|
mla r14, r6, r9, r14
|
||||||
|
mla r14, r12, r8, r14
|
||||||
|
subs r7, r7, #1
|
||||||
|
bne .dloop1
|
||||||
|
|
||||||
|
and r7, r2, #3 @ get remaining samples to be filtered
|
||||||
|
add pc, pc, r7, lsl #2 @ jump into accumulator chain
|
||||||
|
@ jumptable:
|
||||||
|
b .dsave @ padding
|
||||||
|
b .dsave
|
||||||
|
b .oneleft
|
||||||
|
b .twoleft
|
||||||
|
@ implicit .threeleft
|
||||||
|
ldr r12, [r5, #-4]!
|
||||||
|
ldr r8, [r3], #4
|
||||||
|
mla r14, r12, r8, r14
|
||||||
|
.twoleft:
|
||||||
|
ldr r12, [r5, #-4]!
|
||||||
|
ldr r8, [r3], #4
|
||||||
|
mla r14, r12, r8, r14
|
||||||
|
.oneleft:
|
||||||
|
ldr r12, [r5, #-4]!
|
||||||
|
ldr r8, [r3], #4
|
||||||
|
mla r14, r12, r8, r14
|
||||||
|
|
||||||
|
.dsave:
|
||||||
|
ldr r12, [r3] @ load residual
|
||||||
|
add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
|
||||||
|
str r14, [r3], #4 @ store result
|
||||||
|
sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos
|
||||||
|
subs r0, r0, #1 @ are we done?
|
||||||
|
bne .default @ no, prepare for next sample
|
||||||
|
|
||||||
|
.exit:
|
||||||
|
ldmia sp!, { r4-r11, pc }
|
||||||
|
|
8
apps/codecs/libffmpegFLAC/arm.h
Normal file
8
apps/codecs/libffmpegFLAC/arm.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef _FLAC_ARM_H
|
||||||
|
#define _FLAC_ARM_H
|
||||||
|
|
||||||
|
#include "bitstream.h"
|
||||||
|
|
||||||
|
void lpc_decode_arm(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs);
|
||||||
|
|
||||||
|
#endif
|
|
@ -44,6 +44,8 @@
|
||||||
|
|
||||||
#if defined(CPU_COLDFIRE)
|
#if defined(CPU_COLDFIRE)
|
||||||
#include "coldfire.h"
|
#include "coldfire.h"
|
||||||
|
#elif defined(CPU_ARM)
|
||||||
|
#include "arm.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
|
#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
|
||||||
|
@ -262,6 +264,10 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order)
|
||||||
(void)sum;
|
(void)sum;
|
||||||
lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order,
|
lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order,
|
||||||
decoded + pred_order, coeffs);
|
decoded + pred_order, coeffs);
|
||||||
|
#elif defined(CPU_ARM)
|
||||||
|
(void)sum;
|
||||||
|
lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order,
|
||||||
|
decoded + pred_order, coeffs);
|
||||||
#else
|
#else
|
||||||
for (i = pred_order; i < s->blocksize; i++)
|
for (i = pred_order; i < s->blocksize; i++)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue