From 08d3c0be378a8e929379b398e0feb2eb164bc36a Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Mon, 10 May 2010 19:05:25 +0000 Subject: [PATCH] Revert r25929. Test have shown that the assembler code is more than 50% faster than the C code both on ARM7TDMI (tested on PP5002 and PP5022) and on ARM1136JF-S (tested on Gigabeat S). If it is slower on ARM9*, it should be disabled for ARM9 only. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25937 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libffmpegFLAC/SOURCES | 2 + apps/codecs/libffmpegFLAC/arm.S | 271 ++++++++++++++++++++++++++++ apps/codecs/libffmpegFLAC/arm.h | 8 + apps/codecs/libffmpegFLAC/decoder.c | 6 + 4 files changed, 287 insertions(+) create mode 100644 apps/codecs/libffmpegFLAC/arm.S create mode 100644 apps/codecs/libffmpegFLAC/arm.h diff --git a/apps/codecs/libffmpegFLAC/SOURCES b/apps/codecs/libffmpegFLAC/SOURCES index 1bd92e8be7..deed19bcec 100644 --- a/apps/codecs/libffmpegFLAC/SOURCES +++ b/apps/codecs/libffmpegFLAC/SOURCES @@ -3,4 +3,6 @@ decoder.c shndec.c #if defined(CPU_COLDFIRE) coldfire.S +#elif defined(CPU_ARM) +arm.S #endif diff --git a/apps/codecs/libffmpegFLAC/arm.S b/apps/codecs/libffmpegFLAC/arm.S new file mode 100644 index 0000000000..2a2746eefa --- /dev/null +++ b/apps/codecs/libffmpegFLAC/arm.S @@ -0,0 +1,271 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by Thom Johansen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" + +/* The following is an assembler optimised version of the LPC filtering + routines needed for FLAC decoding. It is optimised for use with ARM + processors. + All LPC filtering up to order 9 is done in specially optimised unrolled + loops, while every order above this is handled by a slower default routine. + */ +#ifdef USE_IRAM + .section .icode,"ax",%progbits +#else + .text +#endif + .global lpc_decode_arm +lpc_decode_arm: + stmdb sp!, { r4-r11, lr } + ldr r4, [sp, #36] + /* r0 = blocksize, r1 = qlevel, r2 = pred_order + r3 = data, r4 = coeffs + */ + + /* the data pointer always lags behind history pointer by 'pred_order' + samples. since we have one loop for each order, we can hard code this + and free a register by not saving data pointer. + */ + sub r3, r3, r2, lsl #2 @ r3 = history + cmp r0, #0 @ no samples to process + beq .exit + cmp r2, #9 @ check if order is too high for unrolled loops + addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists +@ jumptable: + b .default @ order too high, go to default routine + b .exit @ zero order filter isn't possible, exit function + b .order1 + b .order2 + b .order3 + b .order4 + b .order5 + b .order6 + b .order7 + b .order8 + +@ last jump table entry coincides with target, so leave it out +.order9: + ldmia r4, { r5-r12, r14 } @ fetch coefs +.loop9: + ldr r4, [r3], #4 @ load first history sample + mul r2, r4, r14 @ multiply with last coef + ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2 + mla r2, r4, r12, r2 + ldr r4, [r3], #4 + mla r2, r4, r11, r2 + ldr r4, [r3], #4 + mla r2, r4, r10, r2 + ldr r4, [r3], #4 + mla r2, r4, r9, r2 + ldr r4, [r3], #4 + mla r2, r4, r8, r2 + ldr r4, [r3], #4 + mla r2, r4, r7, r2 + ldr r4, [r3], #4 + mla r2, r4, r6, r2 + ldr r4, [r3], #4 + mla r2, r4, r5, r2 + ldr r4, [r3] @ r4 = residual + add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual + str r2, [r3], #-8*4 @ save result and wrap history pointer back + subs r0, r0, #1 @ check if we're done + bne .loop9 @ nope, jump back + b .exit + +.order8: + ldmia r4, { r5-r12 } +.loop8: + @ we have more registers to spare here, so start block reading + ldmia r3!, { r4, r14 } + mul r2, r4, r12 + mla r2, r14, r11, r2 + ldmia r3!, { r4, r14 } + mla r2, r4, r10, r2 + mla r2, r14, r9, r2 + ldmia r3!, { r4, r14 } + mla r2, r4, r8, r2 + mla r2, r14, r7, r2 + ldmia r3!, { r4, r14 } + mla r2, r4, r6, r2 + mla r2, r14, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-7*4 + subs r0, r0, #1 + bne .loop8 + b .exit + +.order7: + ldmia r4, { r5-r11 } +.loop7: + ldmia r3!, { r4, r12, r14 } + mul r2, r4, r11 + mla r2, r12, r10, r2 + mla r2, r14, r9, r2 + ldmia r3!, { r4, r12, r14 } + mla r2, r4, r8, r2 + mla r2, r12, r7, r2 + mla r2, r14, r6, r2 + ldr r4, [r3], #4 + mla r2, r4, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-6*4 + subs r0, r0, #1 + bne .loop7 + b .exit + +.order6: + ldmia r4, { r5-r10 } +.loop6: + ldmia r3!, { r4, r11-r12, r14 } + mul r2, r4, r10 + mla r2, r11, r9, r2 + mla r2, r12, r8, r2 + mla r2, r14, r7, r2 + ldmia r3!, { r4, r11 } + mla r2, r4, r6, r2 + mla r2, r11, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-5*4 + subs r0, r0, #1 + bne .loop6 + b .exit + +.order5: + ldmia r4, { r5-r9 } +.loop5: + ldmia r3!, { r4, r10-r12, r14 } + mul r2, r4, r9 + mla r2, r10, r8, r2 + mla r2, r11, r7, r2 + mla r2, r12, r6, r2 + mla r2, r14, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-4*4 + subs r0, r0, #1 + bne .loop5 + b .exit + +.order4: + ldmia r4, { r5-r8 } +.loop4: + ldmia r3!, { r4, r11-r12, r14 } + mul r2, r4, r8 + mla r2, r11, r7, r2 + mla r2, r12, r6, r2 + mla r2, r14, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-3*4 + subs r0, r0, #1 + bne .loop4 + b .exit + +.order3: + ldmia r4, { r5-r7 } +.loop3: + ldmia r3!, { r4, r12, r14 } + mul r2, r4, r7 + mla r2, r12, r6, r2 + mla r2, r14, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-2*4 + subs r0, r0, #1 + bne .loop3 + b .exit + +.order2: + ldmia r4, { r5-r6 } +.loop2: + ldmia r3!, { r4, r14 } + mul r2, r4, r6 + mla r2, r14, r5, r2 + ldr r4, [r3] + add r2, r4, r2, asr r1 + str r2, [r3], #-1*4 + subs r0, r0, #1 + bne .loop2 + b .exit + +.order1: + ldr r5, [r4] @ load the one coef we need + ldr r4, [r3], #4 @ load one history sample, r3 now points to residual +.loop1: + mul r2, r4, r5 @ multiply coef by history sample + ldr r4, [r3] @ load residual + add r4, r4, r2, asr r1 @ add result to residual + str r4, [r3], #4 @ place r3 at next residual, we already have + subs r0, r0, #1 @ the current sample in r4 for the next iteration + bne .loop1 + b .exit + +.default: + /* we do the filtering in an unrolled by 4 loop as far as we can, and then + do the rest by jump table. */ + add r5, r4, r2, lsl #2 @ need to start in the other end of coefs + mov r7, r2, lsr #2 @ r7 = coefs/4 + mov r14, #0 @ init accumulator +.dloop1: + ldmdb r5!, { r8-r11 } + ldmia r3!, { r6, r12 } + mla r14, r6, r11, r14 + mla r14, r12, r10, r14 + ldmia r3!, { r6, r12 } + mla r14, r6, r9, r14 + mla r14, r12, r8, r14 + subs r7, r7, #1 + bne .dloop1 + + and r7, r2, #3 @ get remaining samples to be filtered + add pc, pc, r7, lsl #2 @ jump into accumulator chain +@ jumptable: + b .dsave @ padding + b .dsave + b .oneleft + b .twoleft +@ implicit .threeleft + ldr r12, [r5, #-4]! + ldr r8, [r3], #4 + mla r14, r12, r8, r14 +.twoleft: + ldr r12, [r5, #-4]! + ldr r8, [r3], #4 + mla r14, r12, r8, r14 +.oneleft: + ldr r12, [r5, #-4]! + ldr r8, [r3], #4 + mla r14, r12, r8, r14 + +.dsave: + ldr r12, [r3] @ load residual + add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual + str r14, [r3], #4 @ store result + sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos + subs r0, r0, #1 @ are we done? + bne .default @ no, prepare for next sample + +.exit: + ldmia sp!, { r4-r11, pc } + diff --git a/apps/codecs/libffmpegFLAC/arm.h b/apps/codecs/libffmpegFLAC/arm.h new file mode 100644 index 0000000000..39080d7f75 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/arm.h @@ -0,0 +1,8 @@ +#ifndef _FLAC_ARM_H +#define _FLAC_ARM_H + +#include "bitstream.h" + +void lpc_decode_arm(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs); + +#endif diff --git a/apps/codecs/libffmpegFLAC/decoder.c b/apps/codecs/libffmpegFLAC/decoder.c index e5c4b426d5..ed175548f2 100644 --- a/apps/codecs/libffmpegFLAC/decoder.c +++ b/apps/codecs/libffmpegFLAC/decoder.c @@ -44,6 +44,8 @@ #if defined(CPU_COLDFIRE) #include "coldfire.h" +#elif defined(CPU_ARM) +#include "arm.h" #endif #define FFMAX(a,b) ((a) > (b) ? (a) : (b)) @@ -262,6 +264,10 @@ static int decode_subframe_lpc(FLACContext *s, int32_t* decoded, int pred_order) (void)sum; lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order, decoded + pred_order, coeffs); + #elif defined(CPU_ARM) + (void)sum; + lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order, + decoded + pred_order, coeffs); #else for (i = pred_order; i < s->blocksize; i++) {