rockbox/apps/codecs/libffmpegFLAC/arm.S

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2006 by Thom Johansen 
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

#include "config.h"

/* The following is an assembler optimised version of the LPC filtering
   routines needed for FLAC decoding. It is optimised for use with ARM 
   processors.
   All LPC filtering up to order 9 is done in specially optimised unrolled
   loops, while every order above this is handled by a slower default routine.
 */
#ifdef USE_IRAM
    .section .icode,"ax",%progbits
#else
    .text
#endif
    .global lpc_decode_arm
lpc_decode_arm:
    stmdb sp!, { r4-r11, lr }
    ldr r4, [sp, #36]
    /* r0 = blocksize, r1 = qlevel, r2 = pred_order
       r3 = data, r4 = coeffs
     */
     
    /* the data pointer always lags behind history pointer by 'pred_order'
       samples. since we have one loop for each order, we can hard code this
       and free a register by not saving data pointer. 
     */ 
    sub r3, r3, r2, lsl #2    @ r3 = history
    cmp r0, #0                @ no samples to process
    beq .exit
    cmp r2, #9                @ check if order is too high for unrolled loops
    addls pc, pc, r2, lsl #2  @ jump to our unrolled decode loop if it exists
@ jumptable:
    b .default                @ order too high, go to default routine
    b .exit                   @ zero order filter isn't possible, exit function
    b .order1
    b .order2
    b .order3
    b .order4
    b .order5
    b .order6
    b .order7
    b .order8

@ last jump table entry coincides with target, so leave it out
.order9:
    ldmia r4, { r5-r12, r14 } @ fetch coefs
.loop9:
    ldr r4, [r3], #4          @ load first history sample
    mul r2, r4, r14           @ multiply with last coef
    ldr r4, [r3], #4          @ rinse and repeat while accumulating sum in r2
    mla r2, r4, r12, r2
    ldr r4, [r3], #4
    mla r2, r4, r11, r2
    ldr r4, [r3], #4
    mla r2, r4, r10, r2
    ldr r4, [r3], #4
    mla r2, r4, r9, r2
    ldr r4, [r3], #4
    mla r2, r4, r8, r2
    ldr r4, [r3], #4
    mla r2, r4, r7, r2
    ldr r4, [r3], #4
    mla r2, r4, r6, r2
    ldr r4, [r3], #4
    mla r2, r4, r5, r2
    ldr r4, [r3]              @ r4 = residual
    add r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual 
    str r2, [r3], #-8*4       @ save result and wrap history pointer back
    subs r0, r0, #1           @ check if we're done
    bne .loop9                @ nope, jump back
    b .exit
    
.order8:
    ldmia r4, { r5-r12 }
.loop8:
    @ we have more registers to spare here, so start block reading
    ldmia r3!, { r4, r14 }
    mul r2, r4, r12
    mla r2, r14, r11, r2
    ldmia r3!, { r4, r14 }
    mla r2, r4, r10, r2
    mla r2, r14, r9, r2
    ldmia r3!, { r4, r14 }
    mla r2, r4, r8, r2
    mla r2, r14, r7, r2
    ldmia r3!, { r4, r14 }
    mla r2, r4, r6, r2
    mla r2, r14, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-7*4
    subs r0, r0, #1
    bne .loop8
    b .exit

.order7:
    ldmia r4, { r5-r11 }
.loop7:
    ldmia r3!, { r4, r12, r14 }
    mul r2, r4, r11
    mla r2, r12, r10, r2
    mla r2, r14, r9, r2
    ldmia r3!, { r4, r12, r14 }
    mla r2, r4, r8, r2
    mla r2, r12, r7, r2
    mla r2, r14, r6, r2
    ldr r4, [r3], #4
    mla r2, r4, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-6*4
    subs r0, r0, #1
    bne .loop7
    b .exit

.order6:
    ldmia r4, { r5-r10 }
.loop6:
    ldmia r3!, { r4, r11-r12, r14 }
    mul r2, r4, r10
    mla r2, r11, r9, r2
    mla r2, r12, r8, r2
    mla r2, r14, r7, r2
    ldmia r3!, { r4, r11 }
    mla r2, r4, r6, r2
    mla r2, r11, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-5*4
    subs r0, r0, #1
    bne .loop6
    b .exit

.order5:
    ldmia r4, { r5-r9 }
.loop5:
    ldmia r3!, { r4, r10-r12, r14 }
    mul r2, r4, r9
    mla r2, r10, r8, r2
    mla r2, r11, r7, r2
    mla r2, r12, r6, r2
    mla r2, r14, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-4*4
    subs r0, r0, #1
    bne .loop5
    b .exit

.order4:
    ldmia r4, { r5-r8 }
.loop4:
    ldmia r3!, { r4, r11-r12, r14 }
    mul r2, r4, r8
    mla r2, r11, r7, r2
    mla r2, r12, r6, r2
    mla r2, r14, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-3*4
    subs r0, r0, #1
    bne .loop4
    b .exit

.order3:
    ldmia r4, { r5-r7 }
.loop3:
    ldmia r3!, { r4, r12, r14 }
    mul r2, r4, r7
    mla r2, r12, r6, r2
    mla r2, r14, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-2*4
    subs r0, r0, #1
    bne .loop3
    b .exit

.order2:
    ldmia r4, { r5-r6 }
.loop2:
    ldmia r3!, { r4, r14 }
    mul r2, r4, r6
    mla r2, r14, r5, r2
    ldr r4, [r3]
    add r2, r4, r2, asr r1
    str r2, [r3], #-1*4
    subs r0, r0, #1
    bne .loop2
    b .exit

.order1:
    ldr r5, [r4]            @ load the one coef we need
    ldr r4, [r3], #4        @ load one history sample, r3 now points to residual
.loop1:
    mul r2, r4, r5          @ multiply coef by history sample
    ldr r4, [r3]            @ load residual
    add r4, r4, r2, asr r1  @ add result to residual
    str r4, [r3], #4        @ place r3 at next residual, we already have 
    subs r0, r0, #1         @ the current sample in r4 for the next iteration
    bne .loop1
    b .exit

.default:
    /* we do the filtering in an unrolled by 4 loop as far as we can, and then
       do the rest by jump table. */
    add r5, r4, r2, lsl #2   @ need to start in the other end of coefs
    mov r7, r2, lsr #2       @ r7 = coefs/4
    mov r14, #0              @ init accumulator
.dloop1:
    ldmdb r5!, { r8-r11 }
    ldmia r3!, { r6, r12 }
    mla r14, r6, r11, r14
    mla r14, r12, r10, r14
    ldmia r3!, { r6, r12 }
    mla r14, r6, r9, r14
    mla r14, r12, r8, r14
    subs r7, r7, #1
    bne .dloop1

    and r7, r2, #3            @ get remaining samples to be filtered
    add pc, pc, r7, lsl #2    @ jump into accumulator chain
@ jumptable:
    b .dsave @ padding
    b .dsave
    b .oneleft
    b .twoleft
@ implicit .threeleft 
    ldr r12, [r5, #-4]!
    ldr r8, [r3], #4
    mla r14, r12, r8, r14  
.twoleft:
    ldr r12, [r5, #-4]!
    ldr r8, [r3], #4
    mla r14, r12, r8, r14  
.oneleft:
    ldr r12, [r5, #-4]!
    ldr r8, [r3], #4
    mla r14, r12, r8, r14  

.dsave:
    ldr r12, [r3]             @ load residual
    add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
    str r14, [r3], #4         @ store result
    sub r3, r3, r2, lsl #2    @ and wrap history pointer back to next first pos
    subs r0, r0, #1           @ are we done?
    bne .default              @ no, prepare for next sample

.exit:
    ldmpc regs=r4-r11
Revert r25929. Test have shown that the assembler code is more than 50% faster than the C code both on ARM7TDMI (tested on PP5002 and PP5022) and on ARM1136JF-S (tested on Gigabeat S). If it is slower on ARM9*, it should be disabled for ARM9 only. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25937 a1c6a512-1295-4272-9138-f99709370657 2010-05-10 19:05:25 +00:00			`/***************************************************************************`
			`* __________ __ ___.`
			`* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___`
			`* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /`
			`* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <`
			`* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \`
			`* \/ \/ \/ \/ \/`
			`* $Id$`
			`*`
			`* Copyright (C) 2006 by Thom Johansen`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* as published by the Free Software Foundation; either version 2`
			`* of the License, or (at your option) any later version.`
			`*`
			`* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY`
			`* KIND, either express or implied.`
			`*`
			`****************************************************************************/`

			`#include "config.h"`

			`/* The following is an assembler optimised version of the LPC filtering`
			`routines needed for FLAC decoding. It is optimised for use with ARM`
			`processors.`
			`All LPC filtering up to order 9 is done in specially optimised unrolled`
			`loops, while every order above this is handled by a slower default routine.`
			`*/`
			`#ifdef USE_IRAM`
			`.section .icode,"ax",%progbits`
			`#else`
			`.text`
			`#endif`
			`.global lpc_decode_arm`
			`lpc_decode_arm:`
			`stmdb sp!, { r4-r11, lr }`
			`ldr r4, [sp, #36]`
			`/* r0 = blocksize, r1 = qlevel, r2 = pred_order`
			`r3 = data, r4 = coeffs`
			`*/`

			`/* the data pointer always lags behind history pointer by 'pred_order'`
			`samples. since we have one loop for each order, we can hard code this`
			`and free a register by not saving data pointer.`
			`*/`
			`sub r3, r3, r2, lsl #2 @ r3 = history`
			`cmp r0, #0 @ no samples to process`
			`beq .exit`
			`cmp r2, #9 @ check if order is too high for unrolled loops`
			`addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists`
			`@ jumptable:`
			`b .default @ order too high, go to default routine`
			`b .exit @ zero order filter isn't possible, exit function`
			`b .order1`
			`b .order2`
			`b .order3`
			`b .order4`
			`b .order5`
			`b .order6`
			`b .order7`
			`b .order8`

			`@ last jump table entry coincides with target, so leave it out`
			`.order9:`
			`ldmia r4, { r5-r12, r14 } @ fetch coefs`
			`.loop9:`
			`ldr r4, [r3], #4 @ load first history sample`
			`mul r2, r4, r14 @ multiply with last coef`
			`ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2`
			`mla r2, r4, r12, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r11, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r10, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r9, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r8, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r7, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r6, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r5, r2`
			`ldr r4, [r3] @ r4 = residual`
			`add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual`
			`str r2, [r3], #-8*4 @ save result and wrap history pointer back`
			`subs r0, r0, #1 @ check if we're done`
			`bne .loop9 @ nope, jump back`
			`b .exit`

			`.order8:`
			`ldmia r4, { r5-r12 }`
			`.loop8:`
			`@ we have more registers to spare here, so start block reading`
			`ldmia r3!, { r4, r14 }`
			`mul r2, r4, r12`
			`mla r2, r14, r11, r2`
			`ldmia r3!, { r4, r14 }`
			`mla r2, r4, r10, r2`
			`mla r2, r14, r9, r2`
			`ldmia r3!, { r4, r14 }`
			`mla r2, r4, r8, r2`
			`mla r2, r14, r7, r2`
			`ldmia r3!, { r4, r14 }`
			`mla r2, r4, r6, r2`
			`mla r2, r14, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-7*4`
			`subs r0, r0, #1`
			`bne .loop8`
			`b .exit`

			`.order7:`
			`ldmia r4, { r5-r11 }`
			`.loop7:`
			`ldmia r3!, { r4, r12, r14 }`
			`mul r2, r4, r11`
			`mla r2, r12, r10, r2`
			`mla r2, r14, r9, r2`
			`ldmia r3!, { r4, r12, r14 }`
			`mla r2, r4, r8, r2`
			`mla r2, r12, r7, r2`
			`mla r2, r14, r6, r2`
			`ldr r4, [r3], #4`
			`mla r2, r4, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-6*4`
			`subs r0, r0, #1`
			`bne .loop7`
			`b .exit`

			`.order6:`
			`ldmia r4, { r5-r10 }`
			`.loop6:`
			`ldmia r3!, { r4, r11-r12, r14 }`
			`mul r2, r4, r10`
			`mla r2, r11, r9, r2`
			`mla r2, r12, r8, r2`
			`mla r2, r14, r7, r2`
			`ldmia r3!, { r4, r11 }`
			`mla r2, r4, r6, r2`
			`mla r2, r11, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-5*4`
			`subs r0, r0, #1`
			`bne .loop6`
			`b .exit`

			`.order5:`
			`ldmia r4, { r5-r9 }`
			`.loop5:`
			`ldmia r3!, { r4, r10-r12, r14 }`
			`mul r2, r4, r9`
			`mla r2, r10, r8, r2`
			`mla r2, r11, r7, r2`
			`mla r2, r12, r6, r2`
			`mla r2, r14, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-4*4`
			`subs r0, r0, #1`
			`bne .loop5`
			`b .exit`

			`.order4:`
			`ldmia r4, { r5-r8 }`
			`.loop4:`
			`ldmia r3!, { r4, r11-r12, r14 }`
			`mul r2, r4, r8`
			`mla r2, r11, r7, r2`
			`mla r2, r12, r6, r2`
			`mla r2, r14, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-3*4`
			`subs r0, r0, #1`
			`bne .loop4`
			`b .exit`

			`.order3:`
			`ldmia r4, { r5-r7 }`
			`.loop3:`
			`ldmia r3!, { r4, r12, r14 }`
			`mul r2, r4, r7`
			`mla r2, r12, r6, r2`
			`mla r2, r14, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-2*4`
			`subs r0, r0, #1`
			`bne .loop3`
			`b .exit`

			`.order2:`
			`ldmia r4, { r5-r6 }`
			`.loop2:`
			`ldmia r3!, { r4, r14 }`
			`mul r2, r4, r6`
			`mla r2, r14, r5, r2`
			`ldr r4, [r3]`
			`add r2, r4, r2, asr r1`
			`str r2, [r3], #-1*4`
			`subs r0, r0, #1`
			`bne .loop2`
			`b .exit`

			`.order1:`
			`ldr r5, [r4] @ load the one coef we need`
			`ldr r4, [r3], #4 @ load one history sample, r3 now points to residual`
			`.loop1:`
			`mul r2, r4, r5 @ multiply coef by history sample`
			`ldr r4, [r3] @ load residual`
			`add r4, r4, r2, asr r1 @ add result to residual`
			`str r4, [r3], #4 @ place r3 at next residual, we already have`
			`subs r0, r0, #1 @ the current sample in r4 for the next iteration`
			`bne .loop1`
			`b .exit`

			`.default:`
			`/* we do the filtering in an unrolled by 4 loop as far as we can, and then`
			`do the rest by jump table. */`
			`add r5, r4, r2, lsl #2 @ need to start in the other end of coefs`
			`mov r7, r2, lsr #2 @ r7 = coefs/4`
			`mov r14, #0 @ init accumulator`
			`.dloop1:`
			`ldmdb r5!, { r8-r11 }`
			`ldmia r3!, { r6, r12 }`
			`mla r14, r6, r11, r14`
			`mla r14, r12, r10, r14`
			`ldmia r3!, { r6, r12 }`
			`mla r14, r6, r9, r14`
			`mla r14, r12, r8, r14`
			`subs r7, r7, #1`
			`bne .dloop1`

			`and r7, r2, #3 @ get remaining samples to be filtered`
			`add pc, pc, r7, lsl #2 @ jump into accumulator chain`
			`@ jumptable:`
			`b .dsave @ padding`
			`b .dsave`
			`b .oneleft`
			`b .twoleft`
			`@ implicit .threeleft`
			`ldr r12, [r5, #-4]!`
			`ldr r8, [r3], #4`
			`mla r14, r12, r8, r14`
			`.twoleft:`
			`ldr r12, [r5, #-4]!`
			`ldr r8, [r3], #4`
			`mla r14, r12, r8, r14`
			`.oneleft:`
			`ldr r12, [r5, #-4]!`
			`ldr r8, [r3], #4`
			`mla r14, r12, r8, r14`

			`.dsave:`
			`ldr r12, [r3] @ load residual`
			`add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual`
			`str r14, [r3], #4 @ store result`
			`sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos`
			`subs r0, r0, #1 @ are we done?`
			`bne .default @ no, prepare for next sample`

			`.exit:`
FS#11335 by me: make ARM assembly functions thumb-friendly We can't pop into pc on ARMv4t when using thumb: the T bit won't be modified if we are returning to a thumb function Code running on ARMv4t should use the new ldrpc / ldmpc macros instead of ldr pc, [sp], #4 and ldm(cond) sp!, {regs, pc} No modification on pure ARM builds and ARMv5+ Note: USE_THUMB is currently never defined, no targets can currently be built with -mthumb, see FS#6734 git-svn-id: svn://svn.rockbox.org/rockbox/trunk@26756 a1c6a512-1295-4272-9138-f99709370657 2010-06-11 04:41:36 +00:00			`ldmpc regs=r4-r11`
Revert r25929. Test have shown that the assembler code is more than 50% faster than the C code both on ARM7TDMI (tested on PP5002 and PP5022) and on ARM1136JF-S (tested on Gigabeat S). If it is slower on ARM9*, it should be disabled for ARM9 only. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25937 a1c6a512-1295-4272-9138-f99709370657 2010-05-10 19:05:25 +00:00