rockbox/apps/codecs/lib/mdct_arm.S
Andrew Mahone f35db90efa Reorder some operands to increase frequency of multiply early termination on TDMI targets, reorder some operations to try to reduce stalls.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21834 a1c6a512-1295-4272-9138-f99709370657
2009-07-13 04:50:02 +00:00

429 lines
12 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2007 by Tomasz Malesinski
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
/* Codecs should not normally do this, but we need to check a macro, and
* codecs.h would confuse the assembler. */
#define cPI3_8 (0x30fbc54d)
#define cPI2_8 (0x5a82799a)
#define cPI1_8 (0x7641af3d)
#ifdef USE_IRAM
.section .icode,"ax",%progbits
#else
.text
#endif
.align
.global mdct_butterfly_32
.global mdct_butterfly_generic_loop
mdct_butterfly_8:
add r9, r5, r1 @ x4 + x0
sub r5, r5, r1 @ x4 - x0
add r7, r6, r2 @ x5 + x1
sub r6, r6, r2 @ x5 - x1
add r8, r10, r3 @ x6 + x2
sub r10, r10, r3 @ x6 - x2
add r12, r11, r4 @ x7 + x3
sub r11, r11, r4 @ x7 - x3
add r1, r10, r6 @ y0 = (x6 - x2) + (x5 - x1)
sub r2, r11, r5 @ y1 = (x7 - x3) - (x4 - x0)
sub r3, r10, r6 @ y2 = (x6 - x2) - (x5 - x1)
add r4, r11, r5 @ y3 = (x7 - x3) + (x4 - x0)
sub r5, r8, r9 @ y4 = (x6 + x2) - (x4 + x0)
sub r6, r12, r7 @ y5 = (x7 + x3) - (x5 + x1)
add r10, r8, r9 @ y6 = (x6 + x2) + (x4 + x0)
add r11, r12, r7 @ y7 = (x7 + x3) + (x5 + x1)
stmia r0, {r1, r2, r3, r4, r5, r6, r10, r11}
mov pc, lr
mdct_butterfly_16:
str lr, [sp, #-4]!
add r1, r0, #8*4
ldmia r0, {r2, r3, r4, r5}
ldmia r1, {r6, r7, r8, r9}
add r6, r6, r2 @ y8 = x8 + x0
rsb r2, r6, r2, asl #1 @ x0 - x8
add r7, r7, r3 @ y9 = x9 + x1
rsb r3, r7, r3, asl #1 @ x1 - x9
add r8, r8, r4 @ y10 = x10 + x2
sub r11, r8, r4, asl #1 @ x10 - x2
add r9, r9, r5 @ y11 = x11 + x3
rsb r10, r9, r5, asl #1 @ x3 - x11
stmia r1!, {r6, r7, r8, r9}
add r2, r2, r3 @ (x0 - x8) + (x1 - x9)
rsb r3, r2, r3, asl #1 @ (x1 - x9) - (x0 - x8)
ldr r12, =cPI2_8
smull r8, r5, r12, r2
smull r8, r6, r12, r3
mov r5, r5, asl #1
mov r6, r6, asl #1
stmia r0!, {r5, r6, r10, r11}
ldmia r0, {r2, r3, r4, r5}
ldmia r1, {r6, r7, r8, r9}
add r6, r6, r2 @ y12 = x12 + x4
sub r2, r6, r2, asl #1 @ x12 - x4
add r7, r7, r3 @ y13 = x13 + x5
sub r3, r7, r3, asl #1 @ x13 - x5
add r8, r8, r4 @ y10 = x14 + x6
sub r10, r8, r4, asl #1 @ x14 - x6
add r9, r9, r5 @ y11 = x15 + x7
sub r11, r9, r5, asl #1 @ x15 - x7
stmia r1, {r6, r7, r8, r9}
sub r2, r2, r3 @ (x12 - x4) - (x13 - x5)
add r3, r2, r3, asl #1 @ (x12 - x4) + (x13 - x5)
smull r8, r5, r12, r2
smull r8, r6, r12, r3
mov r5, r5, asl #1
mov r6, r6, asl #1
@ no stmia here, r5, r6, r10, r11 are passed to mdct_butterfly_8
sub r0, r0, #4*4
ldmia r0, {r1, r2, r3, r4}
bl mdct_butterfly_8
add r0, r0, #8*4
ldmia r0, {r1, r2, r3, r4, r5, r6, r10, r11}
bl mdct_butterfly_8
ldr pc, [sp], #4
mdct_butterfly_32:
stmdb sp!, {r4-r11, lr}
add r1, r0, #16*4
ldmia r0, {r2, r3, r4, r5}
ldmia r1, {r6, r7, r8, r9}
add r6, r6, r2 @ y16 = x16 + x0
rsb r2, r6, r2, asl #1 @ x0 - x16
add r7, r7, r3 @ y17 = x17 + x1
rsb r3, r7, r3, asl #1 @ x1 - x17
add r8, r8, r4 @ y18 = x18 + x2
rsb r4, r8, r4, asl #1 @ x2 - x18
add r9, r9, r5 @ y19 = x19 + x3
rsb r5, r9, r5, asl #1 @ x3 - x19
stmia r1!, {r6, r7, r8, r9}
ldr r12, =cPI1_8
ldr lr, =cPI3_8
smull r10, r6, r12, r2
rsb r2, r2, #0
smlal r10, r6, lr, r3
smull r10, r7, r12, r3
smlal r10, r7, lr, r2
mov r6, r6, asl #1
mov r7, r7, asl #1
add r4, r4, r5 @ (x3 - x19) + (x2 - x18)
rsb r5, r4, r5, asl #1 @ (x3 - x19) - (x2 - x18)
ldr r11, =cPI2_8
smull r10, r8, r4, r11
smull r10, r9, r5, r11
mov r8, r8, asl #1
mov r9, r9, asl #1
stmia r0!, {r6, r7, r8, r9}
ldmia r0, {r2, r3, r4, r5}
ldmia r1, {r6, r7, r8, r9}
add r6, r6, r2 @ y20 = x20 + x4
rsb r2, r6, r2, asl #1 @ x4 - x20
add r7, r7, r3 @ y21 = x21 + x5
rsb r3, r7, r3, asl #1 @ x5 - x21
add r8, r8, r4 @ y22 = x22 + x6
sub r4, r8, r4, asl #1 @ x22 - x6
add r9, r9, r5 @ y23 = x23 + x7
rsb r5, r9, r5, asl #1 @ x7 - x23
stmia r1!, {r6, r7, r8, r9}
smull r10, r6, lr, r2
rsb r2, r2, #0
smlal r10, r6, r12, r3
smull r10, r7, lr, r3
smlal r10, r7, r12, r2
mov r6, r6, asl #1
mov r7, r7, asl #1
mov r8, r5
mov r9, r4
stmia r0!, {r6, r7, r8, r9}
ldmia r0, {r2, r3, r4, r5}
ldmia r1, {r6, r7, r8, r9}
add r6, r6, r2 @ y24 = x24 + x8
sub r2, r6, r2, asl #1 @ x24 - x8
add r7, r7, r3 @ y25 = x25 + x9
sub r3, r7, r3, asl #1 @ x25 - x9
add r8, r8, r4 @ y26 = x26 + x10
sub r4, r8, r4, asl #1 @ x26 - x10
add r9, r9, r5 @ y27 = x27 + x11
sub r5, r9, r5, asl #1 @ x27 - x11
stmia r1!, {r6, r7, r8, r9}
smull r10, r7, lr, r3
rsb r3, r3, #0
smlal r10, r7, r12, r2
smull r10, r6, r12, r3
smlal r10, r6, lr, r2
mov r6, r6, asl #1
mov r7, r7, asl #1
sub r4, r4, r5 @ (x26 - x10) - (x27 - x11)
add r5, r4, r5, asl #1 @ (x26 - x10) + (x27 - x11)
ldr r11, =cPI2_8
smull r10, r8, r11, r4
smull r10, r9, r11, r5
mov r8, r8, asl #1
mov r9, r9, asl #1
stmia r0!, {r6, r7, r8, r9}
ldmia r0, {r2, r3, r4, r5}
ldmia r1, {r6, r7, r8, r9}
add r6, r6, r2 @ y28 = x28 + x12
sub r2, r6, r2, asl #1 @ x28 - x12
add r7, r7, r3 @ y29 = x29 + x13
sub r3, r7, r3, asl #1 @ x29 - x13
add r8, r8, r4 @ y30 = x30 + x14
sub r4, r8, r4, asl #1 @ x30 - x14
add r9, r9, r5 @ y31 = x31 + x15
sub r5, r9, r5, asl #1 @ x31 - x15
stmia r1, {r6, r7, r8, r9}
smull r10, r7, r12, r3
rsb r3, r3, #0
smlal r10, r7, lr, r2
smull r10, r6, lr, r3
smlal r10, r6, r12, r2
mov r6, r6, asl #1
mov r7, r7, asl #1
mov r8, r4
mov r9, r5
stmia r0, {r6, r7, r8, r9}
sub r0, r0, #12*4
str r0, [sp, #-4]!
bl mdct_butterfly_16
ldr r0, [sp], #4
add r0, r0, #16*4
bl mdct_butterfly_16
ldmia sp!, {r4-r11, pc}
@ mdct_butterfly_generic_loop(x1, x2, T0, step, Ttop)
mdct_butterfly_generic_loop:
stmdb sp!, {r4-r11, lr}
str r2, [sp, #-4]
ldr r4, [sp, #36]
1:
ldmdb r0, {r6, r7, r8, r9}
ldmdb r1, {r10, r11, r12, r14}
add r6, r6, r10
sub r10, r6, r10, asl #1
add r7, r7, r11
rsb r11, r7, r11, asl #1
add r8, r8, r12
sub r12, r8, r12, asl #1
add r9, r9, r14
rsb r14, r9, r14, asl #1
stmdb r0!, {r6, r7, r8, r9}
ldmia r2, {r6, r7}
smull r5, r8, r6, r14
rsb r14, r14, #0
smlal r5, r8, r7, r12
smull r5, r9, r6, r12
smlal r5, r9, r7, r14
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
add r2, r2, r3, asl #2
ldmia r2, {r6, r7}
smull r5, r8, r6, r11
rsb r11, r11, #0
smlal r5, r8, r7, r10
smull r5, r9, r6, r10
smlal r5, r9, r7, r11
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
add r2, r2, r3, asl #2
cmp r2, r4
blo 1b
ldr r4, [sp, #-4]
1:
ldmdb r0, {r6, r7, r8, r9}
ldmdb r1, {r10, r11, r12, r14}
add r6, r6, r10
sub r10, r6, r10, asl #1
add r7, r7, r11
sub r11, r7, r11, asl #1
add r8, r8, r12
sub r12, r8, r12, asl #1
add r9, r9, r14
sub r14, r9, r14, asl #1
stmdb r0!, {r6, r7, r8, r9}
ldmia r2, {r6, r7}
smull r5, r9, r6, r14
rsb r14, r14, #0
smlal r5, r9, r7, r12
smull r5, r8, r6, r12
smlal r5, r8, r7, r14
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
sub r2, r2, r3, asl #2
ldmia r2, {r6, r7}
smull r5, r9, r6, r11
rsb r11, r11, #0
smlal r5, r9, r7, r10
smull r5, r8, r6, r10
smlal r5, r8, r7, r11
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
sub r2, r2, r3, asl #2
cmp r2, r4
bhi 1b
ldr r4, [sp, #36]
1:
ldmdb r0, {r6, r7, r8, r9}
ldmdb r1, {r10, r11, r12, r14}
add r6, r6, r10
rsb r10, r6, r10, asl #1
add r7, r7, r11
rsb r11, r7, r11, asl #1
add r8, r8, r12
rsb r12, r8, r12, asl #1
add r9, r9, r14
rsb r14, r9, r14, asl #1
stmdb r0!, {r6, r7, r8, r9}
ldmia r2, {r6, r7}
smull r5, r8, r6, r12
rsb r12, r12, #0
smlal r5, r8, r7, r14
smull r5, r9, r6, r14
smlal r5, r9, r7, r12
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
add r2, r2, r3, asl #2
ldmia r2, {r6, r7}
smull r5, r8, r6, r10
rsb r10, r10, #0
smlal r5, r8, r7, r11
smull r5, r9, r6, r11
smlal r5, r9, r7, r10
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
add r2, r2, r3, asl #2
cmp r2, r4
blo 1b
ldr r4, [sp, #-4]
1:
ldmdb r0, {r6, r7, r8, r9}
ldmdb r1, {r10, r11, r12, r14}
add r6, r6, r10
sub r10, r6, r10, asl #1
add r7, r7, r11
rsb r11, r7, r11, asl #1
add r8, r8, r12
sub r12, r8, r12, asl #1
add r9, r9, r14
rsb r14, r9, r14, asl #1
stmdb r0!, {r6, r7, r8, r9}
ldmia r2, {r6, r7}
smull r5, r9, r6, r12
smlal r5, r9, r7, r14
rsb r12, r12, #0
smull r5, r8, r6, r14
smlal r5, r8, r7, r12
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
sub r2, r2, r3, asl #2
ldmia r2, {r6, r7}
smull r5, r9, r6, r10
rsb r10, r10, #0
smlal r5, r9, r7, r11
smull r5, r8, r6, r11
smlal r5, r8, r7, r10
mov r8, r8, asl #1
mov r9, r9, asl #1
stmdb r1!, {r8, r9}
sub r2, r2, r3, asl #2
cmp r2, r4
bhi 1b
ldmia sp!, {r4-r11, pc}