f40bfc9267
Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97 Reviewed-on: http://gerrit.rockbox.org/137 Reviewed-by: Nils Wallménius <nils@rockbox.org> Tested-by: Nils Wallménius <nils@rockbox.org>
163 lines
7.9 KiB
ArmAsm
163 lines
7.9 KiB
ArmAsm
/***************************************************************************
|
|
* __________ __ ___.
|
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
|
* \/ \/ \/ \/ \/
|
|
* $Id:
|
|
*
|
|
* Copyright (C) 2010 by Michael Giacomelli
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
* KIND, either express or implied.
|
|
*
|
|
****************************************************************************/
|
|
|
|
#include "config.h"
|
|
|
|
.section .text, "ax", %progbits
|
|
|
|
|
|
/****************************************************************************
|
|
* atrac3_iqmf_dewindowing_armv5e(int32_t *out,
|
|
* int32_t *in,
|
|
* int32_t *win,
|
|
* unsigned int nIn);
|
|
*
|
|
* Dewindowing step within iqmf of atrac3 synthesis using 16 bit filter
|
|
* coefficients and armv5e packed multiply instructions. Uses 2.5 cycles
|
|
* per filter coefficient (ideal). Benchmarked 3.54 per coefficient (Clip+).
|
|
*
|
|
* Reference implementation:
|
|
*
|
|
* for (j = nIn; j != 0; j--) {
|
|
* s1 = fixmul32(in[0], win[0]);
|
|
* s2 = fixmul32(in[1], win[1]);
|
|
* for (i = 2; i < 48; i += 2) {
|
|
* s1 += fixmul32(in[i ], win[i ]);
|
|
* s2 += fixmul32(in[i+1], win[i+1]);
|
|
* }
|
|
* out[0] = s2 << 1;
|
|
* out[1] = s1 << 1;
|
|
* in += 2;
|
|
* out += 2;
|
|
* }
|
|
* Note: r12 is a scratch register and can be used without restorage.
|
|
****************************************************************************/
|
|
.align 2
|
|
.global atrac3_iqmf_dewindowing_armv5e
|
|
.type atrac3_iqmf_dewindowing_armv5e, %function
|
|
|
|
atrac3_iqmf_dewindowing_armv5e:
|
|
/* r0 = dest */
|
|
/* r1 = input samples */
|
|
/* r2 = window coefficients */
|
|
/* r3 = counter */
|
|
stmfd sp!, {r4-r11, lr} /* save non-scratch registers */
|
|
|
|
.iqmf_dewindow_outer_loop: /* outer loop 0...counter-1 */
|
|
/* 0.. 7 */
|
|
ldmia r2!, {r4, r5, r8, r9} /* load win[0..7] */
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[0..3] to avoid stall on arm11 */
|
|
smulwb lr, r6, r4 /* s1 = in[0] * win[0] */
|
|
smulwt r12, r7, r4 /* s2 = in[1] * win[1] */
|
|
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11, r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
/* 8..15 */
|
|
ldmia r2!, {r4, r5, r8, r9} /* load win[8..15] */
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
/* 16..23 */
|
|
ldmia r2!, {r4, r5, r8, r9} /* load win[16..23] */
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
/* 24..31 */
|
|
ldmia r2!, {r4, r5, r8, r9} /* load win[24..31] */
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
/* 32..39 */
|
|
ldmia r2!, {r4, r5, r8, r9} /* load win[32..39] */
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
/* 40..47 */
|
|
ldmia r2!, {r4, r5, r8, r9} /* load win[40..47] */
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r4, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r4, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r5, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r5, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
ldmia r1!, {r6, r7, r10, r11} /* load in[i...i+3] */
|
|
smlawb lr, r6, r8, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r7, r8, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
smlawb lr, r10, r9, lr /* s1 += in[i ] * win[i ] >> 16 */
|
|
smlawt r12, r11,r9, r12 /* s2 += in[i+1] * win[i+1] >> 16 */
|
|
|
|
|
|
mov lr , lr , lsl #1
|
|
mov r12, r12, lsl #1
|
|
|
|
stmia r0!, {r12, lr} /* store result out[0]=s2, out[1]=s1 */
|
|
sub r1, r1, #184 /* roll back 64 entries = 184 bytes */
|
|
sub r2, r2, #96 /* roll back 48 entries * 2 bytes = 96 bytes = win[0] */
|
|
|
|
subs r3, r3, #1 /* outer loop -= 1 */
|
|
bgt .iqmf_dewindow_outer_loop
|
|
|
|
ldmpc regs=r4-r11 /* restore registers */
|
|
|
|
.atrac3_iqmf_dewindowing_armv5e_end:
|
|
.size atrac3_iqmf_dewindowing_armv5e,.atrac3_iqmf_dewindowing_armv5e_end-atrac3_iqmf_dewindowing_armv5e
|