rockbox/apps/codecs/libwavpack/arml.S
Daniel Stenberg 2acc0ac542 Updated our source code header to explicitly mention that we are GPL v2 or
later. We still need to hunt down snippets used that are not. 1324 modified
files...
http://www.rockbox.org/mail/archive/rockbox-dev-archive-2008-06/0060.shtml


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17847 a1c6a512-1295-4272-9138-f99709370657
2008-06-28 18:10:04 +00:00

504 lines
18 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2006 by David Bryant
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
/* This is an assembly optimized version of the following WavPack function:
*
* void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
* long *buffer, long sample_count);
*
* It performs a single pass of stereo decorrelation on the provided buffer.
* Note that this version of the function requires that the 8 previous stereo
* samples are visible and correct. In other words, it ignores the "samples_*"
* fields in the decorr_pass structure and gets the history data directly
* from the buffer. It does, however, return the appropriate history samples
* to the decorr_pass structure before returning.
*
* This is written to work on a ARM7TDMI processor. This version uses the
* 64-bit multiply-accumulate instruction and so can be used with all
* WavPack files. However, for optimum performance with 16-bit WavPack
* files, there is a faster version that only uses the 32-bit MLA
* instruction.
*/
.text
.align
.global decorr_stereo_pass_cont_arml
/*
* on entry:
*
* r0 = struct decorr_pass *dpp
* r1 = long *buffer
* r2 = long sample_count
*/
decorr_stereo_pass_cont_arml:
stmfd sp!, {r4 - r8, r10, r11, lr}
mov r5, r0 @ r5 = dpp
mov r11, #512 @ r11 = 512 for rounding
ldrsh r6, [r0, #2] @ r6 = dpp->delta
ldrsh r4, [r0, #4] @ r4 = dpp->weight_A
ldrsh r0, [r0, #6] @ r0 = dpp->weight_B
cmp r2, #0 @ exit if no samples to process
beq common_exit
mov r0, r0, asl #18 @ for 64-bit math we use weights << 18
mov r4, r4, asl #18
mov r6, r6, asl #18
add r7, r1, r2, asl #3 @ r7 = buffer ending position
ldrsh r2, [r5, #0] @ r2 = dpp->term
cmp r2, #0
blt minus_term
ldr lr, [r1, #-16] @ load 2 sample history from buffer
ldr r10, [r1, #-12] @ for terms 2, 17, and 18
ldr r8, [r1, #-8]
ldr r3, [r1, #-4]
cmp r2, #18
beq term_18_loop
mov lr, lr, asl #4
mov r10, r10, asl #4
cmp r2, #2
beq term_2_loop
cmp r2, #17
beq term_17_loop
b term_default_loop
minus_term:
mov r10, #(1024 << 18) @ r10 = -1024 << 18 for weight clipping
rsb r10, r10, #0 @ (only used for negative terms)
cmn r2, #1
beq term_minus_1
cmn r2, #2
beq term_minus_2
cmn r2, #3
beq term_minus_3
b common_exit
/*
******************************************************************************
* Loop to handle term = 17 condition
*
* r0 = dpp->weight_B r8 = previous left sample
* r1 = bptr r9 =
* r2 = current sample r10 = second previous left sample << 4
* r3 = previous right sample r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = current decorrelation value
* r5 = dpp sp =
* r6 = dpp->delta lr = second previous right sample << 4
* r7 = eptr pc =
*******************************************************************************
*/
term_17_loop:
rsbs ip, lr, r8, asl #5 @ decorr value = (2 * prev) - 2nd prev
mov lr, r8, asl #4 @ previous becomes 2nd previous
ldr r2, [r1], #4 @ get sample & update pointer
mov r11, #0x80000000
mov r8, r2
smlalne r11, r8, r4, ip
strne r8, [r1, #-4] @ if change possible, store sample back
cmpne r2, #0
beq .L325
teq ip, r2 @ update weight based on signs
submi r4, r4, r6
addpl r4, r4, r6
.L325: rsbs ip, r10, r3, asl #5 @ do same thing for right channel
mov r10, r3, asl #4
ldr r2, [r1], #4
mov r11, #0x80000000
mov r3, r2
smlalne r11, r3, r0, ip
strne r3, [r1, #-4]
cmpne r2, #0
beq .L329
teq ip, r2
submi r0, r0, r6
addpl r0, r0, r6
.L329: cmp r7, r1 @ loop back if more samples to do
bhi term_17_loop
mov lr, lr, asr #4
mov r10, r10, asr #4
b store_1718 @ common exit for terms 17 & 18
/*
******************************************************************************
* Loop to handle term = 18 condition
*
* r0 = dpp->weight_B r8 = previous left sample
* r1 = bptr r9 =
* r2 = current sample r10 = second previous left sample
* r3 = previous right sample r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = decorrelation value
* r5 = dpp sp =
* r6 = dpp->delta lr = second previous right sample
* r7 = eptr pc =
*******************************************************************************
*/
term_18_loop:
rsb ip, lr, r8 @ decorr value =
mov lr, r8 @ ((3 * prev) - 2nd prev) >> 1
add ip, lr, ip, asr #1
movs ip, ip, asl #4
ldr r2, [r1], #4 @ get sample & update pointer
mov r11, #0x80000000
mov r8, r2
smlalne r11, r8, r4, ip
strne r8, [r1, #-4] @ if change possible, store sample back
cmpne r2, #0
beq .L337
teq ip, r2 @ update weight based on signs
submi r4, r4, r6
addpl r4, r4, r6
.L337: rsb ip, r10, r3 @ do same thing for right channel
mov r10, r3
add ip, r10, ip, asr #1
movs ip, ip, asl #4
ldr r2, [r1], #4
mov r11, #0x80000000
mov r3, r2
smlalne r11, r3, r0, ip
strne r3, [r1, #-4]
cmpne r2, #0
beq .L341
teq ip, r2
submi r0, r0, r6
addpl r0, r0, r6
.L341: cmp r7, r1 @ loop back if more samples to do
bhi term_18_loop
/* common exit for terms 17 & 18 */
store_1718:
str r3, [r5, #40] @ store sample history into struct
str r8, [r5, #8]
str r10, [r5, #44]
str lr, [r5, #12]
b common_exit @ and return
/*
******************************************************************************
* Loop to handle term = 2 condition
* (note that this case can be handled by the default term handler (1-8), but
* this special case is faster because it doesn't have to read memory twice)
*
* r0 = dpp->weight_B r8 = previous left sample
* r1 = bptr r9 =
* r2 = current sample r10 = second previous left sample << 4
* r3 = previous right sample r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = decorrelation value
* r5 = dpp sp =
* r6 = dpp->delta lr = second previous right sample << 4
* r7 = eptr pc =
*******************************************************************************
*/
term_2_loop:
movs ip, lr @ get decorrelation value & test
ldr r2, [r1], #4 @ get sample & update pointer
mov lr, r8, asl #4 @ previous becomes 2nd previous
mov r11, #0x80000000
mov r8, r2
smlalne r11, r8, r4, ip
strne r8, [r1, #-4] @ if change possible, store sample back
cmpne r2, #0
beq .L225
teq ip, r2 @ update weight based on signs
submi r4, r4, r6
addpl r4, r4, r6
.L225: movs ip, r10 @ do same thing for right channel
ldr r2, [r1], #4
mov r10, r3, asl #4
mov r11, #0x80000000
mov r3, r2
smlalne r11, r3, r0, ip
strne r3, [r1, #-4]
cmpne r2, #0
beq .L229
teq ip, r2
submi r0, r0, r6
addpl r0, r0, r6
.L229: cmp r7, r1 @ loop back if more samples to do
bhi term_2_loop
b default_term_exit @ this exit updates all dpp->samples
/*
******************************************************************************
* Loop to handle default term condition
*
* r0 = dpp->weight_B r8 = result accumulator
* r1 = bptr r9 =
* r2 = dpp->term r10 =
* r3 = decorrelation value r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = current sample
* r5 = dpp sp =
* r6 = dpp->delta lr =
* r7 = eptr pc =
*******************************************************************************
*/
term_default_loop:
ldr r3, [r1, -r2, asl #3] @ get decorrelation value based on term
ldr ip, [r1], #4 @ get original sample and bump ptr
movs r3, r3, asl #4
mov r11, #0x80000000
mov r8, ip
smlalne r11, r8, r4, r3
strne r8, [r1, #-4] @ if possibly changed, store updated sample
cmpne ip, #0
beq .L350
teq ip, r3 @ update weight based on signs
submi r4, r4, r6
addpl r4, r4, r6
.L350: ldr r3, [r1, -r2, asl #3] @ do the same thing for right channel
ldr ip, [r1], #4
movs r3, r3, asl #4
mov r11, #0x80000000
mov r8, ip
smlalne r11, r8, r0, r3
strne r8, [r1, #-4]
cmpne ip, #0
beq .L354
teq ip, r3
submi r0, r0, r6
addpl r0, r0, r6
.L354: cmp r7, r1 @ loop back if more samples to do
bhi term_default_loop
/*
* This exit is used by terms 1-8 to store the previous 8 samples into the decorr
* structure (even if they are not all used for the given term)
*/
default_term_exit:
ldrsh r3, [r5, #0]
sub ip, r3, #1
mov lr, #7
.L358: and r3, ip, #7
add r3, r5, r3, asl #2
ldr r2, [r1, #-4]
str r2, [r3, #40]
ldr r2, [r1, #-8]!
str r2, [r3, #8]
sub ip, ip, #1
sub lr, lr, #1
cmn lr, #1
bne .L358
b common_exit
/*
******************************************************************************
* Loop to handle term = -1 condition
*
* r0 = dpp->weight_B r8 =
* r1 = bptr r9 =
* r2 = intermediate result r10 = -1024 (for clipping)
* r3 = previous right sample r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = current sample
* r5 = dpp sp =
* r6 = dpp->delta lr = updated left sample
* r7 = eptr pc =
*******************************************************************************
*/
term_minus_1:
ldr r3, [r1, #-4]
term_minus_1_loop:
ldr ip, [r1], #8 @ for left channel the decorrelation value
movs r3, r3, asl #4 @ is the previous right sample (in r3)
mov r11, #0x80000000
mov lr, ip
smlalne r11, lr, r4, r3
strne lr, [r1, #-8]
cmpne ip, #0
beq .L361
teq ip, r3 @ update weight based on signs
submi r4, r4, r6
addpl r4, r4, r6
cmp r4, #(1024 << 18)
movgt r4, #(1024 << 18)
cmp r4, r10
movlt r4, r10
.L361: ldr r2, [r1, #-4] @ for right channel the decorrelation value
movs lr, lr, asl #4
mov r11, #0x80000000
mov r3, r2
smlalne r11, r3, r0, lr
strne r3, [r1, #-4]
cmpne r2, #0
beq .L369
teq r2, lr
submi r0, r0, r6
addpl r0, r0, r6
cmp r0, #(1024 << 18) @ then clip weight to +/-1024
movgt r0, #(1024 << 18)
cmp r0, r10
movlt r0, r10
.L369: cmp r7, r1 @ loop back if more samples to do
bhi term_minus_1_loop
str r3, [r5, #8] @ else store right sample and exit
b common_exit
/*
******************************************************************************
* Loop to handle term = -2 condition
* (note that the channels are processed in the reverse order here)
*
* r0 = dpp->weight_B r8 =
* r1 = bptr r9 =
* r2 = intermediate result r10 = -1024 (for clipping)
* r3 = previous left sample r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = current sample
* r5 = dpp sp =
* r6 = dpp->delta lr = updated right sample
* r7 = eptr pc =
*******************************************************************************
*/
term_minus_2:
ldr r3, [r1, #-8]
term_minus_2_loop:
ldr ip, [r1, #4] @ for right channel the decorrelation value
movs r3, r3, asl #4 @ is the previous left sample (in r3)
mov r11, #0x80000000
mov lr, ip
smlalne r11, lr, r0, r3
strne lr, [r1, #4]
cmpne ip, #0
beq .L380
teq ip, r3 @ update weight based on signs
submi r0, r0, r6
addpl r0, r0, r6
cmp r0, #(1024 << 18) @ then clip weight to +/-1024
movgt r0, #(1024 << 18)
cmp r0, r10
movlt r0, r10
.L380: ldr r2, [r1], #8 @ for left channel the decorrelation value
movs lr, lr, asl #4
mov r11, #0x80000000
mov r3, r2
smlalne r11, r3, r4, lr
strne r3, [r1, #-8]
cmpne r2, #0
beq .L388
teq r2, lr
submi r4, r4, r6
addpl r4, r4, r6
cmp r4, #(1024 << 18)
movgt r4, #(1024 << 18)
cmp r4, r10
movlt r4, r10
.L388: cmp r7, r1 @ loop back if more samples to do
bhi term_minus_2_loop
str r3, [r5, #40] @ else store left channel and exit
b common_exit
/*
******************************************************************************
* Loop to handle term = -3 condition
*
* r0 = dpp->weight_B r8 = previous left sample
* r1 = bptr r9 =
* r2 = current left sample r10 = -1024 (for clipping)
* r3 = previous right sample r11 = lo accumulator (for rounding)
* r4 = dpp->weight_A ip = intermediate result
* r5 = dpp sp =
* r6 = dpp->delta lr =
* r7 = eptr pc =
*******************************************************************************
*/
term_minus_3:
ldr r3, [r1, #-4] @ load previous samples
ldr r8, [r1, #-8]
term_minus_3_loop:
ldr ip, [r1], #4
movs r3, r3, asl #4
mov r11, #0x80000000
mov r2, ip
smlalne r11, r2, r4, r3
strne r2, [r1, #-4]
cmpne ip, #0
beq .L399
teq ip, r3 @ update weight based on signs
submi r4, r4, r6
addpl r4, r4, r6
cmp r4, #(1024 << 18) @ then clip weight to +/-1024
movgt r4, #(1024 << 18)
cmp r4, r10
movlt r4, r10
.L399: movs ip, r8, asl #4 @ ip = previous left we use now
mov r8, r2 @ r8 = current left we use next time
ldr r2, [r1], #4
mov r11, #0x80000000
mov r3, r2
smlalne r11, r3, r0, ip
strne r3, [r1, #-4]
cmpne r2, #0
beq .L407
teq ip, r2
submi r0, r0, r6
addpl r0, r0, r6
cmp r0, #(1024 << 18)
movgt r0, #(1024 << 18)
cmp r0, r10
movlt r0, r10
.L407: cmp r7, r1 @ loop back if more samples to do
bhi term_minus_3_loop
str r3, [r5, #8] @ else store previous samples & exit
str r8, [r5, #40]
/*
* Before finally exiting we must store weights back for next time
*/
common_exit:
mov r0, r0, asr #18 @ restore weights to real magnitude
mov r4, r4, asr #18
strh r4, [r5, #4]
strh r0, [r5, #6]
ldmfd sp!, {r4 - r8, r10, r11, pc}