rockbox/apps/codecs/libwavpack/coldfire.S
Daniel Stenberg 2acc0ac542 Updated our source code header to explicitly mention that we are GPL v2 or
later. We still need to hunt down snippets used that are not. 1324 modified
files...
http://www.rockbox.org/mail/archive/rockbox-dev-archive-2008-06/0060.shtml


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17847 a1c6a512-1295-4272-9138-f99709370657
2008-06-28 18:10:04 +00:00

537 lines
21 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2005 by David Bryant
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
/* This is an assembly optimized version of the following WavPack function:
*
* void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp,
* long *buffer, long sample_count);
*
* It performs a single pass of stereo decorrelation on the provided buffer.
* Note that this version of the function requires that the 8 previous stereo
* samples are visible and correct. In other words, it ignores the "samples_*"
* fields in the decorr_pass structure and gets the history data directly
* from the buffer. It does, however, return the appropriate history samples
* to the decorr_pass structure before returning.
*
* This is written to work on a MCF5249 processor, or any processor based on
* the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for
* the "apply_weight" function of WavPack decorrelation because it provides
* the requires 40-bit product. The fractional rounding mode of the EMAC is not
* configurable and uses "round to even" while WavPack uses "round to larger",
* so the rounding has to be done manually.
*/
.text
.align 2
.global decorr_stereo_pass_cont_mcf5249
decorr_stereo_pass_cont_mcf5249:
lea (-44, %sp), %sp
movem.l %d2-%d7/%a2-%a6, (%sp)
move.l 44+4(%sp), %a2 | a2 = dpp->
move.l 44+8(%sp), %a1 | a1 = bptr
move.w 2(%a2), %a3 | a3 = dpp->delta
move.w 4(%a2), %d3 | d3 = dpp->weight_A (sign extended)
ext.l %d3
move.w 6(%a2), %d4 | d4 = dpp->weight_B (sign extended)
ext.l %d4
move.l 44+12(%sp), %d0 | d0 = sample_count
jbeq return_only | if zero, nothing to do
lsl.l #3, %d0 | d5 = bptr + (sample_count * 8)
move.l %d0, %d5
add.l %a1, %d5
moveq.l #17, %d0 | left shift weights & delta 17 places
asl.l %d0, %d3
asl.l %d0, %d4
move.l %a3, %d1
asl.l %d0, %d1
move.l %d1, %a3
moveq.l #0x20, %d6
move.l %d6, %macsr | set fractional mode for MAC
move.l #0x800000, %accext01 | acc1 = 0x00 0000 80 (for rounding)
move.l #1024<<17, %d6 | d6 & d7 are weight clipping limits
move.l #-1024<<17, %d7 | (only used by negative terms)
move.w (%a2), %d0 | d0 = term
ext.l %d0
cmp.l #17, %d0
jbeq term_17 | term = 17
cmp.l #18, %d0
jbeq term_18 | term = 18
addq.l #1, %d0
jbeq term_minus_1 | term = -1
addq.l #1, %d0
jbeq term_minus_2 | term = -2
addq.l #1, %d0
jbeq term_minus_3 | term = -3
jbra term_default | default term = 1 - 8
|------------------------------------------------------------------------------
| Loop to handle term = 17 condition
|
| a0 = d0 = (2 * bptr [-1]) - bptr [-2]
| a1 = bptr d1 = initial bptr [0]
| a2 = dpp-> d2 = updated bptr [0]
| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
| a4 = d4 = dpp->weight_B << 17
| a5 = d5 = eptr
| macsr = 0x20 acc1 = 0x00 0000 80
|------------------------------------------------------------------------------
term_17:
move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
add.l %d0, %d0
sub.l -16(%a1), %d0
beq .L251 | if zero, skip calculation
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
mac.l %d0, %d3, %acc0
move.l (%a1), %d1
beq .L255
eor.l %d1, %d0 | else compare signs
bge .L256 | if same, add delta to weight
sub.l %a3, %d3 | else subtract delta from weight
sub.l %a3, %d3 | subtract again instead of branch
.L256: add.l %a3, %d3 | add delta to weight
.L255: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | update bptr [0] and store
move.l %d2, (%a1)+
.L253: move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
add.l %d0, %d0
sub.l -16(%a1), %d0
beq .L257 | if zero, skip calculations
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
mac.l %d0, %d4, %acc0
move.l (%a1), %d1
beq .L254
eor.l %d1, %d0 | else compare signs
bge .L259 | if same, add delta to weight
sub.l %a3, %d4 | else subtract delta from weight
sub.l %a3, %d4 | subtract again instead of branch
.L259: add.l %a3, %d4 | add delta to weight
.L254: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | update bptr [0] and store
move.l %d2, (%a1)+
.L252: cmp.l %a1, %d5 | loop if bptr < eptr
jbhi term_17
bra term_17_18_finish | exit through common path
.L251: addq.l #4, %a1 | update point and jump back into loop
bra .L253
.L257: addq.l #4, %a1 | update point and jump back into loop
bra .L252
|------------------------------------------------------------------------------
| Loop to handle term = 18 condition
|
| a0 = d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1
| a1 = bptr d1 = initial bptr [0]
| a2 = dpp-> d2 = updated bptr [0]
| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
| a4 = d4 = dpp->weight_B << 17
| a5 = d5 = eptr
| macsr = 0x20 acc1 = 0x00 0000 80
|------------------------------------------------------------------------------
term_18:
move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
lea (%a0,%a0.l*2), %a0
move.l %a0, %d0
sub.l -16(%a1), %d0
asr.l #1, %d0
beq .L260
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
mac.l %d0, %d3, %acc0
move.l (%a1), %d1
beq .L266
eor.l %d1, %d0 | else compare signs
bge .L267 | if same, add delta to weight
sub.l %a3, %d3 | else subtract delta from weight
sub.l %a3, %d3 | subtract again instead of branch
.L267: add.l %a3, %d3 | add delta to weight
.L266: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [0], store
move.l %d2, (%a1)+
.L268: move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
lea (%a0,%a0.l*2), %a0
move.l %a0, %d0
sub.l -16(%a1), %d0
asr.l #1, %d0
beq .L261
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
mac.l %d0, %d4, %acc0
move.l (%a1), %d1
beq .L265
eor.l %d1, %d0 | else compare signs
bge .L270 | if same, add delta to weight
sub.l %a3, %d4 | else subtract delta from weight
sub.l %a3, %d4 | subtract again instead of branch
.L270: add.l %a3, %d4 | add delta to weight
.L265: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [0], store
move.l %d2, (%a1)+
.L269: cmp.l %a1, %d5 | loop if bptr < eptr
jbhi term_18
bra term_17_18_finish | exit through common path
.L260: addq.l #4, %a1 | bump pointer and jump back into loop
bra .L268
.L261: addq.l #4, %a1 | bump pointer and jump back into loop
bra .L269
term_17_18_finish:
move.l -4(%a1), 40(%a2) | restore dpp->samples_A [0-1], B [0-1]
move.l -8(%a1), 8(%a2)
move.l -12(%a1), 44(%a2)
move.l -16(%a1), 12(%a2)
jbra finish_up
|------------------------------------------------------------------------------
| Loop to handle default terms (i.e. 1 - 8)
|
| a0 = tptr d0 = tptr [0]
| a1 = bptr d1 = initial bptr [0]
| a2 = dpp-> d2 = updated bptr [0]
| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
| a4 = d4 = dpp->weight_B << 17
| a5 = d5 = eptr
| macsr = 0x20 acc1 = 0x00 0000 80
|------------------------------------------------------------------------------
term_default:
move.w (%a2), %d0 | a0 = a1 - (dpp->term * 8)
ext.l %d0
lsl.l #3, %d0
move.l %a1, %a0
sub.l %d0, %a0
term_default_loop:
move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
beq .L271
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
mac.l %d0, %d3, %acc0
move.l (%a1), %d1
beq .L277
eor.l %d1, %d0 | else compare signs
bge .L278 | if same, add delta to weight
sub.l %a3, %d3 | else subtract delta from weight
sub.l %a3, %d3 | subtract again instead of branch
.L278: add.l %a3, %d3 | add delta to weight
.L277: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [0], store
move.l %d2, (%a1)+
.L275: move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
beq .L272
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
mac.l %d0, %d4, %acc0
move.l (%a1), %d1
beq .L276
eor.l %d1, %d0 | else compare signs
bge .L281 | if same, add delta to weight
sub.l %a3, %d4 | else subtract delta from weight
sub.l %a3, %d4 | subtract again instead of branch
.L281: add.l %a3, %d4 | add delta to weight
.L276: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [0], store
move.l %d2, (%a1)+
.L274: cmp.l %a1, %d5 | loop back if bptr < eptr
jbhi term_default_loop
move.w (%a2), %d0 | d0 = term - 1
moveq.l #8, %d1 | d1 = loop counter
.L323: subq.l #1, %d0 | back up & mask index
and.l #7, %d0
move.l -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0]
move.l -(%a1), 8(%a2,%d0.l*4) | store dpp->samples_A [d0]
subq.l #1, %d1 | loop on count
jbne .L323
jbra finish_up
.L271: addq.l #4, %a1 | bump pointer and jump back into loop
bra .L275
.L272: addq.l #4, %a1 | bump pointer and jump back into loop
bra .L274
|------------------------------------------------------------------------------
| Loop to handle term = -1 condition
|
| a0 = d0 = decorrelation sample
| a1 = bptr d1 = initial bptr [0]
| a2 = dpp-> d2 = updated bptr [0]
| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
| a4 = d4 = dpp->weight_B << 17
| a5 = d5 = eptr
| a6 = d6 = 1024 << 17
| a7 = d7 = -1024 << 17
| macsr = 0x20 acc1 = 0x00 0000 80
|------------------------------------------------------------------------------
term_minus_1:
move.l -4(%a1), %d0 | d0 = bptr [-1]
beq .L402
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
mac.l %d0, %d3, %acc0
move.l (%a1), %d1
beq .L405
eor.l %d1, %d0 | else compare signs
bge .L404 | if same, add delta to weight
sub.l %a3, %d3 | else subtract delta from weight
cmp.l %d7, %d3 | check for negative clip limit
bge .L405
move.l %d7, %d3
bra .L405
.L404: add.l %a3, %d3 | add delta to weight
cmp.l %d6, %d3 | check for positive clip limit
ble .L405
move.l %d6, %d3
.L405: move.l %acc0, %d0 | d2 = rounded product
add.l %d1, %d0 | add applied weight to bptr [0], store
move.l %d0, (%a1)+
beq .L401
.L410: move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
mac.l %d0, %d4, %acc0
move.l (%a1), %d1
beq .L403
eor.l %d1, %d0 | else compare signs
bge .L407 | if same, add delta to weight
sub.l %a3, %d4 | else subtract delta from weight
cmp.l %d7, %d4 | check for negative clip limit
bge .L403
move.l %d7, %d4
bra .L403
.L407: add.l %a3, %d4 | add delta to weight
cmp.l %d6, %d4 | check for positive clip limit
ble .L403
move.l %d6, %d4
.L403: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [1], store
move.l %d2, (%a1)+
.L411: cmp.l %a1, %d5 | loop back if bptr < eptr
jbhi term_minus_1
move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
jbra finish_up
.L402: move.l (%a1)+, %d0
bne .L410
.L401: addq.l #4, %a1
bra .L411
|------------------------------------------------------------------------------
| Loop to handle term = -2 condition
|
| a0 = d0 = decorrelation sample
| a1 = bptr d1 = initial bptr [0]
| a2 = dpp-> d2 = updated bptr [0]
| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
| a4 = d4 = dpp->weight_B << 17
| a5 = d5 = eptr
| a6 = d6 = 1024 << 17
| a7 = d7 = -1024 << 17
| macsr = 0x20 acc1 = 0x00 0000 80
|------------------------------------------------------------------------------
term_minus_2:
move.l -8(%a1), %d0 | d0 = bptr [-2]
beq .L511
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
mac.l %d0, %d4, %acc0
move.l 4(%a1), %d1
beq .L505
eor.l %d1, %d0 | else compare signs
bge .L504 | if same, add delta to weight
sub.l %a3, %d4 | else subtract delta from weight
cmp.l %d7, %d4 | ckeck for negative clip limit
bge .L505
move.l %d7, %d4
bra .L505
.L504: add.l %a3, %d4 | add delta to weight
cmp.l %d6, %d4 | check for positive clip limit
ble .L505
move.l %d6, %d4
.L505: move.l %acc0, %d0 | d2 = rounded product
add.l %d1, %d0 | add applied weight to bptr [0], store
move.l %d0, 4(%a1)
beq .L512
.L510: move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
mac.l %d0, %d3, %acc0
move.l (%a1), %d1
beq .L503
eor.l %d1, %d0 | else compare signs
bge .L507 | if same, add delta to weight
sub.l %a3, %d3 | else subtract delta from weight
cmp.l %d7, %d3 | check for negative clip limit
bge .L503
move.l %d7, %d3
bra .L503
.L507: add.l %a3, %d3 | add delta to weight
cmp.l %d6, %d3 | check for negative clip limit
ble .L503
move.l %d6, %d3
.L503: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [1], store
move.l %d2, (%a1)
.L512: addq.l #8, %a1
cmp.l %a1, %d5 | loop if bptr < eptr
jbhi term_minus_2
move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-4]
jbra finish_up
.L511: move.l 4(%a1), %d0
beq .L512
bra .L510
|------------------------------------------------------------------------------
| Loop to handle term = -3 condition
|
| a0 = d0 = decorrelation sample
| a1 = bptr d1 = initial bptr [0]
| a2 = dpp-> d2 = updated bptr [0]
| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
| a4 = d4 = dpp->weight_B << 17
| a5 = d5 = eptr
| a6 = d6 = 1024 << 17
| a7 = d7 = -1024 << 17
| macsr = 0x20 acc1 = 0x00 0000 80
|------------------------------------------------------------------------------
term_minus_3:
move.l -4(%a1), %d0 | d0 = bptr [-1]
beq .L301
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
mac.l %d0, %d3, %acc0
move.l (%a1), %d1
beq .L320
eor.l %d1, %d0 | else compare signs
bge .L319 | if same, add delta to weight
sub.l %a3, %d3 | else subtract delta from weight
cmp.l %d7, %d3 | check for negative clip limit
bge .L320
move.l %d7, %d3
bra .L320
.L319: add.l %a3, %d3 | add delta to weight
cmp.l %d6, %d3 | check for positive clip limit
ble .L320
move.l %d6, %d3
.L320: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [0], store
move.l %d2, (%a1)+
.L330: move.l -12(%a1), %d0 | d0 = bptr [-2]
beq .L302
move.l %acc1, %acc0
asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
mac.l %d0, %d4, %acc0
move.l (%a1), %d1
beq .L318
eor.l %d1, %d0 | else compare signs
bge .L322 | if same, add delta to weight
sub.l %a3, %d4 | else subtract delta from weight
cmp.l %d7, %d4 | check for negative clip limit
bge .L318
move.l %d7, %d4
bra .L318
.L322: add.l %a3, %d4 | add delta to weight
cmp.l %d6, %d4 | check for positive clip limit
ble .L318
move.l %d6, %d4
.L318: move.l %acc0, %d2 | d2 = rounded product
add.l %d1, %d2 | add applied weight to bptr [1], store
move.l %d2, (%a1)+
.L331: cmp.l %a1, %d5 | bptr, eptr
jbhi term_minus_3
move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-2]
jbra finish_up
.L301: addq.l #4, %a1
bra .L330
.L302: addq.l #4, %a1
bra .L331
| finish and return
finish_up:
moveq.l #17, %d0
asr.l %d0, %d3
asr.l %d0, %d4
move.w %d3, 4(%a2) | weight_A, dpp->weight_A
move.w %d4, 6(%a2) | weight_B, dpp->weight_B
clr.l %d0 | clear up EMAC
move.l %d0, %acc0
move.l %d0, %acc1
return_only:
movem.l (%sp), %d2-%d7/%a2-%a6
lea (44,%sp), %sp
rts