ARM assembler for resampling. Should provide some gains, though not huge ones.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12732 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
1b3fc39a65
commit
1b05ea8ffe
2 changed files with 129 additions and 6 deletions
130
apps/dsp_arm.S
130
apps/dsp_arm.S
|
@ -17,14 +17,14 @@
|
|||
*
|
||||
****************************************************************************/
|
||||
|
||||
/*
|
||||
/****************************************************************************
|
||||
* void apply_crossfeed(int count, int32_t* src[])
|
||||
*/
|
||||
.section .text
|
||||
.global apply_crossfeed
|
||||
apply_crossfeed:
|
||||
@ unfortunately, we ended up in a bit of a register squeeze here, and need
|
||||
@ to keep both the count and the delay line index on the stack :/
|
||||
@ to keep the count on the stack :/
|
||||
stmdb sp!, { r4-r11, lr } @ stack modified regs
|
||||
ldmia r1, { r2-r3 } @ r2 = src[0], r3 = src[1]
|
||||
|
||||
|
@ -74,7 +74,131 @@ apply_crossfeed:
|
|||
@ save data back to struct
|
||||
ldr r12, =crossfeed_data + 4*4
|
||||
stmia r12, { r8-r11 } @ save filter history
|
||||
str r0, [r12, #30*4] @ save delay line index
|
||||
str r0, [r12, #30*4] @ save delay line index
|
||||
add sp, sp, #8 @ remove temp variables from stack
|
||||
ldmia sp!, { r4-r11, pc }
|
||||
.cfend:
|
||||
.size apply_crossfeed,.cfend-apply_crossfeed
|
||||
|
||||
/****************************************************************************
|
||||
* int dsp_downsample(int count, struct dsp_data *data,
|
||||
* in32_t *src[], int32_t *dst[])
|
||||
*/
|
||||
.section .text
|
||||
.global dsp_downsample
|
||||
dsp_downsample:
|
||||
stmdb sp!, { r4-r11, lr } @ stack modified regs
|
||||
ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta
|
||||
sub r5, r5, #1 @ pre-decrement num_channels for use
|
||||
add r4, r1, #12 @ r4 = &resample_data.phase
|
||||
mov r12, #0xff
|
||||
orr r12, r12, #0xff00 @ r12 = 0xffff
|
||||
.dschannel_loop:
|
||||
ldr r1, [r4] @ r1 = resample_data.phase
|
||||
ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1]
|
||||
ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1]
|
||||
add r9, r4, #4 @ r9 = &last_sample[0]
|
||||
ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1]
|
||||
sub r11, r0, #1
|
||||
ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ...
|
||||
str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample
|
||||
movs r9, r1, lsr #16 @ r9 = pos = phase >> 16
|
||||
ldreq r11, [r7] @ if pos = 0, load src[0] and jump into loop
|
||||
beq .dsuse_last_start
|
||||
cmp r9, r0 @ if pos >= count, we're already done
|
||||
bge .dsloop_skip
|
||||
|
||||
@ Register usage in loop:
|
||||
@ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
|
||||
@ r6 = delta, r7 = s, r8 = d, r9 = pos, r10 = s[pos - 1], r11 = s[pos]
|
||||
.dsloop:
|
||||
add r9, r7, r9, lsl #2 @ r9 = &s[pos]
|
||||
ldmda r9, { r10, r11 } @ r10 = s[pos - 1], r11 = s[pos]
|
||||
.dsuse_last_start:
|
||||
sub r11, r11, r10 @ r11 = diff = s[pos] - s[pos - 1]
|
||||
@ keep frac in lower bits to take advantage of multiplier early termination
|
||||
and r9, r1, r12 @ frac = phase & 0xffff
|
||||
smull r9, r14, r11, r9
|
||||
add r10, r10, r14, lsl #16
|
||||
add r10, r10, r9, lsr #16 @ r10 = out = s[pos - 1] + frac*diff
|
||||
str r10, [r8], #4 @ *d++ = out
|
||||
add r1, r1, r6 @ phase += delta
|
||||
mov r9, r1, lsr #16 @ pos = phase >> 16
|
||||
cmp r9, r0 @ pos < count?
|
||||
blt .dsloop @ yup, do more samples
|
||||
.dsloop_skip:
|
||||
subs r5, r5, #1
|
||||
bpl .dschannel_loop @ if (--ch) >= 0, do another channel
|
||||
sub r1, r1, r0, lsl #16 @ wrap phase back to start
|
||||
str r1, [r4] @ store back
|
||||
ldr r1, [r3] @ r1 = &dst[0]
|
||||
sub r8, r8, r1 @ dst - &dst[0]
|
||||
mov r0, r8, lsr #2 @ convert bytes->samples
|
||||
ldmia sp!, { r4-r11, pc } @ ... and we're out
|
||||
.dsend:
|
||||
.size dsp_downsample,.dsend-dsp_downsample
|
||||
|
||||
/****************************************************************************
|
||||
* int dsp_upsample(int count, struct dsp_data *dsp,
|
||||
* in32_t *src[], int32_t *dst[])
|
||||
*/
|
||||
.section .text
|
||||
.global dsp_upsample
|
||||
dsp_upsample:
|
||||
stmdb sp!, { r4-r11, lr } @ stack modified regs
|
||||
ldmib r1, { r5-r6 } @ r5 = num_channels,r6 = resample_data.delta
|
||||
sub r5, r5, #1 @ pre-decrement num_channels for use
|
||||
add r4, r1, #12 @ r4 = &resample_data.phase
|
||||
stmdb sp!, { r0, r4 } @ stack count and &resample_data.phase
|
||||
.uschannel_loop:
|
||||
ldr r12, [r4] @ r12 = resample_data.phase
|
||||
mov r1, r12, ror #16 @ swap halfword positions, we'll use carry
|
||||
@ to detect pos increments
|
||||
ldr r7, [r2, r5, lsl #2] @ r7 = s = src[ch - 1]
|
||||
ldr r8, [r3, r5, lsl #2] @ r8 = d = dst[ch - 1]
|
||||
add r9, r4, #4 @ r9 = &last_sample[0]
|
||||
ldr r10, [r9, r5, lsl #2] @ r10 = last_sample[ch - 1]
|
||||
sub r11, r0, #1
|
||||
ldr r14, [r7, r11, lsl #2] @ load last sample in s[] ...
|
||||
str r14, [r9, r5, lsl #2] @ and write as next frame's last_sample
|
||||
add r9, r7, r0, lsl #2 @ r9 = src_end = &src[count]
|
||||
movs r14, r12, lsr #16 @ pos = resample_data.phase >> 16
|
||||
beq .usstart_0 @ pos = 0
|
||||
cmp r14, r0 @ if pos >= count, we're already done
|
||||
bge .usloop_skip
|
||||
add r7, r7, r14, lsl #2 @ r7 = &s[pos]
|
||||
ldr r10, [r7, #-4] @ r11 = s[pos - 1]
|
||||
b .usstart_0
|
||||
|
||||
@ Register usage in loop:
|
||||
@ r0 = count, r1 = phase, r4 = &resample_data.phase, r5 = cur_channel,
|
||||
@ r6 = delta, r7 = s, r8 = d, r9 = src_end, r10 = s[pos - 1], r11 = s[pos]
|
||||
.usloop_1:
|
||||
mov r10, r11 @ r10 = previous sample
|
||||
.usstart_0:
|
||||
ldr r11, [r7], #4 @ r11 = next sample
|
||||
sub r0, r11, r10 @ r0 = s[pos] - s[pos - 1]
|
||||
.usloop_0:
|
||||
mov r4, r1, lsr #16 @ r4 = frac = phase >> 16
|
||||
smull r12, r14, r4, r0
|
||||
add r14, r10, r14, lsl #16
|
||||
add r14, r14, r12, lsr #16 @ r14 = out = s[pos - 1] + frac*diff
|
||||
str r14, [r8], #4 @ *d++ = out
|
||||
adds r1, r1, r6, lsl #16 @ phase += delta << 16
|
||||
bcc .usloop_0 @ if carry is set, pos is incremented
|
||||
cmp r7, r9 @ if s < src_end, do another sample
|
||||
blo .usloop_1
|
||||
.usloop_skip:
|
||||
subs r5, r5, #1
|
||||
ldmia sp, { r0, r4 } @ reload count and &resample_data.phase
|
||||
bpl .uschannel_loop @ if (--ch) >= 0, do another channel
|
||||
mov r1, r1, ror #16 @ wrap phase back to start of next frame
|
||||
str r1, [r4] @ store back
|
||||
ldr r1, [r3] @ r1 = &dst[0]
|
||||
sub r8, r8, r1 @ dst - &dst[0]
|
||||
mov r0, r8, lsr #2 @ convert bytes->samples
|
||||
add sp, sp, #8 @ adjust stack for temp variables
|
||||
ldmia sp!, { r4-r11, pc } @ ... and we're out
|
||||
.usend:
|
||||
.size dsp_upsample,.usend-dsp_upsample
|
||||
|
||||
|
|
|
@ -27,13 +27,12 @@
|
|||
#if defined(CPU_COLDFIRE) || defined(CPU_ARM)
|
||||
#define DSP_HAVE_ASM_CROSSFEED
|
||||
void apply_crossfeed(int count, int32_t *buf[]);
|
||||
#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */
|
||||
|
||||
#if defined (CPU_COLDFIRE)
|
||||
#define DSP_HAVE_ASM_RESAMPLING
|
||||
int dsp_downsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
|
||||
int dsp_upsample(int count, struct dsp_data *data, int32_t *src[], int32_t *dst[]);
|
||||
#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */
|
||||
|
||||
#if defined (CPU_COLDFIRE)
|
||||
#define DSP_HAVE_ASM_SOUND_CHAN_MONO
|
||||
void channels_process_sound_chan_mono(int count, int32_t *buf[]);
|
||||
#define DSP_HAVE_ASM_SOUND_CHAN_CUSTOM
|
||||
|
|
Loading…
Reference in a new issue