Assembler versions of signal_mul. Decent speedup for Coldfire and small speedup for ARM.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15502 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Thom Johansen 2007-11-07 00:50:37 +00:00
parent 71b40994e0
commit c7a8663c7b
3 changed files with 72 additions and 1 deletions

View file

@ -47,9 +47,11 @@
#include "filters_arm4.h"
#define OVERRIDE_IIR_MEM16
#define OVERRIDE_QMF_SYNTH
#define OVERRIDE_SIGNAL_MUL
#elif defined (COLDFIRE_ASM)
#define OVERRIDE_IIR_MEM16
#define OVERRIDE_QMF_SYNTH
#define OVERRIDE_SIGNAL_MUL
#elif defined (BFIN_ASM)
#include "filters_bfin.h"
#endif
@ -114,6 +116,7 @@ void highpass(const spx_word16_t *x, spx_word16_t *y, int len, int filtID, spx_m
#ifdef FIXED_POINT
#ifndef OVERRIDE_SIGNAL_MUL
/* FIXME: These functions are ugly and probably introduce too much error */
void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len)
{
@ -123,6 +126,7 @@ void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len)
y[i] = SHL32(MULT16_32_Q14(EXTRACT16(SHR32(x[i],7)),scale),7);
}
}
#endif
#ifndef SPEEX_DISABLE_ENCODER
void signal_div(const spx_word16_t *x, spx_word16_t *y, spx_word32_t scale, int len)

View file

@ -199,7 +199,7 @@ qmf_synth:
bne 0b
sub r0, r8, r5 @ r0 = &xx1[N2]
sub r1, r9, r5 @ r1 = %xx2[N2]
sub r1, r9, r5 @ r1 = &xx2[N2]
str r4, [sp, #-4] @ Stack N
mov r4, r5
str r4, [sp, #-8] @ Stack M
@ -300,3 +300,28 @@ qmf_synth:
bne 0b
ldmia sp!, { r4-r11, pc } @ Exit
/* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
.global signal_mul
signal_mul:
stmdb sp!, { r4-r8, lr }
0:
ldmia r0!, { r5-r8 } @ Load four input samples
smull r5, r12, r2, r5
mov r12, r12, lsl #18 @ Recombine upper and lower parts
orr r5, r12, r5, lsr #14
smull r6, r12, r2, r6
mov r12, r12, lsl #18
orr r6, r12, r6, lsr #14
smull r7, r12, r2, r7
mov r12, r12, lsl #18
orr r7, r12, r7, lsr #14
smull r8, r12, r2, r8
mov r12, r12, lsl #18
orr r8, r12, r8, lsr #14
stmia r1!, { r5-r8 } @ Store four output samples
subs r3, r3, #4 @ Are we done?
bne 0b
ldmia sp!, { r4-r8, pc } @ Exit

View file

@ -312,3 +312,45 @@ qmf_synth:
lea.l (44, %sp), %sp
rts
/* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
.global signal_mul
signal_mul:
lea.l (-20, %sp), %sp
movem.l %d2-%d6, (%sp)
movem.l (20+4, %sp), %a0-%a1 | a0 = x, a1 = y
movem.l (20+12, %sp), %d0-%d1 | d0 = scale, d1 = len
moveq.l #0x20, %d6
move.l %d6, %macsr | Set MAC unit to fractional mode
asl.l #3, %d0 | Pre-scale 'scale'
moveq.l #9, %d6
0:
movem.l (%a0), %d2-%d5 | Fetch input
asl.l %d6, %d2 | Shift each value 9 to the left
asl.l %d6, %d3
asl.l %d6, %d4
asl.l %d6, %d5
mac.l %d2, %d0, %acc0 | Do multiplies
mac.l %d3, %d0, %acc1
mac.l %d4, %d0, %acc2
mac.l %d5, %d0, %acc3
lea.l (16, %a0), %a0
movclr.l %acc0, %d2
movclr.l %acc1, %d3
movclr.l %acc2, %d4
movclr.l %acc3, %d5
asl.l #5, %d2 | Adjust to proper format
asl.l #5, %d3
asl.l #5, %d4
asl.l #5, %d5
movem.l %d2-%d5, (%a1) | Save output
lea.l (16, %a1), %a1
subq.l #4, %d1
jne 0b
clr.l %d0
move.l %d0, %macsr | Set MAC unit back to integer mode
movem.l (%sp), %d2-%d6
lea.l (20, %sp), %sp
rts