Assembler versions of signal_mul. Decent speedup for Coldfire and small speedup for ARM.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15502 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
71b40994e0
commit
c7a8663c7b
3 changed files with 72 additions and 1 deletions
|
@ -47,9 +47,11 @@
|
|||
#include "filters_arm4.h"
|
||||
#define OVERRIDE_IIR_MEM16
|
||||
#define OVERRIDE_QMF_SYNTH
|
||||
#define OVERRIDE_SIGNAL_MUL
|
||||
#elif defined (COLDFIRE_ASM)
|
||||
#define OVERRIDE_IIR_MEM16
|
||||
#define OVERRIDE_QMF_SYNTH
|
||||
#define OVERRIDE_SIGNAL_MUL
|
||||
#elif defined (BFIN_ASM)
|
||||
#include "filters_bfin.h"
|
||||
#endif
|
||||
|
@ -114,6 +116,7 @@ void highpass(const spx_word16_t *x, spx_word16_t *y, int len, int filtID, spx_m
|
|||
|
||||
#ifdef FIXED_POINT
|
||||
|
||||
#ifndef OVERRIDE_SIGNAL_MUL
|
||||
/* FIXME: These functions are ugly and probably introduce too much error */
|
||||
void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len)
|
||||
{
|
||||
|
@ -123,6 +126,7 @@ void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len)
|
|||
y[i] = SHL32(MULT16_32_Q14(EXTRACT16(SHR32(x[i],7)),scale),7);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SPEEX_DISABLE_ENCODER
|
||||
void signal_div(const spx_word16_t *x, spx_word16_t *y, spx_word32_t scale, int len)
|
||||
|
|
|
@ -199,7 +199,7 @@ qmf_synth:
|
|||
bne 0b
|
||||
|
||||
sub r0, r8, r5 @ r0 = &xx1[N2]
|
||||
sub r1, r9, r5 @ r1 = %xx2[N2]
|
||||
sub r1, r9, r5 @ r1 = &xx2[N2]
|
||||
str r4, [sp, #-4] @ Stack N
|
||||
mov r4, r5
|
||||
str r4, [sp, #-8] @ Stack M
|
||||
|
@ -300,3 +300,28 @@ qmf_synth:
|
|||
bne 0b
|
||||
ldmia sp!, { r4-r11, pc } @ Exit
|
||||
|
||||
|
||||
/* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
|
||||
.global signal_mul
|
||||
signal_mul:
|
||||
stmdb sp!, { r4-r8, lr }
|
||||
0:
|
||||
ldmia r0!, { r5-r8 } @ Load four input samples
|
||||
smull r5, r12, r2, r5
|
||||
mov r12, r12, lsl #18 @ Recombine upper and lower parts
|
||||
orr r5, r12, r5, lsr #14
|
||||
smull r6, r12, r2, r6
|
||||
mov r12, r12, lsl #18
|
||||
orr r6, r12, r6, lsr #14
|
||||
smull r7, r12, r2, r7
|
||||
mov r12, r12, lsl #18
|
||||
orr r7, r12, r7, lsr #14
|
||||
smull r8, r12, r2, r8
|
||||
mov r12, r12, lsl #18
|
||||
orr r8, r12, r8, lsr #14
|
||||
stmia r1!, { r5-r8 } @ Store four output samples
|
||||
subs r3, r3, #4 @ Are we done?
|
||||
bne 0b
|
||||
|
||||
ldmia sp!, { r4-r8, pc } @ Exit
|
||||
|
||||
|
|
|
@ -312,3 +312,45 @@ qmf_synth:
|
|||
lea.l (44, %sp), %sp
|
||||
rts
|
||||
|
||||
|
||||
/* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
|
||||
.global signal_mul
|
||||
signal_mul:
|
||||
lea.l (-20, %sp), %sp
|
||||
movem.l %d2-%d6, (%sp)
|
||||
movem.l (20+4, %sp), %a0-%a1 | a0 = x, a1 = y
|
||||
movem.l (20+12, %sp), %d0-%d1 | d0 = scale, d1 = len
|
||||
moveq.l #0x20, %d6
|
||||
move.l %d6, %macsr | Set MAC unit to fractional mode
|
||||
asl.l #3, %d0 | Pre-scale 'scale'
|
||||
moveq.l #9, %d6
|
||||
0:
|
||||
movem.l (%a0), %d2-%d5 | Fetch input
|
||||
asl.l %d6, %d2 | Shift each value 9 to the left
|
||||
asl.l %d6, %d3
|
||||
asl.l %d6, %d4
|
||||
asl.l %d6, %d5
|
||||
mac.l %d2, %d0, %acc0 | Do multiplies
|
||||
mac.l %d3, %d0, %acc1
|
||||
mac.l %d4, %d0, %acc2
|
||||
mac.l %d5, %d0, %acc3
|
||||
lea.l (16, %a0), %a0
|
||||
movclr.l %acc0, %d2
|
||||
movclr.l %acc1, %d3
|
||||
movclr.l %acc2, %d4
|
||||
movclr.l %acc3, %d5
|
||||
asl.l #5, %d2 | Adjust to proper format
|
||||
asl.l #5, %d3
|
||||
asl.l #5, %d4
|
||||
asl.l #5, %d5
|
||||
movem.l %d2-%d5, (%a1) | Save output
|
||||
lea.l (16, %a1), %a1
|
||||
subq.l #4, %d1
|
||||
jne 0b
|
||||
|
||||
clr.l %d0
|
||||
move.l %d0, %macsr | Set MAC unit back to integer mode
|
||||
movem.l (%sp), %d2-%d6
|
||||
lea.l (20, %sp), %sp
|
||||
rts
|
||||
|
||||
|
|
Loading…
Reference in a new issue