From 3c52395b570d5abc394b5a1320d642057e6f4174 Mon Sep 17 00:00:00 2001 From: Dave Hooper Date: Sun, 21 Feb 2010 21:14:40 +0000 Subject: [PATCH] Get a few more % speedup on ARM (measured on ipod video) - improve imdct full final symmetries using ldm/stm and simple register swapping. Also, add more comments (and improve/update some of the existing ones) regarding the layout of the imdct_half and the imdct_full git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24819 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/lib/SOURCES | 4 +- apps/codecs/lib/asm_arm.h | 13 ++-- apps/codecs/lib/codeclib.h | 2 +- apps/codecs/lib/mdct.c | 131 ++++++++++++++++++++++++++++++------- 4 files changed, 116 insertions(+), 34 deletions(-) diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES index da77f97d30..438cd1fc62 100644 --- a/apps/codecs/lib/SOURCES +++ b/apps/codecs/lib/SOURCES @@ -4,14 +4,14 @@ fixedpoint.c /* OLD MDCT */ /* (when all other codecs are remediated this can be remoed) */ -mdct2.c +/* mdct2.c */ mdct_lookup.c fft-ffmpeg.c mdct.c #ifdef CPU_ARM -mdct_arm.S +/*mdct_arm.S*/ setjmp_arm.S ../../../firmware/target/arm/support-arm.S #endif diff --git a/apps/codecs/lib/asm_arm.h b/apps/codecs/lib/asm_arm.h index 4f31f80c3e..9dcbcef755 100644 --- a/apps/codecs/lib/asm_arm.h +++ b/apps/codecs/lib/asm_arm.h @@ -226,14 +226,11 @@ void vect_mult_bw(int32_t *data, int32_t *window, int n) #define _V_CLIP_MATH static inline int32_t CLIP_TO_15(int32_t x) { - int tmp; - asm volatile("subs %1, %0, #32768\n\t" - "movpl %0, #0x7f00\n\t" - "orrpl %0, %0, #0xff\n" - "adds %1, %0, #32768\n\t" - "movmi %0, #0x8000" - : "+r"(x),"=r"(tmp) - : + const int32_t mask = 0xffff7fff; + asm volatile("teq %0,%0,asr #31\n\t" + "eorne %0,%1,%0,asr #31\n\t" + : "+r"(x) + : "r" (mask) : "cc"); return(x); } diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h index 817d86a6a3..32a4696b9d 100644 --- a/apps/codecs/lib/codeclib.h +++ b/apps/codecs/lib/codeclib.h @@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con /*MDCT library functions*/ /* -1- Tremor mdct */ -extern void mdct_backward(int n, int32_t *in, int32_t *out); +/* extern void mdct_backward(int n, int32_t *in, int32_t *out); */ /* -2- ffmpeg fft-based mdct */ extern void ff_imdct_half(unsigned int nbits, int32_t *output, const int32_t *input); extern void ff_imdct_calc(unsigned int nbits, int32_t *output, const int32_t *input); diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c index aefd553f25..9747bd14d9 100644 --- a/apps/codecs/lib/mdct.c +++ b/apps/codecs/lib/mdct.c @@ -72,8 +72,9 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) For postrotation, the factors are sin,cos(2PI*(i+1/4)/N) Therefore, prerotation can immediately reuse the same twiddles as fft - (for postrotation it's still a bit complex, so this is still using - an mdct-local set of twiddles to do that part) + (for postrotation it's still a bit complex, we reuse the fft trig tables + where we can, or a special table for N=2048, or interpolate between + trig tables for N>2048) */ const int32_t *T = sincos_lookup0; const int step = 2<<(12-nbits); @@ -248,25 +249,49 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input) * <----input----> * <-----------output-----------> * + * The result of ff_imdct_half is to put the 'half' imdct here + * + * N/2 N-1 + * <--half imdct--> + * + * We want it here for the full imdct: + * N/4 3N/4-1 + * <--------------> + * + * In addition we need to apply two symmetries to get the full imdct: + * + * + * + * + * D is a reflection of C + * A is a reflection of B (but with sign flipped) + * + * We process the symmetries at the same time as we 'move' the half imdct + * from [N/2,N-1] to [N/4,3N/4-1] + * + * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1] + * This would require being able to use revtab 'inplace' (since the input + * and output of imdct_half would then overlap somewhat) */ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT; +#ifndef CPU_ARM void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) { const int n = (1<>1); const int n4 = (n>>2); + /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */ ff_imdct_half(nbits,output+n2,input); - /* reflect the half imdct into the full N samples */ - /* TODO: this could easily be optimised more! */ fixed32 * in_r, * in_r2, * out_r, * out_r2; + /* Copy BBBB to AAAA, reflected and sign-flipped. + Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */ out_r = output; out_r2 = output+n2-8; in_r = output+n2+n4-8; while(out_r <- ^b ^c -> <- ^d - // - // #1: copy from ^c to ^a - // #2: copy from ^d to ^b - // #3: swap ^c and ^d in place - // - // #1 pt1 : load 4 words from ^c. + /* Copy and reflect CCCC to DDDD. Because CCCC is already where + we actually want to put DDDD this is a bit complicated. + * So simultaneously do the following things: + * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1] + * 2. reflect range from [n2+n4 .. n-1] inplace + * + * [ | ] + * ^a -> <- ^b ^c -> <- ^d + * + * #1: copy from ^c to ^a + * #2: copy from ^d to ^b + * #3: swap ^c and ^d in place + */ + /* #1 pt1 : load 4 words from ^c. */ t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3]; - // #1 pt2 : write to ^a + /* #1 pt2 : write to ^a */ out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3; - // #2 pt1 : load 4 words from ^d + /* #2 pt1 : load 4 words from ^d */ s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3]; - // #2 pt2 : write to ^b + /* #2 pt2 : write to ^b */ out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3; - // #3 pt1 : write words from #2 to ^c + /* #3 pt1 : write words from #2 to ^c */ in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0; - // #3 pt2 : write words from #1 to ^d + /* #3 pt2 : write words from #1 to ^d */ in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0; in_r += 4; @@ -319,6 +345,65 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) out_r2 -= 4; } } +#else +/* Follows the same structure as the canonical version above */ +void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) +{ + const int n = (1<>1); + const int n4 = (n>>2); + + ff_imdct_half(nbits,output+n2,input); + + fixed32 * in_r, * in_r2, * out_r, * out_r2; + + out_r = output; + out_r2 = output+n2; + in_r = output+n2+n4; + while(out_r