Some asm for mdct on coldfire, speeds up vorbis decoding by about 0.3MHz

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25984 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Nils Wallménius 2010-05-13 08:03:42 +00:00
parent 182d13e3ab
commit 88bd8e74c4

View file

@ -293,6 +293,32 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
in_r = output+n2+n4-8; in_r = output+n2+n4-8;
while(out_r<out_r2) while(out_r<out_r2)
{ {
#if defined CPU_COLDFIRE
asm volatile(
"movem.l (%[in_r]), %%d0-%%d7\n\t"
"movem.l %%d0-%%d7, (%[out_r2])\n\t"
"neg.l %%d7\n\t"
"move.l %%d7, (%[out_r])+\n\t"
"neg.l %%d6\n\t"
"move.l %%d6, (%[out_r])+\n\t"
"neg.l %%d5\n\t"
"move.l %%d5, (%[out_r])+\n\t"
"neg.l %%d4\n\t"
"move.l %%d4, (%[out_r])+\n\t"
"neg.l %%d3\n\t"
"move.l %%d3, (%[out_r])+\n\t"
"neg.l %%d2\n\t"
"move.l %%d2, (%[out_r])+\n\t"
"lea.l (-8*4, %[in_r]), %[in_r]\n\t"
"neg.l %%d1\n\t"
"move.l %%d1, (%[out_r])+\n\t"
"lea.l (-8*4, %[out_r2]), %[out_r2]\n\t"
"neg.l %%d0\n\t"
"move.l %%d0, (%[out_r])+\n\t"
: [in_r] "+a" (in_r), [out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
:
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory" );
#else
out_r[0] = -(out_r2[7] = in_r[7]); out_r[0] = -(out_r2[7] = in_r[7]);
out_r[1] = -(out_r2[6] = in_r[6]); out_r[1] = -(out_r2[6] = in_r[6]);
out_r[2] = -(out_r2[5] = in_r[5]); out_r[2] = -(out_r2[5] = in_r[5]);
@ -304,6 +330,7 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
in_r -= 8; in_r -= 8;
out_r += 8; out_r += 8;
out_r2 -= 8; out_r2 -= 8;
#endif
} }
in_r = output + n2+n4; in_r = output + n2+n4;
in_r2 = output + n-4; in_r2 = output + n-4;
@ -311,6 +338,29 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
out_r2 = output + n2 + n4 - 4; out_r2 = output + n2 + n4 - 4;
while(in_r<in_r2) while(in_r<in_r2)
{ {
#if defined CPU_COLDFIRE
asm volatile(
"movem.l (%[in_r]), %%d0-%%d3\n\t"
"movem.l %%d0-%%d3, (%[out_r])\n\t"
"movem.l (%[in_r2]), %%d4-%%d7\n\t"
"movem.l %%d4-%%d7, (%[out_r2])\n\t"
"move.l %%d0, %%a3\n\t"
"move.l %%d3, %%d0\n\t"
"move.l %%d1, %%d3\n\t"
"movem.l %%d0/%%d2-%%d3/%%a3, (%[in_r2])\n\t"
"move.l %%d7, %%d1\n\t"
"move.l %%d6, %%d2\n\t"
"move.l %%d5, %%d3\n\t"
"movem.l %%d1-%%d4, (%[in_r])\n\t"
"lea.l (4*4, %[in_r]), %[in_r]\n\t"
"lea.l (-4*4, %[in_r2]), %[in_r2]\n\t"
"lea.l (4*4, %[out_r]), %[out_r]\n\t"
"lea.l (-4*4, %[out_r2]), %[out_r2]\n\t"
: [in_r] "+a" (in_r), [in_r2] "+a" (in_r2),
[out_r] "+a" (out_r), [out_r2] "+a" (out_r2)
:
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a3", "memory", "cc" );
#else
register fixed32 t0,t1,t2,t3; register fixed32 t0,t1,t2,t3;
register fixed32 s0,s1,s2,s3; register fixed32 s0,s1,s2,s3;
@ -344,6 +394,7 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
in_r2 -= 4; in_r2 -= 4;
out_r += 4; out_r += 4;
out_r2 -= 4; out_r2 -= 4;
#endif
} }
} }
#else #else
@ -377,7 +428,7 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
"stmia %[out_r]!, {r0-r3,r5-r8}\n\t" "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
: [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2) : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
: :
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
} }
in_r = output + n2+n4; in_r = output + n2+n4;
in_r2 = output + n; in_r2 = output + n;
@ -401,7 +452,7 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
: :
[in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2) [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
: :
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "memory" );
} }
} }
#endif #endif