Vorbis opts: keep floor1 lookup table in IRAM.

Slightly faster 16-bit clipping function. Misc: changed tabs for spaces to conform with Rockbox coding standards. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6608 a1c6a512-1295-4272-9138-f99709370657
2005-06-08 13:09:30 +00:00 · 2005-06-08 13:09:30 +00:00 · 0a3f8e0924
commit 0a3f8e0924
parent ead61c1d18
11 changed files with 863 additions and 835 deletions
--- a/apps/codecs/Tremor/asm_mcf5249.h
+++ b/apps/codecs/Tremor/asm_mcf5249.h
@ -24,13 +24,12 @@
 #ifndef _V_WIDE_MATH
 #define _V_WIDE_MATH

-//#define MB() asm volatile ("" : : : "memory")
 #define MB()

 static inline void mcf5249_init_mac(void) {
  int r;
-  asm volatile ("move.l #0x20, %%macsr;"  // frac, truncate, no saturation
-		"movclr.l %%acc0, %[r];"  // clear accumulators
+  asm volatile ("move.l #0x20, %%macsr;"  /* frac, truncate, no saturation */
+                "movclr.l %%acc0, %[r];"  /* clear accumulators */
                "move.l %%acc0, %%acc1;"
                "move.l %%acc0, %%acc2;"
                "move.l %%acc0, %%acc3;"
@ -38,19 +37,18 @@ static inline void mcf5249_init_mac(void) {
 }

 static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
-  ogg_int32_t r;
-  asm volatile ("mac.l %[x], %[y], %%acc0;"  // multiply into acc
-		"movclr.l %%acc0, %[r];"     // move & clear acc
-		"asr.l #1, %[r];"            // no overflow test
-		: [r] "=d" (r)
-		: [x] "r" (x), [y] "r" (y)
+  asm volatile ("mac.l %[x], %[y], %%acc0;"    /* multiply & shift  */
+                "movclr.l %%acc0, %[x];"       /* move & clear acc */
+                "asr.l #1, %[x];"              /* no overflow test */
+                : [x] "+&d" (x)
+                : [y] "r" (y)
                : "cc");
-  return r;
+  return x;
 }

 static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
-  asm volatile ("mac.l %[x], %[y], %%acc0;" // multiply
-		"movclr.l %%acc0, %[x];"    // move and clear
+  asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
+                "movclr.l %%acc0, %[x];"    /* move and clear */
                : [x] "+&r" (x)
                : [y] "r" (y)
                : "cc");
@ -60,14 +58,14 @@ static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {

 static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
  ogg_int32_t r;
-  asm volatile ("mac.l %[x], %[y], %%acc0;"  // multiply
-		"movclr.l %%acc0, %[r];"     // get higher half
-		"mulu.l %[y], %[x];"         // get lower half
-		"asl.l #8, %[r];"            // hi << 17
+  asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
+                "movclr.l %%acc0, %[r];"     /* get higher half */
+                "mulu.l %[y], %[x];"         /* get lower half */
+                "asl.l #8, %[r];"            /* hi<<16, plus one free */
                "asl.l #8, %[r];"
-		"lsr.l #8, %[x];"            // (unsigned)lo >> 15
+                "lsr.l #8, %[x];"            /* (unsigned)lo >> 15 */
                "lsr.l #7, %[x];"
-		"or.l %[x], %[r];"           // or
+                "or.l %[x], %[r];"           /* logical-or results */
                : [r] "=&d" (r), [x] "+d" (x)
                : [y] "d" (y)
                : "cc");
@ -116,10 +114,28 @@ void XNPROD31(ogg_int32_t  a, ogg_int32_t  b,



-/* is there no better way of doing this using the MAC? */
+
+#if 1 /* Canonical definition */
 #define XPROD32(_a, _b, _t, _v, _x, _y)         \
  { (_x)=MULT32(_a,_t)+MULT32(_b,_v);           \
    (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
+#else
+/* Thom Johansen suggestion; this could loose the lsb by overflow
+   but does it matter in practice? */
+#define XPROD32(_a, _b, _t, _v, _x, _y)     \
+  asm volatile ("mac.l %[a], %[t], %%acc0;" \
+                "mac.l %[b], %[v], %%acc0;" \
+                "mac.l %[b], %[t], %%acc1;" \
+                "msac.l %[a], %[v], %%acc1;" \
+                "movclr.l %%acc0, %[x];" \
+                "asr.l #1, %[x];" \
+                "movclr.l %%acc1, %[y];" \
+                "asr.l #1, %[y];" \
+                : [x] "=&d" (_x), [y] "=&d" (_y) \
+                : [a] "r" (_a), [b] "r" (_b), \
+                  [t] "r" (_t), [v] "r" (_v) \
+                : "cc");
+#endif 


 /* asm versions of vector multiplication for window.c */
@ -127,41 +143,41 @@ void XNPROD31(ogg_int32_t  a, ogg_int32_t  b,
 static inline 
 void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
 {
-  asm volatile ("movem.l (%[d]), %%d0-%%d3;"  // loop start
-		"movem.l (%[w]), %%a0-%%a3;"  // pre-fetch registers
+  asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
+                "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
                "lea.l (4*4, %[w]), %[w];"
-		"bra 1f;"               // jump to loop condition
-		"0:" // loop body
-		// multiply and load next window values
+                "bra 1f;"               /* jump to loop condition */
+                "0:" /* loop body */
+                /* multiply and load next window values */
                "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
                "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
                "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
                "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"              
-		"movclr.l %%acc0, %%d0;"  // get the products
+                "movclr.l %%acc0, %%d0;"  /* get the products */
                "movclr.l %%acc1, %%d1;"
                "movclr.l %%acc2, %%d2;"
                "movclr.l %%acc3, %%d3;"
-		// store and advance
+                /* store and advance */
                "movem.l %%d0-%%d3, (%[d]);"  
                "lea.l (4*4, %[d]), %[d];"
                "movem.l (%[d]), %%d0-%%d3;"
-		"subq.l #4, %[n];"     // done 4 elements
+                "subq.l #4, %[n];"     /* done 4 elements */
                "1: cmpi.l #4, %[n];"
                "bge 0b;"
-		// multiply final elements
+                /* multiply final elements */
                "tst.l %[n];"
-		"beq 1f;"      // n=0
+                "beq 1f;"      /* n=0 */
                "mac.l %%d0, %%a0, %%acc0;"
                "movclr.l %%acc0, %%d0;"
                "move.l %%d0, (%[d])+;"
                "subq.l #1, %[n];"
-		"beq 1f;"     // n=1
+                "beq 1f;"     /* n=1 */
                "mac.l %%d1, %%a1, %%acc0;"
                "movclr.l %%acc0, %%d1;"
                "move.l %%d1, (%[d])+;"
                "subq.l #1, %[n];"
-		"beq 1f;"     // n=2
-		// otherwise n = 3
+                "beq 1f;"     /* n=2 */
+                /* otherwise n = 3 */
                "mac.l %%d2, %%a2, %%acc0;"
                "movclr.l %%acc0, %%d2;"
                "move.l %%d2, (%[d])+;"
@ -174,41 +190,41 @@ void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
 static inline 
 void mcf5249_vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
 {
-  asm volatile ("lea.l (-3*4, %[w]), %[w];"     // loop start
-		"movem.l (%[d]), %%d0-%%d3;"    // pre-fetch registers
+  asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
+                "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
                "movem.l (%[w]), %%a0-%%a3;"
-		"bra 1f;"               // jump to loop condition
-		"0:" // loop body
-		// multiply and load next window value
+                "bra 1f;"               /* jump to loop condition */
+                "0:" /* loop body */
+                /* multiply and load next window value */
                "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
                "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
                "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
                "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"              
-		"movclr.l %%acc0, %%d0;"  // get the products
+                "movclr.l %%acc0, %%d0;"  /* get the products */
                "movclr.l %%acc1, %%d1;"
                "movclr.l %%acc2, %%d2;"
                "movclr.l %%acc3, %%d3;"
-		// store and advance
+                /* store and advance */
                "movem.l %%d0-%%d3, (%[d]);"  
                "lea.l (4*4, %[d]), %[d];"
                "movem.l (%[d]), %%d0-%%d3;"
-		"subq.l #4, %[n];"     // done 4 elements
+                "subq.l #4, %[n];"     /* done 4 elements */
                "1: cmpi.l #4, %[n];"
                "bge 0b;"
-		// multiply final elements
+                /* multiply final elements */
                "tst.l %[n];"
-		"beq 1f;"      // n=0
+                "beq 1f;"      /* n=0 */
                "mac.l %%d0, %%a3, %%acc0;"
                "movclr.l %%acc0, %%d0;"
                "move.l %%d0, (%[d])+;"
                "subq.l #1, %[n];"
-		"beq 1f;"     // n=1
+                "beq 1f;"     /* n=1 */
                "mac.l %%d1, %%a2, %%acc0;"
                "movclr.l %%acc0, %%d1;"
                "move.l %%d1, (%[d])+;"
                "subq.l #1, %[n];"
-		"beq 1f;"     // n=2
-		// otherwise n = 3
+                "beq 1f;"     /* n=2 */
+                /* otherwise n = 3 */
                "mac.l %%d2, %%a1, %%acc0;"
                "movclr.l %%acc0, %%d2;"
                "move.l %%d2, (%[d])+;"
@ -226,23 +242,23 @@ void mcf5249_vect_zero(ogg_int32_t *ptr, int n)
                "clr.l %%d1;"
                "clr.l %%d2;"
                "clr.l %%d3;"
-		// loop start
+                /* loop start */
                "tst.l %[n];"
                "bra 1f;"
                "0: movem.l %%d0-%%d3, (%[ptr]);"
                "lea (4*4, %[ptr]), %[ptr];"
                "subq.l #4, %[n];"
                "1: bgt 0b;"
-		// remaing elements
+                /* remaing elements */
                "tst.l %[n];"
-		"beq 1f;"      // n=0
+                "beq 1f;"      /* n=0 */
                "clr.l (%[ptr])+;"
                "subq.l #1, %[n];"
-		"beq 1f;"     // n=1
+                "beq 1f;"     /* n=1 */
                "clr.l (%[ptr])+;"
                "subq.l #1, %[n];"
-		"beq 1f;"     // n=2
-		// otherwise n = 3
+                "beq 1f;"     /* n=2 */
+                /* otherwise n = 3 */
                "clr.l (%[ptr])+;"
                "1:"
                : [n] "+d" (n), [ptr] "+a" (ptr)
@ -250,6 +266,16 @@ void mcf5249_vect_zero(ogg_int32_t *ptr, int n)
                : "%d0","%d1","%d2","%d3","cc","memory");
 }

+#endif
+
+#ifndef _V_CLIP_MATH
+#define _V_CLIP_MATH
+
+/* this is portable C and simple; why not use this as default? */
+static inline ogg_int32_t CLIP_TO_15(register ogg_int32_t x) {
+  register ogg_int32_t hi=32767, lo=-32768;
+  return (x>=hi ? hi : (x<=lo ? lo : x));
+}

 #endif
 #endif
--- a/apps/codecs/Tremor/floor1.c
+++ b/apps/codecs/Tremor/floor1.c
@ -216,7 +216,8 @@ static int render_point(int x0,int x1,int y0,int y1,int x){
 #  define XdB(n) (n)
 #endif

-static ogg_int32_t FLOOR_fromdB_LOOKUP[256] ={
+/* keep the floor lookup table in fast IRAM */
+static ogg_int32_t FLOOR_fromdB_LOOKUP[256] IDATA_ATTR = {
  XdB(0x000000e5), XdB(0x000000f4), XdB(0x00000103), XdB(0x00000114),
  XdB(0x00000126), XdB(0x00000139), XdB(0x0000014e), XdB(0x00000163),
  XdB(0x0000017a), XdB(0x00000193), XdB(0x000001ad), XdB(0x000001c9),
--- a/apps/codecs/Tremor/mdct.c
+++ b/apps/codecs/Tremor/mdct.c
@ -342,7 +342,7 @@ void mdct_backward(int n, DATA_TYPE *in, DATA_TYPE *out) {
  int step;

 #if CONFIG_CPU == MCF5249
-  mcf5249_init_mac();  /* should be redundant */
+  /* mcf5249_init_mac(); */  /* should be redundant */
 #endif

  for (shift=6;!(n&(1<<shift));shift++);
--- a/apps/codecs/Tremor/mdct_lookup.h
+++ b/apps/codecs/Tremor/mdct_lookup.h
@ -17,7 +17,8 @@
  
 #include "os_types.h"

-/* keep lookup tables in fast IRAM */
+/* we keep the most used sin cosine table in fast IRAM;
+   unfortunately, we don't have the space for both tables */

 /* {sin(2*i*PI/4096), cos(2*i*PI/4096)}, with i = 0 to 512 */
 static LOOKUP_T sincos_lookup0[1026] IDATA_ATTR = {
--- a/apps/codecs/Tremor/misc.h
+++ b/apps/codecs/Tremor/misc.h
@ -37,7 +37,7 @@ void* alloca(size_t size);
  
 #ifndef  _LOW_ACCURACY_
 /* 64 bit multiply */
-//#include <sys/types.h>
+/* #include <sys/types.h> */

 #if BYTE_ORDER==LITTLE_ENDIAN
 union magic {
--- a/apps/codecs/Tremor/synthesis.c
+++ b/apps/codecs/Tremor/synthesis.c
@ -26,8 +26,8 @@


 /* IRAM buffer keep the block pcm data; only for windows size upto 2048
-   for space restrictions. No real compromise, larger window sizes
-   are only used for very low quality settings (q<0?) */
+   for space restrictions. 
+   libVorbis 1.1 Oggenc doesn't use larger windows anyway. */
 /* max 2 channels on the ihp-1xx (stereo), 2048 samples (2*2048*4=16Kb)  */
 #define IRAM_PCM_END      2048    
 #define CHANNELS          2          
--- a/apps/codecs/Tremor/window.c
+++ b/apps/codecs/Tremor/window.c
@ -68,7 +68,7 @@ void _vorbis_apply_window(ogg_int32_t *d,const void *window_p[2],
  long rightend=rightbegin+rn/2;

 #if CONFIG_CPU == MCF5249
-  mcf5249_init_mac();   /* shouldn't be needed, but just in case */
+  /* mcf5249_init_mac(); */ /* shouldn't be needed, but just in case */
  mcf5249_vect_zero(&d[0], leftbegin);
  mcf5249_vect_mult_fw(&d[leftbegin], &window[lW][0], leftend-leftbegin);
  mcf5249_vect_mult_bw(&d[rightbegin], &window[nW][rn/2-1], rightend-rightbegin);
--- a/apps/codecs/Tremor/window_lookup.h
+++ b/apps/codecs/Tremor/window_lookup.h
@ -18,8 +18,10 @@

 #include "os_types.h"

-/* keep small window tables in fast IRAM */
-static LOOKUP_T vwin64[32] IDATA_ATTR = {
+/* Oggenc 1.1 seems to use exclusively windows sizes 256, 2048
+   keep the most common sizes in fast IRAM; 
+   because we have the available space also 128, 512 */
+static LOOKUP_T vwin64[32]   = {
  X(0x001f0003), X(0x01168c98), X(0x030333c8), X(0x05dfe3a4),
  X(0x09a49562), X(0x0e45df18), X(0x13b47ef2), X(0x19dcf676),
  X(0x20a74d83), X(0x27f7137c), X(0x2fabb05a), X(0x37a1105a),
@ -151,7 +153,7 @@ static LOOKUP_T vwin512[256] IDATA_ATTR  = {
  X(0x7ffffdcd), X(0x7fffff6d), X(0x7fffffed), X(0x7fffffff),
 };

-static LOOKUP_T vwin1024[512] IDATA_ATTR = {
+static LOOKUP_T vwin1024[512]  = {
  X(0x00001f02), X(0x0001170e), X(0x00030724), X(0x0005ef40),
  X(0x0009cf59), X(0x000ea767), X(0x0014775e), X(0x001b3f2e),
  X(0x0022fec8), X(0x002bb618), X(0x00356508), X(0x00400b81),
@ -541,8 +543,6 @@ static LOOKUP_T vwin2048[1024] IDATA_ATTR = {
  X(0x7ffffffe), X(0x7fffffff), X(0x7fffffff), X(0x7fffffff),
 };

-/* The remaining large window sizes are used only for very low
-   quality Vorbis files so we don't bother to put them in IRAM */
 static LOOKUP_T vwin4096[2048] = {
  X(0x000001f0), X(0x00001171), X(0x00003072), X(0x00005ef5),
  X(0x00009cf8), X(0x0000ea7c), X(0x00014780), X(0x0001b405),