From 2640bdb262d07bf910a6ff614834d73713bdf4a4 Mon Sep 17 00:00:00 2001
From: Jens Arnold <amiconn@rockbox.org>
Date: Thu, 18 Oct 2007 22:37:33 +0000
Subject: [PATCH] APE codec: Assembler optimised vector math routines for
 coldfire. -c2000 is now usable at 130% realtime (was 107%), -c3000 is near
 realtime (93%, was 64%). -c1000 doesn't change.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15194 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/demac/libdemac/decoder.c          |  11 +-
 apps/codecs/demac/libdemac/filter.c           |  13 ++
 apps/codecs/demac/libdemac/vector_math16_cf.h | 180 ++++++++++++++++++
 3 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 apps/codecs/demac/libdemac/vector_math16_cf.h

diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c
index 4f4a583d00..326e893ec4 100644
--- a/apps/codecs/demac/libdemac/decoder.c
+++ b/apps/codecs/demac/libdemac/decoder.c
@@ -32,12 +32,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 
 /* Statically allocate the filter buffers */
 
-static int16_t filterbuf32[(32*3 + HISTORY_SIZE) * 2] IBSS_ATTR;   /* 4480 bytes */
-static int16_t filterbuf256[(256*3 + HISTORY_SIZE) * 2] IBSS_ATTR;  /* 5120 bytes */
+static int16_t filterbuf32[(32*3 + HISTORY_SIZE) * 2]     /* 4480 bytes */
+               IBSS_ATTR __attribute__((aligned(16)));
+static int16_t filterbuf256[(256*3 + HISTORY_SIZE) * 2]   /* 5120 bytes */
+               IBSS_ATTR __attribute__((aligned(16)));
 
 /* This is only needed for "insane" files, and no Rockbox targets can
    hope to decode them in realtime anyway. */
-static int16_t filterbuf1280[(1280*3 + HISTORY_SIZE) * 2]; /* 17408 bytes */
+static int16_t filterbuf1280[(1280*3 + HISTORY_SIZE) * 2] /* 17408 bytes */
+               __attribute__((aligned(16)));
 
 void init_frame_decoder(struct ape_ctx_t* ape_ctx,
                         unsigned char* inbuffer, int* firstbyte,
@@ -163,7 +166,7 @@ int decode_chunk(struct ape_ctx_t* ape_ctx,
         }
 
         /* Now apply the predictor decoding */
-	predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count);
+        predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count);
 
         if (ape_ctx->bps == 8) {
             /* TODO: Handle 8-bit streams */
diff --git a/apps/codecs/demac/libdemac/filter.c b/apps/codecs/demac/libdemac/filter.c
index 131c152590..ac12959241 100644
--- a/apps/codecs/demac/libdemac/filter.c
+++ b/apps/codecs/demac/libdemac/filter.c
@@ -25,10 +25,15 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #include <string.h>
 #include <inttypes.h>
 
+#include "codecs.h" /* for Rockbox CPU definitions etc */
 #include "demac.h"
 #include "filter.h"
 
+#ifdef CPU_COLDFIRE
+#include "vector_math16_cf.h"
+#else
 #include "vector_math16.h"
+#endif
 
 struct filter_t {
     int16_t* coeffs; /* ORDER entries */
@@ -84,6 +89,10 @@ static inline void do_apply_filter_3980(struct filter_t* f, int32_t* data, int c
     int res;
     int absres;
 
+#ifdef PREPARE_SCALARPRODUCT
+    PREPARE_SCALARPRODUCT
+#endif
+
     while(count--)
     {
         res = FP_TO_INT(scalarproduct(f->delay - ORDER, f->coeffs));
@@ -135,6 +144,10 @@ static inline void do_apply_filter_3980(struct filter_t* f, int32_t* data, int c
 static inline void do_apply_filter_3970(struct filter_t* f, int32_t* data, int count)
 {
     int res;
+    
+#ifdef PREPARE_SCALARPRODUCT
+    PREPARE_SCALARPRODUCT
+#endif
 
     while(count--)
     {
diff --git a/apps/codecs/demac/libdemac/vector_math16_cf.h b/apps/codecs/demac/libdemac/vector_math16_cf.h
new file mode 100644
index 0000000000..85258c97a5
--- /dev/null
+++ b/apps/codecs/demac/libdemac/vector_math16_cf.h
@@ -0,0 +1,180 @@
+/*
+
+libdemac - A Monkey's Audio decoder
+
+$Id$
+
+Copyright (C) Dave Chapman 2007
+
+Coldfire vector math copyright (C) 2007 Jens Arnold
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
+
+*/
+
+static inline void vector_add(int16_t* v1, int16_t* v2)
+{
+#define ADDHALFREGS(s1, sum)            /* 's1' can be an A or D reg */    \
+        "move.l " #s1  ",   %%d4  \n"   /* 'sum' must be a D reg */        \
+        "add.l  " #sum ", " #s1  "\n"   /* 's1' and %%d4 are clobbered! */ \
+        "clr.w    %%d4            \n"   \
+        "add.l    %%d4  , " #sum "\n"   \
+        "move.w " #s1  ", " #sum "\n"
+
+    asm volatile (
+#if ORDER > 16
+        "moveq.l %[cnt], %%d5        \n"
+    "1:                              \n"
+#endif
+        "movem.l (%[v1]), %%d0-%%d3  \n"
+        "movem.l (%[v2]), %%a0-%%a3  \n"
+        
+        ADDHALFREGS(%%a0, %%d0)
+        ADDHALFREGS(%%a1, %%d1)
+        ADDHALFREGS(%%a2, %%d2)
+        ADDHALFREGS(%%a3, %%d3)
+
+        "movem.l %%d0-%%d3, (%[v1])  \n"
+        "lea.l   (16, %[v1]), %[v1]  \n"
+        "movem.l (%[v1]), %%d0-%%d3  \n"
+        "lea.l   (16, %[v2]), %[v2]  \n"
+        "movem.l (%[v2]), %%a0-%%a3  \n"
+
+        ADDHALFREGS(%%a0, %%d0)
+        ADDHALFREGS(%%a1, %%d1)
+        ADDHALFREGS(%%a2, %%d2)
+        ADDHALFREGS(%%a3, %%d3)
+
+        "movem.l %%d0-%%d3, (%[v1])  \n"
+#if ORDER > 16
+        "lea.l   (16, %[v1]), %[v1]  \n"
+        "lea.l   (16, %[v2]), %[v2]  \n"
+        "subq.l  #1, %%d5            \n"
+        "bne.s   1b                  \n"
+#endif
+        : /* outputs */
+        [v1]"+a"(v1),
+        [v2]"+a"(v2)
+        : /* inputs */
+        [cnt]"n"(ORDER>>4)
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5",
+        "a0", "a1", "a2", "a3", "memory"
+    );
+}
+
+static inline void vector_sub(int16_t* v1, int16_t* v2)
+{
+#define SUBHALFREGS(min, sub, dif)      /* 'min' can be an A or D reg */     \
+        "move.l " #min ", " #dif "\n"   /* 'sub' and 'dif' must be D regs */ \
+        "sub.l  " #sub ", " #min "\n"   /* 'min' and 'sub' are clobbered! */ \
+        "clr.w  " #sub           "\n"   \
+        "sub.l  " #sub ", " #dif "\n"   \
+        "move.w " #min ", " #dif "\n" 
+
+    asm volatile (
+#if ORDER > 16
+        "moveq.l %[cnt], %%d5        \n"
+    "1:                              \n"
+#endif
+        "movem.l (%[v1]), %%a0-%%a3  \n"
+        "movem.l (%[v2]), %%d1-%%d4  \n"
+        
+        SUBHALFREGS(%%a0, %%d1, %%d0)
+        SUBHALFREGS(%%a1, %%d2, %%d1)
+        SUBHALFREGS(%%a2, %%d3, %%d2)
+        SUBHALFREGS(%%a3, %%d4, %%d3)
+
+        "movem.l %%d0-%%d3, (%[v1])  \n"
+        "lea.l   (16, %[v1]), %[v1]  \n"
+        "movem.l (%[v1]), %%a0-%%a3  \n"
+        "lea.l   (16, %[v2]), %[v2]  \n"
+        "movem.l (%[v2]), %%d1-%%d4  \n"
+            
+        SUBHALFREGS(%%a0, %%d1, %%d0)
+        SUBHALFREGS(%%a1, %%d2, %%d1)
+        SUBHALFREGS(%%a2, %%d3, %%d2)
+        SUBHALFREGS(%%a3, %%d4, %%d3)
+
+        "movem.l %%d0-%%d3, (%[v1])  \n"
+#if ORDER > 16
+        "lea.l   (16, %[v1]), %[v1]  \n"
+        "lea.l   (16, %[v2]), %[v2]  \n"
+        "subq.l  #1, %%d5            \n"
+        "bne.s   1b                  \n"
+#endif
+        : /* outputs */
+        [v1]"+a"(v1),
+        [v2]"+a"(v2)
+        : /* inputs */
+        [cnt]"n"(ORDER>>4)
+        : /* clobbers */
+        "d0", "d1", "d2", "d3", "d4", "d5",
+        "a0", "a1", "a2", "a3", "memory"
+    );
+}
+
+#define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
+
+/* Needs EMAC in signed integer mode! */
+static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
+{
+    int res = 0;
+
+#define MACBLOCK4                                        \
+        "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n" \
+        "mac.w   %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n" \
+        "mac.w   %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n" \
+        "mac.w   %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n"
+
+    asm volatile (
+#if ORDER > 32
+        "moveq.l %[cnt], %[res]                      \n"
+#endif
+        "move.l  (%[v1])+, %%d0                      \n"
+        "move.l  (%[v2])+, %%d1                      \n"
+    "1:                                              \n"
+#if ORDER > 16
+        MACBLOCK4
+        MACBLOCK4
+        MACBLOCK4
+        MACBLOCK4
+#endif
+        MACBLOCK4
+        MACBLOCK4
+        MACBLOCK4
+        "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
+        "mac.w   %%d0l, %%d1l, (%[v2])+, %%d3, %%acc0\n"
+#if ORDER > 32
+        "mac.w   %%d2u, %%d3u, (%[v1])+, %%d0, %%acc0\n"
+        "mac.w   %%d2l, %%d3l, (%[v2])+, %%d1, %%acc0\n"
+        "subq.l  #1, %[res]                          \n"
+        "bne.w   1b                                  \n"
+#else
+        "mac.w   %%d2u, %%d3u, %%acc0                \n"
+        "mac.w   %%d2l, %%d3l, %%acc0                \n"
+#endif
+        "movclr.l %%acc0, %[res]                     \n"
+        : /* outputs */
+        [v1]"+a"(v1),
+        [v2]"+a"(v2),
+        [res]"=&d"(res)
+        : /* inputs */
+        [cnt]"n"(ORDER>>5)
+        : /* clobbers */
+        "d0", "d1", "d2", "d3"
+    );
+    return res;
+}