rockbox/apps/codecs/Tremor/asm_mcf5249.h
Daniel Stenberg 2acc0ac542 Updated our source code header to explicitly mention that we are GPL v2 or
later. We still need to hunt down snippets used that are not. 1324 modified
files...
http://www.rockbox.org/mail/archive/rockbox-dev-archive-2008-06/0060.shtml


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@17847 a1c6a512-1295-4272-9138-f99709370657
2008-06-28 18:10:04 +00:00

327 lines
11 KiB
C

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
*
* Copyright (C) 2005 by Pedro Vasconcelos
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
/* asm routines for wide math on the MCF5249 */
#include "os_types.h"
#if defined(CPU_COLDFIRE)
/* attribute for 16-byte alignment */
#define LINE_ATTR __attribute__ ((aligned (16)))
#ifndef _V_WIDE_MATH
#define _V_WIDE_MATH
#define MB()
static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply & shift */
"movclr.l %%acc0, %[x];" /* move & clear acc */
"asr.l #1, %[x];" /* no overflow test */
: [x] "+&d" (x)
: [y] "r" (y)
: "cc");
return x;
}
static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
"movclr.l %%acc0, %[x];" /* move and clear */
: [x] "+&r" (x)
: [y] "r" (y)
: "cc");
return x;
}
static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
ogg_int32_t r;
asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
"mulu.l %[y], %[x];" /* get lower half, avoid emac stall */
"movclr.l %%acc0, %[r];" /* get higher half */
"asl.l #8, %[r];" /* hi<<16, plus one free */
"asl.l #8, %[r];"
"lsr.l #8, %[x];" /* (unsigned)lo >> 15 */
"lsr.l #7, %[x];"
"or.l %[x], %[r];" /* logical-or results */
: [r] "=&d" (r), [x] "+d" (x)
: [y] "d" (y)
: "cc");
return r;
}
static inline
void XPROD31(ogg_int32_t a, ogg_int32_t b,
ogg_int32_t t, ogg_int32_t v,
ogg_int32_t *x, ogg_int32_t *y)
{
asm volatile ("mac.l %[a], %[t], %%acc0;"
"mac.l %[b], %[v], %%acc0;"
"mac.l %[b], %[t], %%acc1;"
"msac.l %[a], %[v], %%acc1;"
"movclr.l %%acc0, %[a];"
"move.l %[a], (%[x]);"
"movclr.l %%acc1, %[a];"
"move.l %[a], (%[y]);"
: [a] "+&r" (a)
: [x] "a" (x), [y] "a" (y),
[b] "r" (b), [t] "r" (t), [v] "r" (v)
: "cc", "memory");
}
static inline
void XNPROD31(ogg_int32_t a, ogg_int32_t b,
ogg_int32_t t, ogg_int32_t v,
ogg_int32_t *x, ogg_int32_t *y)
{
asm volatile ("mac.l %[a], %[t], %%acc0;"
"msac.l %[b], %[v], %%acc0;"
"mac.l %[b], %[t], %%acc1;"
"mac.l %[a], %[v], %%acc1;"
"movclr.l %%acc0, %[a];"
"move.l %[a], (%[x]);"
"movclr.l %%acc1, %[a];"
"move.l %[a], (%[y]);"
: [a] "+&r" (a)
: [x] "a" (x), [y] "a" (y),
[b] "r" (b), [t] "r" (t), [v] "r" (v)
: "cc", "memory");
}
#if 0 /* canonical Tremor definition */
#define XPROD32(_a, _b, _t, _v, _x, _y) \
{ (_x)=MULT32(_a,_t)+MULT32(_b,_v); \
(_y)=MULT32(_b,_t)-MULT32(_a,_v); }
#endif
/* this could lose the LSB by overflow, but i don't think it'll ever happen.
if anyone think they can hear a bug caused by this, please try the above
version. */
#define XPROD32(_a, _b, _t, _v, _x, _y) \
asm volatile ("mac.l %[a], %[t], %%acc0;" \
"mac.l %[b], %[v], %%acc0;" \
"mac.l %[b], %[t], %%acc1;" \
"msac.l %[a], %[v], %%acc1;" \
"movclr.l %%acc0, %[x];" \
"asr.l #1, %[x];" \
"movclr.l %%acc1, %[y];" \
"asr.l #1, %[y];" \
: [x] "=&d" (_x), [y] "=&d" (_y) \
: [a] "r" (_a), [b] "r" (_b), \
[t] "r" (_t), [v] "r" (_v) \
: "cc");
#ifndef _V_VECT_OPS
#define _V_VECT_OPS
/* asm versions of vector operations for block.c, window.c */
/* assumes MAC is initialized & accumulators cleared */
static inline
void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
{
/* align to 16 bytes */
while(n>0 && (int)x&16) {
*x++ += *y++;
n--;
}
asm volatile ("bra 1f;"
"0:" /* loop start */
"movem.l (%[x]), %%d0-%%d3;" /* fetch values */
"movem.l (%[y]), %%a0-%%a3;"
/* add */
"add.l %%a0, %%d0;"
"add.l %%a1, %%d1;"
"add.l %%a2, %%d2;"
"add.l %%a3, %%d3;"
/* store and advance */
"movem.l %%d0-%%d3, (%[x]);"
"lea.l (4*4, %[x]), %[x];"
"lea.l (4*4, %[y]), %[y];"
"subq.l #4, %[n];" /* done 4 elements */
"1: cmpi.l #4, %[n];"
"bge 0b;"
: [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
"cc", "memory");
/* add final elements */
while (n>0) {
*x++ += *y++;
n--;
}
}
static inline
void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
{
/* align to 16 bytes */
while(n>0 && (int)x&16) {
*x++ = *y++;
n--;
}
asm volatile ("bra 1f;"
"0:" /* loop start */
"movem.l (%[y]), %%d0-%%d3;" /* fetch values */
"movem.l %%d0-%%d3, (%[x]);" /* store */
"lea.l (4*4, %[x]), %[x];" /* advance */
"lea.l (4*4, %[y]), %[y];"
"subq.l #4, %[n];" /* done 4 elements */
"1: cmpi.l #4, %[n];"
"bge 0b;"
: [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
: : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
/* copy final elements */
while (n>0) {
*x++ = *y++;
n--;
}
}
static inline
void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
{
/* ensure data is aligned to 16-bytes */
while(n>0 && (int)data%16) {
*data = MULT31(*data, *window);
data++;
window++;
n--;
}
asm volatile ("movem.l (%[d]), %%d0-%%d3;" /* loop start */
"movem.l (%[w]), %%a0-%%a3;" /* pre-fetch registers */
"lea.l (4*4, %[w]), %[w];"
"bra 1f;" /* jump to loop condition */
"0:" /* loop body */
/* multiply and load next window values */
"mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
"mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
"mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
"mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
"movclr.l %%acc0, %%d0;" /* get the products */
"movclr.l %%acc1, %%d1;"
"movclr.l %%acc2, %%d2;"
"movclr.l %%acc3, %%d3;"
/* store and advance */
"movem.l %%d0-%%d3, (%[d]);"
"lea.l (4*4, %[d]), %[d];"
"movem.l (%[d]), %%d0-%%d3;"
"subq.l #4, %[n];" /* done 4 elements */
"1: cmpi.l #4, %[n];"
"bge 0b;"
/* multiply final elements */
"tst.l %[n];"
"beq 1f;" /* n=0 */
"mac.l %%d0, %%a0, %%acc0;"
"movclr.l %%acc0, %%d0;"
"move.l %%d0, (%[d])+;"
"subq.l #1, %[n];"
"beq 1f;" /* n=1 */
"mac.l %%d1, %%a1, %%acc0;"
"movclr.l %%acc0, %%d1;"
"move.l %%d1, (%[d])+;"
"subq.l #1, %[n];"
"beq 1f;" /* n=2 */
/* otherwise n = 3 */
"mac.l %%d2, %%a2, %%acc0;"
"movclr.l %%acc0, %%d2;"
"move.l %%d2, (%[d])+;"
"1:"
: [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
"cc", "memory");
}
static inline
void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
{
/* ensure at least data is aligned to 16-bytes */
while(n>0 && (int)data%16) {
*data = MULT31(*data, *window);
data++;
window--;
n--;
}
asm volatile ("lea.l (-3*4, %[w]), %[w];" /* loop start */
"movem.l (%[d]), %%d0-%%d3;" /* pre-fetch registers */
"movem.l (%[w]), %%a0-%%a3;"
"bra 1f;" /* jump to loop condition */
"0:" /* loop body */
/* multiply and load next window value */
"mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
"mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
"mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
"mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
"movclr.l %%acc0, %%d0;" /* get the products */
"movclr.l %%acc1, %%d1;"
"movclr.l %%acc2, %%d2;"
"movclr.l %%acc3, %%d3;"
/* store and advance */
"movem.l %%d0-%%d3, (%[d]);"
"lea.l (4*4, %[d]), %[d];"
"movem.l (%[d]), %%d0-%%d3;"
"subq.l #4, %[n];" /* done 4 elements */
"1: cmpi.l #4, %[n];"
"bge 0b;"
/* multiply final elements */
"tst.l %[n];"
"beq 1f;" /* n=0 */
"mac.l %%d0, %%a3, %%acc0;"
"movclr.l %%acc0, %%d0;"
"move.l %%d0, (%[d])+;"
"subq.l #1, %[n];"
"beq 1f;" /* n=1 */
"mac.l %%d1, %%a2, %%acc0;"
"movclr.l %%acc0, %%d1;"
"move.l %%d1, (%[d])+;"
"subq.l #1, %[n];"
"beq 1f;" /* n=2 */
/* otherwise n = 3 */
"mac.l %%d2, %%a1, %%acc0;"
"movclr.l %%acc0, %%d2;"
"move.l %%d2, (%[d])+;"
"1:"
: [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
: : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
"cc", "memory");
}
#endif
#endif
#ifndef _V_CLIP_MATH
#define _V_CLIP_MATH
/* this is portable C and simple; why not use this as default? */
static inline ogg_int32_t CLIP_TO_15(register ogg_int32_t x) {
register ogg_int32_t hi=32767, lo=-32768;
return (x>=hi ? hi : (x<=lo ? lo : x));
}
#endif
#else
#define LINE_ATTR
#endif