rockbox/apps/codecs/libFLAC/coldfire.S

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2005 by Thom Johansen
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

/* The following is a first attempt at an assembler optimized version of
   FLAC__lpc_restore_signal programmed for MFC5249 or any other similar
   ColdFire V2 core with the EMAC unit.
*/
    .section .icode,"ax",@progbits
    .global FLAC__lpc_restore_signal_mcf5249
    .align 2
FLAC__lpc_restore_signal_mcf5249:
    lea.l (-44, %sp), %sp
    movem.l %d2-%d7/%a2-%a6, (%sp)
    move.l (44+4, %sp), %a0  /* residual */
    move.l (44+8, %sp), %d0  /* data_len */
    move.l (44+12, %sp), %a1 /* qlp_coef */
    move.l (44+16, %sp), %d2 /* order */
    move.l (44+20, %sp), %d1 /* lp_quantization */
    move.l (44+24, %sp), %a2 /* data */
    /* the data pointer always lags behind history pointer by 'order' samples.
       since we have one loop for each order, we can hard code this and free
       a register by not saving data pointer.
     */
    move.l %d2, %d3
    neg.l %d3
    lea.l (%a2, %d3.l*4), %a2 /* history */
    clr.l %d3
    move.l %d3, %macsr /* we'll need integer mode for this */
    tst.l %d0
    jeq .Lexit         /* zero samples to process */
    movq.l #8, %d3
    cmp.l %d3, %d2     /* coldfire v2 only has long cmp version */
    jgt .Ldefault      /* order is over 8, jump to default case */
    lea.l .Ljumptable, %a4
    move.l (%a4, %d2.l*4), %a4
    jmp (%a4)
    .align 4           /* avoid unaligned fetch */
.Ljumptable:
    .long .Lexit
    .long .Lorder1
    .long .Lorder2
    .long .Lorder3
    .long .Lorder4
    .long .Lorder5
    .long .Lorder6
    .long .Lorder7
    .long .Lorder8

.Lorder8:
    movem.l (%a1), %d3-%d7/%a3-%a5 /* load lpc coefs */
    movea.l (%a2), %a6             /* load first history sample */
.Lloop8:
    mac.l %a6, %a5, (1*4, %a2), %a6, %acc0
    mac.l %a6, %a4, (2*4, %a2), %a6, %acc0
    mac.l %a6, %a3, (3*4, %a2), %a6, %acc0
    mac.l %a6, %d7, (4*4, %a2), %a6, %acc0
    mac.l %a6, %d6, (5*4, %a2), %a6, %acc0
    mac.l %a6, %d5, (6*4, %a2), %a6, %acc0
    mac.l %a6, %d4, (7*4, %a2), %a6, %acc0
    mac.l %a6, %d3, (1*4, %a2), %a6, %acc0 /* load for the next iteration */
    addq.l #4, %a2        /* increment history pointer */
    movclr.l %acc0, %d2   /* get sum */
    asr.l %d1, %d2        /* shift sum by lp_quantization bits */
    add.l (%a0)+, %d2     /* add residual and increment residual pointer */
    move.l %d2, (28, %a2) /* save result to data */
    subq.l #1, %d0        /* decrement counter */
    jne .Lloop8           /* are we done? */
    jra .Lexit

.Lorder7:
    movem.l (%a1), %d3-%d7/%a3-%a4
    movea.l (%a2), %a6
.Lloop7:
    mac.l %a6, %a4, (1*4, %a2), %a6, %acc0
    mac.l %a6, %a3, (2*4, %a2), %a6, %acc0
    mac.l %a6, %d7, (3*4, %a2), %a6, %acc0
    mac.l %a6, %d6, (4*4, %a2), %a6, %acc0
    mac.l %a6, %d5, (5*4, %a2), %a6, %acc0
    mac.l %a6, %d4, (6*4, %a2), %a6, %acc0
    mac.l %a6, %d3, (1*4, %a2), %a6, %acc0
    addq.l #4, %a2
    movclr.l %acc0, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (24, %a2)
    subq.l #1, %d0
    jne .Lloop7
    jra .Lexit

.Lorder6:
    movem.l (%a1), %d3-%d7/%a3
    movea.l (%a2), %a6
.Lloop6:
    mac.l %a6, %a3, (1*4, %a2), %a6, %acc0
    mac.l %a6, %d7, (2*4, %a2), %a6, %acc0
    mac.l %a6, %d6, (3*4, %a2), %a6, %acc0
    mac.l %a6, %d5, (4*4, %a2), %a6, %acc0
    mac.l %a6, %d4, (5*4, %a2), %a6, %acc0
    mac.l %a6, %d3, (1*4, %a2), %a6, %acc0
    addq.l #4, %a2
    movclr.l %acc0, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (20, %a2)
    subq.l #1, %d0
    jne .Lloop6
    jra .Lexit

.Lorder5:
    movem.l (%a1), %d3-%d7
    movea.l (%a2), %a6
.Lloop5:
    mac.l %a6, %d7, (1*4, %a2), %a6, %acc0
    mac.l %a6, %d6, (2*4, %a2), %a6, %acc0
    mac.l %a6, %d5, (3*4, %a2), %a6, %acc0
    mac.l %a6, %d4, (4*4, %a2), %a6, %acc0
    mac.l %a6, %d3, (1*4, %a2), %a6, %acc0
    addq.l #4, %a2
    movclr.l %acc0, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (16, %a2)
    subq.l #1, %d0
    jne .Lloop5
    jra .Lexit

.Lorder4:
    movem.l (%a1), %d3-%d6
    movea.l (%a2), %a6
.Lloop4:
    mac.l %a6, %d6, (1*4, %a2), %a6, %acc0
    mac.l %a6, %d5, (2*4, %a2), %a6, %acc0
    mac.l %a6, %d4, (3*4, %a2), %a6, %acc0
    mac.l %a6, %d3, (1*4, %a2), %a6, %acc0
    addq.l #4, %a2
    movclr.l %acc0, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (12, %a2)
    subq.l #1, %d0
    jne .Lloop4
    jra .Lexit

.Lorder3:
    movem.l (%a1), %d3-%d5
    movea.l (%a2), %a6
.Lloop3:
    mac.l %a6, %d5, (1*4, %a2), %a6, %acc0
    mac.l %a6, %d4, (2*4, %a2), %a6, %acc0
    mac.l %a6, %d3, (1*4, %a2), %a6, %acc0
    addq.l #4, %a2
    movclr.l %acc0, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (8, %a2)
    subq.l #1, %d0
    jne .Lloop3
    jra .Lexit

.Lorder2:
    movem.l (%a1), %d3-%d4
    movea.l (%a2), %a6
.Lloop2:
    mac.l %a6, %d4, (1*4, %a2), %a6, %acc0
    mac.l %a6, %d3, %acc0 /* data for next iteration is already loaded */
    addq.l #4, %a2
    movclr.l %acc0, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (4, %a2)
    subq.l #1, %d0
    jne .Lloop2
    jra .Lexit

.Lorder1:
    /* no point in using mac here */
    move.l (%a1), %d3
.Lloop1:
    move.l %d3, %d2
    muls.l (%a2)+, %d2
    asr.l %d1, %d2
    add.l (%a0)+, %d2
    move.l %d2, (%a2)
    subq.l #1, %d0
    jne .Lloop1
    jra .Lexit

.Ldefault:
    /* we do the filtering in an unrolled by 4 loop as far as we can, and then
       do the rest in an ordinary on by one sample loop.
     */
    lea.l (%a1, %d2.l*4), %a3 /* need to start in the other end of coefs */
    movea.l %a2, %a4    /* working copy of history pointer */
    move.l %d2, %d3
    lsr.l #2, %d3       /* coefs/4, number of iterations needed in next loop */
    movea.l (%a4)+, %a6 /* preload lpc coef for loop */
.Ldloop1:
    lea.l (-16, %a3), %a3  /* move lpc coef pointer four samples backwards */
    movem.l (%a3), %d4-%d7 /* load four coefs */
    mac.l %a6, %d7, (%a4)+, %a6, %acc0
    mac.l %a6, %d6, (%a4)+, %a6, %acc0
    mac.l %a6, %d5, (%a4)+, %a6, %acc0
    mac.l %a6, %d4, (%a4)+, %a6, %acc0
    subq.l #1, %d3         /* any more unrolled loop operations left? */
    jne .Ldloop1

    move.l %d2, %d3
    movq.l #3, %d4        /* mask 0x00000003 */
    and.l %d4, %d3        /* get the remaining samples to be filtered */
    jeq .Ldsave           /* no remaining samples */
.Ldloop2:
    move.l -(%a3), %d4    /* get lpc coef */
    mac.l %a6, %d4, (%a4)+, %a6, %acc0
    subq.l #1, %d3        /* any more iterations left? */
    jne .Ldloop2
.Ldsave:
    movclr.l %acc0, %d3   /* get result */
    asr.l %d1, %d3        /* shift lp_quantization bits right */
    add.l (%a0)+, %d3     /* add residual */
    move.l %d3, (-4, %a4) /* history pointer is one sample past data pointer */
    addq.l #4, %a2        /* increment history pointer */
    subq.l #1, %d0        /* decrement data_len */
    jne .Ldefault         /* are we done? */
                          /* if so, fall through to exit */

.Lexit:
    movem.l (%sp), %d2-%d7/%a2-%a6
    lea.l (44, %sp), %sp
    rts