/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2006 by David Bryant
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

/* This is an assembly optimized version of the following WavPack function:
 *
 * void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp,
 *                                   long *buffer, long sample_count);
 *
 * It performs a single pass of stereo decorrelation on the provided buffer.
 * Note that this version of the function requires that the 8 previous stereo
 * samples are visible and correct. In other words, it ignores the "samples_*"
 * fields in the decorr_pass structure and gets the history data directly
 * from the buffer. It does, however, return the appropriate history samples
 * to the decorr_pass structure before returning.
 *
 * This is written to work on a ARM7TDMI processor. This version only uses the
 * 32-bit multiply-accumulate instruction and so will overflow with 24-bit
 * WavPack files.
 */

#include "config.h"

        .text
        .align
        .global         decorr_stereo_pass_cont_arm

/*
 * on entry:
 *
 * r0 = struct decorr_pass *dpp
 * r1 = long *buffer
 * r2 = long sample_count
 */

decorr_stereo_pass_cont_arm:

        stmfd   sp!, {r4 - r8, r10, r11, lr}
        mov     r5, r0                  @ r5 = dpp
        mov     r11, #512               @ r11 = 512 for rounding
        ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
        ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
        ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
        cmp     r2, #0                  @ exit if no samples to process
        beq     common_exit

        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
        ldrsh   r2, [r5, #0]            @ r2 = dpp->term
        cmp     r2, #0
        bmi     minus_term

        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
        ldr     r8, [r1, #-8]
        ldr     r3, [r1, #-4]
        cmp     r2, #17
        beq     term_17_loop
        cmp     r2, #18
        beq     term_18_loop
        cmp     r2, #2
        beq     term_2_loop
        b       term_default_loop       @ else handle default (1-8, except 2)

minus_term:
        mov     r10, #1024              @ r10 = -1024 for weight clipping
        rsb     r10, r10, #0            @  (only used for negative terms)
        cmn     r2, #1
        beq     term_minus_1
        cmn     r2, #2
        beq     term_minus_2
        cmn     r2, #3
        beq     term_minus_3
        b       common_exit

/*
 ******************************************************************************
 * Loop to handle term = 17 condition
 *
 * r0 = dpp->weight_B           r8 = previous left sample
 * r1 = bptr                    r9 = 
 * r2 = current sample          r10 = second previous left sample
 * r3 = previous right sample   r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = current decorrelation value
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr = second previous right sample
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_17_loop:
        rsbs    ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
        mov     lr, r8                  @ previous becomes 2nd previous
        ldr     r2, [r1], #4            @ get sample & update pointer
        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
        add     r8, r2, r8, asr #10     @  shift, and add to new sample
        strne   r8, [r1, #-4]           @ if change possible, store sample back
        cmpne   r2, #0
        beq     .L325
        teq     ip, r2                  @ update weight based on signs
        submi   r4, r4, r6
        addpl   r4, r4, r6

.L325:  rsbs    ip, r10, r3, asl #1     @ do same thing for right channel
        mov     r10, r3
        ldr     r2, [r1], #4
        mla     r3, ip, r0, r11
        add     r3, r2, r3, asr #10
        strne   r3, [r1, #-4]
        cmpne   r2, #0
        beq     .L329
        teq     ip, r2
        submi   r0, r0, r6
        addpl   r0, r0, r6

.L329:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_17_loop
        b       store_1718              @ common exit for terms 17 & 18

/*
 ******************************************************************************
 * Loop to handle term = 18 condition
 *
 * r0 = dpp->weight_B           r8 = previous left sample
 * r1 = bptr                    r9 = 
 * r2 = current sample          r10 = second previous left sample
 * r3 = previous right sample   r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = decorrelation value
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr = second previous right sample
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_18_loop:
        sub     ip, r8, lr              @ decorr value =
        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
        adds    ip, r8, ip, asr #1
        ldr     r2, [r1], #4            @ get sample & update pointer
        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
        add     r8, r2, r8, asr #10     @  shift, and add to new sample
        strne   r8, [r1, #-4]           @ if change possible, store sample back
        cmpne   r2, #0
        beq     .L337
        teq     ip, r2                  @ update weight based on signs
        submi   r4, r4, r6
        addpl   r4, r4, r6

.L337:  sub     ip, r3, r10             @ do same thing for right channel
        mov     r10, r3
        adds    ip, r3, ip, asr #1
        ldr     r2, [r1], #4
        mla     r3, ip, r0, r11
        add     r3, r2, r3, asr #10
        strne   r3, [r1, #-4]
        cmpne   r2, #0
        beq     .L341
        teq     ip, r2
        submi   r0, r0, r6
        addpl   r0, r0, r6

.L341:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_18_loop

/* common exit for terms 17 & 18 */

store_1718:
        str     r3, [r5, #40]           @ store sample history into struct
        str     r8, [r5, #8]
        str     r10, [r5, #44]
        str     lr, [r5, #12]
        b       common_exit             @ and return

/*
 ******************************************************************************
 * Loop to handle term = 2 condition
 * (note that this case can be handled by the default term handler (1-8), but
 * this special case is faster because it doesn't have to read memory twice)
 *
 * r0 = dpp->weight_B           r8 = previous left sample
 * r1 = bptr                    r9 = 
 * r2 = current sample          r10 = second previous left sample
 * r3 = previous right sample   r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = decorrelation value
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr = second previous right sample
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_2_loop:
        movs    ip, lr                  @ get decorrelation value & test
        mov     lr, r8                  @ previous becomes 2nd previous
        ldr     r2, [r1], #4            @ get sample & update pointer
        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
        add     r8, r2, r8, asr #10     @  shift, and add to new sample
        strne   r8, [r1, #-4]           @ if change possible, store sample back
        cmpne   r2, #0
        beq     .L225
        teq     ip, r2                  @ update weight based on signs
        submi   r4, r4, r6
        addpl   r4, r4, r6

.L225:  movs    ip, r10                 @ do same thing for right channel
        mov     r10, r3
        ldr     r2, [r1], #4
        mla     r3, ip, r0, r11
        add     r3, r2, r3, asr #10
        strne   r3, [r1, #-4]
        cmpne   r2, #0
        beq     .L229
        teq     ip, r2
        submi   r0, r0, r6
        addpl   r0, r0, r6

.L229:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_2_loop
        b       default_term_exit       @ this exit updates all dpp->samples

/*
 ******************************************************************************
 * Loop to handle default term condition
 *
 * r0 = dpp->weight_B           r8 = result accumulator
 * r1 = bptr                    r9 = 
 * r2 = dpp->term               r10 =
 * r3 = decorrelation value     r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = current sample
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr =
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_default_loop:
        ldr     ip, [r1]                @ get original sample
        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
        mla     r8, r3, r4, r11         @ mult decorr value by weight, round,
        add     r8, ip, r8, asr #10     @  shift and add to new sample
        str     r8, [r1], #4            @ store update sample
        cmp     r3, #0
        cmpne   ip, #0
        beq     .L350
        teq     ip, r3                  @ update weight based on signs
        submi   r4, r4, r6
        addpl   r4, r4, r6

.L350:  ldr     ip, [r1]                @ do the same thing for right channel
        ldr     r3, [r1, -r2, asl #3]
        mla     r8, r3, r0, r11
        add     r8, ip, r8, asr #10
        str     r8, [r1], #4
        cmp     r3, #0
        cmpne   ip, #0
        beq     .L354
        teq     ip, r3
        submi   r0, r0, r6
        addpl   r0, r0, r6

.L354:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_default_loop

/*
 * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
 * structure (even if they are not all used for the given term)
 */

default_term_exit:
        ldrsh   r3, [r5, #0]
        sub     ip, r3, #1
        mov     lr, #7

.L358:  and     r3, ip, #7
        add     r3, r5, r3, asl #2
        ldr     r2, [r1, #-4]
        str     r2, [r3, #40]
        ldr     r2, [r1, #-8]!
        str     r2, [r3, #8]
        sub     ip, ip, #1
        sub     lr, lr, #1
        cmn     lr, #1
        bne     .L358
        b       common_exit

/*
 ******************************************************************************
 * Loop to handle term = -1 condition
 *
 * r0 = dpp->weight_B           r8 =
 * r1 = bptr                    r9 = 
 * r2 = intermediate result     r10 = -1024 (for clipping)
 * r3 = previous right sample   r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = current sample
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr = updated left sample
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_minus_1:
        ldr     r3, [r1, #-4]

term_minus_1_loop:
        ldr     ip, [r1]                @ for left channel the decorrelation value
        mla     r2, r3, r4, r11         @  is the previous right sample (in r3)
        add     lr, ip, r2, asr #10
        str     lr, [r1], #8
        cmp     r3, #0
        cmpne   ip, #0
        beq     .L361
        teq     ip, r3                  @ update weight based on signs
        submi   r4, r4, r6
        addpl   r4, r4, r6
        cmp     r4, #1024
        movgt   r4, #1024
        cmp     r4, r10
        movlt   r4, r10

.L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
        mla     r3, lr, r0, r11         @  is the just updated right sample (in lr)
        add     r3, r2, r3, asr #10
        str     r3, [r1, #-4]
        cmp     lr, #0
        cmpne   r2, #0
        beq     .L369
        teq     r2, lr
        submi   r0, r0, r6
        addpl   r0, r0, r6
        cmp     r0, #1024               @ then clip weight to +/-1024
        movgt   r0, #1024
        cmp     r0, r10
        movlt   r0, r10

.L369:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_minus_1_loop

        str     r3, [r5, #8]            @ else store right sample and exit
        b       common_exit

/*
 ******************************************************************************
 * Loop to handle term = -2 condition
 * (note that the channels are processed in the reverse order here)
 *
 * r0 = dpp->weight_B           r8 =
 * r1 = bptr                    r9 = 
 * r2 = intermediate result     r10 = -1024 (for clipping)
 * r3 = previous left sample    r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = current sample
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr = updated right sample
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_minus_2:
        ldr     r3, [r1, #-8]

term_minus_2_loop:
        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
        mla     r2, r3, r0, r11         @  is the previous left sample (in r3)
        add     lr, ip, r2, asr #10
        str     lr, [r1, #4]
        cmp     r3, #0
        cmpne   ip, #0
        beq     .L380
        teq     ip, r3                  @ update weight based on signs
        submi   r0, r0, r6
        addpl   r0, r0, r6
        cmp     r0, #1024               @ then clip weight to +/-1024
        movgt   r0, #1024
        cmp     r0, r10
        movlt   r0, r10

.L380:  ldr     r2, [r1, #0]            @ for left channel the decorrelation value
        mla     r3, lr, r4, r11         @  is the just updated left sample (in lr)
        add     r3, r2, r3, asr #10
        str     r3, [r1], #8
        cmp     lr, #0
        cmpne   r2, #0
        beq     .L388
        teq     r2, lr
        submi   r4, r4, r6
        addpl   r4, r4, r6
        cmp     r4, #1024
        movgt   r4, #1024
        cmp     r4, r10
        movlt   r4, r10

.L388:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_minus_2_loop

        str     r3, [r5, #40]           @ else store left channel and exit
        b       common_exit

/*
 ******************************************************************************
 * Loop to handle term = -3 condition
 *
 * r0 = dpp->weight_B           r8 = previous left sample
 * r1 = bptr                    r9 = 
 * r2 = current left sample     r10 = -1024 (for clipping)
 * r3 = previous right sample   r11 = 512 (for rounding)
 * r4 = dpp->weight_A           ip = intermediate result
 * r5 = dpp                     sp =
 * r6 = dpp->delta              lr =
 * r7 = eptr                    pc =
 *******************************************************************************
 */

term_minus_3:
        ldr     r3, [r1, #-4]           @ load previous samples
        ldr     r8, [r1, #-8]

term_minus_3_loop:
        ldr     ip, [r1]
        mla     r2, r3, r4, r11
        add     r2, ip, r2, asr #10
        str     r2, [r1], #4
        cmp     r3, #0
        cmpne   ip, #0
        beq     .L399
        teq     ip, r3                  @ update weight based on signs
        submi   r4, r4, r6
        addpl   r4, r4, r6
        cmp     r4, #1024               @ then clip weight to +/-1024
        movgt   r4, #1024
        cmp     r4, r10
        movlt   r4, r10

.L399:  movs    ip, r8                  @ ip = previous left we use now
        mov     r8, r2                  @ r8 = current left we use next time
        ldr     r2, [r1], #4
        mla     r3, ip, r0, r11
        add     r3, r2, r3, asr #10
        strne   r3, [r1, #-4]
        cmpne   r2, #0
        beq     .L407
        teq     ip, r2
        submi   r0, r0, r6
        addpl   r0, r0, r6
        cmp     r0, #1024
        movgt   r0, #1024
        cmp     r0, r10
        movlt   r0, r10

.L407:  cmp     r7, r1                  @ loop back if more samples to do
        bhi     term_minus_3_loop

        str     r3, [r5, #8]            @ else store previous samples & exit
        str     r8, [r5, #40]

/*
 * Before finally exiting we must store weights back for next time
 */

common_exit:
        strh    r4, [r5, #4]
        strh    r0, [r5, #6]
        ldmpc   regs="r4-r8, r10-r11"