SWCODEC: Tighten up coldfire assembly a little bit more. Cleanup to make differing parameters between ARM and Coldfire halfway clean. Hopefully those differences can be reconciled soon. A tiny bit of C optimizing for karaoke channel mode.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12505 a1c6a512-1295-4272-9138-f99709370657
2007-02-27 14:25:36 +00:00 · 2007-02-27 14:25:36 +00:00 · 6fbdb912b0
commit 6fbdb912b0
parent 8ca99d3288
3 changed files with 159 additions and 153 deletions
--- a/apps/dsp.c
+++ b/apps/dsp.c
@ -112,7 +112,7 @@ struct crossfeed_data
    int32_t coefs[3];       /* 04h - Coefficients for the shelving filter */
    int32_t history[4];     /* 10h - Format is x[n - 1], y[n - 1] for both channels */
    int32_t delay[13][2];   /* 20h */
-    int index;              /* 88h - Current index into the delay line */
+    int     index;          /* 88h - Current index/pointer into the delay line */
                            /* 8ch */
 };

@ -129,13 +129,21 @@ struct eq_state

 /* Include header with defines which functions are implemented in assembly
   code for the target */
-#ifndef SIMULATOR
 #include <dsp_asm.h>
-#endif

-#ifndef DSP_HAVE_ASM_CROSSFEED
-static void apply_crossfeed(int32_t *buf[], int count);
-#endif
+/* Typedefs keep things much neater in this case */
+typedef int (*sample_input_fn_type)(int count, const char *src[],
+                                    int32_t *dst[]);    
+typedef int (*resample_fn_type)(int count, struct dsp_data *data,
+                                int32_t *src[], int32_t *dst[]);
+typedef void (*sample_output_fn_type)(int count, struct dsp_data *data,
+                                      int32_t *src[], int16_t *dst);
+/* If ACF_SWITCHPARAM is no longer needed, make apply_crossfeed of type
+   channels_process_fn_type since it is really just that */
+typedef void (*apply_crossfeed_fn_type)(ACF_SWITCHPARAM(int count,
+                                                        int32_t *buf[]));
+typedef void (*channels_process_fn_type)(int count, int32_t *buf[]);
+
 /*
 ***************************************************************************/

@ -151,15 +159,13 @@ struct dsp_config
    long gain;          /* Note that this is in S8.23 format. */
    /* Functions that change depending upon settings - NULL if stage is
       disabled */
-    int (*input_samples)(int count, const char *src[], int32_t *dst[]);
-    int (*resample)(int count, struct dsp_data *data,
-                    int32_t *src[], int32_t *dst[]);
-    void (*output_samples)(int count, struct dsp_data *data,
-                           int32_t *src[], int16_t *dst);
+    sample_input_fn_type        input_samples;
+    resample_fn_type            resample;
+    sample_output_fn_type       output_samples;
    /* These will be NULL for the voice codec and is more economical that
       way */
-    void (*apply_crossfeed)(int32_t *src[], int count);
-    void (*channels_process)(int count, int32_t *buf[]);
+    apply_crossfeed_fn_type     apply_crossfeed;
+    channels_process_fn_type    channels_process;
 };

 /* General DSP config */
@ -169,7 +175,14 @@ static struct dither_data dither_data[2] IBSS_ATTR; /* 0=left, 1=right */
 static long   dither_mask IBSS_ATTR;
 static long   dither_bias IBSS_ATTR;
 /* Crossfeed */
-struct crossfeed_data crossfeed_data IBSS_ATTR;     /* A */
+struct crossfeed_data crossfeed_data IDATA_ATTR =    /* A */
+{
+#ifdef DSP_CROSSFEED_DELAY_PTR
+    .index = (intptr_t)crossfeed_data.delay
+#else
+    .index = 0
+#endif
+};     
 /* Equalizer */
 static struct eq_state eq_data;                     /* A/V */
 #ifdef HAVE_SW_TONE_CONTROLS
@ -401,8 +414,7 @@ static int sample_input_gt_native_ni_stereo(
 */
 static void sample_input_new_format(void)
 {
-    static int (* const sample_input_functions[])(
-        int count, const char* src[], int32_t *dst[]) =
+    static const sample_input_fn_type sample_input_functions[] =
    {
        [SAMPLE_INPUT_LE_NATIVE_MONO]      = sample_input_lte_native_mono,
        [SAMPLE_INPUT_LE_NATIVE_I_STEREO]  = sample_input_lte_native_i_stereo,
@ -539,9 +551,7 @@ static void sample_output_dithered(int count, struct dsp_data *data,
 */
 static void sample_output_new_format(void)
 {
-    static void (* const sample_output_functions[])(
-        int count, struct dsp_data *data,
-        int32_t *src[], int16_t *dst) =
+    static const sample_output_fn_type sample_output_functions[] =
    {
        sample_output_mono,
        sample_output_stereo,
@ -695,42 +705,13 @@ void dsp_dither_enable(bool enable)
    switch_dsp(old_dsp);    
 }

-/**
- * dsp_set_crossfeed(bool enable)
- *
- * !DSPPARAMSYNC
- * needs syncing with changes to the following dsp parameters:
- *  * dsp->stereo_mode (A)
- */
-void dsp_set_crossfeed(bool enable)
-{
-    crossfeed_enabled = enable;
-    audio_dsp->apply_crossfeed =
-        (enable && audio_dsp->data.num_channels > 1)
-            ? apply_crossfeed : NULL;
-}
-
-void dsp_set_crossfeed_direct_gain(int gain)
-{
-    crossfeed_data.gain = get_replaygain_int(gain * -10) << 7;
-}
-
-void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain, long cutoff)
-{
-    long g1 = get_replaygain_int(lf_gain * -10) << 3;
-    long g2 = get_replaygain_int(hf_gain * -10) << 3;
-
-    filter_shelf_coefs(0xffffffff/NATIVE_FREQUENCY*cutoff, g1, g2,
-                       crossfeed_data.coefs);
-}
-
 /* Applies crossfeed to the stereo signal in src.
 * Crossfeed is a process where listening over speakers is simulated. This
 * is good for old hard panned stereo records, which might be quite fatiguing
 * to listen to on headphones with no crossfeed.
 */
 #ifndef DSP_HAVE_ASM_CROSSFEED
-static void apply_crossfeed(int32_t *buf[], int count)
+static void apply_crossfeed(int count, int32_t *buf[])
 {
    int32_t *hist_l = &crossfeed_data.history[0];
    int32_t *hist_r = &crossfeed_data.history[2];
@ -775,7 +756,36 @@ static void apply_crossfeed(int32_t *buf[], int count)
    /* Write back local copies of data we've modified */
    crossfeed_data.index = di;
 }
-#endif
+#endif /* DSP_HAVE_ASM_CROSSFEED */
+
+/**
+ * dsp_set_crossfeed(bool enable)
+ *
+ * !DSPPARAMSYNC
+ * needs syncing with changes to the following dsp parameters:
+ *  * dsp->stereo_mode (A)
+ */
+void dsp_set_crossfeed(bool enable)
+{
+    crossfeed_enabled = enable;
+    audio_dsp->apply_crossfeed =
+        (enable && audio_dsp->data.num_channels > 1)
+            ? apply_crossfeed : NULL;
+}
+
+void dsp_set_crossfeed_direct_gain(int gain)
+{
+    crossfeed_data.gain = get_replaygain_int(gain * -10) << 7;
+}
+
+void dsp_set_crossfeed_cross_params(long lf_gain, long hf_gain, long cutoff)
+{
+    long g1 = get_replaygain_int(lf_gain * -10) << 3;
+    long g2 = get_replaygain_int(hf_gain * -10) << 3;
+
+    filter_shelf_coefs(0xffffffff/NATIVE_FREQUENCY*cutoff, g1, g2,
+                       crossfeed_data.coefs);
+}

 /* Combine all gains to a global gain. */
 static void set_gain(struct dsp_config *dsp)
@ -1056,10 +1066,9 @@ static void channels_process_sound_chan_karaoke(int count, int32_t *buf[])

    do
    {
-        int32_t l = *sl/2;
-        int32_t r = *sr/2;
-        *sl++ = l - r;
-        *sr++ = r - l;
+        int32_t ch = *sl/2 - *sr/2;
+        *sl++ = ch;
+        *sr++ = -ch;
    }
    while (--count > 0);
 }
@ -1067,8 +1076,7 @@ static void channels_process_sound_chan_karaoke(int count, int32_t *buf[])

 void channels_set(int value)
 {
-    static void (* const channels_process_functions[])(
-        int count, int32_t *buf[]) =
+    static const channels_process_fn_type channels_process_functions[] =
    {
        /* SOUND_CHAN_STEREO = All-purpose index for no channel processing */
        [SOUND_CHAN_STEREO]     = NULL,
@ -1118,7 +1126,7 @@ int dsp_process(char *dst, const char *src[], int count)
        if ((samples = resample(samples, tmp)) <= 0)
            break; /* I'm pretty sure we're downsampling here */
        if (dsp->apply_crossfeed)
-            dsp->apply_crossfeed(tmp, samples);
+            dsp->apply_crossfeed(ACF_SWITCHPARAM(samples, tmp));
        /* TODO: EQ and tone controls need separate structs for audio and voice
         * DSP processing thanks to filter history. isn't really audible now, but
         * might be the day we start handling voice more delicately.
--- a/apps/dsp_asm.h
+++ b/apps/dsp_asm.h
@ -22,10 +22,22 @@
 #ifndef _DSP_ASM_H
 #define _DSP_ASM_H

+#define ACF_SWITCHPARAM(count, buf)     count, buf
+
+#ifndef SIMULATOR
+
 #if defined(CPU_COLDFIRE) || defined(CPU_ARM)
 #define DSP_HAVE_ASM_CROSSFEED
-void apply_crossfeed(int32_t *src[], int count);
+#if defined(CPU_COLDFIRE)
+/* ACF_SWITCHPARAM can be stripped out if all have the same parameter
+   order - DSP_CROSSFEED_DELAY_PTR if all use a pointer instead of index */
+#define DSP_CROSSFEED_DELAY_PTR
+#else
+#undef ACF_SWITCHPARAM
+#define ACF_SWITCHPARAM(count, buf)     buf, count
 #endif
+void apply_crossfeed(ACF_SWITCHPARAM(int count, int32_t *buf[]));
+#endif /* defined(CPU_COLDFIRE) || defined(CPU_ARM) */

 #if defined (CPU_COLDFIRE)
 #define DSP_HAVE_ASM_RESAMPLING
@ -45,5 +57,8 @@ void sample_output_mono(int count, struct dsp_data *data,
 #define DSP_HAVE_ASM_SAMPLE_OUTPUT_STEREO
 void sample_output_stereo(int count, struct dsp_data *data,
                          int32_t *src[], int16_t *dst);
-#endif
+#endif /* CPU_COLDFIRE */
+
+#endif /* SIMULATOR */
+
 #endif /* _DSP_ASM_H */
--- a/apps/dsp_cf.S
+++ b/apps/dsp_cf.S
@ -8,6 +8,7 @@
 * $Id$
 *
 * Copyright (C) 2006 Thom Johansen
+ * Portions Copyright (C) 2007 Michael Sevakis
 *
 * All files in this archive are subject to the GNU General Public License.
 * See the file COPYING in the source tree root for full license agreement.
@ -18,75 +19,63 @@
 ****************************************************************************/

 /****************************************************************************
- * void apply_crossfeed(int32_t *src[], int count)
+ * void apply_crossfeed(int count, int32_t *src[])
 */
    .section    .text
    .global     apply_crossfeed 
 apply_crossfeed:
-    lea.l (-44, %sp), %sp
-    movem.l %d2-%d7/%a2-%a6, (%sp)      | save all regs
-    move.l (44+4, %sp), %a4
-    movem.l (%a4), %a4-%a5              | a4 = src[0], a5 = src[1]
-    move.l (44+8, %sp), %d7             | d7 = count
-
-    lea.l crossfeed_data, %a1
-    lea.l (8*4, %a1), %a0               | a0 = &delay[0][0]
-    move.l (%a1)+, %a6                  | a6 = direct gain
-    movem.l (3*4, %a1), %d0-%d3         | fetch filter history samples
-    move.l (33*4, %a1), %d4             | fetch delay line index
-    movem.l (%a1), %a1-%a3              | load filter coefs
-    move.l %d4, %d5
-    lsl.l #3, %d5
-    add.l %d5, %a0                      | point a0 to current delay position
-|    lea.l (%d4*4, %a0), %a0
-|    lea.l (%d4*4, %a0), %a0             | point a0 to current delay position
+    lea.l       -44(%sp), %sp
+    movem.l     %d2-%d7/%a2-%a6, (%sp)  | save all regs
+    movem.l     48(%sp), %d7/%a4        | %d7 = count, %a4 = src
+    movem.l     (%a4), %a4-%a5          | %a4 = src[0], %a5 = src[1]
+    lea.l       crossfeed_data, %a1
+    move.l      (%a1)+, %a6             | a6 = direct gain
+    movem.l     12(%a1), %d0-%d3        | fetch filter history samples
+    move.l      132(%a1), %a0           | fetch delay line address
+    movem.l     (%a1), %a1-%a3          | load filter coefs
    /* Register usage in loop:
-     * a0 = &delay[index][0], a1..a3 = b0, b1, a1 (filter coefs),
-     * a4 = src[0], a5 = src[1], a6 = direct gain,
-     * d0..d3 = history
-     * d4 = delay line index,
-     * d5,d6 = temp.
-     * d7 = count
+     * %a0 = delay_p, %a1..%a3 = b0, b1, a1 (filter coefs),
+     * %a4 = src[0], %a5 = src[1], %a6 = direct gain,
+     * %d0..%d3 = history
+     * %d4..%d6 = temp.
+     * %d7 = count
     */
 .cfloop:
-    mac.l %a2, %d0, (4, %a0), %d0, %acc0 | acc = b1*dr[n - 1] d0 = dr[n]
-    mac.l %a1, %d0, %acc0               | acc += b0*dr[n]
-    mac.l %a3, %d1, (%a4), %d5, %acc0   | acc += a1*y_l[n - 1], load left input
-    move.l %acc0, %d1                   | get filtered delayed sample
-    mac.l %a6, %d5, %acc0               | acc += gain*x_l[n]
-    movclr.l %acc0, %d6
-    move.l %d6, (%a4)+                  | write result
+    mac.l       %a2, %d0, 4(%a0), %d0, %acc0 | acc  = b1*dr[n - 1] d0 = dr[n]
+    mac.l       %a1, %d0             , %acc0 | acc += b0*dr[n]
+    mac.l       %a3, %d1,  (%a4), %d4, %acc0 | acc += a1*y_l[n - 1], load L
+    move.l      %acc0, %d1              | get filtered delayed sample
+    mac.l       %a6, %d4, %acc0         | acc += gain*x_l[n]
+    movclr.l    %acc0, %d6              |
+    move.l      %d6, (%a4)+             | write result

-    mac.l %a2, %d2, (%a0), %d2, %acc0   | acc = b1*dl[n - 1], d2 = dl[n]
-    move.l %d5, (%a0)+                  | save left input to delay line
-    mac.l %a1, %d2, %acc0               | acc += b0*dl[n]
-    mac.l %a3, %d3, (%a5), %d5, %acc0   | acc += a1*y_r[n - 1], load right input
-    move.l %acc0, %d3                   | get filtered delayed sample
-    mac.l %a6, %d5, %acc0               | acc += gain*x_r[n]
-    move.l %d5, (%a0)+                  | save right input to delay line
-    movclr.l %acc0, %d6
-    move.l %d6, (%a5)+                  | write result
+    mac.l       %a2, %d2, (%a0), %d2, %acc0 | acc  = b1*dl[n - 1], d2 = dl[n]
+    mac.l       %a1, %d2            , %acc0 | acc += b0*dl[n]
+    mac.l       %a3, %d3, (%a5), %d5, %acc0 | acc += a1*y_r[n - 1], load R
+    movem.l     %d4-%d5, (%a0)          | save left & right inputs to delay line
+    move.l      %acc0, %d3              | get filtered delayed sample
+    mac.l       %a6, %d5, %acc0         | acc += gain*x_r[n]
+    lea.l       8(%a0), %a0             | increment delay pointer
+    movclr.l    %acc0, %d6              |
+    move.l      %d6, (%a5)+             | write result

-    addq.l #1, %d4                      | index++
-    moveq.l #13, %d6
-    cmp.l %d6, %d4                      | wrap index to 0 if it overflows
-    jlt .cfnowrap
-    moveq.l #13*8, %d4
-    sub.l %d4, %a0                      | wrap back delay line ptr as well
-    clr.l %d4
-.cfnowrap:
-    subq.l #1, %d7
-    jne .cfloop
-    | save data back to struct
-    lea.l crossfeed_data + 4*4, %a1
-    movem.l %d0-%d3, (%a1)
-    move.l %d4, (30*4, %a1)
-    movem.l (%sp), %d2-%d7/%a2-%a6
-    lea.l (44, %sp), %sp
+    cmpa.l      #crossfeed_data+136, %a0| wrap a0 if passed end
+    bge.b       .cfwrap                 |
+    .word       0x51fb                  | tpf.l - trap the buffer wrap
+.cfwrap:
+    lea.l       -104(%a0), %a0          | wrap
+    subq.l      #1, %d7                 | --count < 0 ?
+    bgt.b       .cfloop                 |
+    lea.l       crossfeed_data+16, %a1  | save data back to struct
+    movem.l     %d0-%d3, (%a1)          | ...history
+    move.l      %a0, 120(%a1)           | ...delay_p
+    movem.l     (%sp), %d2-%d7/%a2-%a6  | restore all regs
+    lea.l       44(%sp), %sp
    rts
 .cfend:
    .size       apply_crossfeed,.cfend-apply_crossfeed

+
 /****************************************************************************
 * int dsp_downsample(int count, struct dsp_data *data,
 *                    in32_t *src[], int32_t *dst[])
@ -128,10 +117,10 @@ dsp_downsample:
    lsl.l       %d7, %d0                |
    lsr.l       #1, %d0                 |
    mac.l       %d0, %d1, %acc0         | %acc0 += frac * diff
-    move.l      %acc0, %d0              |
    add.l       %d4, %d5                | phase += delta
    move.l      %d5, %d6                | pos = phase >> 16
    lsr.l       %d7, %d6                |
+    movclr.l    %acc0, %d0              |
    move.l      %d0, (%a4)+             | *d++ = %d0
    cmp.l       %d2, %d6                | pos < count?
    blt.b       .dsloop                 | yes? continue resampling
@ -145,7 +134,6 @@ dsp_downsample:
    sub.l       (%a2), %d0              |
    asr.l       #2, %d0                 | convert bytes->samples
    movem.l     (%sp), %d2-%d7/%a2-%a5  | restore non-clobberables
-    move.l      %acc1, %acc0            | clear %acc0
    lea.l       40(%sp), %sp            | cleanup stack
    rts                                 | buh-bye
 .dsend:
@ -196,8 +184,8 @@ dsp_upsample:
 .usloop_0:
    lsr.l       #1, %d5                 | make phase into frac
    mac.l       %d1, %d5, %acc0         | %acc0 = diff * frac
-    movclr.l    %acc0, %d7              | %d7 = product
    lsl.l       #1, %d5                 | restore frac to phase
+    movclr.l    %acc0, %d7              | %d7 = product
    add.l       %d0, %d7                | %d7 = last + product
    move.l      %d7, (%a4)+             | *d++ = %d7
    add.l       %d4, %d5                | phase += delta
@ -272,10 +260,10 @@ channels_process_sound_chan_custom:
    move.l      dsp_sw_cross, %d4       | load cross (side) gain
 1:
    move.l      (%a0), %d1              |
-    mac.l       %d1, %d3 , (%a1), %d2, %acc0 |  L = l*gain + r*cross
-    mac.l       %d1, %d4 , %acc1        |  R = r*gain + l*cross
-    mac.l       %d2, %d4 , %acc0        |
-    mac.l       %d2, %d3 , %acc1        |
+    mac.l       %d1, %d3, (%a1), %d2, %acc0 |  L = l*gain + r*cross
+    mac.l       %d1, %d4            , %acc1 |  R = r*gain + l*cross
+    mac.l       %d2, %d4            , %acc0 |
+    mac.l       %d2, %d3            , %acc1 |
    movclr.l    %acc0, %d1              |
    movclr.l    %acc1, %d2              |
    move.l      %d1, (%a0)+             |
@ -306,15 +294,12 @@ channels_process_sound_chan_karaoke:
    move.l      #0x40000000, %d4        | %d3 = 0.5
 1:
    move.l     (%a0), %d1               |
-    mac.l      %d1, %d4, (%a1), %d2, %acc0 | L = l/2 - r/2
-    mac.l      %d2, %d4, %acc1          | R = r/2 - l/2
+    msac.l     %d1, %d4, (%a1), %d2, %acc0 | R = r/2 - l/2
+    mac.l      %d2, %d4            , %acc0 |
    movclr.l   %acc0, %d1               |
-    movclr.l   %acc1, %d2               |
-    move.l     %d1, %d3                 |
-    sub.l      %d2, %d1                 |
-    sub.l      %d3, %d2                 |
+    move.l     %d1, (%a1)+              |
+    neg.l      %d1                      | L = -R = -(r/2 - l/2) = l/2 - r/2
    move.l     %d1, (%a0)+              |
-    move.l     %d2, (%a1)+              |
    subq.l     #1, %d0                  |
    bgt.s      1b                       |
    movem.l    (%sp), %d1-%d4           | restore registers
@ -323,7 +308,6 @@ channels_process_sound_chan_karaoke:
    rts
 .cpkaraoke_end:
    .size       channels_process_sound_chan_karaoke, .cpkaraoke_end-channels_process_sound_chan_karaoke
-
 /****************************************************************************
 * void sample_output_stereo(int count, struct dsp_data *data,
 *                               int32_t *src[], int16_t *dst)
@ -382,34 +366,33 @@ sample_output_stereo:
 .sos_lineloop_start:
    lea.l       -12(%a0), %a5             | %a5 = at or just before last line bound
 .sos_lineloop:
-    move.l      (%a2)+, %d0               | get next 4 L samples and scale
-    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | with saturation
-    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 |
-    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
-    mac.l       %d3, %a1, %acc3           |
-    movclr.l    %acc0, %d0                | obtain results
-    movclr.l    %acc1, %d1                |
-    movclr.l    %acc2, %d2                |
-    movclr.l    %acc3, %d3                |
    move.l      (%a3)+, %d4               | get next 4 R samples and scale
-    mac.l       %d4, %a1, (%a3)+, %d5,  %acc0 | with saturation
-    mac.l       %d5, %a1, (%a3)+, %d6,  %acc1 |
-    mac.l       %d6, %a1, (%a3)+, %d7,  %acc2 |
-    mac.l       %d7, %a1, %acc3           |
-    movclr.l    %acc0, %d4                | obtain results
+    mac.l       %d4, %a1, (%a3)+, %d5, %acc0 | with saturation
+    mac.l       %d5, %a1, (%a3)+, %d6, %acc1 |
+    mac.l       %d6, %a1, (%a3)+, %d7, %acc2 |
+    mac.l       %d7, %a1, (%a2)+, %d0, %acc3 |
+    lea.l       16(%a4), %a4              | increment dest here, mitigate stalls
+    movclr.l    %acc0, %d4                | obtain R results
    movclr.l    %acc1, %d5                |
    movclr.l    %acc2, %d6                |
    movclr.l    %acc3, %d7                |
-    swap        %d4                       | interleave most significant
-    move.w      %d4, %d0                  | 16 bits of L and R
+    mac.l       %d0, %a1, (%a2)+, %d1, %acc0 | get next 4 L samples and scale
+    mac.l       %d1, %a1, (%a2)+, %d2, %acc1 | with saturation
+    mac.l       %d2, %a1, (%a2)+, %d3, %acc2 |
+    mac.l       %d3, %a1             , %acc3 |
+    swap        %d4                       | a) interleave most significant...
    swap        %d5                       |
-    move.w      %d5, %d1                  |
    swap        %d6                       |
-    move.w      %d6, %d2                  |
    swap        %d7                       |
+    movclr.l    %acc0, %d0                | obtain L results
+    movclr.l    %acc1, %d1                |
+    movclr.l    %acc2, %d2                |
+    movclr.l    %acc3, %d3                |
+    move.w      %d4, %d0                  | a) ... 16 bits of L and R
+    move.w      %d5, %d1                  |
+    move.w      %d6, %d2                  |
    move.w      %d7, %d3                  |
-    movem.l     %d0-%d3, (%a4)            | write four stereo samples
-    lea.l       16(%a4), %a4              |
+    movem.l     %d0-%d3, -16(%a4)         | write four stereo samples
    cmp.l       %a4, %a5                  |
    bhi.b       .sos_lineloop             |
 .sos_longloop_1_start:
@ -480,7 +463,8 @@ sample_output_mono:
    mac.l       %d0, %d5, (%a2)+, %d1, %acc0 | with saturation
    mac.l       %d1, %d5, (%a2)+, %d2, %acc1 |
    mac.l       %d2, %d5, (%a2)+, %d3, %acc2 |
-    mac.l       %d3, %d5, %acc3           |
+    mac.l       %d3, %d5             , %acc3 |
+    lea.l       16(%a3), %a3              | increment dest here, mitigate stalls
    movclr.l    %acc0, %d0                | obtain results
    movclr.l    %acc1, %d1                |
    movclr.l    %acc2, %d2                |
@ -497,8 +481,7 @@ sample_output_mono:
    move.l      %d3, %d4                  |
    swap        %d4                       |
    move.w      %d4, %d3                  |
-    movem.l     %d0-%d3, (%a3)            | write four stereo samples
-    lea.l       16(%a3), %a3              |
+    movem.l     %d0-%d3, -16(%a3)         | write four stereo samples
    cmp.l       %a3, %a1                  |
    bhi.b       .som_lineloop             |
 .som_longloop_1_start: