rockbox/lib/rbcodec/dsp/tdspeed.c

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2006 by Nicolas Pitre <nico@cam.org>
 * Copyright (C) 2006-2007 by Stéphane Doyon <s.doyon@videotron.ca>
 * Copyright (C) 2012 Michael Sevakis
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/
#include "rbcodecconfig.h"
#include "platform.h"
#include "sound.h"
#include "core_alloc.h"
#include "dsp-util.h"
#include "dsp_proc_entry.h"
#include "tdspeed.h"

#ifndef assert
#define assert(cond)
#endif

#define TIMESTRETCH_SET_FACTOR (DSP_PROC_SETTING+DSP_PROC_TIMESTRETCH)

#define MIN_RATE 8000
#define MAX_RATE 48000 /* double buffer for double rate */
#define MINFREQ 100

#define MAX_INPUTCOUNT       512 /* Max input count so dst doesn't overflow */
#define FIXED_BUFCOUNT      3072 /* 48KHz factor 3.0 */
#define FIXED_OUTBUFCOUNT   4096

enum tdspeed_ops
{
    TDSOP_PROCESS,
    TDSOP_LAST,
    TDSOP_PURGE,
};

static struct tdspeed_state_s
{
    struct dsp_proc_entry *this; /* this stage */
    int channels;           /* number of audio channels */
    int32_t samplerate;     /* current samplerate of input data */
    int32_t factor;         /* stretch factor (perdecimille) */
    int32_t shift_max;      /* maximum displacement on a frame */
    int32_t src_step;       /* source window pace */
    int32_t dst_step;       /* destination window pace */
    int32_t dst_order;      /* power of two for dst_step */
    int32_t ovl_shift;      /* overlap buffer frame shift */
    int32_t ovl_size;       /* overlap buffer used size */
    int32_t *ovl_buff[2];   /* overlap buffer (L+R) */
} tdspeed_state;

static int32_t *buffers[TDSPEED_NBUFFERS] = { NULL, NULL, NULL, NULL };

static const int buffer_sizes[TDSPEED_NBUFFERS] =
{
    FIXED_BUFCOUNT * sizeof(int32_t),
    FIXED_BUFCOUNT * sizeof(int32_t),
    FIXED_OUTBUFCOUNT * sizeof(int32_t),
    FIXED_OUTBUFCOUNT * sizeof(int32_t)
};

#define overlap_buffer  (&buffers[0])
#define outbuf          (&buffers[2])
#define out_size        FIXED_OUTBUFCOUNT

/* Processed buffer passed out to later stages */
static struct dsp_buffer dsp_outbuf;

/* Blend overlapping frame samples according to position */
#if defined(CPU_COLDFIRE)
static inline int32_t blend_frame_samples(int32_t curr, int32_t prev,
                                          int i, int j, int order)
{
    int32_t a0, a1;
    asm (
        "mac.l     %2, %3, %%acc0 \n" /* acc = curr*(i<<(30-order)) >> 23 */
        "mac.l     %4, %5, %%acc0 \n" /* acc += prev*(j<<(30-order)) >> 23 */
        "moveq.l   #1, %0         \n" /* Prepare mask */
        "move.l    %%accext01, %1 \n" /* Get extension bits */
        "lsr.l     #7, %1         \n" /* Get bit 7 of LSb extension ... */
        "and.l     %0, %1         \n" /* ... into bit 0 */
        "movclr.l  %%acc0, %0     \n" /* Get result >> 8 */
        "asl.l     #1, %0         \n" /* Everything x2 */
        "or.l      %1, %0         \n" /* Insert proper LSb from extension */
        : "=d"(a0), "=d"(a1)
        : "r"(curr), "r"(i << order),
          "r"(prev), "r"(j << order));

    return a0;
}
#else
/* Generic */
static inline int32_t blend_frame_samples(int32_t curr, int32_t prev,
                                          int i, int j, int order)
{
    return (curr * (int64_t)i + prev * (int64_t)j) >> order;
}
#endif /* CPU_* */

/* Discard all data */
static void tdspeed_flush(void)
{
    struct tdspeed_state_s *st = &tdspeed_state;
    st->ovl_size = 0;
    st->ovl_shift = 0;
    dsp_outbuf.remcount = 0; /* Dump remaining output */
}

static bool tdspeed_update(int32_t samplerate, int32_t factor)
{
    struct tdspeed_state_s *st = &tdspeed_state;

    /* Save parameters we'll need later if format changes */
    st->samplerate = samplerate;
    st->factor     = factor;

    /* just discard remaining input data */
    st->ovl_size = 0;
    st->ovl_shift = 0;

    /* Check parameters */
    if (factor == PITCH_SPEED_100)
        return false;

    if (samplerate < MIN_RATE || samplerate > MAX_RATE)
        return false;

    if (factor < STRETCH_MIN || factor > STRETCH_MAX)
        return false;

    st->dst_step = samplerate / MINFREQ;

    if (factor > PITCH_SPEED_100)
        st->dst_step = st->dst_step * PITCH_SPEED_100 / factor;

    st->dst_order = 1;

    while (st->dst_step >>= 1)
        st->dst_order++;

    st->dst_step = (1 << st->dst_order);
#ifdef CPU_COLDFIRE
    /* blend_frame_samples works in s0.31 mode. Also must shift by
       one less bit before mac in order not to overflow. */
    st->dst_order = 30 - st->dst_order;
#endif
    st->src_step = st->dst_step * factor / PITCH_SPEED_100;
    st->shift_max = (st->dst_step > st->src_step) ?
                        st->dst_step : st->src_step;

    st->ovl_buff[0] = overlap_buffer[0];
    st->ovl_buff[1] = overlap_buffer[1]; /* ignored if mono */

    return true;
}

static int tdspeed_apply(int32_t *buf_out[2], int32_t *buf_in[2],
                         int data_len, enum tdspeed_ops op, int *consumed)
/* data_len in samples */
{
    struct tdspeed_state_s *const st = &tdspeed_state;
    int32_t src_frame_sz = st->shift_max + st->dst_step;

    if (st->dst_step > st->src_step)
        src_frame_sz += st->dst_step - st->src_step;

    int32_t *dest[2];
    int32_t next_frame, prev_frame;

    /* deal with overlap data first, if any */
    if (st->ovl_size)
    {
        int32_t have = st->ovl_size;

        if (st->ovl_shift > 0)
            have -= st->ovl_shift;

        /* append just enough data to have all of the overlap buffer consumed */
        int32_t steps = (have - 1) / st->src_step;
        int32_t copy = steps * st->src_step + src_frame_sz - have;

        if (copy < src_frame_sz - st->dst_step)
            copy += st->src_step;  /* one more step to allow for pregap data */

        if (copy > data_len)
            copy = data_len;

        assert(st->ovl_size + copy <= FIXED_BUFCOUNT);

        for (int ch = 0; ch < st->channels; ch++)
        {
            memcpy(st->ovl_buff[ch] + st->ovl_size, buf_in[ch],
                   copy * sizeof(int32_t));
        }

        if (consumed)
            *consumed = copy;

        if (op == TDSOP_PROCESS && have + copy < src_frame_sz)
        {
            /* still not enough to process at least one frame */
            st->ovl_size += copy;
            return 0;
        }

        /* recursively call ourselves to process the overlap buffer */
        have = st->ovl_size;
        st->ovl_size = 0;

        assert(have + copy <= FIXED_BUFCOUNT);

        if (copy == data_len)
        {
            return tdspeed_apply(buf_out, st->ovl_buff, have + copy,
                                 op, NULL);
        }

        int i = tdspeed_apply(buf_out, st->ovl_buff, have + copy,
                              TDSOP_LAST, NULL);

        dest[0] = buf_out[0] + i;
        dest[1] = buf_out[1] + i;

        /* readjust pointers to account for data already consumed */
        next_frame = copy - src_frame_sz + st->src_step;
        prev_frame = next_frame - st->ovl_shift;
    }
    else
    {
        dest[0] = buf_out[0];
        dest[1] = buf_out[1];

        next_frame = prev_frame = 0;

        if (st->ovl_shift > 0)
            next_frame = st->ovl_shift;
        else
            prev_frame = -st->ovl_shift;
    }

    st->ovl_shift = 0;

    /* process all complete frames */
    while (data_len - next_frame >= src_frame_sz)
    {
        /* find frame overlap by autocorelation */
        int const INC1 = 8;
        int const INC2 = 32;

        int64_t min_delta = INT64_MAX;  /* most positive */
        int shift = 0;

        assert(next_frame + st->shift_max - 1 + st->dst_step <= data_len);
        assert(prev_frame + st->dst_step <= data_len);

        for (int i = 0; i < st->shift_max; i += INC1)
        {
            int64_t delta = 0;

            for (int ch = 0; ch < st->channels; ch++)
            {
                int32_t *curr = buf_in[ch] + next_frame + i;
                int32_t *prev = buf_in[ch] + prev_frame;

                for (int j = 0; j < st->dst_step;
                     j += INC2, curr += INC2, prev += INC2)
                {
                    delta += ad_s32(*curr, *prev);

                    if (delta >= min_delta)
                        goto skip;
                }
            }

            min_delta = delta;
            shift = i;
skip:;
        }

        /* overlap fading-out previous frame with fading-in current frame */
        for (int ch = 0; ch < st->channels; ch++)
        {
            int32_t *curr = buf_in[ch] + next_frame + shift;
            int32_t *prev = buf_in[ch] + prev_frame;
            int32_t *d = dest[ch];

            assert(next_frame + shift + st->dst_step <= data_len);
            assert(prev_frame + st->dst_step <= data_len);
            assert(dest[ch] - buf_out[ch] + st->dst_step <= out_size);

            for (int i = 0, j = st->dst_step; j; i++, j--)
            {
                assert(d < buf_out[ch] + out_size);
                *d++ = blend_frame_samples(*curr++, *prev++, i, j,
                                           st->dst_order);
            }

            dest[ch] = d;
        }

        /* adjust pointers for next frame */
        prev_frame = next_frame + shift + st->dst_step;
        next_frame += st->src_step;

        /* here next_frame - prev_frame = src_step - dst_step - shift */
        assert(next_frame - prev_frame == st->src_step - st->dst_step - shift);
    } /* while */

    /* now deal with remaining partial frames */
    switch (op)
    {
    case TDSOP_PROCESS:
    {
        /* preserve remaining data + needed overlap data for next call */
        st->ovl_shift = next_frame - prev_frame;
        int i = (st->ovl_shift < 0) ? next_frame : prev_frame;
        st->ovl_size = data_len - i;
        assert(st->ovl_size <= FIXED_BUFCOUNT);

        for (int ch = 0; ch < st->channels; ch++)
        {
            memmove(st->ovl_buff[ch], buf_in[ch] + i,
                    st->ovl_size * sizeof(int32_t));
        }

        if (consumed)
            *consumed = data_len;

        break;
        } /* TDSOP_PROCESS: */

    case TDSOP_LAST:
    {
        /* special overlap buffer processing: remember frame shift only */
        st->ovl_shift = next_frame - prev_frame;
        break;
        } /* TDSOP_LAST: */

    case TDSOP_PURGE:
    {
        /* last call: purge all remaining data to output buffer */
        int i = data_len - prev_frame;

        for (int ch = 0; ch < st->channels; ch++)
        {
            assert(dest[ch] + i <= buf_out[ch] + out_size);
            memcpy(dest[ch], buf_in[ch] + prev_frame, i * sizeof(int32_t));
            dest[ch] += i;
        }

        if (consumed)
            *consumed += i;

        break;
        } /* TDSOP_PURGE: */
    } /* switch */

    return dest[0] - buf_out[0];
}


/** DSP interface **/

/* Enable or disable the availability of timestretch */
void dsp_timestretch_enable(bool enabled)
{
    if (enabled != !tdspeed_state.this)
        return; /* No change */

    struct dsp_config *dsp = dsp_get_config(CODEC_IDX_AUDIO);
    dsp_proc_enable(dsp, DSP_PROC_TIMESTRETCH, enabled);
}

/* Set the timestretch ratio */
void dsp_set_timestretch(int32_t percent)
{
    struct tdspeed_state_s *st = &tdspeed_state;

    if (!st->this)
        return; /* not enabled */

    if (percent <= 0)
        percent = PITCH_SPEED_100;

    if (percent == st->factor)
        return; /* no change */

    struct dsp_config *dsp = dsp_get_config(CODEC_IDX_AUDIO);
    dsp_configure(dsp, TIMESTRETCH_SET_FACTOR, percent);
}

/* Return the timestretch ratio */
int32_t dsp_get_timestretch(void)
{
    return tdspeed_state.factor;
}

/* Return whether or not timestretch is enabled and initialized */
bool dsp_timestretch_available(void)
{
    return !!tdspeed_state.this;
}

/* Apply timestretch to the input buffer and switch to our output buffer */
static void tdspeed_process(struct dsp_proc_entry *this,
                            struct dsp_buffer **buf_p)
{
    struct dsp_buffer *src = *buf_p;
    struct dsp_buffer *dst = &dsp_outbuf;

    *buf_p = dst; /* switch to our buffer */

    int count = dst->remcount;

    if (count > 0)
        return; /* output remains from an earlier call */

    dst->p32[0] = outbuf[0];
    dst->p32[1] = outbuf[src->format.num_channels - 1];

    if (src->remcount > 0)
    {
        dst->bufcount = 0; /* use this to get consumed src */
        count = tdspeed_apply(dst->p32, src->p32,
                              MIN(src->remcount, MAX_INPUTCOUNT),
                              TDSOP_PROCESS, &dst->bufcount);

        /* advance src by samples consumed */
        if (dst->bufcount > 0)
            dsp_advance_buffer32(src, dst->bufcount);
    }
    /* else purged dsp_outbuf */

    dst->remcount = count;

    /* inherit in-place processed mask from source buffer */
    dst->proc_mask = src->proc_mask;

    (void)this;
}

/* Process format changes and settings changes */
static intptr_t tdspeed_new_format(struct dsp_proc_entry *this,
                                   struct dsp_config *dsp,
                                   struct sample_format *format)
{
    struct dsp_buffer *dst = &dsp_outbuf;

    if (dst->remcount > 0)
        return PROC_NEW_FORMAT_TRANSITION;

    DSP_PRINT_FORMAT(DSP_PROC_TIMESTRETCH, *format);

    bool active = dsp_proc_active(dsp, DSP_PROC_TIMESTRETCH);
    struct tdspeed_state_s *st = &tdspeed_state;
    int channels = format->num_channels;

    if (format->codec_frequency != st->samplerate)
    {
        /* relevent parameters are changing - all overlap will be discarded */
        st->channels = channels;

        DEBUGF("  DSP_PROC_TIMESTRETCH- new settings: "
               "ch:%u chz: %u, %d.%02d%%\n",
               channels,
               format->codec_frequency,
               st->factor / 100, st->factor % 100);
        active = tdspeed_update(format->codec_frequency, st->factor);
        dsp_proc_activate(dsp, DSP_PROC_TIMESTRETCH, active);
    }
    else if (active && channels != st->channels)
    {
        /* channel count transistion - have to make old data in overlap
           buffer compatible with new format */
        DEBUGF("  DSP_PROC_TIMESTRETCH- new ch count: %u=>%u\n",
               st->channels, channels);

        st->channels = channels;

        if (channels > 1)
        {
            /* mono->stereo: Process the old mono as stereo now */
            memcpy(st->ovl_buff[1], st->ovl_buff[0],
                   st->ovl_size * sizeof (int32_t));
        }
        else
        {
            /* stereo->mono: Process the old stereo as mono now */
            for (int i = 0; i < st->ovl_size; i++)
            {
                st->ovl_buff[0][i] = st->ovl_buff[0][i] / 2 +
                                     st->ovl_buff[1][i] / 2;
            }
        }
    }

    dst->format = *format;

    if (active)
        return PROC_NEW_FORMAT_OK;

    /* Nothing to do */
    DEBUGF("  DSP_PROC_RESAMPLE- deactivated\n");
    return PROC_NEW_FORMAT_DEACTIVATED;

    (void)this;
}

static void tdspeed_dsp_init(struct tdspeed_state_s *st, unsigned int dsp_id)
{
    /* everything is at 100% until dsp_set_timestretch is called with
       some other value and timestretch is enabled at the time */
    if (dsp_id == CODEC_IDX_AUDIO)
        st->factor = PITCH_SPEED_100;
}

/* DSP message hook */
static intptr_t tdspeed_configure(struct dsp_proc_entry *this,
                                  struct dsp_config *dsp,
                                  unsigned int setting,
                                  intptr_t value)
{
    intptr_t retval = 0;

    struct tdspeed_state_s *st = &tdspeed_state;

    switch (setting)
    {
    case DSP_INIT:
        tdspeed_dsp_init(st, value);
        break;

    case DSP_FLUSH:
        tdspeed_flush();
        break;

    case DSP_PROC_INIT:
        if (!tdspeed_alloc_buffers(buffers, buffer_sizes, TDSPEED_NBUFFERS))
            return -1; /* fail the init */

        st->this = this;
        dsp_proc_set_in_place(dsp, DSP_PROC_TIMESTRETCH, false);
        this->process = tdspeed_process;
        break;

    case DSP_PROC_CLOSE:
        st->this = NULL;
        st->factor = PITCH_SPEED_100;
        dsp_outbuf.remcount = 0;
        tdspeed_free_buffers(buffers, TDSPEED_NBUFFERS);
        break;

    case DSP_PROC_NEW_FORMAT:
        retval = tdspeed_new_format(this, dsp, (struct sample_format *)value);
        break;

    case TIMESTRETCH_SET_FACTOR:
        st->samplerate = 0;
        st->factor = (int32_t)value;
        dsp_proc_want_format_update(dsp, DSP_PROC_TIMESTRETCH);
        break;
    }

    return retval;
}

void tdspeed_move(int i, void* current, void* new)
{
    ptrdiff_t shift = (int32_t *)new - (int32_t *)current;
    int32_t **p32 = dsp_outbuf.p32;

    switch (i)
    {
    case 0: case 1:
        /* moving overlap (input) buffers */
        tdspeed_state.ovl_buff[i] = new;
        break;

    case 2:
        /* moving outbuf left channel and dsp_outbuf.p32[0] */
        if (p32[0] == p32[1])
            p32[1] += shift; /* mono mode */

        p32[0] += shift;
        break;

    case 3:
        /* moving outbuf right channel and dsp_outbuf.p32[1] */
        p32[1] += shift;
        break;
    }

    buffers[i] = new;
}

/* Database entry */
DSP_PROC_DB_ENTRY(TIMESTRETCH,
                  tdspeed_configure);