/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2008 by Jens Arnold
 * Copyright (C) 2009 by Andrew Mahone
 *
 * Optimised unsigned integer division for ARMv4
 *
 * Based on: libgcc routines for ARM cpu.
 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
 * Free Software Foundation, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

#include "config.h"
/* Codecs should not normally do this, but we need to check a macro, and
 * codecs.h would confuse the assembler. */

/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
   for dividing a 30-bit value by a 15-bit value, with two operations per
   iteration by storing quotient and remainder together and adding the previous
   quotient bit during trial subtraction. Modified to work with any dividend
   and divisor both less than 1 << 30, and skipping trials by calculating bits
   in output.
*/
.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient

    mov     \bits, #1
    cmp     \divisor, \dividend, lsr #16
    movls   \divisor, \divisor, lsl #16
    addls   \bits, \bits, #16
    cmp     \divisor, \dividend, lsr #8
    movls   \divisor, \divisor, lsl #8
    addls   \bits, \bits, #8
    cmp     \divisor, \dividend, lsr #4
    movls   \divisor, \divisor, lsl #4
    addls   \bits, \bits, #4
    cmp     \divisor, \dividend, lsr #2
    movls   \divisor, \divisor, lsl #2
    addls   \bits, \bits, #2
    cmp     \divisor, \dividend, lsr #1
    movls   \divisor, \divisor, lsl #1
    addls   \bits, \bits, #1
    rsb     \divisor, \divisor, #0
    adds    \result, \dividend, \divisor
    subcc   \result, \result, \divisor
    rsb     \curbit, \bits, #31
    add     pc, pc, \curbit, lsl #3
    nop
    .rept   30
    adcs    \result, \divisor, \result, lsl #1
    subcc   \result, \result, \divisor
    .endr
    /* shift remainder/quotient left one, add final quotient bit */
    adc     \result, \result, \result
    mov     \dividend, \result, lsr \bits
    eor     \quotient, \result, \dividend, lsl \bits
.endm

.macro ARM_DIV_32_BODY dividend, divisor, result, curbit

    mov     \result, \dividend
    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
    cmp     \divisor, \result, lsr #16
    movls   \result,\result, lsr #16
    subls   \curbit, \curbit, #48
    cmp     \divisor, \result, lsr #8
    movls   \result,\result, lsr #8
    subls   \curbit, \curbit, #24
    cmp     \divisor, \result, lsr #4
    movls   \result,\result, lsr #4
    subls   \curbit, \curbit, #12
    cmp     \divisor, \result, lsr #2
    subls   \curbit, \curbit, #6
    @ Calculation is only done down to shift=2, because the shift=1 step
    @ would need 3 more cycles, but would only gain 1.5 cycles on average.
    mov     \result, #0
    add     pc, pc, \curbit, lsl #2
    nop
    .set    shift, 32
    .rept   31
    .set    shift, shift - 1
    cmp     \divisor, \dividend, lsr #shift
    orrls   \result, \result, #(1 << shift)
    subls   \dividend, \dividend, \divisor, lsl #shift
    .endr   @ shift==0 in the .rept would cause a warning  for lsr #0
    cmp     \divisor, \dividend
    orrls   \result, \result, #1
    @subls  \dividend, \dividend, \divisor  @ correct remainder not needed
.endm

#ifdef USE_IRAM
    .section    .icode,"ax",%progbits
#else
    .text
#endif
    .align
    .global udiv32_arm
    .type   udiv32_arm,%function

udiv32_arm:
    cmp     r1, #0
    beq     20f
    tst     r0, r0
    /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
       divisor is also unset dividend has been tested to be >= divisor.
    */
    bmi     10f
    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
    bx      lr

10:
    ARM_DIV_32_BODY r0, r1, r2, r3
    mov     r0, r2
    bx      lr

20:
    movne   r0, #0
    bx      lr