From bff5a35c3c51ebe1fe72ee20147b16ede847971d Mon Sep 17 00:00:00 2001
From: Andrew Mahone <andrew.mahone@gmail.com>
Date: Mon, 1 Feb 2010 01:36:46 +0000
Subject: [PATCH] FS#10943, optimized division and clz routines to replace
 libgcc routines for ARM. Replaces libgcc support functions for unsigned and
 signed 32-bit division on ARMv4 and up, and leading-zero count on ARMv4.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24432 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/lib/SOURCES           |   1 +
 apps/codecs/lib/codeclib.h        |   2 +-
 apps/plugins/lib/SOURCES          |   1 +
 firmware/SOURCES                  |   1 +
 firmware/target/arm/support-arm.S | 699 ++++++++++++++++++++++++++++++
 5 files changed, 703 insertions(+), 1 deletion(-)
 create mode 100644 firmware/target/arm/support-arm.S

diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index ffbe1af92e..a8c3feb1aa 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -8,6 +8,7 @@ mdct_lookup.c
 mdct_arm.S
 setjmp_arm.S
 udiv32_arm.S
+../../../firmware/target/arm/support-arm.S
 #endif
 
 #ifdef CPU_COLDFIRE
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 926035f05e..2d34523de5 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
 
 extern void mdct_backward(int n, int32_t *in, int32_t *out);
 
-#ifdef CPU_ARM
+#if defined(CPU_ARM) && (ARM_ARCH < 5 || defined(USE_IRAM))
 /* optimised unsigned integer division for ARMv4, in IRAM */
 unsigned udiv32_arm(unsigned a, unsigned b);
 #define UDIV32(a, b) udiv32_arm(a, b)
diff --git a/apps/plugins/lib/SOURCES b/apps/plugins/lib/SOURCES
index 00d3ac7c56..82807d15ee 100644
--- a/apps/plugins/lib/SOURCES
+++ b/apps/plugins/lib/SOURCES
@@ -30,6 +30,7 @@ profile_plugin.c
 #endif
 #ifdef HAVE_LCD_BITMAP
 #ifdef CPU_ARM
+../../../firmware/target/arm/support-arm.S
 pluginlib_jpeg_idct_arm.S
 #endif
 pluginlib_jpeg_mem.c
diff --git a/firmware/SOURCES b/firmware/SOURCES
index 56ab680417..e60bf63419 100644
--- a/firmware/SOURCES
+++ b/firmware/SOURCES
@@ -375,6 +375,7 @@ target/coldfire/i2c-coldfire.c
 
 #elif defined(CPU_PP) || defined(CPU_ARM)
 /* CPU_PP => CPU_ARM, CPU_ARM !=> CPU_PP */
+target/arm/support-arm.S
 target/arm/memcpy-arm.S
 target/arm/memmove-arm.S
 common/strlen.c
diff --git a/firmware/target/arm/support-arm.S b/firmware/target/arm/support-arm.S
new file mode 100644
index 0000000000..8703dd5b0a
--- /dev/null
+++ b/firmware/target/arm/support-arm.S
@@ -0,0 +1,699 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ * Copyright (C) 2009 by Andrew Mahone
+ *
+ * Optimised replacements for libgcc functions
+ *
+ * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
+ *           Developer's Guide
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include <config.h>
+
+.macro ARM_SDIV32_PRE numerator, divisor, sign
+    /* sign[31] = divisor sign */
+    ands    \sign, \divisor, #1<<31
+    rsbeq   \divisor, \divisor, #0
+    /* sign[31] = result sign, sign[0:30], C = numerator sign */
+    eors    \sign, \sign, \numerator, asr #32
+    rsbcs   \numerator, \numerator, #0
+.endm
+
+.macro ARM_SDIV32_POST quotient, remainder, sign
+    movs    \sign, \sign, lsl #1
+.ifnc "", "\quotient"
+    rsbcs   \quotient, \quotient, #0
+.endif
+.ifnc "", "\remainder"
+    rsbmi   \remainder, \remainder, #0
+.endif
+.endm
+
+#if ARM_ARCH < 5
+.macro ARMV4_UDIV32_BODY numerator, divisor, quotient, remainder, tmp, bits, div0label, return
+.ifnc "", "\div0label"
+    rsbs    \divisor, \divisor, #0
+    beq     \div0label
+.else
+    rsb     \divisor, \divisor, #0
+.endif
+    /* This SWAR divider requires a numerator less than 1<<31, because it must
+       be able to shift the remainder left at each step without shifting out
+       topmost bit. Since a shift might be needed for the aligned remainder to
+       exceed the divisor, the topmost bit must be unset at the start to avoid
+       this overflow case. The original numerator is saved so that the result
+       can be corrected after the reduced division completes. */
+    cmn     \numerator, \divisor
+.ifc "", "\quotient"
+.ifc "\numerator", "\remainder"
+.if \return
+    bxcc    lr
+.else
+    b 99f
+.endif
+.else
+    bcc     20f
+.endif
+.else
+    bcc     20f
+.endif
+    movs    \tmp, \numerator
+    movmi   \numerator, \numerator, lsr #1
+    mov     \bits, #30
+.set shift, 16
+.rept 5
+    cmn     \divisor, \numerator, lsr #shift
+    subcs   \bits, \bits, #shift
+    movcs   \divisor, \divisor, lsl #shift
+.set shift, shift >> 1
+.endr
+    adds    \numerator, \numerator, \divisor
+    subcc   \numerator, \numerator, \divisor
+    add     pc, pc, \bits, lsl #3
+    nop
+.rept 30
+    adcs    \numerator, \divisor, \numerator, lsl #1
+    subcc   \numerator, \numerator, \divisor
+.endr
+    adc     \numerator, \numerator, \numerator
+    movs    \tmp, \tmp, asr #1
+    rsb     \bits, \bits, #31
+    bmi     10f
+.ifc "", "\quotient"
+    mov     \remainder, \numerator, lsr \bits
+.else
+.ifc "", "\remainder"
+    mov     \divisor, \numerator, lsr \bits
+    eor     \quotient, \numerator, \divisor, lsl \bits
+.else
+    mov     \remainder, \numerator, lsr \bits
+    eor     \quotient, \numerator, \remainder, lsl \bits
+.endif
+.endif
+.ifne \return        
+    bx      lr
+.else
+    b       99f
+.endif
+10:
+    mov     \tmp, \numerator, lsr \bits
+    eor     \numerator, \numerator, \tmp, lsl \bits
+    sub     \bits, \bits, #1
+    adc     \tmp, \tmp, \tmp
+    adds    \tmp, \tmp, \divisor, asr \bits
+.ifnc "", "\quotient"
+    adc     \quotient, \numerator, \numerator
+.endif
+.ifnc "", "\remainder"
+    subcc   \remainder, \tmp, \divisor, asr \bits
+    movcs   \remainder, \tmp
+.endif
+.ifne \return        
+    bx      lr
+.else
+    b       99f
+.endif
+20:
+.ifnc "", "\remainder"
+.ifnc "\remainder", "\numerator"
+    mov     \remainder, \numerator
+.endif
+.endif
+.ifnc "", "\quotient"
+    mov   \quotient, #0
+.endif
+.ifne \return        
+    bx      lr
+.else
+99:
+.endif
+.endm
+
+.macro ARMV4_SDIV32_BODY numerator, divisor, quotient, remainder, bits, sign, div0label, return
+    /* When this is wrapped for signed division, the wrapper code will handle
+       inverting the divisor, and also the zero divisor test. */
+    ARM_SDIV32_PRE \numerator, \divisor, \sign
+.ifnc "",   "\div0label"
+    tst     \divisor, \divisor
+    beq     \div0label
+.endif
+    /* This SWAR divider requires a numerator less than 1<<31, because it must
+       be able to shift the remainder left at each step without shifting out
+       topmost bit. With signed inputs, whose absolute value may not exceed
+       1<<31,this may be accomplished simply by subtracting the divisor before
+       beginning division, and adding 1 to the quotient. */
+    adds    \numerator, \numerator, \divisor
+    bcc     20f
+    mov     \bits, #30
+.set shift, 16
+.rept 5
+    cmn     \divisor, \numerator, lsr #shift
+    subcs   \bits, \bits, #shift
+    movcs   \divisor, \divisor, lsl #shift
+.set shift, shift >> 1
+.endr
+    adds    \numerator, \numerator, \divisor
+    subcc   \numerator, \numerator, \divisor
+    add     pc, pc, \bits, lsl #3
+    nop
+.rept 30
+    adcs    \numerator, \divisor, \numerator, lsl #1
+    subcc   \numerator, \numerator, \divisor
+.endr
+    rsb     \bits, \bits, #31
+    adc     \numerator, \numerator, \numerator
+.ifc "", "\quotient"
+    mov     \remainder, \numerator, lsr \bits
+.else
+.ifc "", "\remainder"
+    mov     \divisor, \numerator, lsr \bits
+    add     \numerator, \numerator, #1
+    sub     \quotient, \numerator, \divisor, lsl \bits
+.else
+    mov     \remainder, \numerator, lsr \bits
+    add     \numerator, \numerator, #1
+    sub     \quotient, \numerator, \remainder, lsl \bits
+.endif
+.endif
+.ifne \return        
+    ARM_SDIV32_POST \quotient, \remainder, \sign
+    bx      lr
+.else
+    b       99f
+.endif
+20:
+.ifnc "", "\remainder"
+    sub     \remainder, \numerator, \divisor
+.endif
+.ifnc "", "\quotient"
+    mov     \quotient, #0
+.endif
+.ifne \return        
+    ARM_SDIV32_POST "", \remainder, \sign
+    bx      lr
+.else
+99:
+    ARM_SDIV32_POST \quotient, \remainder, \sign
+.endif
+.endm
+
+#else
+.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, remainder, bits, inv, neg, div0label, return
+    cmp     \numerator, \divisor
+    clz     \bits, \divisor
+    bcc     30f
+    mov     \inv, \divisor, lsl \bits
+    add     \neg, pc, \inv, lsr #25
+    /* Test whether divisor is 2^N */
+    cmp     \inv, #1<<31
+    /* Load approximate reciprocal */
+    ldrhib  \inv, [\neg, #.L_udiv_est_table-.-64]
+    bls     20f
+    subs    \bits, \bits, #7
+    rsb     \neg, \divisor, #0
+    /* Scale approximate reciprocal, or else branch to large-divisor path */
+    movpl   \divisor, \inv, lsl \bits
+    bmi     10f
+    /* Newton-Raphson iteration to improve reciprocal accuracy */
+    mul     \inv, \divisor, \neg
+    smlawt  \divisor, \divisor, \inv, \divisor
+    mul     \inv, \divisor, \neg
+    /* Complete N-R math and produce approximate quotient. Use smmla/smmul on
+       ARMv6. */
+#if ARM_ARCH >= 6
+    tst     \numerator, \numerator
+    smmla   \divisor, \divisor, \inv, \divisor
+    /* Branch to large-numerator handler, or else use smmul if sign bit is not
+       set. */
+    bmi     40f
+    smmul   \inv, \numerator, \divisor
+#else
+    /* ARMv5e lacks smmul, so always uses umull. */
+    mov     \bits, #0
+    smlal   \bits, \divisor, \inv, \divisor
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    /* Calculate remainder and correct result. */
+    add     \numerator, \numerator, \neg
+.ifnc "", "\remainder"
+    mla     \remainder, \inv, \neg, \numerator
+.ifnc "", "\quotient"
+    mov     \quotient, \inv
+    cmn     \remainder, \neg
+    subcs   \remainder, \remainder, \neg
+    addpl   \remainder, \remainder, \neg, lsl #1
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+.else
+    cmn     \remainder, \neg
+    subcs   \remainder, \remainder, \neg
+    addpl   \remainder, \remainder, \neg, lsl #1
+.endif
+.else
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+.endif
+.if \return
+    bx      lr
+.else
+    b       99f
+.endif
+10:
+    /* Very large divisors can be handled without further improving the
+       reciprocal. First the reciprocal must be reduced to ensure that it
+       underestimates the correct value. */
+    rsb     \bits, \bits, #0
+    sub     \inv, \inv, #4
+    mov     \divisor, \inv, lsr \bits
+    /* Calculate approximate quotient and remainder */
+    umull   \bits, \inv, \numerator, \divisor
+    /* Correct quotient and remainder */
+.ifnc "", "\remainder"
+    mla     \remainder, \inv, \neg, \numerator
+.ifnc "", "\quotient"
+    mov     \quotient, \inv
+    cmn     \neg, \remainder, lsr #1
+    addcs   \remainder, \remainder, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \remainder
+    addcs   \remainder, \remainder, \neg
+    addcs   \quotient, \quotient, #1
+.else
+    cmn     \neg, \remainder, lsr #1
+    addcs   \remainder, \remainder, \neg, lsl #1
+    cmn     \neg, \remainder
+    addcs   \remainder, \remainder, \neg
+.endif
+.else
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \neg, \divisor, lsr #1
+    addcs   \divisor, \divisor, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \divisor
+    addcs   \quotient, \quotient, #1
+.endif
+.if \return
+    bx      lr
+.else
+    b       99f
+.endif
+20:
+    /* Handle division by powers of two by shifting right. Mod is handled
+       by using divisor-1 as a bitmask. */
+.ifnc "", "\remainder"
+.ifnc "", "\div0label"
+    bne     \div0label
+.endif
+.ifnc "", "\quotient"
+    sub     \divisor, \divisor, #1
+    rsb     \bits, \bits, #31
+    and     \remainder, \numerator, \divisor
+    mov     \quotient, \numerator, lsr \bits
+.else
+    sub     \divisor, \divisor, #1
+    and     \remainder, \numerator, \divisor
+.endif
+.else
+    rsb     \bits, \bits, #31
+.ifnc "", "\div0label"
+    bne     \div0label
+.endif
+    mov     \quotient, \numerator, lsr \bits
+.endif
+.if \return
+    bx      lr
+.else
+    b       99f
+.endif
+30:
+    /* Handle numerator < divisor - quotient is zero, remainder is numerator,
+       which must be restored to its original value on ARMv6. */
+.ifnc "", "\remainder"
+    mov     \remainder, \numerator
+.endif
+.ifnc "", "\quotient"
+    mov     \quotient, #0
+.endif
+.if \return
+    bx      lr
+.endif
+#if ARM_ARCH >= 6
+40:
+    /* Handle large (sign bit set) numerators. Works exactly as the ARMv5e code
+       above 10:. */
+    umull   \bits, \inv, \numerator, \divisor
+    add     \numerator, \numerator, \neg
+.ifnc "", "\remainder"
+    mla     \remainder, \inv, \neg, \numerator
+.ifnc "", "\quotient"
+    mla     \remainder, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \remainder, \neg
+    subcs   \remainder, \remainder, \neg
+    addpl   \remainder, \remainder, \neg, lsl #1
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+.else
+    cmn     \remainder, \neg
+    subcs   \remainder, \remainder, \neg
+    addpl   \remainder, \remainder, \neg, lsl #1
+.endif
+.else
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+.endif
+.if \return
+    bx      lr
+.else
+    b       99f
+.endif
+#endif
+99:
+.endm
+
+.macro ARMV5_SDIV32_BODY numerator, divisor, quotient, remainder, bits, inv, neg, sign, div0label, return
+    /* sign[31] = divisor sign */
+    ands    \sign, \divisor, #1<<31
+    rsbne   \divisor, \divisor, #0
+    /* sign[31] = result sign, sign[0:30], C = numerator sign */
+    eors    \sign, \sign, \numerator, asr #32
+    clz     \bits, \divisor
+    rsbcs   \numerator, \numerator, #0
+    /* On ARMv6, subtract divisor before performing division, which ensures
+       numerator sign bit is clear and smmul may be used in place of umull. The
+       fixup for the results can be fit entirely into existing delay slots on
+       the main division paths. It costs 1c in the num<div path if the
+       the remainder is to be produced in the numerator's register, and 1c in
+       the power-of-2-divisor path only if producing both remainder and
+       quotient. */
+#if ARM_ARCH >= 6
+    subs    \numerator, \numerator, \divisor
+#else
+    cmp     \numerator, \divisor
+#endif
+    movcs   \inv, \divisor, lsl \bits
+    bcc     30f
+    /* Test whether divisor is 2^N */
+    cmp     \inv, #1<<31
+    add     \inv, pc, \inv, lsr #25
+    bls     20f
+    /* Load approximate reciprocal */
+    ldrb    \inv, [\inv, #.L_udiv_est_table-.-64]
+    subs    \bits, \bits, #7
+    rsb     \neg, \divisor, #0
+    /* Scale approximate reciprocal, or else branch to large-divisor path */
+    movpl   \divisor, \inv, lsl \bits
+    bmi     10f
+    /* Newton-Raphson iteration to improve reciprocal accuracy */
+    mul     \inv, \divisor, \neg
+    smlawt  \divisor, \divisor, \inv, \divisor
+    mul     \inv, \divisor, \neg
+    /* Complete N-R math and produce approximate quotient. Use smmla/smmul on
+       ARMv6. */
+#if ARM_ARCH >= 6
+    smmla   \divisor, \divisor, \inv, \divisor
+    smmul   \inv, \numerator, \divisor
+#else
+    mov     \bits, #0
+    smlal   \bits, \divisor, \inv, \divisor
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    /* Calculate remainder and correct quotient. */
+    add     \numerator, \numerator, \neg
+.ifnc "", "\remainder"
+    mla     \remainder, \inv, \neg, \numerator
+.ifnc "", "\quotient"
+#if ARM_ARCH >= 6
+    add     \quotient, \inv, #1
+#else
+    mov     \quotient, \inv
+#endif
+    cmn     \remainder, \neg
+    subcs   \remainder, \remainder, \neg
+    addpl   \remainder, \remainder, \neg, lsl #1
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+.else
+    cmn     \remainder, \neg
+    subcs   \remainder, \remainder, \neg
+    addpl   \remainder, \remainder, \neg, lsl #1
+.endif
+.else
+    mla     \divisor, \inv, \neg, \numerator
+#if ARM_ARCH >= 6
+    add     \quotient, \inv, #1
+#else
+    mov     \quotient, \inv
+#endif
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+.endif
+    ARM_SDIV32_POST \quotient, \remainder, \sign
+.ifnc "", "\return"
+    \return
+.else
+    b       99f
+.endif
+10:
+    /* Very large divisors can be handled without further improving the
+       reciprocal. First the reciprocal must be reduced to ensure that it
+       underestimates the correct value. */
+    rsb     \bits, \bits, #0
+    sub     \inv, \inv, #4
+    mov     \divisor, \inv, lsr \bits
+    /* Calculate approximate quotient and remainder */
+#if ARM_ARCH >= 6
+    smmul   \inv, \numerator, \divisor
+#else
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    /* Correct quotient and remainder */
+.ifnc "", "\remainder"
+    mla     \remainder, \inv, \neg, \numerator
+.ifnc "", "\quotient"
+#if ARM_ARCH >= 6
+    add     \quotient, \inv, #1
+#else
+    mov     \quotient, \inv
+#endif
+    cmn     \neg, \remainder, lsr #1
+    addcs   \remainder, \remainder, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \remainder
+    addcs   \remainder, \remainder, \neg
+    addcs   \quotient, \quotient, #1
+.else
+    cmn     \neg, \remainder, lsr #1
+    addcs   \remainder, \remainder, \neg, lsl #1
+    cmn     \neg, \remainder
+    addcs   \remainder, \remainder, \neg
+.endif
+.else
+    mla     \divisor, \inv, \neg, \numerator
+#if ARM_ARCH >= 6
+    add     \quotient, \inv, #1
+#else
+    mov     \quotient, \inv
+#endif
+    cmn     \neg, \divisor, lsr #1
+    addcs   \divisor, \divisor, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \divisor
+    addcs   \quotient, \quotient, #1
+.endif
+    ARM_SDIV32_POST \quotient, \remainder, \sign
+.ifnc "", "\return"
+    \return
+.else
+    b       99f
+.endif
+20:
+    /* Handle division by powers of two by shifting right. Mod is handled
+       by using divisor-1 as a bitmask. */
+.ifnc "", "\div0label"
+    bne     \div0label
+.endif
+.ifnc "", "\remainder"
+.ifnc "", "\quotient"
+    rsb     \bits, \bits, #31
+#if ARM_ARCH >= 6
+    add     \numerator, \numerator, \divisor
+#endif
+    sub     \divisor, \divisor, #1
+    and     \remainder, \numerator, \divisor
+    mov     \quotient, \numerator, lsr \bits
+.else
+    sub     \divisor, \divisor, #1
+    and     \remainder, \numerator, \divisor
+.endif
+.else
+    rsb     \bits, \bits, #31
+#if ARM_ARCH >= 6
+    add     \numerator, \numerator, \divisor
+#endif
+    mov     \quotient, \numerator, lsr \bits
+.endif
+    ARM_SDIV32_POST \quotient, \remainder, \sign
+.ifnc "", "\return"
+    \return
+.else
+    b       99f
+.endif
+30:
+    /* Handle numerator < divisor - quotient is zero, remainder is numerator,
+       which must be restored to its original value on ARMv6. */
+.ifnc "", "\remainder"
+#if ARM_ARCH >= 6
+    add     \remainder, \numerator, \divisor
+#else
+.ifnc "\remainder", "\numerator"
+    mov     \remainder, \numerator
+.endif
+#endif
+.endif
+.ifnc "", "\quotient"
+    mov     \quotient, #0
+.endif
+    ARM_SDIV32_POST "", \remainder, \sign
+.ifnc "", "\return"
+    \return
+.endif
+99:
+.endm
+#endif    
+
+    .section .text
+
+__div0_wrap_s:
+    sub sp, sp, #4
+    mov     r0, #0
+    b       __div0
+    .size __div0_wrap_s, . - __div0_wrap_s
+
+__div0_wrap:
+    str     lr, [sp, #-4]!
+    mov     r0, #0
+    b       __div0
+    .size __div0_wrap, . - __div0_wrap
+
+#ifndef __ARM_EABI__
+    .global __divsi3
+    .type   __divsi3,%function
+    .global __udivsi3
+    .type   __udivsi3,%function
+    .global __udivsi3
+    .type   __udivsi3,%function
+#else
+/* The div+mod averagess a fraction of a cycle worse for signed values, and
+   slightly better for unsigned, so just alias div to divmod. */
+    .global __aeabi_uidivmod
+    .type   __aeabi_uidivmod,%function
+    .global __aeabi_uidiv
+    .type   __aeabi_uidiv,%function
+    .set    __aeabi_uidiv,__aeabi_uidivmod
+    .global __aeabi_idivmod
+    .type   __aeabi_idivmod,%function
+    .global __aeabi_idiv
+    .type   __aeabi_idiv,%function
+    .set    __aeabi_idiv,__aeabi_idivmod
+#endif
+
+
+#if ARM_ARCH < 5
+    .global __clzsi2
+    .type   __clzsi2, %function
+
+__clzsi2:
+    orr r0, r0, r0, lsr #8
+    orr r0, r0, r0, lsr #4
+    orr r0, r0, r0, lsr #2
+    orr r0, r0, r0, lsr #1
+    bic r0, r0, r0, lsr #16
+    rsb r0, r0, r0, lsl #14
+    rsb r0, r0, r0, lsl #11
+    rsb r0, r0, r0, lsl #9
+    ldrb r0, [pc, r0, lsr #26]
+    bx lr
+    .byte 32, 20, 19,  0,  0, 18,  0,  7, 10, 17,  0,  0, 14,  0,  6,  0
+    .byte  0,  9,  0, 16,  0,  0,  1, 26,  0, 13,  0,  0, 24,  5,  0,  0
+    .byte  0, 21,  0,  8, 11,  0, 15,  0,  0,  0,  0,  2, 27,  0, 25,  0
+    .byte 22,  0, 12,  0,  0,  3, 28,  0, 23,  0,  4, 29,  0,  0, 30, 31
+    .size __clzsi2, .-__clzsi2
+
+#ifndef __ARM_EABI__
+__udivsi3:
+    ARMV4_UDIV32_BODY r0, r1, r0, "", r2, r3, __div0_wrap, 1
+    .size __udivsi3, . - __udivsi3
+
+__divsi3:
+    ARMV4_SDIV32_BODY r0, r1, r0, "", r2, r3, __div0_wrap, 1
+    .size __divsi3, . - __divsi3
+
+#else
+__aeabi_uidivmod:
+    ARMV4_UDIV32_BODY r0, r1, r0, r1, r2, r3, __div0_wrap, 1
+    .size __aeabi_uidivmod, . - __aeabi_uidivmo
+
+__aeabi_idivmod
+    ARMV4_SDIV32_BODY r0, r1, r0, r1, r2, r3, __div0_wrap, 1
+    .size __aeabi_idivmod, . - __aeabi_idivmod
+#endif
+
+#else
+#ifndef __ARM_EABI__
+__udivsi3:
+    ARMV5_UDIV32_BODY r0, r1, r0, "", r2, r3, ip, __div0_wrap, 1
+    .size __udivsi3, . - __udivsi3
+
+__divsi3:
+    ARMV5_SDIV32_BODY r0, r1, r0, "", r2, lr, ip, r3, __div0_wrap_s, "ldr pc, [sp, #-4]"
+    .size __divsi3, . - __divsi3
+
+#else
+__aeabi_uidivmod:
+    ARMV5_UDIV32_BODY r0, r1, r0, r1, r2, r3, ip, __div0_wrap, 1
+    .size __aeabi_uidivmod, . - __aeabi_uidivmo
+
+__aeabi_idivmod
+    ARMV5_SDIV32_BODY r0, r1, r0, r1, r2, lr, ip, r3, __div0_wrap_s, "ldr pc, [sp, #-4]"
+    .size __aeabi_idivmod, . - __aeabi_idivmod
+#endif
+
+.L_udiv_est_table:
+    .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
+    .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
+    .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
+    .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
+    .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
+    .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
+    .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
+    .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
+#endif