rockbox/apps/codecs/lib/udiv32_armv4.S
Andrew Mahone 934514558b Remove special cases from udiv32_armv4.S, except for zero divisor and large numerator. Improvement of 1.23MHz on e200 with ape normal.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24142 a1c6a512-1295-4272-9138-f99709370657
2010-01-02 15:15:21 +00:00

134 lines
No EOL
4.6 KiB
ArmAsm

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2008 by Jens Arnold
* Copyright (C) 2009 by Andrew Mahone
*
* Optimised unsigned integer division for ARMv4
*
* Based on: libgcc routines for ARM cpu.
* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
* Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
/* Codecs should not normally do this, but we need to check a macro, and
* codecs.h would confuse the assembler. */
/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
for dividing a 30-bit value by a 15-bit value, with two operations per
iteration by storing quotient and remainder together and adding the previous
quotient bit during trial subtraction. Modified to work with any dividend
and divisor both less than 1 << 30, and skipping trials by calculating bits
in output.
*/
.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient
mov \bits, #1
cmp \divisor, \dividend, lsr #16
movls \divisor, \divisor, lsl #16
addls \bits, \bits, #16
cmp \divisor, \dividend, lsr #8
movls \divisor, \divisor, lsl #8
addls \bits, \bits, #8
cmp \divisor, \dividend, lsr #4
movls \divisor, \divisor, lsl #4
addls \bits, \bits, #4
cmp \divisor, \dividend, lsr #2
movls \divisor, \divisor, lsl #2
addls \bits, \bits, #2
cmp \divisor, \dividend, lsr #1
movls \divisor, \divisor, lsl #1
addls \bits, \bits, #1
rsb \divisor, \divisor, #0
adds \result, \dividend, \divisor
subcc \result, \result, \divisor
rsb \curbit, \bits, #31
add pc, pc, \curbit, lsl #3
nop
.rept 30
adcs \result, \divisor, \result, lsl #1
subcc \result, \result, \divisor
.endr
/* shift remainder/quotient left one, add final quotient bit */
adc \result, \result, \result
mov \dividend, \result, lsr \bits
eor \quotient, \result, \dividend, lsl \bits
.endm
.macro ARM_DIV_32_BODY dividend, divisor, result, curbit
mov \result, \dividend
mov \curbit, #90 @ 3 * 30, (calculating branch dest)
cmp \divisor, \result, lsr #16
movls \result,\result, lsr #16
subls \curbit, \curbit, #48
cmp \divisor, \result, lsr #8
movls \result,\result, lsr #8
subls \curbit, \curbit, #24
cmp \divisor, \result, lsr #4
movls \result,\result, lsr #4
subls \curbit, \curbit, #12
cmp \divisor, \result, lsr #2
subls \curbit, \curbit, #6
@ Calculation is only done down to shift=2, because the shift=1 step
@ would need 3 more cycles, but would only gain 1.5 cycles on average.
mov \result, #0
add pc, pc, \curbit, lsl #2
nop
.set shift, 32
.rept 31
.set shift, shift - 1
cmp \divisor, \dividend, lsr #shift
orrls \result, \result, #(1 << shift)
subls \dividend, \dividend, \divisor, lsl #shift
.endr @ shift==0 in the .rept would cause a warning for lsr #0
cmp \divisor, \dividend
orrls \result, \result, #1
@subls \dividend, \dividend, \divisor @ correct remainder not needed
.endm
#ifdef USE_IRAM
.section .icode,"ax",%progbits
#else
.text
#endif
.align
.global udiv32_arm
.type udiv32_arm,%function
udiv32_arm:
cmp r1, #0
beq 20f
tst r0, r0
/* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
divisor is also unset dividend has been tested to be >= divisor.
*/
bmi 10f
ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
bx lr
10:
ARM_DIV_32_BODY r0, r1, r2, r3
mov r0, r2
bx lr
20:
movne r0, #0
bx lr