rockbox/apps/codecs/lib/udiv32_armv4.S

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (C) 2008 by Jens Arnold
 * Copyright (C) 2009 by Andrew Mahone
 *
 * Optimised unsigned integer division for ARMv4
 *
 * Based on: libgcc routines for ARM cpu.
 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
 * Free Software Foundation, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/

#include "config.h"
/* Codecs should not normally do this, but we need to check a macro, and
 * codecs.h would confuse the assembler. */

/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
   for dividing a 30-bit value by a 15-bit value, with two operations per
   iteration by storing quotient and remainder together and adding the previous
   quotient bit during trial subtraction. Modified to work with any dividend
   and divisor both less than 1 << 30, and skipping trials by calculating bits
   in output.
*/
.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient

    mov     \bits, #1
    cmp     \divisor, \dividend, lsr #16
    movls   \divisor, \divisor, lsl #16
    addls   \bits, \bits, #16
    cmp     \divisor, \dividend, lsr #8
    movls   \divisor, \divisor, lsl #8
    addls   \bits, \bits, #8
    cmp     \divisor, \dividend, lsr #4
    movls   \divisor, \divisor, lsl #4
    addls   \bits, \bits, #4
    cmp     \divisor, \dividend, lsr #2
    movls   \divisor, \divisor, lsl #2
    addls   \bits, \bits, #2
    cmp     \divisor, \dividend, lsr #1
    movls   \divisor, \divisor, lsl #1
    addls   \bits, \bits, #1
    rsb     \divisor, \divisor, #0
    adds    \result, \dividend, \divisor
    subcc   \result, \result, \divisor
    rsb     \curbit, \bits, #31
    add     pc, pc, \curbit, lsl #3
    nop
    .rept   30
    adcs    \result, \divisor, \result, lsl #1
    subcc   \result, \result, \divisor
    .endr
    /* shift remainder/quotient left one, add final quotient bit */
    adc     \result, \result, \result
    mov     \dividend, \result, lsr \bits
    eor     \quotient, \result, \dividend, lsl \bits
.endm

.macro ARM_DIV_32_BODY dividend, divisor, result, curbit

    mov     \result, \dividend
    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
    cmp     \divisor, \result, lsr #16
    movls   \result,\result, lsr #16
    subls   \curbit, \curbit, #48
    cmp     \divisor, \result, lsr #8
    movls   \result,\result, lsr #8
    subls   \curbit, \curbit, #24
    cmp     \divisor, \result, lsr #4
    movls   \result,\result, lsr #4
    subls   \curbit, \curbit, #12
    cmp     \divisor, \result, lsr #2
    subls   \curbit, \curbit, #6
    @ Calculation is only done down to shift=2, because the shift=1 step
    @ would need 3 more cycles, but would only gain 1.5 cycles on average.
    mov     \result, #0
    add     pc, pc, \curbit, lsl #2
    nop
    .set    shift, 32
    .rept   31
    .set    shift, shift - 1
    cmp     \divisor, \dividend, lsr #shift
    orrls   \result, \result, #(1 << shift)
    subls   \dividend, \dividend, \divisor, lsl #shift
    .endr   @ shift==0 in the .rept would cause a warning  for lsr #0
    cmp     \divisor, \dividend
    orrls   \result, \result, #1
    @subls  \dividend, \dividend, \divisor  @ correct remainder not needed
.endm

#ifdef USE_IRAM
    .section    .icode,"ax",%progbits
#else
    .text
#endif
    .align
    .global udiv32_arm
    .type   udiv32_arm,%function

udiv32_arm:
    cmp     r1, #0
    beq     20f
    tst     r0, r0
    /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
       divisor is also unset dividend has been tested to be >= divisor.
    */
    bmi     10f
    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
    bx      lr

10:
    ARM_DIV_32_BODY r0, r1, r2, r3
    mov     r0, r2
    bx      lr

20:
    movne   r0, #0
    bx      lr
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`/***************************************************************************`
			`* __________ __ ___.`
			`* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___`
			`* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /`
			`* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <`
			`* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \`
			`* \/ \/ \/ \/ \/`
			`* $Id$`
			`*`
			`* Copyright (C) 2008 by Jens Arnold`
Add 31/31-bit unsigned division in apps/codecs/lib/udiv_arm.S, with 2 cycles / iteration, falling back to previous 32-bit, 3 cycle / iteration code when needed (well under 1% of divisions in sample file). APE normal sample is now 96.90% realtime, approx 1.3% improved vs svn. TODO: unify divisor normalization for both trial subtraction routines, possibly use divisor bits to select 31- vs 32-bit division. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24130 a1c6a512-1295-4272-9138-f99709370657 2009-12-31 08:32:15 +00:00			`* Copyright (C) 2009 by Andrew Mahone`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`*`
			`* Optimised unsigned integer division for ARMv4`
			`*`
			`* Based on: libgcc routines for ARM cpu.`
			`* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)`
			`* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005`
			`* Free Software Foundation, Inc.`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* as published by the Free Software Foundation; either version 2`
			`* of the License, or (at your option) any later version.`
			`*`
			`* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY`
			`* KIND, either express or implied.`
			`*`
			`****************************************************************************/`

			`#include "config.h"`
			`/* Codecs should not normally do this, but we need to check a macro, and`
			`* codecs.h would confuse the assembler. */`

Add 31/31-bit unsigned division in apps/codecs/lib/udiv_arm.S, with 2 cycles / iteration, falling back to previous 32-bit, 3 cycle / iteration code when needed (well under 1% of divisions in sample file). APE normal sample is now 96.90% realtime, approx 1.3% improved vs svn. TODO: unify divisor normalization for both trial subtraction routines, possibly use divisor bits to select 31- vs 32-bit division. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24130 a1c6a512-1295-4272-9138-f99709370657 2009-12-31 08:32:15 +00:00			`/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)`
			`for dividing a 30-bit value by a 15-bit value, with two operations per`
			`iteration by storing quotient and remainder together and adding the previous`
			`quotient bit during trial subtraction. Modified to work with any dividend`
			`and divisor both less than 1 << 30, and skipping trials by calculating bits`
			`in output.`
			`*/`
			`.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient`

			`mov \bits, #1`
			`cmp \divisor, \dividend, lsr #16`
			`movls \divisor, \divisor, lsl #16`
			`addls \bits, \bits, #16`
			`cmp \divisor, \dividend, lsr #8`
			`movls \divisor, \divisor, lsl #8`
			`addls \bits, \bits, #8`
			`cmp \divisor, \dividend, lsr #4`
			`movls \divisor, \divisor, lsl #4`
			`addls \bits, \bits, #4`
			`cmp \divisor, \dividend, lsr #2`
			`movls \divisor, \divisor, lsl #2`
			`addls \bits, \bits, #2`
			`cmp \divisor, \dividend, lsr #1`
			`movls \divisor, \divisor, lsl #1`
			`addls \bits, \bits, #1`
			`rsb \divisor, \divisor, #0`
			`adds \result, \dividend, \divisor`
			`subcc \result, \result, \divisor`
			`rsb \curbit, \bits, #31`
			`add pc, pc, \curbit, lsl #3`
			`nop`
			`.rept 30`
			`adcs \result, \divisor, \result, lsl #1`
			`subcc \result, \result, \divisor`
			`.endr`
			`/* shift remainder/quotient left one, add final quotient bit */`
			`adc \result, \result, \result`
			`mov \dividend, \result, lsr \bits`
			`eor \quotient, \result, \dividend, lsl \bits`
			`.endm`

			`.macro ARM_DIV_32_BODY dividend, divisor, result, curbit`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00
			`mov \result, \dividend`
ARMv4 unsigned integer division: Using an overflow-safe comparison method in the main calculation allows to put back the 1.5 cyle (average) optimisation. Shaved off another instruction, as we don't need the remainder. * Use the very efficient ffs algorithm from ffs-arm.S for dividing by a power of 2. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19032 a1c6a512-1295-4272-9138-f99709370657 2008-11-06 21:21:33 +00:00			`mov \curbit, #90 @ 3 * 30, (calculating branch dest)`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`cmp \divisor, \result, lsr #16`
			`movls \result,\result, lsr #16`
			`subls \curbit, \curbit, #48`
			`cmp \divisor, \result, lsr #8`
			`movls \result,\result, lsr #8`
			`subls \curbit, \curbit, #24`
			`cmp \divisor, \result, lsr #4`
			`movls \result,\result, lsr #4`
			`subls \curbit, \curbit, #12`
			`cmp \divisor, \result, lsr #2`
			`subls \curbit, \curbit, #6`
ARMv4 unsigned integer division: Using an overflow-safe comparison method in the main calculation allows to put back the 1.5 cyle (average) optimisation. Shaved off another instruction, as we don't need the remainder. * Use the very efficient ffs algorithm from ffs-arm.S for dividing by a power of 2. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19032 a1c6a512-1295-4272-9138-f99709370657 2008-11-06 21:21:33 +00:00			`@ Calculation is only done down to shift=2, because the shift=1 step`
			`@ would need 3 more cycles, but would only gain 1.5 cycles on average.`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`mov \result, #0`
			`add pc, pc, \curbit, lsl #2`
			`nop`
			`.set shift, 32`
ARMv4 unsigned integer division: Using an overflow-safe comparison method in the main calculation allows to put back the 1.5 cyle (average) optimisation. Shaved off another instruction, as we don't need the remainder. * Use the very efficient ffs algorithm from ffs-arm.S for dividing by a power of 2. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19032 a1c6a512-1295-4272-9138-f99709370657 2008-11-06 21:21:33 +00:00			`.rept 31`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`.set shift, shift - 1`
ARMv4 unsigned integer division: Using an overflow-safe comparison method in the main calculation allows to put back the 1.5 cyle (average) optimisation. Shaved off another instruction, as we don't need the remainder. * Use the very efficient ffs algorithm from ffs-arm.S for dividing by a power of 2. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19032 a1c6a512-1295-4272-9138-f99709370657 2008-11-06 21:21:33 +00:00			`cmp \divisor, \dividend, lsr #shift`
			`orrls \result, \result, #(1 << shift)`
			`subls \dividend, \dividend, \divisor, lsl #shift`
			`.endr @ shift==0 in the .rept would cause a warning for lsr #0`
			`cmp \divisor, \dividend`
			`orrls \result, \result, #1`
			`@subls \dividend, \dividend, \divisor @ correct remainder not needed`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`.endm`

			`#ifdef USE_IRAM`
			`.section .icode,"ax",%progbits`
			`#else`
			`.text`
			`#endif`
			`.align`
			`.global udiv32_arm`
			`.type udiv32_arm,%function`

			`udiv32_arm:`
Remove special cases from udiv32_armv4.S, except for zero divisor and large numerator. Improvement of 1.23MHz on e200 with ape normal. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24142 a1c6a512-1295-4272-9138-f99709370657 2010-01-02 15:15:21 +00:00			`cmp r1, #0`
			`beq 20f`
Add 31/31-bit unsigned division in apps/codecs/lib/udiv_arm.S, with 2 cycles / iteration, falling back to previous 32-bit, 3 cycle / iteration code when needed (well under 1% of divisions in sample file). APE normal sample is now 96.90% realtime, approx 1.3% improved vs svn. TODO: unify divisor normalization for both trial subtraction routines, possibly use divisor bits to select 31- vs 32-bit division. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24130 a1c6a512-1295-4272-9138-f99709370657 2009-12-31 08:32:15 +00:00			`tst r0, r0`
			`/* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of`
			`divisor is also unset dividend has been tested to be >= divisor.`
			`*/`
Remove special cases from udiv32_armv4.S, except for zero divisor and large numerator. Improvement of 1.23MHz on e200 with ape normal. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24142 a1c6a512-1295-4272-9138-f99709370657 2010-01-02 15:15:21 +00:00			`bmi 10f`
Add 31/31-bit unsigned division in apps/codecs/lib/udiv_arm.S, with 2 cycles / iteration, falling back to previous 32-bit, 3 cycle / iteration code when needed (well under 1% of divisions in sample file). APE normal sample is now 96.90% realtime, approx 1.3% improved vs svn. TODO: unify divisor normalization for both trial subtraction routines, possibly use divisor bits to select 31- vs 32-bit division. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24130 a1c6a512-1295-4272-9138-f99709370657 2009-12-31 08:32:15 +00:00			`ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0`
			`bx lr`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00
Remove special cases from udiv32_armv4.S, except for zero divisor and large numerator. Improvement of 1.23MHz on e200 with ape normal. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24142 a1c6a512-1295-4272-9138-f99709370657 2010-01-02 15:15:21 +00:00			`10:`
Add 31/31-bit unsigned division in apps/codecs/lib/udiv_arm.S, with 2 cycles / iteration, falling back to previous 32-bit, 3 cycle / iteration code when needed (well under 1% of divisions in sample file). APE normal sample is now 96.90% realtime, approx 1.3% improved vs svn. TODO: unify divisor normalization for both trial subtraction routines, possibly use divisor bits to select 31- vs 32-bit division. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24130 a1c6a512-1295-4272-9138-f99709370657 2009-12-31 08:32:15 +00:00			`ARM_DIV_32_BODY r0, r1, r2, r3`
Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657 2008-11-05 00:10:05 +00:00			`mov r0, r2`
			`bx lr`

			`20:`
			`movne r0, #0`
Remove special cases from udiv32_armv4.S, except for zero divisor and large numerator. Improvement of 1.23MHz on e200 with ape normal. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24142 a1c6a512-1295-4272-9138-f99709370657 2010-01-02 15:15:21 +00:00			`bx lr`