374647f478
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8787 a1c6a512-1295-4272-9138-f99709370657
318 lines
9.8 KiB
ArmAsm
318 lines
9.8 KiB
ArmAsm
/***************************************************************************
|
|
* __________ __ ___.
|
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
|
* \/ \/ \/ \/ \/
|
|
* $Id$
|
|
*
|
|
* Copyright (C) 2004 by Jens Arnold
|
|
*
|
|
* All files in this archive are subject to the GNU General Public License.
|
|
* See the file COPYING in the source tree root for full license agreement.
|
|
*
|
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
* KIND, either express or implied.
|
|
*
|
|
****************************************************************************/
|
|
#include "config.h"
|
|
|
|
#ifdef CPU_ARM
|
|
.section .icode,"ax",%progbits
|
|
#else
|
|
.section .icode,"ax",@progbits
|
|
#endif
|
|
|
|
.align 2
|
|
#if CONFIG_CPU == SH7034
|
|
.global _memset
|
|
.type _memset,@function
|
|
|
|
/* Fills a memory region with specified byte value
|
|
* This version is optimized for speed
|
|
*
|
|
* arguments:
|
|
* r4 - start address
|
|
* r5 - data
|
|
* r6 - length
|
|
*
|
|
* return value:
|
|
* r0 - start address (like ANSI version)
|
|
*
|
|
* register usage:
|
|
* r0 - temporary
|
|
* r1 - start address +11 for main loop
|
|
* r4 - start address
|
|
* r5 - data (spread to all 4 bytes when using long stores)
|
|
* r6 - current address (runs down from end to start)
|
|
*
|
|
* The instruction order below is devised in a way to utilize the pipelining
|
|
* of the SH1 to the max. The routine fills memory from end to start in
|
|
* order to utilize the auto-decrementing store instructions.
|
|
*/
|
|
|
|
_memset:
|
|
neg r4,r0
|
|
and #3,r0 /* r0 = (4 - align_offset) % 4 */
|
|
add #4,r0
|
|
cmp/hs r0,r6 /* at least one aligned longword to fill? */
|
|
add r4,r6 /* r6 = end_address */
|
|
bf .no_longs /* no, jump directly to byte loop */
|
|
|
|
extu.b r5,r5 /* start: spread data to all 4 bytes */
|
|
swap.b r5,r0
|
|
or r0,r5 /* data now in 2 lower bytes of r5 */
|
|
swap.w r5,r0
|
|
or r0,r5 /* data now in all 4 bytes of r5 */
|
|
|
|
mov r6,r0
|
|
tst #3,r0 /* r0 already long aligned? */
|
|
bt .end_b1 /* yes: skip loop */
|
|
|
|
/* leading byte loop: sets 0..3 bytes */
|
|
.loop_b1:
|
|
mov.b r5,@-r0 /* store byte */
|
|
tst #3,r0 /* r0 long aligned? */
|
|
bf .loop_b1 /* runs r0 down until long aligned */
|
|
|
|
mov r0,r6 /* r6 = last long bound */
|
|
nop /* keep alignment */
|
|
|
|
.end_b1:
|
|
mov r4,r1 /* r1 = start_address... */
|
|
add #11,r1 /* ... + 11, combined for rounding and offset */
|
|
xor r1,r0
|
|
tst #4,r0 /* bit 2 tells whether an even or odd number of */
|
|
bf .loop_odd /* longwords to set */
|
|
|
|
/* main loop: set 2 longs per pass */
|
|
.loop_2l:
|
|
mov.l r5,@-r6 /* store first long */
|
|
.loop_odd:
|
|
cmp/hi r1,r6 /* runs r6 down to first long bound */
|
|
mov.l r5,@-r6 /* store second long */
|
|
bt .loop_2l
|
|
|
|
.no_longs:
|
|
cmp/hi r4,r6 /* any bytes left? */
|
|
bf .end_b2 /* no: skip loop */
|
|
|
|
/* trailing byte loop */
|
|
.loop_b2:
|
|
mov.b r5,@-r6 /* store byte */
|
|
cmp/hi r4,r6 /* runs r6 down to the start address */
|
|
bt .loop_b2
|
|
|
|
.end_b2:
|
|
rts
|
|
mov r4,r0 /* return start address */
|
|
|
|
.end:
|
|
.size _memset,.end-_memset
|
|
#elif defined(CPU_COLDFIRE)
|
|
.global memset
|
|
.type memset,@function
|
|
|
|
/* Fills a memory region with specified byte value
|
|
* This version is optimized for speed
|
|
*
|
|
* arguments:
|
|
* (4,%sp) - start address
|
|
* (8,%sp) - data
|
|
* (12,%sp) - length
|
|
*
|
|
* return value:
|
|
* %d0 - start address (like ANSI version)
|
|
*
|
|
* register usage:
|
|
* %d0 - data (spread to all 4 bytes when using long stores)
|
|
* %d1 - temporary / data (for burst transfer)
|
|
* %d2 - data (for burst transfer)
|
|
* %d3 - data (for burst transfer)
|
|
* %a0 - start address
|
|
* %a1 - current address (runs down from end to start)
|
|
*
|
|
* For maximum speed this routine uses both long stores and burst mode,
|
|
* storing whole lines with movem.l. The routine fills memory from end
|
|
* to start in order to ease returning the start address.
|
|
*/
|
|
memset:
|
|
move.l (4,%sp),%a0 /* start address */
|
|
move.l (8,%sp),%d0 /* data */
|
|
move.l (12,%sp),%a1 /* length */
|
|
add.l %a0,%a1 /* %a1 = end address */
|
|
|
|
move.l %a0,%d1
|
|
addq.l #7,%d1
|
|
and.l #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */
|
|
cmp.l %d1,%a1 /* at least one aligned longword to fill? */
|
|
blo.b .no_longs /* no, jump directly to byte loop */
|
|
|
|
and.l #0xFF,%d0 /* start: spread data to all 4 bytes */
|
|
move.l %d0,%d1
|
|
lsl.l #8,%d1
|
|
or.l %d1,%d0 /* data now in 2 lower bytes of %d0 */
|
|
move.l %d0,%d1
|
|
swap %d0
|
|
or.l %d1,%d0 /* data now in all 4 bytes of %d0 */
|
|
|
|
move.l %a1,%d1
|
|
and.l #0xFFFFFFFC,%d1 /* %d1 = last long bound */
|
|
cmp.l %d1,%a1 /* any bytes to set? */
|
|
bls.b .end_b1 /* no: skip byte loop */
|
|
|
|
/* leading byte loop: sets 0..3 bytes */
|
|
.loop_b1:
|
|
move.b %d0,-(%a1) /* store byte */
|
|
cmp.l %d1,%a1 /* runs %a1 down to last long bound */
|
|
bhi.b .loop_b1
|
|
|
|
.end_b1:
|
|
moveq.l #31,%d1
|
|
add.l %a0,%d1
|
|
and.l #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */
|
|
cmp.l %d1,%a1 /* at least one full line to fill? */
|
|
blo.b .no_lines /* no, jump to longword loop */
|
|
|
|
mov.l %a1,%d1
|
|
and.l #0xFFFFFFF0,%d1 /* %d1 = last line bound */
|
|
cmp.l %d1,%a1 /* any longwords to set? */
|
|
bls.b .end_l1 /* no: skip longword loop */
|
|
|
|
/* leading longword loop: sets 0..3 longwords */
|
|
.loop_l1:
|
|
move.l %d0,-(%a1) /* store longword */
|
|
cmp.l %d1,%a1 /* runs %a1 down to last line bound */
|
|
bhi.b .loop_l1
|
|
|
|
.end_l1:
|
|
move.l %d2,-(%sp) /* free some registers */
|
|
move.l %d3,-(%sp)
|
|
|
|
move.l %d0,%d1 /* spread data to 4 data registers */
|
|
move.l %d0,%d2
|
|
move.l %d0,%d3
|
|
lea.l (15,%a0),%a0 /* start address += 15, acct. for trl. data */
|
|
|
|
/* main loop: set whole lines utilising burst mode */
|
|
.loop_line:
|
|
lea.l (-16,%a1),%a1 /* pre-decrement */
|
|
movem.l %d0-%d3,(%a1) /* store line */
|
|
cmp.l %a0,%a1 /* runs %a1 down to first line bound */
|
|
bhi.b .loop_line
|
|
|
|
lea.l (-15,%a0),%a0 /* correct start address */
|
|
move.l (%sp)+,%d3 /* restore registers */
|
|
move.l (%sp)+,%d2
|
|
|
|
move.l %a0,%d1 /* %d1 = start address ... */
|
|
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
|
|
cmp.l %d1,%a1 /* any longwords left */
|
|
bhi.b .loop_l2 /* yes: jump to longword loop */
|
|
bra.b .no_longs /* no: skip loop */
|
|
|
|
.no_lines:
|
|
move.l %a0,%d1 /* %d1 = start address ... */
|
|
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
|
|
|
|
/* trailing longword loop */
|
|
.loop_l2:
|
|
move.l %d0,-(%a1) /* store longword */
|
|
cmp.l %d1,%a1 /* runs %a1 down to first long bound */
|
|
bhi.b .loop_l2
|
|
|
|
.no_longs:
|
|
cmp.l %a0,%a1 /* any bytes left? */
|
|
bls.b .end_b2 /* no: skip loop */
|
|
|
|
/* trailing byte loop */
|
|
.loop_b2:
|
|
move.b %d0,-(%a1) /* store byte */
|
|
cmp.l %a0,%a1 /* runs %a1 down to start address */
|
|
bhi.b .loop_b2
|
|
|
|
.end_b2:
|
|
move.l %a0,%d0 /* return start address */
|
|
rts
|
|
|
|
.end:
|
|
.size memset,.end-memset
|
|
|
|
#elif defined(CPU_ARM)
|
|
|
|
/* The following code is taken from the Linux kernel version 2.6.15.3
|
|
* linux/arch/arm/lib/memset.S
|
|
*
|
|
* Copyright (C) 1995-2000 Russell King
|
|
*/
|
|
|
|
@ .word 0
|
|
1: subs r2, r2, #4 @ 1 do we have enough
|
|
blt 5f @ 1 bytes to align with?
|
|
cmp r3, #2 @ 1
|
|
strltb r1, [r0], #1 @ 1
|
|
strleb r1, [r0], #1 @ 1
|
|
strb r1, [r0], #1 @ 1
|
|
add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
|
|
/*
|
|
* The pointer is now aligned and the length is adjusted. Try doing the
|
|
* memzero again.
|
|
*/
|
|
|
|
.global memset
|
|
.type memset,%function
|
|
memset:
|
|
ands r3, r0, #3 @ 1 unaligned?
|
|
bne 1b @ 1
|
|
/*
|
|
* we know that the pointer in r0 is aligned to a word boundary.
|
|
*/
|
|
orr r1, r1, r1, lsl #8
|
|
orr r1, r1, r1, lsl #16
|
|
mov r3, r1
|
|
cmp r2, #16
|
|
blt 4f
|
|
/*
|
|
* We need an extra register for this loop - save the return address and
|
|
* use the LR
|
|
*/
|
|
str lr, [sp, #-4]!
|
|
mov ip, r1
|
|
mov lr, r1
|
|
|
|
2: subs r2, r2, #64
|
|
stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
|
|
stmgeia r0!, {r1, r3, ip, lr}
|
|
stmgeia r0!, {r1, r3, ip, lr}
|
|
stmgeia r0!, {r1, r3, ip, lr}
|
|
bgt 2b
|
|
ldmeqfd sp!, {pc} @ Now <64 bytes to go.
|
|
/*
|
|
* No need to correct the count; we're only testing bits from now on
|
|
*/
|
|
tst r2, #32
|
|
stmneia r0!, {r1, r3, ip, lr}
|
|
stmneia r0!, {r1, r3, ip, lr}
|
|
tst r2, #16
|
|
stmneia r0!, {r1, r3, ip, lr}
|
|
ldr lr, [sp], #4
|
|
|
|
4: tst r2, #8
|
|
stmneia r0!, {r1, r3}
|
|
tst r2, #4
|
|
strne r1, [r0], #4
|
|
/*
|
|
* When we get here, we've got less than 4 bytes to zero. We
|
|
* may have an unaligned pointer as well.
|
|
*/
|
|
5: tst r2, #2
|
|
strneb r1, [r0], #1
|
|
strneb r1, [r0], #1
|
|
tst r2, #1
|
|
strneb r1, [r0], #1
|
|
mov pc, lr
|
|
end:
|
|
.size memset,.end-memset
|
|
#endif
|
|
|