From 5a0cb68e43eacc98d275baef08296f154f3ae6dd Mon Sep 17 00:00:00 2001 From: Linus Nielsen Feltzing Date: Tue, 25 Jul 2006 11:16:03 +0000 Subject: [PATCH] Moved some assembly optimizations to the target tree git-svn-id: svn://svn.rockbox.org/rockbox/trunk@10318 a1c6a512-1295-4272-9138-f99709370657 --- firmware/SOURCES | 22 +- firmware/target/arm/memset-arm.S | 96 ++++++++ .../coldfire/memcpy-coldfire.S} | 198 ---------------- .../coldfire/memmove-coldfire.S} | 201 ---------------- .../coldfire/memset-coldfire.S} | 167 ------------- .../coldfire/memset16-coldfire.S} | 2 - firmware/target/sh/memcpy-sh.S | 217 +++++++++++++++++ firmware/target/sh/memmove-sh.S | 220 ++++++++++++++++++ firmware/target/sh/memset-sh.S | 107 +++++++++ .../strlen_a.S => target/sh/strlen-sh.S} | 0 10 files changed, 652 insertions(+), 578 deletions(-) create mode 100755 firmware/target/arm/memset-arm.S rename firmware/{common/memcpy_a.S => target/coldfire/memcpy-coldfire.S} (77%) mode change 100644 => 100755 rename firmware/{common/memmove_a.S => target/coldfire/memmove-coldfire.S} (78%) rename firmware/{common/memset_a.S => target/coldfire/memset-coldfire.S} (51%) mode change 100644 => 100755 rename firmware/{common/memset16_a.S => target/coldfire/memset16-coldfire.S} (99%) create mode 100755 firmware/target/sh/memcpy-sh.S create mode 100755 firmware/target/sh/memmove-sh.S create mode 100755 firmware/target/sh/memset-sh.S rename firmware/{common/strlen_a.S => target/sh/strlen-sh.S} (100%) diff --git a/firmware/SOURCES b/firmware/SOURCES index 7d93edaca1..2979e33ccc 100644 --- a/firmware/SOURCES +++ b/firmware/SOURCES @@ -30,7 +30,7 @@ common/strchr.c common/strcmp.c common/strcpy.c #if (CONFIG_CPU == SH7034) && !defined(SIMULATOR) -common/strlen_a.S +target/sh/strlen-sh.S #else common/strlen.c #endif @@ -39,22 +39,24 @@ common/strncpy.c common/strrchr.c common/strtok.c common/timefuncs.c -#if (CONFIG_CPU == SH7034) || defined(CPU_COLDFIRE) -common/memcpy_a.S -common/memmove_a.S -common/memset_a.S + +#ifdef CPU_COLDFIRE +target/coldfire/memcpy-coldfire.S +target/coldfire/memmove-coldfire.S +target/coldfire/memset-coldfire.S +target/coldfire/memset16-coldfire.S +#elif (CONFIG_CPU == SH7034) +target/sh/memcpy-sh.S +target/sh/memmove-sh.S +target/sh/memset-sh.S #elif defined(CPU_ARM) common/memcpy.c common/memmove.c -common/memset_a.S +target/arm/memset-arm.S #else common/memcpy.c common/memmove.c common/memset.c -#endif -#ifdef CPU_COLDFIRE -common/memset16_a.S -#else common/memset16.c #endif #ifdef HAVE_LCD_CHARCELLS diff --git a/firmware/target/arm/memset-arm.S b/firmware/target/arm/memset-arm.S new file mode 100755 index 0000000000..b3faafcb37 --- /dev/null +++ b/firmware/target/arm/memset-arm.S @@ -0,0 +1,96 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",%progbits + + .align 2 + +/* The following code is based on code found in Linux kernel version 2.6.15.3 + * linux/arch/arm/lib/memset.S + * + * Copyright (C) 1995-2000 Russell King + */ + +/* This code will align a pointer for memset, if needed */ +1: cmp r2, #4 @ 1 do we have enough + blt 5f @ 1 bytes to align with? + cmp r3, #2 @ 1 + strgtb r1, [r0, #-1]! @ 1 + strgeb r1, [r0, #-1]! @ 1 + strb r1, [r0, #-1]! @ 1 + sub r2, r2, r3 @ 1 r2 = r2 - r3 + b 2f + + .global memset + .type memset,%function +memset: + add r0, r0, r2 @ we'll write backwards in memory + ands r3, r0, #3 @ 1 unaligned? + bne 1b @ 1 +2: +/* + * we know that the pointer in r0 is aligned to a word boundary. + */ + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + cmp r2, #16 + blt 5f +/* + * We need an extra register for this loop - save the return address and + * use the LR + */ + str lr, [sp, #-4]! + mov ip, r1 + mov lr, r1 + +3: subs r2, r2, #64 + stmgedb r0!, {r1, r3, ip, lr} @ 64 bytes at a time. + stmgedb r0!, {r1, r3, ip, lr} + stmgedb r0!, {r1, r3, ip, lr} + stmgedb r0!, {r1, r3, ip, lr} + bgt 3b + ldmeqfd sp!, {pc} @ Now <64 bytes to go. +/* + * No need to correct the count; we're only testing bits from now on + */ + tst r2, #32 + stmnedb r0!, {r1, r3, ip, lr} + stmnedb r0!, {r1, r3, ip, lr} + tst r2, #16 + stmnedb r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + +5: tst r2, #8 + stmnedb r0!, {r1, r3} + tst r2, #4 + strne r1, [r0, #-4]! +/* + * When we get here, we've got less than 4 bytes to zero. We + * may have an unaligned pointer as well. + */ +6: tst r2, #2 + strneb r1, [r0, #-1]! + strneb r1, [r0, #-1]! + tst r2, #1 + strneb r1, [r0, #-1]! + mov pc, lr +end: + .size memset,.end-memset diff --git a/firmware/common/memcpy_a.S b/firmware/target/coldfire/memcpy-coldfire.S old mode 100644 new mode 100755 similarity index 77% rename from firmware/common/memcpy_a.S rename to firmware/target/coldfire/memcpy-coldfire.S index 9f6c813be3..523e1f5ed9 --- a/firmware/common/memcpy_a.S +++ b/firmware/target/coldfire/memcpy-coldfire.S @@ -20,203 +20,6 @@ .section .icode,"ax",@progbits -#if CONFIG_CPU == SH7034 - .align 2 - .global _memcpy - .global ___memcpy_fwd_entry - .type _memcpy,@function - -/* Copies bytes of data in memory from to - * This version is optimized for speed - * - * arguments: - * r4 - destination address - * r5 - source address - * r6 - length - * - * return value: - * r0 - destination address (like ANSI version) - * - * register usage: - * r0 - data / scratch - * r1 - 2nd data / scratch - * r2 - scratch - * r3 - first long bound / adjusted end address (only if >= 11 bytes) - * r4 - current dest address - * r5 - current source address - * r6 - source end address - * r7 - stored dest start address - * - * The instruction order is devised in a way to utilize the pipelining - * of the SH1 to the max. The routine also tries to utilize fast page mode. - */ - -_memcpy: - mov r4,r7 /* store dest for returning */ -___memcpy_fwd_entry: - add #-8,r4 /* offset for early increment (max. 2 longs) */ - mov #11,r0 - cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ - add r5,r6 /* r6 = source_end */ - bf .start_b2 /* no: jump directly to byte loop */ - - mov #3,r0 - neg r5,r3 - and r0,r3 /* r3 = (4 - align_offset) % 4 */ - tst r3,r3 /* already aligned? */ - bt .end_b1 /* yes: skip leading byte loop */ - - add r5,r3 /* r3 = first source long bound */ - - /* leading byte loop: copies 0..3 bytes */ -.loop_b1: - mov.b @r5+,r0 /* load byte & increment source addr */ - add #1,r4 /* increment dest addr */ - mov.b r0,@(7,r4) /* store byte */ - cmp/hi r5,r3 /* runs r5 up to first long bound */ - bt .loop_b1 - /* now r5 is always at a long boundary */ - /* -> memory reading is done in longs for all dest alignments */ - - /* selector for main copy loop */ -.end_b1: - mov #3,r1 - and r4,r1 /* r1 = dest alignment offset */ - mova .jmptab,r0 - mov.b @(r0,r1),r1 /* select appropriate main loop */ - add r0,r1 - mov r6,r3 /* move end address to r3 */ - jmp @r1 /* and jump to it */ - add #-7,r3 /* adjust end addr for main loops doing 2 longs/pass */ - - /** main loops, copying 2 longs per pass to profit from fast page mode **/ - - /* long aligned destination (fastest) */ - .align 2 -.loop_do0: - mov.l @r5+,r1 /* load first long & increment source addr */ - add #16,r4 /* increment dest addr & account for decrementing stores */ - mov.l @r5+,r0 /* load second long & increment source addr */ - cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ - mov.l r0,@-r4 /* store second long */ - mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ - bt .loop_do0 - - add #4,r3 /* readjust end address */ - cmp/hi r5,r3 /* one long left? */ - bf .start_b2 /* no, jump to trailing byte loop */ - - mov.l @r5+,r0 /* load last long & increment source addr */ - add #4,r4 /* increment dest addr */ - bra .start_b2 /* jump to trailing byte loop */ - mov.l r0,@(4,r4) /* store last long */ - - /* word aligned destination (long + 2) */ - .align 2 -.loop_do2: - mov.l @r5+,r1 /* load first long & increment source addr */ - add #16,r4 /* increment dest addr */ - mov.l @r5+,r0 /* load second long & increment source addr */ - cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ - mov.w r0,@-r4 /* store low word of second long */ - xtrct r1,r0 /* extract low word of first long & high word of second long */ - mov.l r0,@-r4 /* and store as long */ - swap.w r1,r0 /* get high word of first long */ - mov.w r0,@-r4 /* and store it */ - bt .loop_do2 - - add #4,r3 /* readjust end address */ - cmp/hi r5,r3 /* one long left? */ - bf .start_b2 /* no, jump to trailing byte loop */ - - mov.l @r5+,r0 /* load last long & increment source addr */ - add #4,r4 /* increment dest addr */ - mov.w r0,@(6,r4) /* store low word */ - shlr16 r0 /* get high word */ - bra .start_b2 /* jump to trailing byte loop */ - mov.w r0,@(4,r4) /* and store it */ - - /* jumptable for loop selector */ - .align 2 -.jmptab: - .byte .loop_do0 - .jmptab /* placed in the middle because the SH1 */ - .byte .loop_do1 - .jmptab /* loads bytes sign-extended. Otherwise */ - .byte .loop_do2 - .jmptab /* the last loop would be out of reach */ - .byte .loop_do3 - .jmptab /* of the offset range. */ - - /* byte aligned destination (long + 1) */ - .align 2 -.loop_do1: - mov.l @r5+,r1 /* load first long & increment source addr */ - add #16,r4 /* increment dest addr */ - mov.l @r5+,r0 /* load second long & increment source addr */ - cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ - mov.b r0,@-r4 /* store low byte of second long */ - shlr8 r0 /* get upper 3 bytes */ - mov r1,r2 /* copy first long */ - shll16 r2 /* move low byte of first long all the way up, .. */ - shll8 r2 - or r2,r0 /* ..combine with the 3 bytes of second long.. */ - mov.l r0,@-r4 /* ..and store as long */ - shlr8 r1 /* get middle 2 bytes */ - mov.w r1,@-r4 /* store as word */ - shlr16 r1 /* get upper byte */ - mov.b r1,@-r4 /* and store */ - bt .loop_do1 - - add #4,r3 /* readjust end address */ -.last_do13: - cmp/hi r5,r3 /* one long left? */ - bf .start_b2 /* no, jump to trailing byte loop */ - - mov.l @r5+,r0 /* load last long & increment source addr */ - add #12,r4 /* increment dest addr */ - mov.b r0,@-r4 /* store low byte */ - shlr8 r0 /* get middle 2 bytes */ - mov.w r0,@-r4 /* store as word */ - shlr16 r0 /* get upper byte */ - mov.b r0,@-r4 /* and store */ - bra .start_b2 /* jump to trailing byte loop */ - add #-4,r4 /* readjust destination */ - - /* byte aligned destination (long + 3) */ - .align 2 -.loop_do3: - mov.l @r5+,r1 /* load first long & increment source addr */ - add #16,r4 /* increment dest addr */ - mov.l @r5+,r0 /* load second long & increment source addr */ - mov r1,r2 /* copy first long */ - mov.b r0,@-r4 /* store low byte of second long */ - shlr8 r0 /* get middle 2 bytes */ - mov.w r0,@-r4 /* store as word */ - shlr16 r0 /* get upper byte */ - shll8 r2 /* move lower 3 bytes of first long one up.. */ - or r2,r0 /* ..combine with the 1 byte of second long.. */ - mov.l r0,@-r4 /* ..and store as long */ - shlr16 r1 /* get upper byte of first long.. */ - shlr8 r1 - cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ - mov.b r1,@-r4 /* ..and store */ - bt .loop_do3 - - bra .last_do13 /* handle last longword: reuse routine for (long + 1) */ - add #4,r3 /* readjust end address */ - - /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ - .align 2 -.loop_b2: - mov.b @r5+,r0 /* load byte & increment source addr */ - add #1,r4 /* increment dest addr */ - mov.b r0,@(7,r4) /* store byte */ -.start_b2: - cmp/hi r5,r6 /* runs r5 up to end address */ - bt .loop_b2 - - rts - mov r7,r0 /* return dest start address */ -.end: - .size _memcpy,.end-_memcpy -#elif defined(CPU_COLDFIRE) #define FULLSPEED /* use burst writing for word aligned destinations */ .align 2 .global memcpy @@ -875,4 +678,3 @@ __memcpy_fwd_entry: .end: .size memcpy,.end-memcpy -#endif diff --git a/firmware/common/memmove_a.S b/firmware/target/coldfire/memmove-coldfire.S similarity index 78% rename from firmware/common/memmove_a.S rename to firmware/target/coldfire/memmove-coldfire.S index d7421333df..bdd2e2e206 100755 --- a/firmware/common/memmove_a.S +++ b/firmware/target/coldfire/memmove-coldfire.S @@ -20,206 +20,6 @@ .section .icode,"ax",@progbits -#if CONFIG_CPU == SH7034 - .align 2 - .global _memmove - .type _memmove,@function - -/* Moves bytes of data in memory from to - * Regions may overlap. - * This version is optimized for speed, and needs the corresponding memcpy - * implementation for the forward copy branch. - * - * arguments: - * r4 - destination address - * r5 - source address - * r6 - length - * - * return value: - * r0 - destination address (like ANSI version) - * - * register usage: - * r0 - data / scratch - * r1 - 2nd data / scratch - * r2 - scratch - * r3 - last long bound / adjusted start address (only if >= 11 bytes) - * r4 - current dest address - * r5 - source start address - * r6 - current source address - * - * The instruction order is devised in a way to utilize the pipelining - * of the SH1 to the max. The routine also tries to utilize fast page mode. - */ - -_memmove: - cmp/hi r4,r5 /* source > destination */ - bf .backward /* no: backward copy */ - mov.l .memcpy_fwd,r0 - jmp @r0 - mov r4,r7 /* store dest for returning */ - - .align 2 -.memcpy_fwd: - .long ___memcpy_fwd_entry - -.backward: - add r6,r4 /* r4 = destination end */ - mov #11,r0 - cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ - add #-8,r5 /* adjust for late decrement (max. 2 longs) */ - add r5,r6 /* r6 = source end - 8 */ - bf .start_b2r /* no: jump directly to byte loop */ - - mov #-4,r3 /* r3 = 0xfffffffc */ - and r6,r3 /* r3 = last source long bound */ - cmp/hi r3,r6 /* already aligned? */ - bf .end_b1r /* yes: skip leading byte loop */ - -.loop_b1r: - mov.b @(7,r6),r0 /* load byte */ - add #-1,r6 /* decrement source addr */ - mov.b r0,@-r4 /* store byte */ - cmp/hi r3,r6 /* runs r6 down to last long bound */ - bt .loop_b1r - -.end_b1r: - mov #3,r1 - and r4,r1 /* r1 = dest alignment offset */ - mova .jmptab_r,r0 - mov.b @(r0,r1),r1 /* select appropriate main loop.. */ - add r0,r1 - mov r5,r3 /* copy start adress to r3 */ - jmp @r1 /* ..and jump to it */ - add #7,r3 /* adjust end addr for main loops doing 2 longs/pass */ - - /** main loops, copying 2 longs per pass to profit from fast page mode **/ - - /* long aligned destination (fastest) */ - .align 2 -.loop_do0r: - mov.l @r6,r1 /* load first long */ - add #-8,r6 /* decrement source addr */ - mov.l @(12,r6),r0 /* load second long */ - cmp/hi r3,r6 /* runs r6 down to first or second long bound */ - mov.l r0,@-r4 /* store second long */ - mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ - bt .loop_do0r - - add #-4,r3 /* readjust end address */ - cmp/hi r3,r6 /* first long left? */ - bf .start_b2r /* no, jump to trailing byte loop */ - - mov.l @(4,r6),r0 /* load first long */ - add #-4,r6 /* decrement source addr */ - bra .start_b2r /* jump to trailing byte loop */ - mov.l r0,@-r4 /* store first long */ - - /* word aligned destination (long + 2) */ - .align 2 -.loop_do2r: - mov.l @r6,r1 /* load first long */ - add #-8,r6 /* decrement source addr */ - mov.l @(12,r6),r0 /* load second long */ - cmp/hi r3,r6 /* runs r6 down to first or second long bound */ - mov.w r0,@-r4 /* store low word of second long */ - xtrct r1,r0 /* extract low word of first long & high word of second long */ - mov.l r0,@-r4 /* and store as long */ - shlr16 r1 /* get high word of first long */ - mov.w r1,@-r4 /* and store it */ - bt .loop_do2r - - add #-4,r3 /* readjust end address */ - cmp/hi r3,r6 /* first long left? */ - bf .start_b2r /* no, jump to trailing byte loop */ - - mov.l @(4,r6),r0 /* load first long & decrement source addr */ - add #-4,r6 /* decrement source addr */ - mov.w r0,@-r4 /* store low word */ - shlr16 r0 /* get high word */ - bra .start_b2r /* jump to trailing byte loop */ - mov.w r0,@-r4 /* and store it */ - - /* jumptable for loop selector */ - .align 2 -.jmptab_r: - .byte .loop_do0r - .jmptab_r /* placed in the middle because the SH1 */ - .byte .loop_do1r - .jmptab_r /* loads bytes sign-extended. Otherwise */ - .byte .loop_do2r - .jmptab_r /* the last loop would be out of reach */ - .byte .loop_do3r - .jmptab_r /* of the offset range. */ - - /* byte aligned destination (long + 1) */ - .align 2 -.loop_do1r: - mov.l @r6,r1 /* load first long */ - add #-8,r6 /* decrement source addr */ - mov.l @(12,r6),r0 /* load second long */ - cmp/hi r3,r6 /* runs r6 down to first or second long bound */ - mov.b r0,@-r4 /* store low byte of second long */ - shlr8 r0 /* get upper 3 bytes */ - mov r1,r2 /* copy first long */ - shll16 r2 /* move low byte of first long all the way up, .. */ - shll8 r2 - or r2,r0 /* ..combine with the 3 bytes of second long.. */ - mov.l r0,@-r4 /* ..and store as long */ - shlr8 r1 /* get middle 2 bytes */ - mov.w r1,@-r4 /* store as word */ - shlr16 r1 /* get upper byte */ - mov.b r1,@-r4 /* and store */ - bt .loop_do1r - - add #-4,r3 /* readjust end address */ -.last_do13r: - cmp/hi r3,r6 /* first long left? */ - bf .start_b2r /* no, jump to trailing byte loop */ - - nop /* alignment */ - mov.l @(4,r6),r0 /* load first long */ - add #-4,r6 /* decrement source addr */ - mov.b r0,@-r4 /* store low byte */ - shlr8 r0 /* get middle 2 bytes */ - mov.w r0,@-r4 /* store as word */ - shlr16 r0 /* get upper byte */ - bra .start_b2r /* jump to trailing byte loop */ - mov.b r0,@-r4 /* and store */ - - /* byte aligned destination (long + 3) */ - .align 2 -.loop_do3r: - mov.l @r6,r1 /* load first long */ - add #-8,r6 /* decrement source addr */ - mov.l @(12,r6),r0 /* load second long */ - mov r1,r2 /* copy first long */ - mov.b r0,@-r4 /* store low byte of second long */ - shlr8 r0 /* get middle 2 bytes */ - mov.w r0,@-r4 /* store as word */ - shlr16 r0 /* get upper byte */ - shll8 r2 /* move lower 3 bytes of first long one up.. */ - or r2,r0 /* ..combine with the 1 byte of second long.. */ - mov.l r0,@-r4 /* ..and store as long */ - shlr16 r1 /* get upper byte of first long */ - shlr8 r1 - cmp/hi r3,r6 /* runs r6 down to first or second long bound */ - mov.b r1,@-r4 /* ..and store */ - bt .loop_do3r - - bra .last_do13r /* handle first longword: reuse routine for (long + 1) */ - add #-4,r3 /* readjust end address */ - - /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ - .align 2 -.loop_b2r: - mov.b @(7,r6),r0 /* load byte */ - add #-1,r6 /* decrement source addr */ - mov.b r0,@-r4 /* store byte */ -.start_b2r: - cmp/hi r5,r6 /* runs r6 down to start address */ - bt .loop_b2r - - rts - mov r4,r0 /* return dest start address */ -.end: - .size _memmove,.end-_memmove -#elif defined(CPU_COLDFIRE) #define FULLSPEED /* use burst writing for word aligned destinations */ .align 2 .global memmove @@ -866,4 +666,3 @@ memmove: .end: .size memmove,.end-memmove -#endif diff --git a/firmware/common/memset_a.S b/firmware/target/coldfire/memset-coldfire.S old mode 100644 new mode 100755 similarity index 51% rename from firmware/common/memset_a.S rename to firmware/target/coldfire/memset-coldfire.S index 6dbdab9595..7c9fe88463 --- a/firmware/common/memset_a.S +++ b/firmware/target/coldfire/memset-coldfire.S @@ -18,99 +18,9 @@ ****************************************************************************/ #include "config.h" -#ifdef CPU_ARM - .section .icode,"ax",%progbits -#else .section .icode,"ax",@progbits -#endif .align 2 -#if CONFIG_CPU == SH7034 - .global _memset - .type _memset,@function - -/* Fills a memory region with specified byte value - * This version is optimized for speed - * - * arguments: - * r4 - start address - * r5 - data - * r6 - length - * - * return value: - * r0 - start address (like ANSI version) - * - * register usage: - * r0 - temporary - * r1 - start address +11 for main loop - * r4 - start address - * r5 - data (spread to all 4 bytes when using long stores) - * r6 - current address (runs down from end to start) - * - * The instruction order below is devised in a way to utilize the pipelining - * of the SH1 to the max. The routine fills memory from end to start in - * order to utilize the auto-decrementing store instructions. - */ - -_memset: - neg r4,r0 - and #3,r0 /* r0 = (4 - align_offset) % 4 */ - add #4,r0 - cmp/hs r0,r6 /* at least one aligned longword to fill? */ - add r4,r6 /* r6 = end_address */ - bf .no_longs /* no, jump directly to byte loop */ - - extu.b r5,r5 /* start: spread data to all 4 bytes */ - swap.b r5,r0 - or r0,r5 /* data now in 2 lower bytes of r5 */ - swap.w r5,r0 - or r0,r5 /* data now in all 4 bytes of r5 */ - - mov r6,r0 - tst #3,r0 /* r0 already long aligned? */ - bt .end_b1 /* yes: skip loop */ - - /* leading byte loop: sets 0..3 bytes */ -.loop_b1: - mov.b r5,@-r0 /* store byte */ - tst #3,r0 /* r0 long aligned? */ - bf .loop_b1 /* runs r0 down until long aligned */ - - mov r0,r6 /* r6 = last long bound */ - nop /* keep alignment */ - -.end_b1: - mov r4,r1 /* r1 = start_address... */ - add #11,r1 /* ... + 11, combined for rounding and offset */ - xor r1,r0 - tst #4,r0 /* bit 2 tells whether an even or odd number of */ - bf .loop_odd /* longwords to set */ - - /* main loop: set 2 longs per pass */ -.loop_2l: - mov.l r5,@-r6 /* store first long */ -.loop_odd: - cmp/hi r1,r6 /* runs r6 down to first long bound */ - mov.l r5,@-r6 /* store second long */ - bt .loop_2l - -.no_longs: - cmp/hi r4,r6 /* any bytes left? */ - bf .end_b2 /* no: skip loop */ - - /* trailing byte loop */ -.loop_b2: - mov.b r5,@-r6 /* store byte */ - cmp/hi r4,r6 /* runs r6 down to the start address */ - bt .loop_b2 - -.end_b2: - rts - mov r4,r0 /* return start address */ - -.end: - .size _memset,.end-_memset -#elif defined(CPU_COLDFIRE) .global memset .type memset,@function @@ -238,80 +148,3 @@ memset: .end: .size memset,.end-memset - -#elif defined(CPU_ARM) - -/* The following code is based on code found in Linux kernel version 2.6.15.3 - * linux/arch/arm/lib/memset.S - * - * Copyright (C) 1995-2000 Russell King - */ - -/* This code will align a pointer for memset, if needed */ -1: cmp r2, #4 @ 1 do we have enough - blt 5f @ 1 bytes to align with? - cmp r3, #2 @ 1 - strgtb r1, [r0, #-1]! @ 1 - strgeb r1, [r0, #-1]! @ 1 - strb r1, [r0, #-1]! @ 1 - sub r2, r2, r3 @ 1 r2 = r2 - r3 - b 2f - - .global memset - .type memset,%function -memset: - add r0, r0, r2 @ we'll write backwards in memory - ands r3, r0, #3 @ 1 unaligned? - bne 1b @ 1 -2: -/* - * we know that the pointer in r0 is aligned to a word boundary. - */ - orr r1, r1, r1, lsl #8 - orr r1, r1, r1, lsl #16 - mov r3, r1 - cmp r2, #16 - blt 5f -/* - * We need an extra register for this loop - save the return address and - * use the LR - */ - str lr, [sp, #-4]! - mov ip, r1 - mov lr, r1 - -3: subs r2, r2, #64 - stmgedb r0!, {r1, r3, ip, lr} @ 64 bytes at a time. - stmgedb r0!, {r1, r3, ip, lr} - stmgedb r0!, {r1, r3, ip, lr} - stmgedb r0!, {r1, r3, ip, lr} - bgt 3b - ldmeqfd sp!, {pc} @ Now <64 bytes to go. -/* - * No need to correct the count; we're only testing bits from now on - */ - tst r2, #32 - stmnedb r0!, {r1, r3, ip, lr} - stmnedb r0!, {r1, r3, ip, lr} - tst r2, #16 - stmnedb r0!, {r1, r3, ip, lr} - ldr lr, [sp], #4 - -5: tst r2, #8 - stmnedb r0!, {r1, r3} - tst r2, #4 - strne r1, [r0, #-4]! -/* - * When we get here, we've got less than 4 bytes to zero. We - * may have an unaligned pointer as well. - */ -6: tst r2, #2 - strneb r1, [r0, #-1]! - strneb r1, [r0, #-1]! - tst r2, #1 - strneb r1, [r0, #-1]! - mov pc, lr -end: - .size memset,.end-memset -#endif - diff --git a/firmware/common/memset16_a.S b/firmware/target/coldfire/memset16-coldfire.S similarity index 99% rename from firmware/common/memset16_a.S rename to firmware/target/coldfire/memset16-coldfire.S index 9ab1bdcb5b..d9f72f683f 100755 --- a/firmware/common/memset16_a.S +++ b/firmware/target/coldfire/memset16-coldfire.S @@ -20,7 +20,6 @@ .section .icode,"ax",@progbits -#ifdef CPU_COLDFIRE .global memset16 .type memset16,@function @@ -143,4 +142,3 @@ memset16: .end: .size memset16,.end-memset16 -#endif diff --git a/firmware/target/sh/memcpy-sh.S b/firmware/target/sh/memcpy-sh.S new file mode 100755 index 0000000000..0b5e086be9 --- /dev/null +++ b/firmware/target/sh/memcpy-sh.S @@ -0,0 +1,217 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004-2005 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",@progbits + + .align 2 + .global _memcpy + .global ___memcpy_fwd_entry + .type _memcpy,@function + +/* Copies bytes of data in memory from to + * This version is optimized for speed + * + * arguments: + * r4 - destination address + * r5 - source address + * r6 - length + * + * return value: + * r0 - destination address (like ANSI version) + * + * register usage: + * r0 - data / scratch + * r1 - 2nd data / scratch + * r2 - scratch + * r3 - first long bound / adjusted end address (only if >= 11 bytes) + * r4 - current dest address + * r5 - current source address + * r6 - source end address + * r7 - stored dest start address + * + * The instruction order is devised in a way to utilize the pipelining + * of the SH1 to the max. The routine also tries to utilize fast page mode. + */ + +_memcpy: + mov r4,r7 /* store dest for returning */ +___memcpy_fwd_entry: + add #-8,r4 /* offset for early increment (max. 2 longs) */ + mov #11,r0 + cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ + add r5,r6 /* r6 = source_end */ + bf .start_b2 /* no: jump directly to byte loop */ + + mov #3,r0 + neg r5,r3 + and r0,r3 /* r3 = (4 - align_offset) % 4 */ + tst r3,r3 /* already aligned? */ + bt .end_b1 /* yes: skip leading byte loop */ + + add r5,r3 /* r3 = first source long bound */ + + /* leading byte loop: copies 0..3 bytes */ +.loop_b1: + mov.b @r5+,r0 /* load byte & increment source addr */ + add #1,r4 /* increment dest addr */ + mov.b r0,@(7,r4) /* store byte */ + cmp/hi r5,r3 /* runs r5 up to first long bound */ + bt .loop_b1 + /* now r5 is always at a long boundary */ + /* -> memory reading is done in longs for all dest alignments */ + + /* selector for main copy loop */ +.end_b1: + mov #3,r1 + and r4,r1 /* r1 = dest alignment offset */ + mova .jmptab,r0 + mov.b @(r0,r1),r1 /* select appropriate main loop */ + add r0,r1 + mov r6,r3 /* move end address to r3 */ + jmp @r1 /* and jump to it */ + add #-7,r3 /* adjust end addr for main loops doing 2 longs/pass */ + + /** main loops, copying 2 longs per pass to profit from fast page mode **/ + + /* long aligned destination (fastest) */ + .align 2 +.loop_do0: + mov.l @r5+,r1 /* load first long & increment source addr */ + add #16,r4 /* increment dest addr & account for decrementing stores */ + mov.l @r5+,r0 /* load second long & increment source addr */ + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ + mov.l r0,@-r4 /* store second long */ + mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ + bt .loop_do0 + + add #4,r3 /* readjust end address */ + cmp/hi r5,r3 /* one long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + mov.l @r5+,r0 /* load last long & increment source addr */ + add #4,r4 /* increment dest addr */ + bra .start_b2 /* jump to trailing byte loop */ + mov.l r0,@(4,r4) /* store last long */ + + /* word aligned destination (long + 2) */ + .align 2 +.loop_do2: + mov.l @r5+,r1 /* load first long & increment source addr */ + add #16,r4 /* increment dest addr */ + mov.l @r5+,r0 /* load second long & increment source addr */ + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ + mov.w r0,@-r4 /* store low word of second long */ + xtrct r1,r0 /* extract low word of first long & high word of second long */ + mov.l r0,@-r4 /* and store as long */ + swap.w r1,r0 /* get high word of first long */ + mov.w r0,@-r4 /* and store it */ + bt .loop_do2 + + add #4,r3 /* readjust end address */ + cmp/hi r5,r3 /* one long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + mov.l @r5+,r0 /* load last long & increment source addr */ + add #4,r4 /* increment dest addr */ + mov.w r0,@(6,r4) /* store low word */ + shlr16 r0 /* get high word */ + bra .start_b2 /* jump to trailing byte loop */ + mov.w r0,@(4,r4) /* and store it */ + + /* jumptable for loop selector */ + .align 2 +.jmptab: + .byte .loop_do0 - .jmptab /* placed in the middle because the SH1 */ + .byte .loop_do1 - .jmptab /* loads bytes sign-extended. Otherwise */ + .byte .loop_do2 - .jmptab /* the last loop would be out of reach */ + .byte .loop_do3 - .jmptab /* of the offset range. */ + + /* byte aligned destination (long + 1) */ + .align 2 +.loop_do1: + mov.l @r5+,r1 /* load first long & increment source addr */ + add #16,r4 /* increment dest addr */ + mov.l @r5+,r0 /* load second long & increment source addr */ + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ + mov.b r0,@-r4 /* store low byte of second long */ + shlr8 r0 /* get upper 3 bytes */ + mov r1,r2 /* copy first long */ + shll16 r2 /* move low byte of first long all the way up, .. */ + shll8 r2 + or r2,r0 /* ..combine with the 3 bytes of second long.. */ + mov.l r0,@-r4 /* ..and store as long */ + shlr8 r1 /* get middle 2 bytes */ + mov.w r1,@-r4 /* store as word */ + shlr16 r1 /* get upper byte */ + mov.b r1,@-r4 /* and store */ + bt .loop_do1 + + add #4,r3 /* readjust end address */ +.last_do13: + cmp/hi r5,r3 /* one long left? */ + bf .start_b2 /* no, jump to trailing byte loop */ + + mov.l @r5+,r0 /* load last long & increment source addr */ + add #12,r4 /* increment dest addr */ + mov.b r0,@-r4 /* store low byte */ + shlr8 r0 /* get middle 2 bytes */ + mov.w r0,@-r4 /* store as word */ + shlr16 r0 /* get upper byte */ + mov.b r0,@-r4 /* and store */ + bra .start_b2 /* jump to trailing byte loop */ + add #-4,r4 /* readjust destination */ + + /* byte aligned destination (long + 3) */ + .align 2 +.loop_do3: + mov.l @r5+,r1 /* load first long & increment source addr */ + add #16,r4 /* increment dest addr */ + mov.l @r5+,r0 /* load second long & increment source addr */ + mov r1,r2 /* copy first long */ + mov.b r0,@-r4 /* store low byte of second long */ + shlr8 r0 /* get middle 2 bytes */ + mov.w r0,@-r4 /* store as word */ + shlr16 r0 /* get upper byte */ + shll8 r2 /* move lower 3 bytes of first long one up.. */ + or r2,r0 /* ..combine with the 1 byte of second long.. */ + mov.l r0,@-r4 /* ..and store as long */ + shlr16 r1 /* get upper byte of first long.. */ + shlr8 r1 + cmp/hi r5,r3 /* runs r5 up to last or second last long bound */ + mov.b r1,@-r4 /* ..and store */ + bt .loop_do3 + + bra .last_do13 /* handle last longword: reuse routine for (long + 1) */ + add #4,r3 /* readjust end address */ + + /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ + .align 2 +.loop_b2: + mov.b @r5+,r0 /* load byte & increment source addr */ + add #1,r4 /* increment dest addr */ + mov.b r0,@(7,r4) /* store byte */ +.start_b2: + cmp/hi r5,r6 /* runs r5 up to end address */ + bt .loop_b2 + + rts + mov r7,r0 /* return dest start address */ +.end: + .size _memcpy,.end-_memcpy diff --git a/firmware/target/sh/memmove-sh.S b/firmware/target/sh/memmove-sh.S new file mode 100755 index 0000000000..9ae9ae5fa2 --- /dev/null +++ b/firmware/target/sh/memmove-sh.S @@ -0,0 +1,220 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2006 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",@progbits + + .align 2 + .global _memmove + .type _memmove,@function + +/* Moves bytes of data in memory from to + * Regions may overlap. + * This version is optimized for speed, and needs the corresponding memcpy + * implementation for the forward copy branch. + * + * arguments: + * r4 - destination address + * r5 - source address + * r6 - length + * + * return value: + * r0 - destination address (like ANSI version) + * + * register usage: + * r0 - data / scratch + * r1 - 2nd data / scratch + * r2 - scratch + * r3 - last long bound / adjusted start address (only if >= 11 bytes) + * r4 - current dest address + * r5 - source start address + * r6 - current source address + * + * The instruction order is devised in a way to utilize the pipelining + * of the SH1 to the max. The routine also tries to utilize fast page mode. + */ + +_memmove: + cmp/hi r4,r5 /* source > destination */ + bf .backward /* no: backward copy */ + mov.l .memcpy_fwd,r0 + jmp @r0 + mov r4,r7 /* store dest for returning */ + + .align 2 +.memcpy_fwd: + .long ___memcpy_fwd_entry + +.backward: + add r6,r4 /* r4 = destination end */ + mov #11,r0 + cmp/hs r0,r6 /* at least 11 bytes to copy? (ensures 2 aligned longs) */ + add #-8,r5 /* adjust for late decrement (max. 2 longs) */ + add r5,r6 /* r6 = source end - 8 */ + bf .start_b2r /* no: jump directly to byte loop */ + + mov #-4,r3 /* r3 = 0xfffffffc */ + and r6,r3 /* r3 = last source long bound */ + cmp/hi r3,r6 /* already aligned? */ + bf .end_b1r /* yes: skip leading byte loop */ + +.loop_b1r: + mov.b @(7,r6),r0 /* load byte */ + add #-1,r6 /* decrement source addr */ + mov.b r0,@-r4 /* store byte */ + cmp/hi r3,r6 /* runs r6 down to last long bound */ + bt .loop_b1r + +.end_b1r: + mov #3,r1 + and r4,r1 /* r1 = dest alignment offset */ + mova .jmptab_r,r0 + mov.b @(r0,r1),r1 /* select appropriate main loop.. */ + add r0,r1 + mov r5,r3 /* copy start adress to r3 */ + jmp @r1 /* ..and jump to it */ + add #7,r3 /* adjust end addr for main loops doing 2 longs/pass */ + + /** main loops, copying 2 longs per pass to profit from fast page mode **/ + + /* long aligned destination (fastest) */ + .align 2 +.loop_do0r: + mov.l @r6,r1 /* load first long */ + add #-8,r6 /* decrement source addr */ + mov.l @(12,r6),r0 /* load second long */ + cmp/hi r3,r6 /* runs r6 down to first or second long bound */ + mov.l r0,@-r4 /* store second long */ + mov.l r1,@-r4 /* store first long; NOT ALIGNED - no speed loss here! */ + bt .loop_do0r + + add #-4,r3 /* readjust end address */ + cmp/hi r3,r6 /* first long left? */ + bf .start_b2r /* no, jump to trailing byte loop */ + + mov.l @(4,r6),r0 /* load first long */ + add #-4,r6 /* decrement source addr */ + bra .start_b2r /* jump to trailing byte loop */ + mov.l r0,@-r4 /* store first long */ + + /* word aligned destination (long + 2) */ + .align 2 +.loop_do2r: + mov.l @r6,r1 /* load first long */ + add #-8,r6 /* decrement source addr */ + mov.l @(12,r6),r0 /* load second long */ + cmp/hi r3,r6 /* runs r6 down to first or second long bound */ + mov.w r0,@-r4 /* store low word of second long */ + xtrct r1,r0 /* extract low word of first long & high word of second long */ + mov.l r0,@-r4 /* and store as long */ + shlr16 r1 /* get high word of first long */ + mov.w r1,@-r4 /* and store it */ + bt .loop_do2r + + add #-4,r3 /* readjust end address */ + cmp/hi r3,r6 /* first long left? */ + bf .start_b2r /* no, jump to trailing byte loop */ + + mov.l @(4,r6),r0 /* load first long & decrement source addr */ + add #-4,r6 /* decrement source addr */ + mov.w r0,@-r4 /* store low word */ + shlr16 r0 /* get high word */ + bra .start_b2r /* jump to trailing byte loop */ + mov.w r0,@-r4 /* and store it */ + + /* jumptable for loop selector */ + .align 2 +.jmptab_r: + .byte .loop_do0r - .jmptab_r /* placed in the middle because the SH1 */ + .byte .loop_do1r - .jmptab_r /* loads bytes sign-extended. Otherwise */ + .byte .loop_do2r - .jmptab_r /* the last loop would be out of reach */ + .byte .loop_do3r - .jmptab_r /* of the offset range. */ + + /* byte aligned destination (long + 1) */ + .align 2 +.loop_do1r: + mov.l @r6,r1 /* load first long */ + add #-8,r6 /* decrement source addr */ + mov.l @(12,r6),r0 /* load second long */ + cmp/hi r3,r6 /* runs r6 down to first or second long bound */ + mov.b r0,@-r4 /* store low byte of second long */ + shlr8 r0 /* get upper 3 bytes */ + mov r1,r2 /* copy first long */ + shll16 r2 /* move low byte of first long all the way up, .. */ + shll8 r2 + or r2,r0 /* ..combine with the 3 bytes of second long.. */ + mov.l r0,@-r4 /* ..and store as long */ + shlr8 r1 /* get middle 2 bytes */ + mov.w r1,@-r4 /* store as word */ + shlr16 r1 /* get upper byte */ + mov.b r1,@-r4 /* and store */ + bt .loop_do1r + + add #-4,r3 /* readjust end address */ +.last_do13r: + cmp/hi r3,r6 /* first long left? */ + bf .start_b2r /* no, jump to trailing byte loop */ + + nop /* alignment */ + mov.l @(4,r6),r0 /* load first long */ + add #-4,r6 /* decrement source addr */ + mov.b r0,@-r4 /* store low byte */ + shlr8 r0 /* get middle 2 bytes */ + mov.w r0,@-r4 /* store as word */ + shlr16 r0 /* get upper byte */ + bra .start_b2r /* jump to trailing byte loop */ + mov.b r0,@-r4 /* and store */ + + /* byte aligned destination (long + 3) */ + .align 2 +.loop_do3r: + mov.l @r6,r1 /* load first long */ + add #-8,r6 /* decrement source addr */ + mov.l @(12,r6),r0 /* load second long */ + mov r1,r2 /* copy first long */ + mov.b r0,@-r4 /* store low byte of second long */ + shlr8 r0 /* get middle 2 bytes */ + mov.w r0,@-r4 /* store as word */ + shlr16 r0 /* get upper byte */ + shll8 r2 /* move lower 3 bytes of first long one up.. */ + or r2,r0 /* ..combine with the 1 byte of second long.. */ + mov.l r0,@-r4 /* ..and store as long */ + shlr16 r1 /* get upper byte of first long */ + shlr8 r1 + cmp/hi r3,r6 /* runs r6 down to first or second long bound */ + mov.b r1,@-r4 /* ..and store */ + bt .loop_do3r + + bra .last_do13r /* handle first longword: reuse routine for (long + 1) */ + add #-4,r3 /* readjust end address */ + + /* trailing byte loop: copies 0..3 bytes (or all for < 11 in total) */ + .align 2 +.loop_b2r: + mov.b @(7,r6),r0 /* load byte */ + add #-1,r6 /* decrement source addr */ + mov.b r0,@-r4 /* store byte */ +.start_b2r: + cmp/hi r5,r6 /* runs r6 down to start address */ + bt .loop_b2r + + rts + mov r4,r0 /* return dest start address */ +.end: + .size _memmove,.end-_memmove diff --git a/firmware/target/sh/memset-sh.S b/firmware/target/sh/memset-sh.S new file mode 100755 index 0000000000..9b96b93f27 --- /dev/null +++ b/firmware/target/sh/memset-sh.S @@ -0,0 +1,107 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2004 by Jens Arnold + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ +#include "config.h" + + .section .icode,"ax",@progbits + + .align 2 + .global _memset + .type _memset,@function + +/* Fills a memory region with specified byte value + * This version is optimized for speed + * + * arguments: + * r4 - start address + * r5 - data + * r6 - length + * + * return value: + * r0 - start address (like ANSI version) + * + * register usage: + * r0 - temporary + * r1 - start address +11 for main loop + * r4 - start address + * r5 - data (spread to all 4 bytes when using long stores) + * r6 - current address (runs down from end to start) + * + * The instruction order below is devised in a way to utilize the pipelining + * of the SH1 to the max. The routine fills memory from end to start in + * order to utilize the auto-decrementing store instructions. + */ + +_memset: + neg r4,r0 + and #3,r0 /* r0 = (4 - align_offset) % 4 */ + add #4,r0 + cmp/hs r0,r6 /* at least one aligned longword to fill? */ + add r4,r6 /* r6 = end_address */ + bf .no_longs /* no, jump directly to byte loop */ + + extu.b r5,r5 /* start: spread data to all 4 bytes */ + swap.b r5,r0 + or r0,r5 /* data now in 2 lower bytes of r5 */ + swap.w r5,r0 + or r0,r5 /* data now in all 4 bytes of r5 */ + + mov r6,r0 + tst #3,r0 /* r0 already long aligned? */ + bt .end_b1 /* yes: skip loop */ + + /* leading byte loop: sets 0..3 bytes */ +.loop_b1: + mov.b r5,@-r0 /* store byte */ + tst #3,r0 /* r0 long aligned? */ + bf .loop_b1 /* runs r0 down until long aligned */ + + mov r0,r6 /* r6 = last long bound */ + nop /* keep alignment */ + +.end_b1: + mov r4,r1 /* r1 = start_address... */ + add #11,r1 /* ... + 11, combined for rounding and offset */ + xor r1,r0 + tst #4,r0 /* bit 2 tells whether an even or odd number of */ + bf .loop_odd /* longwords to set */ + + /* main loop: set 2 longs per pass */ +.loop_2l: + mov.l r5,@-r6 /* store first long */ +.loop_odd: + cmp/hi r1,r6 /* runs r6 down to first long bound */ + mov.l r5,@-r6 /* store second long */ + bt .loop_2l + +.no_longs: + cmp/hi r4,r6 /* any bytes left? */ + bf .end_b2 /* no: skip loop */ + + /* trailing byte loop */ +.loop_b2: + mov.b r5,@-r6 /* store byte */ + cmp/hi r4,r6 /* runs r6 down to the start address */ + bt .loop_b2 + +.end_b2: + rts + mov r4,r0 /* return start address */ + +.end: + .size _memset,.end-_memset diff --git a/firmware/common/strlen_a.S b/firmware/target/sh/strlen-sh.S similarity index 100% rename from firmware/common/strlen_a.S rename to firmware/target/sh/strlen-sh.S