memset() on coldfire now exploits burst mode whenever possible, giving another speed increase of up to 2.4 times for large blocks. Added a slight optimisation for small blocks as well.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6790 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
ac0bc83777
commit
8caf175c7f
1 changed files with 60 additions and 9 deletions
|
@ -123,21 +123,27 @@ _memset:
|
|||
*
|
||||
* register usage:
|
||||
* %d0 - data (spread to all 4 bytes when using long stores)
|
||||
* %d1 - temporary
|
||||
* %d1 - temporary / data (for burst transfer)
|
||||
* %d2 - data (for burst transfer)
|
||||
* %d3 - data (for burst transfer)
|
||||
* %a0 - start address
|
||||
* %a1 - current address (runs down from end to start)
|
||||
* %a2 - end address (for burst transfer)
|
||||
*
|
||||
* For maximum speed this routine uses both long stores and burst mode,
|
||||
* storing whole lines with movem.l. The routine fills memory from end
|
||||
* to start in order to ease returning the start address.
|
||||
*/
|
||||
memset:
|
||||
move.l (4,%sp),%a0 /* start address */
|
||||
move.l (8,%sp),%d0 /* data */
|
||||
move.l (12,%sp),%a1 /* length */
|
||||
add.l %a0,%a1 /* %a1 = end address */
|
||||
|
||||
move.l %a0,%d1
|
||||
neg.l %d1
|
||||
and.l #3,%d1 /* %d1 = (4 - align_offset) % 4 */
|
||||
addq.l #4,%d1
|
||||
addq.l #7,%d1
|
||||
and.l #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */
|
||||
cmp.l %d1,%a1 /* at least one aligned longword to fill? */
|
||||
add.l %a0,%a1 /* %a1 = end address; doesn't change flags */
|
||||
blo.b .no_longs /* no, jump directly to byte loop */
|
||||
|
||||
and.l #0xFF,%d0 /* start: spread data to all 4 bytes */
|
||||
|
@ -148,7 +154,7 @@ memset:
|
|||
swap %d0
|
||||
or.l %d1,%d0 /* data now in all 4 bytes of %d0 */
|
||||
|
||||
mov.l %a1,%d1
|
||||
move.l %a1,%d1
|
||||
and.l #0xFFFFFFFC,%d1 /* %d1 = last long bound */
|
||||
cmp.l %d1,%a1 /* any bytes to set? */
|
||||
bls.b .end_b1 /* no: skip byte loop */
|
||||
|
@ -160,14 +166,59 @@ memset:
|
|||
bhi.b .loop_b1
|
||||
|
||||
.end_b1:
|
||||
move.l %a0,%d1
|
||||
add.l #31,%d1
|
||||
and.l #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */
|
||||
cmp.l %d1,%a1 /* at least one full line to fill? */
|
||||
blo.b .no_lines /* no, jump to longword loop */
|
||||
|
||||
mov.l %a1,%d1
|
||||
and.l #0xFFFFFFF0,%d1 /* %d1 = last line bound */
|
||||
cmp.l %d1,%a1 /* any longwords to set? */
|
||||
bls.b .end_l1 /* no: skip longword loop */
|
||||
|
||||
/* leading longword loop: sets 0..3 longwords */
|
||||
.loop_l1:
|
||||
move.l %d0,-(%a1) /* store longword */
|
||||
cmp.l %d1,%a1 /* runs %a1 down to last line bound */
|
||||
bhi.b .loop_l1
|
||||
|
||||
.end_l1:
|
||||
move.l %d2,-(%sp) /* free some registers */
|
||||
move.l %d3,-(%sp)
|
||||
move.l %a2,-(%sp)
|
||||
|
||||
move.l %d0,%d1 /* spread data to 4 data registers */
|
||||
move.l %d0,%d2
|
||||
move.l %d0,%d3
|
||||
lea.l (15,%a0),%a2 /* %a2 = start address + 15, acct. for trl. data */
|
||||
|
||||
/* main loop: set whole lines utilising burst mode */
|
||||
.loop_line:
|
||||
lea.l (-16,%a1),%a1 /* pre-decrement */
|
||||
movem.l %d0-%d3,(%a1) /* store line */
|
||||
cmp.l %a2,%a1 /* runs %a1 down to first line bound */
|
||||
bhi.b .loop_line
|
||||
|
||||
move.l (%sp)+,%a2 /* restore registers */
|
||||
move.l (%sp)+,%d3
|
||||
move.l (%sp)+,%d2
|
||||
|
||||
move.l %a0,%d1 /* %d1 = start address ... */
|
||||
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
|
||||
bra.b .start_l2 /* there might be no longwords left when coming
|
||||
* out of the main loop */
|
||||
|
||||
.no_lines:
|
||||
move.l %a0,%d1 /* %d1 = start address ... */
|
||||
addq.l #3,%d1 /* ... +3, account for possible trailing bytes */
|
||||
|
||||
/* main loop: set longs */
|
||||
.loop_l:
|
||||
/* trailing longword loop */
|
||||
.loop_l2:
|
||||
move.l %d0,-(%a1) /* store longword */
|
||||
.start_l2:
|
||||
cmp.l %d1,%a1 /* runs %a1 down to first long bound */
|
||||
bhi.b .loop_l
|
||||
bhi.b .loop_l2
|
||||
|
||||
.no_longs:
|
||||
cmp.l %a0,%a1 /* any bytes left? */
|
||||
|
|
Loading…
Reference in a new issue