diff --git a/firmware/common/memset_a.S b/firmware/common/memset_a.S index a35fcb10a3..c3da66874b 100644 --- a/firmware/common/memset_a.S +++ b/firmware/common/memset_a.S @@ -123,21 +123,27 @@ _memset: * * register usage: * %d0 - data (spread to all 4 bytes when using long stores) - * %d1 - temporary + * %d1 - temporary / data (for burst transfer) + * %d2 - data (for burst transfer) + * %d3 - data (for burst transfer) * %a0 - start address * %a1 - current address (runs down from end to start) + * %a2 - end address (for burst transfer) + * + * For maximum speed this routine uses both long stores and burst mode, + * storing whole lines with movem.l. The routine fills memory from end + * to start in order to ease returning the start address. */ memset: move.l (4,%sp),%a0 /* start address */ move.l (8,%sp),%d0 /* data */ move.l (12,%sp),%a1 /* length */ + add.l %a0,%a1 /* %a1 = end address */ move.l %a0,%d1 - neg.l %d1 - and.l #3,%d1 /* %d1 = (4 - align_offset) % 4 */ - addq.l #4,%d1 + addq.l #7,%d1 + and.l #0xFFFFFFFC,%d1 /* %d1 = first long bound + 4 */ cmp.l %d1,%a1 /* at least one aligned longword to fill? */ - add.l %a0,%a1 /* %a1 = end address; doesn't change flags */ blo.b .no_longs /* no, jump directly to byte loop */ and.l #0xFF,%d0 /* start: spread data to all 4 bytes */ @@ -148,7 +154,7 @@ memset: swap %d0 or.l %d1,%d0 /* data now in all 4 bytes of %d0 */ - mov.l %a1,%d1 + move.l %a1,%d1 and.l #0xFFFFFFFC,%d1 /* %d1 = last long bound */ cmp.l %d1,%a1 /* any bytes to set? */ bls.b .end_b1 /* no: skip byte loop */ @@ -160,14 +166,59 @@ memset: bhi.b .loop_b1 .end_b1: + move.l %a0,%d1 + add.l #31,%d1 + and.l #0xFFFFFFF0,%d1 /* %d1 = first line bound + 16 */ + cmp.l %d1,%a1 /* at least one full line to fill? */ + blo.b .no_lines /* no, jump to longword loop */ + + mov.l %a1,%d1 + and.l #0xFFFFFFF0,%d1 /* %d1 = last line bound */ + cmp.l %d1,%a1 /* any longwords to set? */ + bls.b .end_l1 /* no: skip longword loop */ + + /* leading longword loop: sets 0..3 longwords */ +.loop_l1: + move.l %d0,-(%a1) /* store longword */ + cmp.l %d1,%a1 /* runs %a1 down to last line bound */ + bhi.b .loop_l1 + +.end_l1: + move.l %d2,-(%sp) /* free some registers */ + move.l %d3,-(%sp) + move.l %a2,-(%sp) + + move.l %d0,%d1 /* spread data to 4 data registers */ + move.l %d0,%d2 + move.l %d0,%d3 + lea.l (15,%a0),%a2 /* %a2 = start address + 15, acct. for trl. data */ + + /* main loop: set whole lines utilising burst mode */ +.loop_line: + lea.l (-16,%a1),%a1 /* pre-decrement */ + movem.l %d0-%d3,(%a1) /* store line */ + cmp.l %a2,%a1 /* runs %a1 down to first line bound */ + bhi.b .loop_line + + move.l (%sp)+,%a2 /* restore registers */ + move.l (%sp)+,%d3 + move.l (%sp)+,%d2 + + move.l %a0,%d1 /* %d1 = start address ... */ + addq.l #3,%d1 /* ... +3, account for possible trailing bytes */ + bra.b .start_l2 /* there might be no longwords left when coming + * out of the main loop */ + +.no_lines: move.l %a0,%d1 /* %d1 = start address ... */ addq.l #3,%d1 /* ... +3, account for possible trailing bytes */ - /* main loop: set longs */ -.loop_l: + /* trailing longword loop */ +.loop_l2: move.l %d0,-(%a1) /* store longword */ +.start_l2: cmp.l %d1,%a1 /* runs %a1 down to first long bound */ - bhi.b .loop_l + bhi.b .loop_l2 .no_longs: cmp.l %a0,%a1 /* any bytes left? */