From b646d4f278017b4b3797ab0976239bd64df8b43e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Hohensohn?= Date: Mon, 9 Feb 2004 08:24:25 +0000 Subject: [PATCH] Assembly code for copy_read_sectors() reworked: I've spread the ATA reads apart by filling independent instructions inbetween, this is also slightly faster because of no pipeline stall. Hopefully this fixes the problem Kargatron had with it. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@4302 a1c6a512-1295-4272-9138-f99709370657 --- firmware/drivers/ata.c | 59 ++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/firmware/drivers/ata.c b/firmware/drivers/ata.c index e114718bb4..284c599ef8 100644 --- a/firmware/drivers/ata.c +++ b/firmware/drivers/ata.c @@ -181,16 +181,23 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) *buf++ = tmp >> 8; /* and don't use the SWAB16 macro */ } while (buf < bufend); /* tail loop is faster */ #else - asm ( /* I can bring it down to 7 instructions/loop */ - "mov #1, r0 \n" + /* I can bring it down to 7 instructions/loop, and exploit pipeline */ + asm ( + "mov #1, r0 \n" /* r0 = 1; */ + /* correct for the "early increment" below */ + "add #-2,%2 \n" /* buf -= 2; */ + "add #-2,%3 \n" /* bufend -= 2; */ "loop_b: \n" - "mov.w @%1,%0 \n" - "mov.b %0,@%2 \n" - "shlr8 %0 \n" - "mov.b %0,@(r0,%2) \n" - "add #0x02,%2 \n" - "cmp/hs %3,%2 \n" - "bf loop_b \n" + "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ + /* Now we're reading from the bus, I do something independent we + need later, to avoid pipeline stall */ + "add #0x02,%2 \n" /* buf += 2; */ + "cmp/hs %3,%2 \n" /* if (buf < bufend) */ + /* now use the read result */ + "mov.b %0,@%2 \n" /* buf[0] = lowbyte(tmp); */ + "shlr8 %0 \n" /* tmp >>= 8; */ + "mov.b %0,@(r0,%2) \n" /* buf[r0] = lowbyte(tmp); */ + "bf loop_b \n" /* goto loop_b; */ : /* outputs */ : /* inputs */ /* %0 */ "r"(tmp), @@ -212,18 +219,30 @@ static void copy_read_sectors(unsigned char* buf, int wordcount) *wbuf = SWAB16(ATA_DATA); } while (++wbuf < wbufend); /* tail loop is faster */ #else - asm ( /* I can bring it down to 9 instructions for 2 loops */ - "mov #2, r0 \n" + /* I can bring it down to 9 instructions for 2 loops, and pipeline */ + asm ( + "mov #2, r0 \n" /* r0 = 2 */ + /* correct for the "early increment" below */ + "add #-4,%2 \n" /* wbuf -= 4; */ + "bra enter_loop \n" /* goto enter_loop, after next instr. */ + "add #-4,%3 \n" /* wbufend -= 4; */ "loop_w: \n" - "mov.w @%1,%0 \n" - "swap.b %0,%0 \n" - "mov.w %0,@%2 \n" - "mov.w @%1,%0 \n" /* unrolled, do one more */ - "swap.b %0,%0 \n" - "mov.w %0,@(r0,%2) \n" - "add #0x04,%2 \n" - "cmp/hs %3,%2 \n" - "bf loop_w \n" + /* use read result and store, from last round */ + "swap.b %0,%0 \n" /* endian_swap(tmp); */ + "mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */ + "enter_loop: \n" + "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ + /* keep the pipeline busy with 2 independent instructions */ + "add #0x04,%2 \n" /* wbuf += 4; */ + "cmp/hs %3,%2 \n" /* if (wbuf < wbufend) */ + "swap.b %0,%0 \n" /* endian_swap(tmp); */ + "mov.w %0,@%2 \n" /* wbuf[0] = tmp; */ + /* unrolled, do one more */ + "mov.w @%1,%0 \n" /* tmp = ATA_DATA; */ + /* use and store later, to keep pipeline busy */ + "bf loop_w \n" /* goto loop_w; */ + "swap.b %0,%0 \n" /* endian_swap(tmp); */ + "mov.w %0,@(r0,%2) \n" /* wbuf[r0] = tmp; */ : /* outputs */ : /* inputs */ /* %0 */ "r"(tmp),