Coldfire: Assembler optimised disk writing, speed increase ~36% on average.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@9117 a1c6a512-1295-4272-9138-f99709370657
2006-03-19 12:37:23 +00:00 · 2006-03-19 12:37:23 +00:00 · bf3dba8628
commit bf3dba8628
parent 958d6ac278
1 changed files with 208 additions and 6 deletions
--- a/firmware/drivers/ata.c
+++ b/firmware/drivers/ata.c
@ -38,8 +38,7 @@

 #if (CONFIG_CPU == MCF5249) || (CONFIG_CPU == MCF5250)

-/* asm reading, C writing */
-#define PREFER_C_WRITING
+/* asm reading + writing */

 #define ATA_IOBASE      0x20000000
 #define ATA_DATA        (*((volatile unsigned short*)(ATA_IOBASE + 0x20)))
@ -505,7 +504,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
        "btst.l  #1,%%d0         \n" /* longword aligned? */
        "beq.b   .end_u_w1       \n" /* yes, skip leading word handling */
        
-        "swap    %%d2            \n" /* move initila word up */
+        "swap    %%d2            \n" /* move initial word up */
        "move.w  (%[ata]),%%d2   \n" /* combine with second word */
        "move.l  %%d2,%%d3       \n"
        "lsr.l   #8,%%d3         \n"
@ -533,7 +532,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)

    ".end_u_l1:                  \n"
        "lea.l   (-14,%[wcnt]),%[wcnt]   \n" /* adjust end addr. to 16 bytes/pass */
-        
+
    ".loop_u_line:               \n"
        "move.w  (%[ata]),%%d3   \n" /* load 1st word */
        "swap    %%d3            \n" /* move to upper 16 bit */
@ -675,7 +674,7 @@ static void copy_read_sectors(unsigned char* buf, int wordcount)
        "d0", "d1", "d2", "d3", "d4", "d5", "d6"
    );
 #else
-    /* turbo-charged assembler version */
+    /* SH1 turbo-charged assembler reading */
    /* this assumes wordcount to be a multiple of 4 */
    asm (
        "add     %1,%1       \n"  /* wordcount -> bytecount */
@ -981,7 +980,209 @@ static void copy_write_sectors(const unsigned char* buf, int wordcount)
        } while (++wbuf < wbufend); /* tail loop is faster */
    }
 #else
-    /* optimized assembler version */
+#ifdef CPU_COLDFIRE
+    /* coldfire asm writing, utilising line bursts */
+    asm (
+        "add.l   %[wcnt],%[wcnt] \n" /* wordcount -> bytecount */
+        "add.l   %[buf],%[wcnt]  \n" /* bytecount -> bufend */
+        "move.l  %[buf],%%d0     \n"
+        "btst.l  #0,%%d0         \n" /* 16-bit aligned? */
+        "jeq     .w_aligned      \n" /* yes, do word copy */
+
+        /* not 16-bit aligned */
+        "subq.l  #1,%[wcnt]      \n" /* last byte is done unconditionally */
+        "moveq.l #24,%%d1        \n" /* preload shift count */
+
+        "move.b  (%[buf])+,%%d2  \n"
+
+        "btst.l  #1,%%d0         \n" /* longword aligned? */
+        "beq.b   .w_end_u_w1     \n" /* yes, skip leading word handling */
+
+        "swap    %%d2            \n"
+        "move.w  (%[buf])+,%%d2  \n"
+        "move.l  %%d2,%%d3       \n"
+        "lsr.l   #8,%%d3         \n"
+        "move.w  %%d3,(%[ata])   \n"
+        
+    ".w_end_u_w1:                \n"
+        "moveq.l #12,%%d0        \n"
+        "add.l   %[buf],%%d0     \n"
+        "and.l   #0xFFFFFFF0,%%d0\n" /* d0 == first line bound */
+        "cmp.l   %[buf],%%d0     \n" /* any leading longwords? */
+        "bls.b   .w_end_u_l1     \n" /* no: skip loop */
+        
+    ".w_loop_u_l1:               \n"
+        "move.l  (%[buf])+,%%d3  \n"
+        "move.l  %%d3,%%d4       \n"
+        "lsl.l   %%d1,%%d2       \n"
+        "lsr.l   #8,%%d3         \n"
+        "or.l    %%d3,%%d2       \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "move.l  %%d4,%%d2       \n"
+        "cmp.l   %[buf],%%d0     \n" /* run up to first line bound */
+        "bhi.b   .w_loop_u_l1    \n"
+
+    ".w_end_u_l1:                \n"
+        "lea.l   (-14,%[wcnt]),%[wcnt]   \n" /* adjust end addr. to 16 bytes/pass */
+
+    ".w_loop_u_line:             \n"
+        "movem.l (%[buf]),%%d3-%%d6  \n"
+        "lea.l   (16,%[buf]),%[buf]  \n"
+        "move.l  %%d3,%%d0       \n"
+        "lsl.l   %%d1,%%d2       \n"
+        "lsr.l   #8,%%d0         \n"
+        "or.l    %%d0,%%d2       \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "move.l  %%d4,%%d0       \n"
+        "lsl.l   %%d1,%%d3       \n"
+        "lsr.l   #8,%%d0         \n"
+        "or.l    %%d0,%%d3       \n"
+        "swap    %%d3            \n"
+        "move.w  %%d3,(%[ata])   \n"
+        "swap    %%d3            \n"
+        "move.w  %%d3,(%[ata])   \n"
+        "move.l  %%d5,%%d0       \n"
+        "lsl.l   %%d1,%%d4       \n"
+        "lsr.l   #8,%%d0         \n"
+        "or.l    %%d0,%%d4       \n"
+        "swap    %%d4            \n"
+        "move.w  %%d4,(%[ata])   \n"
+        "swap    %%d4            \n"
+        "move.w  %%d4,(%[ata])   \n"
+        "move.l  %%d6,%%d0       \n"
+        "lsl.l   %%d1,%%d5       \n"
+        "lsr.l   #8,%%d0         \n"
+        "or.l    %%d0,%%d5       \n"
+        "swap    %%d5            \n"
+        "move.w  %%d5,(%[ata])   \n"
+        "swap    %%d5            \n"
+        "move.w  %%d5,(%[ata])   \n"
+        "move.l  %%d6,%%d2       \n"
+        "cmp.l   %[buf],%[wcnt]  \n" /* run up to last line bound */
+        "bhi.b   .w_loop_u_line  \n"
+
+        "lea.l   (12,%[wcnt]),%[wcnt]\n"  /* readjust for longword loop */
+        "cmp.l   %[buf],%[wcnt]  \n" /* any trailing longwords? */
+        "bls.b   .w_end_u_l2     \n" /* no: skip loop */
+
+    ".w_loop_u_l2:               \n"
+        "move.l  (%[buf])+,%%d3  \n"
+        "move.l  %%d3,%%d4       \n"
+        "lsl.l   %%d1,%%d2       \n"
+        "lsr.l   #8,%%d3         \n"
+        "or.l    %%d3,%%d2       \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "move.l  %%d4,%%d2       \n"
+        "cmp.l   %[buf],%[wcnt]  \n" /* run up to first line bound */
+        "bhi.b   .w_loop_u_l2    \n"
+
+    ".w_end_u_l2:                \n"
+        "addq.l  #2,%[wcnt]      \n" /* back to final end address */
+        "cmp.l   %[buf],%[wcnt]  \n" /* one word left? */
+        "bls.b   .w_end_u_w2     \n"
+
+        "swap    %%d2            \n"
+        "move.w  (%[buf])+,%%d2  \n"
+        "move.l  %%d2,%%d3       \n"
+        "lsr.l   #8,%%d3         \n"
+        "move.w  %%d3,(%[ata])   \n"
+
+    ".w_end_u_w2:                \n"
+        "lsl.l   #8,%%d2         \n"
+        "move.b  (%[buf])+,%%d2  \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "bra.b    .w_exit         \n"
+
+        /* 16-bit aligned */
+    ".w_aligned:                 \n"
+        "btst.l  #1,%%d0         \n"
+        "beq.b   .w_end_a_w1     \n"
+
+        "move.w  (%[buf])+,(%[ata])  \n"  /* copy initial word */
+
+    ".w_end_a_w1:                \n"
+        "moveq.l #12,%%d0        \n"
+        "add.l   %[buf],%%d0     \n"
+        "and.l   #0xFFFFFFF0,%%d0\n" /* d0 == first line bound */
+        "cmp.l   %[buf],%%d0     \n" /* any leading longwords? */
+        "bls.b   .w_end_a_l1     \n" /* no: skip loop */
+
+    ".w_loop_a_l1:               \n"
+        "move.l  (%[buf])+,%%d1  \n"
+        "swap    %%d1            \n"
+        "move.w  %%d1,(%[ata])   \n"
+        "swap    %%d1            \n"
+        "move.w  %%d1,(%[ata])   \n"
+        "cmp.l   %[buf],%%d0     \n" /* run up to first line bound */
+        "bhi.b   .w_loop_a_l1    \n"
+
+    ".w_end_a_l1:                \n"
+        "lea.l   (-14,%[wcnt]),%[wcnt]   \n" /* adjust end addr. to 16 bytes/pass */
+
+    ".w_loop_a_line:             \n"
+        "movem.l (%[buf]),%%d0-%%d3  \n"
+        "lea.l   (16,%[buf]),%[buf]  \n"
+        "swap    %%d0            \n"
+        "move.w  %%d0,(%[ata])   \n"
+        "swap    %%d0            \n"
+        "move.w  %%d0,(%[ata])   \n"
+        "swap    %%d1            \n"
+        "move.w  %%d1,(%[ata])   \n"
+        "swap    %%d1            \n"
+        "move.w  %%d1,(%[ata])   \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "swap    %%d2            \n"
+        "move.w  %%d2,(%[ata])   \n"
+        "swap    %%d3            \n"
+        "move.w  %%d3,(%[ata])   \n"
+        "swap    %%d3            \n"
+        "move.w  %%d3,(%[ata])   \n"
+        "cmp.l   %[buf],%[wcnt]  \n" /* run up to last line bound */
+        "bhi.b   .w_loop_a_line  \n"
+        
+        "lea.l   (12,%[wcnt]),%[wcnt]\n"  /* readjust for longword loop */
+        "cmp.l   %[buf],%[wcnt]  \n" /* any trailing longwords? */
+        "bls.b   .w_end_a_l2     \n" /* no: skip loop */
+
+    ".w_loop_a_l2:               \n"
+        "move.l  (%[buf])+,%%d1  \n"
+        "swap    %%d1            \n"
+        "move.w  %%d1,(%[ata])   \n"
+        "swap    %%d1            \n"
+        "move.w  %%d1,(%[ata])   \n"
+        "cmp.l   %[buf],%[wcnt]  \n" /* run up to first line bound */
+        "bhi.b   .w_loop_a_l2    \n"
+
+    ".w_end_a_l2:                \n"
+        "addq.l  #2,%[wcnt]      \n" /* back to final end address */
+        "cmp.l   %[buf],%[wcnt]  \n" /* one word left? */
+        "bls.b   .w_end_a_w2     \n"
+
+        "move.w  (%[buf])+,(%[ata])  \n"  /* copy final word */
+
+    ".w_end_a_w2:                \n"
+
+    ".w_exit:                    \n"
+        : /* outputs */
+        : /* inputs */
+        [buf] "a"(buf),
+        [wcnt]"a"(wordcount),
+        [ata] "a"(&ATA_DATA)
+        : /*trashed */
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6"
+    );
+#else
+    /* SH1 optimized assembler version */
    /* this assumes wordcount to be a multiple of 2 */

 /* writing is not unrolled as much as reading, for several reasons:
@ -1065,6 +1266,7 @@ static void copy_write_sectors(const unsigned char* buf, int wordcount)
        "r0","r1","r2","r3","r6"
    );
 #endif
+#endif
 }

 int ata_write_sectors(IF_MV2(int drive,)