From 0a291fff12c27ba6b46521ecaf126bdb4726c24e Mon Sep 17 00:00:00 2001 From: Jens Arnold Date: Wed, 10 Feb 2010 23:23:17 +0000 Subject: [PATCH] APE: Fused vector math for the filters on ARMv5te. Speedup on Cowon D2 is ~4% for -c2000..-c4000 (less for -c5000). Thanks to Frank Gevaerts for testing. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24590 a1c6a512-1295-4272-9138-f99709370657 --- .../demac/libdemac/vector_math16_armv5te.h | 441 +++++++++++------- 1 file changed, 266 insertions(+), 175 deletions(-) diff --git a/apps/codecs/demac/libdemac/vector_math16_armv5te.h b/apps/codecs/demac/libdemac/vector_math16_armv5te.h index 4f2c203f5e..2940585a42 100644 --- a/apps/codecs/demac/libdemac/vector_math16_armv5te.h +++ b/apps/codecs/demac/libdemac/vector_math16_armv5te.h @@ -24,180 +24,288 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA */ -/* This version fetches data as 32 bit words, and *requires* v1 to be - * 32 bit aligned, otherwise it will result either in a data abort, or - * incorrect results (if ARM aligncheck is disabled). */ -static inline void vector_add(int16_t* v1, int16_t* v2) +#define FUSED_VECTOR_MATH + +/* Calculate scalarproduct, then add a 2nd vector (fused for performance) + * This version fetches data as 32 bit words, and *requires* v1 to be + * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit + * aligned or both unaligned. If either condition isn't met, it will either + * result in a data abort or incorrect results. */ +static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2) { + int res; #if ORDER > 16 int cnt = ORDER>>4; #endif -#define ADDHALFREGS(sum, s1) /* Adds register */ \ - "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight. */ \ - "add r8 , " #s1 ", " #sum ", lsl #16 \n" /* Clobbers 's1' */ \ - "add " #sum ", " #s1 ", " #sum ", lsr #16 \n" /* and r8. */ \ - "mov " #sum ", " #sum ", lsl #16 \n" \ - "orr " #sum ", " #sum ", r8 , lsr #16 \n" +#define ADDHALFREGS(sum, s1, s2) /* Adds register */ \ + "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \ + "add " #sum ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \ + "add " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \ + "mov " #s1 ", " #s1 ", lsl #16 \n" \ + "orr " #sum ", " #s1 ", " #sum ", lsr #16 \n" -#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ +#define ADDHALFXREGS(sum, s1, s2) /* Adds register */ \ "add " #s1 ", " #s1 ", " #sum ", lsl #16 \n" /* halves across. */ \ "add " #sum ", " #s2 ", " #sum ", lsr #16 \n" /* Clobbers 's1'. */ \ "mov " #sum ", " #sum ", lsl #16 \n" \ "orr " #sum ", " #sum ", " #s1 ", lsr #16 \n" asm volatile ( - "tst %[v2], #2 \n" - "beq 20f \n" - - "10: \n" - "ldrh r4, [%[v2]], #2 \n" - "mov r4, r4, lsl #16 \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - ADDHALFXREGS(r0, r4, r5) - ADDHALFXREGS(r1, r5, r6) - ADDHALFXREGS(r2, r6, r7) - ADDHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" - "mov r4, r8 \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - ADDHALFXREGS(r0, r4, r5) - ADDHALFXREGS(r1, r5, r6) - ADDHALFXREGS(r2, r6, r7) - ADDHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" #if ORDER > 16 - "mov r4, r8 \n" - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "mov %[res], #0 \n" #endif - "b 99f \n" + "tst %[f2], #2 \n" + "beq 20f \n" - "20: \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - ADDHALFREGS(r0, r4) - ADDHALFREGS(r1, r5) - ADDHALFREGS(r2, r6) - ADDHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - ADDHALFREGS(r0, r4) - ADDHALFREGS(r1, r5) - ADDHALFREGS(r2, r6) - ADDHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" + "10: \n" + "ldrh r4, [%[s2]], #2 \n" + "mov r4, r4, lsl #16 \n" + "ldrh r3, [%[f2]], #2 \n" #if ORDER > 16 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "mov r3, r3, lsl #16 \n" + "1: \n" + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" +#else + "ldmia %[v1], {r0,r1} \n" + "smulbb %[res], r0, r3 \n" +#endif + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r5} \n" + ADDHALFXREGS(r0, r4, r2) + ADDHALFXREGS(r1, r2, r5) + "stmia %[v1]!, {r0,r1} \n" + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r4} \n" + ADDHALFXREGS(r0, r5, r2) + ADDHALFXREGS(r1, r2, r4) + "stmia %[v1]!, {r0,r1} \n" + + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r5} \n" + ADDHALFXREGS(r0, r4, r2) + ADDHALFXREGS(r1, r2, r5) + "stmia %[v1]!, {r0,r1} \n" + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r4} \n" + ADDHALFXREGS(r0, r5, r2) + ADDHALFXREGS(r1, r2, r4) + "stmia %[v1]!, {r0,r1} \n" +#if ORDER > 16 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" +#endif + "b 99f \n" + + "20: \n" + "1: \n" + "ldmia %[v1], {r1,r2} \n" + "ldmia %[f2]!, {r3,r4} \n" +#if ORDER > 16 + "smlabb %[res], r1, r3, %[res] \n" +#else + "smulbb %[res], r1, r3 \n" +#endif + "smlatt %[res], r1, r3, %[res] \n" + "smlabb %[res], r2, r4, %[res] \n" + "smlatt %[res], r2, r4, %[res] \n" + "ldmia %[s2]!, {r3,r4} \n" + ADDHALFREGS(r0, r1, r3) + ADDHALFREGS(r1, r2, r4) + "stmia %[v1]!, {r0,r1} \n" + + ".rept 3 \n" + "ldmia %[v1], {r1,r2} \n" + "ldmia %[f2]!, {r3,r4} \n" + "smlabb %[res], r1, r3, %[res] \n" + "smlatt %[res], r1, r3, %[res] \n" + "smlabb %[res], r2, r4, %[res] \n" + "smlatt %[res], r2, r4, %[res] \n" + "ldmia %[s2]!, {r3,r4} \n" + ADDHALFREGS(r0, r1, r3) + ADDHALFREGS(r1, r2, r4) + "stmia %[v1]!, {r0,r1} \n" + ".endr \n" +#if ORDER > 16 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" #endif - "99: \n" + "99: \n" : /* outputs */ #if ORDER > 16 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), - [v2] "+r"(v2) + [f2] "+r"(f2), + [s2] "+r"(s2), + [res]"=r"(res) : /* inputs */ : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", - "r5", "r6", "r7", "r8", "memory" + "r0", "r1", "r2", "r3", "r4", "r5", "memory" ); + return res; } -/* This version fetches data as 32 bit words, and *requires* v1 to be - * 32 bit aligned, otherwise it will result either in a data abort, or - * incorrect results (if ARM aligncheck is disabled). */ -static inline void vector_sub(int16_t* v1, int16_t* v2) +/* Calculate scalarproduct, then subtract a 2nd vector (fused for performance) + * This version fetches data as 32 bit words, and *requires* v1 to be + * 32 bit aligned. It also requires that f2 and s2 are either both 32 bit + * aligned or both unaligned. If either condition isn't met, it will either + * result in a data abort or incorrect results. */ +static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2) { + int res; #if ORDER > 16 int cnt = ORDER>>4; #endif -#define SUBHALFREGS(dif, s1) /* Subtracts register */ \ - "sub r8 , " #dif ", " #s1 "\n" /* halves straight. */ \ - "and r8 , r8 , r9 \n" /* Needs r9 = 0x0000ffff, */ \ - "mov " #dif ", " #dif ", lsr #16 \n" /* clobbers r8. */ \ - "sub " #dif ", " #dif ", " #s1 ", lsr #16 \n" \ - "orr " #dif ", r8 , " #dif ", lsl #16 \n" +#define SUBHALFREGS(dif, s1, s2) /* Subtracts reg. */ \ + "mov " #s1 ", " #s1 ", ror #16 \n" /* halves straight */ \ + "sub " #dif ", " #s1 ", " #s2 ", lsl #16 \n" /* Clobbers 's1' */ \ + "sub " #s1 ", " #s1 ", " #s2 ", lsr #16 \n" \ + "mov " #s1 ", " #s1 ", lsl #16 \n" \ + "orr " #dif ", " #s1 ", " #dif ", lsr #16 \n" + +#define SUBHALFXREGS(dif, s1, s2, msk) /* Subtracts reg. */ \ + "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ + "and " #s1 ", " #s1 ", " #msk " \n" /* Needs msk = */ \ + "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* 0x0000ffff, */ \ + "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" /* clobbers 's1'. */ -#define SUBHALFXREGS(dif, s1, s2) /* Subtracts register */ \ - "sub " #s1 ", " #dif ", " #s1 ", lsr #16 \n" /* halves across. */ \ - "and " #s1 ", " #s1 ", r9 \n" /* Needs r9 = 0x0000ffff, */ \ - "rsb " #dif ", " #s2 ", " #dif ", lsr #16 \n" /* clobbers 's1'. */ \ - "orr " #dif ", " #s1 ", " #dif ", lsl #16 \n" - asm volatile ( - "mov r9, #0xff \n" - "orr r9, r9, #0xff00 \n" - "tst %[v2], #2 \n" - "beq 20f \n" - - "10: \n" - "ldrh r4, [%[v2]], #2 \n" - "mov r4, r4, lsl #16 \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - SUBHALFXREGS(r0, r4, r5) - SUBHALFXREGS(r1, r5, r6) - SUBHALFXREGS(r2, r6, r7) - SUBHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" - "mov r4, r8 \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r5-r8} \n" - SUBHALFXREGS(r0, r4, r5) - SUBHALFXREGS(r1, r5, r6) - SUBHALFXREGS(r2, r6, r7) - SUBHALFXREGS(r3, r7, r8) - "stmia %[v1]!, {r0-r3} \n" #if ORDER > 16 - "mov r4, r8 \n" - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "mov %[res], #0 \n" #endif - "b 99f \n" + "tst %[f2], #2 \n" + "beq 20f \n" - "20: \n" - "1: \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - SUBHALFREGS(r0, r4) - SUBHALFREGS(r1, r5) - SUBHALFREGS(r2, r6) - SUBHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" - "ldmia %[v1], {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - SUBHALFREGS(r0, r4) - SUBHALFREGS(r1, r5) - SUBHALFREGS(r2, r6) - SUBHALFREGS(r3, r7) - "stmia %[v1]!, {r0-r3} \n" + "10: \n" + "mov r6, #0xff \n" + "orr r6, r6, #0xff00 \n" + "ldrh r4, [%[s2]], #2 \n" + "mov r4, r4, lsl #16 \n" + "ldrh r3, [%[f2]], #2 \n" #if ORDER > 16 - "subs %[cnt], %[cnt], #1 \n" - "bne 1b \n" + "mov r3, r3, lsl #16 \n" + "1: \n" + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" +#else + "ldmia %[v1], {r0,r1} \n" + "smulbb %[res], r0, r3 \n" +#endif + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r5} \n" + SUBHALFXREGS(r0, r4, r2, r6) + SUBHALFXREGS(r1, r2, r5, r6) + "stmia %[v1]!, {r0,r1} \n" + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r4} \n" + SUBHALFXREGS(r0, r5, r2, r6) + SUBHALFXREGS(r1, r2, r4, r6) + "stmia %[v1]!, {r0,r1} \n" + + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r5} \n" + SUBHALFXREGS(r0, r4, r2, r6) + SUBHALFXREGS(r1, r2, r5, r6) + "stmia %[v1]!, {r0,r1} \n" + "ldmia %[v1], {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[f2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + "ldmia %[s2]!, {r2,r4} \n" + SUBHALFXREGS(r0, r5, r2, r6) + SUBHALFXREGS(r1, r2, r4, r6) + "stmia %[v1]!, {r0,r1} \n" +#if ORDER > 16 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" +#endif + "b 99f \n" + + "20: \n" + "1: \n" + "ldmia %[v1], {r1,r2} \n" + "ldmia %[f2]!, {r3,r4} \n" +#if ORDER > 16 + "smlabb %[res], r1, r3, %[res] \n" +#else + "smulbb %[res], r1, r3 \n" +#endif + "smlatt %[res], r1, r3, %[res] \n" + "smlabb %[res], r2, r4, %[res] \n" + "smlatt %[res], r2, r4, %[res] \n" + "ldmia %[s2]!, {r3,r4} \n" + SUBHALFREGS(r0, r1, r3) + SUBHALFREGS(r1, r2, r4) + "stmia %[v1]!, {r0,r1} \n" + + ".rept 3 \n" + "ldmia %[v1], {r1,r2} \n" + "ldmia %[f2]!, {r3,r4} \n" + "smlabb %[res], r1, r3, %[res] \n" + "smlatt %[res], r1, r3, %[res] \n" + "smlabb %[res], r2, r4, %[res] \n" + "smlatt %[res], r2, r4, %[res] \n" + "ldmia %[s2]!, {r3,r4} \n" + SUBHALFREGS(r0, r1, r3) + SUBHALFREGS(r1, r2, r4) + "stmia %[v1]!, {r0,r1} \n" + ".endr \n" +#if ORDER > 16 + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" #endif - "99: \n" + "99: \n" : /* outputs */ #if ORDER > 16 [cnt]"+r"(cnt), #endif [v1] "+r"(v1), - [v2] "+r"(v2) + [f2] "+r"(f2), + [s2] "+r"(s2), + [res]"=r"(res) : /* inputs */ : /* clobbers */ - "r0", "r1", "r2", "r3", "r4", "r5", - "r6", "r7", "r8", "r9", "memory" + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "memory" ); + return res; } /* This version fetches data as 32 bit words, and *requires* v1 to be @@ -211,9 +319,9 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) #endif #if ORDER > 16 -#define MLA_BLOCKS "3" +#define MLA_BLOCKS "7" #else -#define MLA_BLOCKS "1" +#define MLA_BLOCKS "3" #endif asm volatile ( @@ -224,36 +332,28 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "beq 20f \n" "10: \n" - "ldrh r7, [%[v2]], #2 \n" + "ldrh r3, [%[v2]], #2 \n" #if ORDER > 32 - "mov r7, r7, lsl #16 \n" + "mov r3, r3, lsl #16 \n" "1: \n" - "ldmia %[v1]!, {r0-r3} \n" - "smlabt %[res], r0, r7, %[res] \n" + "ldmia %[v1]!, {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" #else - "ldmia %[v1]!, {r0-r3} \n" - "smulbb %[res], r0, r7 \n" + "ldmia %[v1]!, {r0,r1} \n" + "smulbb %[res], r0, r3 \n" #endif - "ldmia %[v2]!, {r4-r7} \n" - "smlatb %[res], r0, r4, %[res] \n" - "smlabt %[res], r1, r4, %[res] \n" - "smlatb %[res], r1, r5, %[res] \n" - "smlabt %[res], r2, r5, %[res] \n" - "smlatb %[res], r2, r6, %[res] \n" - "smlabt %[res], r3, r6, %[res] \n" - "smlatb %[res], r3, r7, %[res] \n" - + "ldmia %[v2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" + ".rept " MLA_BLOCKS "\n" - "ldmia %[v1]!, {r0-r3} \n" - "smlabt %[res], r0, r7, %[res] \n" - "ldmia %[v2]!, {r4-r7} \n" - "smlatb %[res], r0, r4, %[res] \n" - "smlabt %[res], r1, r4, %[res] \n" - "smlatb %[res], r1, r5, %[res] \n" - "smlabt %[res], r2, r5, %[res] \n" - "smlatb %[res], r2, r6, %[res] \n" - "smlabt %[res], r3, r6, %[res] \n" - "smlatb %[res], r3, r7, %[res] \n" + "ldmia %[v1]!, {r0,r1} \n" + "smlabt %[res], r0, r3, %[res] \n" + "ldmia %[v2]!, {r2,r3} \n" + "smlatb %[res], r0, r2, %[res] \n" + "smlabt %[res], r1, r2, %[res] \n" + "smlatb %[res], r1, r3, %[res] \n" ".endr \n" #if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" @@ -263,32 +363,24 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) "20: \n" "1: \n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" + "ldmia %[v1]!, {r0,r1} \n" + "ldmia %[v2]!, {r2,r3} \n" #if ORDER > 32 - "smlabb %[res], r0, r4, %[res] \n" + "smlabb %[res], r0, r2, %[res] \n" #else - "smulbb %[res], r0, r4 \n" + "smulbb %[res], r0, r2 \n" #endif - "smlatt %[res], r0, r4, %[res] \n" - "smlabb %[res], r1, r5, %[res] \n" - "smlatt %[res], r1, r5, %[res] \n" - "smlabb %[res], r2, r6, %[res] \n" - "smlatt %[res], r2, r6, %[res] \n" - "smlabb %[res], r3, r7, %[res] \n" - "smlatt %[res], r3, r7, %[res] \n" + "smlatt %[res], r0, r2, %[res] \n" + "smlabb %[res], r1, r3, %[res] \n" + "smlatt %[res], r1, r3, %[res] \n" ".rept " MLA_BLOCKS "\n" - "ldmia %[v1]!, {r0-r3} \n" - "ldmia %[v2]!, {r4-r7} \n" - "smlabb %[res], r0, r4, %[res] \n" - "smlatt %[res], r0, r4, %[res] \n" - "smlabb %[res], r1, r5, %[res] \n" - "smlatt %[res], r1, r5, %[res] \n" - "smlabb %[res], r2, r6, %[res] \n" - "smlatt %[res], r2, r6, %[res] \n" - "smlabb %[res], r3, r7, %[res] \n" - "smlatt %[res], r3, r7, %[res] \n" + "ldmia %[v1]!, {r0,r1} \n" + "ldmia %[v2]!, {r2,r3} \n" + "smlabb %[res], r0, r2, %[res] \n" + "smlatt %[res], r0, r2, %[res] \n" + "smlabb %[res], r1, r3, %[res] \n" + "smlatt %[res], r1, r3, %[res] \n" ".endr \n" #if ORDER > 32 "subs %[cnt], %[cnt], #1 \n" @@ -305,8 +397,7 @@ static inline int32_t scalarproduct(int16_t* v1, int16_t* v2) [res]"=r"(res) : /* inputs */ : /* clobbers */ - "r0", "r1", "r2", "r3", - "r4", "r5", "r6", "r7" + "r0", "r1", "r2", "r3" ); return res; }