Use hand-written constants table on ARMv5+ for JPEG IDCT, and load four 16-bit constants at a time with ldrd. Not useful for ARMv4, since one load per constant would still be needed, and limited range of ldrsh would force multiple copies of table.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21535 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andrew Mahone 2009-06-28 02:32:43 +00:00
parent 99ae7bcc43
commit 815dcfdd35

View file

@ -113,7 +113,11 @@ jpeg_idct2h:
results can not be stored merged.
*/
stmdb sp!, { r4-r5, lr }
#if ARM_ARCH < 5
ldr r14, =4112
#else
ldrsh r14, .Lpool4+2
#endif
1:
ldrsh r12, [r0]
ldrsh r4, [r0, #2]
@ -140,7 +144,7 @@ jpeg_idct2h:
ldmia sp!, { r4-r5, pc }
#else
stmdb sp!, { r4, lr }
ldr r14, =4112
ldrsh r14, .Lpool4+2
1:
ldr r12, [r0]
sadd16 r12, r12, r14
@ -198,27 +202,26 @@ jpeg_idct4v:
ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr }
ldr r8, =1024
ldr r14, =4433
ldr r12, =3302955134
mov r8, #1024
ldrd r4, .Lpool4
1:
ldrsh r5, [r0, #48]
ldrsh r14, [r0, #48]
ldrsh r3, [r0, #16]
ldrsh r4, [r0, #32]
ldrsh r12, [r0, #32]
ldrsh r2, [r0]
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
add r6, r3, r14 /* r6 = z1 = d1 + d3 */
add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
smlabb r6, r5, r6, r8 /* z1 *= 4433 */
sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
mov r7, r7, lsl #2
mov r2, r2, lsl #2
add r4, r7, r3, asr #11 /* r4 = o0 */
sub r7, r7, r3, asr #11 /* r7 = o3 */
add r3, r2, r5, asr #11 /* r3 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r4, [r0]
add r12, r7, r3, asr #11 /* r12 = o0 */
sub r7, r7, r3, asr #11 /* r7 = o3 */
add r3, r2, r14, asr #11 /* r3 = o1 */
sub r2, r2, r14, asr #11 /* r2 = o2 */
strh r12, [r0]
strh r7, [r0, #48]
strh r3, [r0, #16]
strh r2, [r0, #32]
@ -228,9 +231,8 @@ jpeg_idct4v:
ldmia sp!, { r4-r8, pc }
#else
stmdb sp!, { r4-r10, lr }
ldr r2, =1024
ldr r3, =4433
ldr r12, =3302955134
ldrd r2, .Lpool4
mov r12, #1024
1:
ldr r6, [r0, #32]
ldr r4, [r0]
@ -247,12 +249,12 @@ jpeg_idct4v:
/* multiplication expands values beyond 16 bits, so this part needs to be
split. the values will be merged below so that the rest of the addition
can be done in parallel */
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
mov r8, r8, lsl #2 /* complete the parallel shift started */
mov r4, r4, lsl #2 /* with the earlier bic instructions */
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
@ -276,6 +278,17 @@ jpeg_idct4v:
#endif
.size jpeg_idct4v, .-jpeg_idct4v
#if ARM_ARCH > 4
.align 4
.Lpool4:
.short -15137
.short 4112
.short 4433
.short 6270
.align 2
#endif
jpeg_idct4h:
#if ARM_ARCH < 5
stmdb sp!, { r4-r10, lr }
@ -328,88 +341,85 @@ jpeg_idct4h:
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r10, lr }
ldr r10, =4433
ldr r14, =4112
ldr r12, =3302955134
#elif ARM_ARCH < 6 || 1
stmdb sp!, { r4-r9, lr }
ldrd r4, .Lpool4
1:
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
ldrsh r4, [r0]
ldrsh r14, [r0, #2]
ldrsh r12, [r0]
ldrsh r6, [r0, #4]
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
add r4, r4, r14
smulbb r8, r10, r8 /* z1 *= 4433 */
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
sub r4, r4, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
add r6, r5, r9, lsl #13 /* r6 = o0 */
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
add r5, r7, r4, lsl #13 /* r5 = o1 */
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
add r8, r14, r7 /* r8 = z1 = d1 + d3 */
add r12, r12, r4, lsr #16
smulbb r8, r5, r8 /* z1 *= 4433 */
add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
add r6, r14, r9, lsl #13 /* r6 = o0 */
rsb r9, r14, r9, lsl #13 /* r9 = o3 */
add r14, r7, r12, lsl #13 /* r14= o1 */
rsb r12, r7, r12, lsl #13 /* r12= o2 */
mov r6, r6, asr #18
mov r5, r5, asr #18
mov r4, r4, asr #18
mov r14, r14, asr #18
mov r12, r12, asr #18
mov r9, r9, asr #18
cmp r6, #255
mvnhi r6, r6, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r14, #255
mvnhi r14, r14, asr #31
cmp r12, #255
mvnhi r12, r12, asr #31
cmp r9, #255
mvnhi r9, r9, asr #31
#ifdef HAVE_LCD_COLOR
strb r6, [r1]
strb r5, [r1, #4]
strb r4, [r1, #8]
strb r14, [r1, #4]
strb r12, [r1, #8]
strb r9, [r1, #12]
#else
strb r6, [r1]
strb r5, [r1, #1]
strb r4, [r1, #2]
strb r14, [r1, #1]
strb r12, [r1, #2]
strb r9, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
ldmia sp!, { r4-r9, pc }
#else
stmdb sp!, { r4-r9, lr }
ldr r9, =4433
ldr r14, =4112
ldr r12, =3302955134
ldrd r4, .Lpool4
mov r9, r4, lsr #16
1:
ldmia r0, { r4-r5 }
sadd16 r4, r4, r14
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
smulbt r8, r9, r6
ldmia r0, { r12, r14 }
sadd16 r12, r12, r9
sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
smulbt r8, r5, r6
sxth r6, r6
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
sxth r7, r7
add r8, r4, r6, lsl #13 /* r8 = o0 */
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
add r4, r5, r7, lsl #13 /* r4 = o1 */
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
add r8, r12, r6, lsl #13 /* r8 = o0 */
rsb r6, r12, r6, lsl #13 /* r6 = o3 */
add r12, r14, r7, lsl #13 /* r12= o1 */
rsb r14, r14, r7, lsl #13 /* r14= o2 */
usat r8, #8, r8, asr #18
usat r6, #8, r6, asr #18
usat r4, #8, r4, asr #18
usat r5, #8, r5, asr #18
usat r12, #8, r12, asr #18
usat r14, #8, r14, asr #18
#ifdef HAVE_LCD_COLOR
strb r8, [r1]
strb r6, [r1, #12]
strb r4, [r1, #4]
strb r5, [r1, #8]
strb r12, [r1, #4]
strb r14, [r1, #8]
#else
strb r8, [r1]
strb r6, [r1, #3]
strb r4, [r1, #1]
strb r5, [r1, #2]
strb r12, [r1, #1]
strb r14, [r1, #2]
#endif
add r0, r0, #16
add r1, r1, r3
@ -450,7 +460,7 @@ jpeg_idct8v:
mov r11, r11, asr #16 /* r11 = z3 = d6 */
add r8, r8, #8192
add r9, r10, r11
mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
ldr r14, =6270
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */