Use hand-written constants table on ARMv5+ for JPEG IDCT, and load four 16-bit constants at a time with ldrd. Not useful for ARMv4, since one load per constant would still be needed, and limited range of ldrsh would force multiple copies of table.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21535 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
99ae7bcc43
commit
815dcfdd35
1 changed files with 86 additions and 76 deletions
|
@ -113,7 +113,11 @@ jpeg_idct2h:
|
|||
results can not be stored merged.
|
||||
*/
|
||||
stmdb sp!, { r4-r5, lr }
|
||||
#if ARM_ARCH < 5
|
||||
ldr r14, =4112
|
||||
#else
|
||||
ldrsh r14, .Lpool4+2
|
||||
#endif
|
||||
1:
|
||||
ldrsh r12, [r0]
|
||||
ldrsh r4, [r0, #2]
|
||||
|
@ -140,7 +144,7 @@ jpeg_idct2h:
|
|||
ldmia sp!, { r4-r5, pc }
|
||||
#else
|
||||
stmdb sp!, { r4, lr }
|
||||
ldr r14, =4112
|
||||
ldrsh r14, .Lpool4+2
|
||||
1:
|
||||
ldr r12, [r0]
|
||||
sadd16 r12, r12, r14
|
||||
|
@ -198,27 +202,26 @@ jpeg_idct4v:
|
|||
ldmia sp!, { r4-r7, pc }
|
||||
#elif ARM_ARCH < 6
|
||||
stmdb sp!, { r4-r8, lr }
|
||||
ldr r8, =1024
|
||||
ldr r14, =4433
|
||||
ldr r12, =3302955134
|
||||
mov r8, #1024
|
||||
ldrd r4, .Lpool4
|
||||
1:
|
||||
ldrsh r5, [r0, #48]
|
||||
ldrsh r14, [r0, #48]
|
||||
ldrsh r3, [r0, #16]
|
||||
ldrsh r4, [r0, #32]
|
||||
ldrsh r12, [r0, #32]
|
||||
ldrsh r2, [r0]
|
||||
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
|
||||
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
|
||||
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
|
||||
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
|
||||
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
||||
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
|
||||
add r6, r3, r14 /* r6 = z1 = d1 + d3 */
|
||||
add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
|
||||
smlabb r6, r5, r6, r8 /* z1 *= 4433 */
|
||||
sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
|
||||
smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
||||
smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
|
||||
mov r7, r7, lsl #2
|
||||
mov r2, r2, lsl #2
|
||||
add r4, r7, r3, asr #11 /* r4 = o0 */
|
||||
sub r7, r7, r3, asr #11 /* r7 = o3 */
|
||||
add r3, r2, r5, asr #11 /* r3 = o1 */
|
||||
sub r2, r2, r5, asr #11 /* r2 = o2 */
|
||||
strh r4, [r0]
|
||||
add r12, r7, r3, asr #11 /* r12 = o0 */
|
||||
sub r7, r7, r3, asr #11 /* r7 = o3 */
|
||||
add r3, r2, r14, asr #11 /* r3 = o1 */
|
||||
sub r2, r2, r14, asr #11 /* r2 = o2 */
|
||||
strh r12, [r0]
|
||||
strh r7, [r0, #48]
|
||||
strh r3, [r0, #16]
|
||||
strh r2, [r0, #32]
|
||||
|
@ -228,9 +231,8 @@ jpeg_idct4v:
|
|||
ldmia sp!, { r4-r8, pc }
|
||||
#else
|
||||
stmdb sp!, { r4-r10, lr }
|
||||
ldr r2, =1024
|
||||
ldr r3, =4433
|
||||
ldr r12, =3302955134
|
||||
ldrd r2, .Lpool4
|
||||
mov r12, #1024
|
||||
1:
|
||||
ldr r6, [r0, #32]
|
||||
ldr r4, [r0]
|
||||
|
@ -247,12 +249,12 @@ jpeg_idct4v:
|
|||
/* multiplication expands values beyond 16 bits, so this part needs to be
|
||||
split. the values will be merged below so that the rest of the addition
|
||||
can be done in parallel */
|
||||
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
|
||||
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
|
||||
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
|
||||
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
|
||||
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
|
||||
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
|
||||
smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
|
||||
smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
|
||||
smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
|
||||
smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
|
||||
smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
|
||||
smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
|
||||
mov r8, r8, lsl #2 /* complete the parallel shift started */
|
||||
mov r4, r4, lsl #2 /* with the earlier bic instructions */
|
||||
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
|
||||
|
@ -276,6 +278,17 @@ jpeg_idct4v:
|
|||
#endif
|
||||
.size jpeg_idct4v, .-jpeg_idct4v
|
||||
|
||||
#if ARM_ARCH > 4
|
||||
.align 4
|
||||
.Lpool4:
|
||||
.short -15137
|
||||
.short 4112
|
||||
.short 4433
|
||||
.short 6270
|
||||
|
||||
.align 2
|
||||
#endif
|
||||
|
||||
jpeg_idct4h:
|
||||
#if ARM_ARCH < 5
|
||||
stmdb sp!, { r4-r10, lr }
|
||||
|
@ -328,88 +341,85 @@ jpeg_idct4h:
|
|||
cmp r0, r2
|
||||
bcc 1b
|
||||
ldmia sp!, { r4-r10, pc }
|
||||
#elif ARM_ARCH < 6
|
||||
stmdb sp!, { r4-r10, lr }
|
||||
ldr r10, =4433
|
||||
ldr r14, =4112
|
||||
ldr r12, =3302955134
|
||||
#elif ARM_ARCH < 6 || 1
|
||||
stmdb sp!, { r4-r9, lr }
|
||||
ldrd r4, .Lpool4
|
||||
1:
|
||||
ldrsh r7, [r0, #6]
|
||||
ldrsh r5, [r0, #2]
|
||||
ldrsh r4, [r0]
|
||||
ldrsh r14, [r0, #2]
|
||||
ldrsh r12, [r0]
|
||||
ldrsh r6, [r0, #4]
|
||||
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
|
||||
add r4, r4, r14
|
||||
smulbb r8, r10, r8 /* z1 *= 4433 */
|
||||
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
|
||||
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
|
||||
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
|
||||
sub r4, r4, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
|
||||
add r6, r5, r9, lsl #13 /* r6 = o0 */
|
||||
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
|
||||
add r5, r7, r4, lsl #13 /* r5 = o1 */
|
||||
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
|
||||
add r8, r14, r7 /* r8 = z1 = d1 + d3 */
|
||||
add r12, r12, r4, lsr #16
|
||||
smulbb r8, r5, r8 /* z1 *= 4433 */
|
||||
add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
|
||||
smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
|
||||
smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
|
||||
sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
|
||||
add r6, r14, r9, lsl #13 /* r6 = o0 */
|
||||
rsb r9, r14, r9, lsl #13 /* r9 = o3 */
|
||||
add r14, r7, r12, lsl #13 /* r14= o1 */
|
||||
rsb r12, r7, r12, lsl #13 /* r12= o2 */
|
||||
mov r6, r6, asr #18
|
||||
mov r5, r5, asr #18
|
||||
mov r4, r4, asr #18
|
||||
mov r14, r14, asr #18
|
||||
mov r12, r12, asr #18
|
||||
mov r9, r9, asr #18
|
||||
cmp r6, #255
|
||||
mvnhi r6, r6, asr #31
|
||||
cmp r5, #255
|
||||
mvnhi r5, r5, asr #31
|
||||
cmp r4, #255
|
||||
mvnhi r4, r4, asr #31
|
||||
cmp r14, #255
|
||||
mvnhi r14, r14, asr #31
|
||||
cmp r12, #255
|
||||
mvnhi r12, r12, asr #31
|
||||
cmp r9, #255
|
||||
mvnhi r9, r9, asr #31
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
strb r6, [r1]
|
||||
strb r5, [r1, #4]
|
||||
strb r4, [r1, #8]
|
||||
strb r14, [r1, #4]
|
||||
strb r12, [r1, #8]
|
||||
strb r9, [r1, #12]
|
||||
#else
|
||||
strb r6, [r1]
|
||||
strb r5, [r1, #1]
|
||||
strb r4, [r1, #2]
|
||||
strb r14, [r1, #1]
|
||||
strb r12, [r1, #2]
|
||||
strb r9, [r1, #3]
|
||||
#endif
|
||||
add r0, r0, #16
|
||||
add r1, r1, r3
|
||||
cmp r0, r2
|
||||
bcc 1b
|
||||
ldmia sp!, { r4-r10, pc }
|
||||
ldmia sp!, { r4-r9, pc }
|
||||
#else
|
||||
stmdb sp!, { r4-r9, lr }
|
||||
ldr r9, =4433
|
||||
ldr r14, =4112
|
||||
ldr r12, =3302955134
|
||||
ldrd r4, .Lpool4
|
||||
mov r9, r4, lsr #16
|
||||
1:
|
||||
ldmia r0, { r4-r5 }
|
||||
sadd16 r4, r4, r14
|
||||
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
|
||||
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
|
||||
smulbt r8, r9, r6
|
||||
ldmia r0, { r12, r14 }
|
||||
sadd16 r12, r12, r9
|
||||
sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
|
||||
ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
|
||||
smulbt r8, r5, r6
|
||||
sxth r6, r6
|
||||
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
|
||||
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
|
||||
smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
|
||||
smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
|
||||
sxth r7, r7
|
||||
add r8, r4, r6, lsl #13 /* r8 = o0 */
|
||||
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
|
||||
add r4, r5, r7, lsl #13 /* r4 = o1 */
|
||||
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
|
||||
add r8, r12, r6, lsl #13 /* r8 = o0 */
|
||||
rsb r6, r12, r6, lsl #13 /* r6 = o3 */
|
||||
add r12, r14, r7, lsl #13 /* r12= o1 */
|
||||
rsb r14, r14, r7, lsl #13 /* r14= o2 */
|
||||
usat r8, #8, r8, asr #18
|
||||
usat r6, #8, r6, asr #18
|
||||
usat r4, #8, r4, asr #18
|
||||
usat r5, #8, r5, asr #18
|
||||
usat r12, #8, r12, asr #18
|
||||
usat r14, #8, r14, asr #18
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
strb r8, [r1]
|
||||
strb r6, [r1, #12]
|
||||
strb r4, [r1, #4]
|
||||
strb r5, [r1, #8]
|
||||
strb r12, [r1, #4]
|
||||
strb r14, [r1, #8]
|
||||
#else
|
||||
strb r8, [r1]
|
||||
strb r6, [r1, #3]
|
||||
strb r4, [r1, #1]
|
||||
strb r5, [r1, #2]
|
||||
strb r12, [r1, #1]
|
||||
strb r14, [r1, #2]
|
||||
#endif
|
||||
add r0, r0, #16
|
||||
add r1, r1, r3
|
||||
|
@ -450,7 +460,7 @@ jpeg_idct8v:
|
|||
mov r11, r11, asr #16 /* r11 = z3 = d6 */
|
||||
add r8, r8, #8192
|
||||
add r9, r10, r11
|
||||
mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
|
||||
mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
|
||||
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
|
||||
ldr r14, =6270
|
||||
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
|
||||
|
|
Loading…
Reference in a new issue