45c7498f59
We can't pop into pc on ARMv4t when using thumb: the T bit won't be modified if we are returning to a thumb function Code running on ARMv4t should use the new ldrpc / ldmpc macros instead of ldr pc, [sp], #4 and ldm(cond) sp!, {regs, pc} No modification on pure ARM builds and ARMv5+ Note: USE_THUMB is currently never defined, no targets can currently be built with -mthumb, see FS#6734 git-svn-id: svn://svn.rockbox.org/rockbox/trunk@26756 a1c6a512-1295-4272-9138-f99709370657
1022 lines
35 KiB
ArmAsm
1022 lines
35 KiB
ArmAsm
/***************************************************************************
|
|
* __________ __ ___.
|
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
|
* \/ \/ \/ \/ \/
|
|
* $Id$
|
|
*
|
|
* JPEG assembly IDCT
|
|
*
|
|
* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
|
|
* jpeg_load.c with
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
* KIND, either express or implied.
|
|
*
|
|
****************************************************************************/
|
|
#include "config.h"
|
|
#include "apps/core_asmdefs.h"
|
|
|
|
.section .text
|
|
.align 2
|
|
.global jpeg_idct1h
|
|
.type jpeg_idct1h, %function
|
|
.global jpeg_idct2v
|
|
.type jpeg_idct2v, %function
|
|
.global jpeg_idct2h
|
|
.type jpeg_idct2h, %function
|
|
.global jpeg_idct4v
|
|
.type jpeg_idct4v, %function
|
|
.global jpeg_idct4h
|
|
.type jpeg_idct4h, %function
|
|
.global jpeg_idct8v
|
|
.type jpeg_idct8v, %function
|
|
.global jpeg_idct8h
|
|
.type jpeg_idct8h, %function
|
|
|
|
jpeg_idct1h:
|
|
/* In the common case of one pass through the loop, the extra add should be
|
|
cheaper than saving registers to stack and loading a the value 4112. */
|
|
1:
|
|
ldrsh r12, [r0]
|
|
add r12, r12, #4096
|
|
add r12, r12, #16
|
|
#if ARM_ARCH < 6
|
|
mov r12, r12, asr #5
|
|
cmp r12, #255
|
|
mvnhi r12, r12, asr #31
|
|
#else
|
|
usat r12, #8, r12, asr #5
|
|
#endif
|
|
strb r12, [r1]
|
|
add r0, r0, #16
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
bx lr
|
|
.size jpeg_idct1h, .-jpeg_idct1h
|
|
|
|
jpeg_idct2v:
|
|
#if ARM_ARCH < 6
|
|
/* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
|
|
than loading two values in each register and using shifts and strh, and
|
|
requires fewer fixup operations than splitting the values, calculating, and
|
|
merging.
|
|
*/
|
|
stmdb sp!, { r4, lr }
|
|
1:
|
|
ldr r2, [r0]
|
|
ldr r3, [r0, #16]
|
|
eor r12, r2, r3
|
|
and r12, r12, #0x8000
|
|
bic r3, r3, #0x8000
|
|
bic r4, r2, #0x8000
|
|
add r4, r4, r3
|
|
eor r4, r4, r12
|
|
orr r2, r2, #0x8000
|
|
sub r2, r2, r3
|
|
eor r2, r2, r12
|
|
eor r2, r2, #0x8000
|
|
str r4, [r0]
|
|
str r2, [r0, #16]
|
|
add r0, r0, #4
|
|
cmp r0, r1
|
|
bcc 1b
|
|
ldmpc regs=r4
|
|
#else
|
|
/* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
|
|
to two columns.
|
|
*/
|
|
1:
|
|
ldr r2, [r0]
|
|
ldr r3, [r0, #16]
|
|
sadd16 r12, r2, r3
|
|
ssub16 r2, r2, r3
|
|
str r12, [r0]
|
|
str r2, [r0, #16]
|
|
add r0, r0, #4
|
|
cmp r0, r1
|
|
bcc 1b
|
|
bx lr
|
|
#endif
|
|
.size jpeg_idct2v, .-jpeg_idct2v
|
|
|
|
jpeg_idct2h:
|
|
#if ARM_ARCH < 6
|
|
/* Using LDR and shifts here would costs two more ops, and is no faster as
|
|
results can not be stored merged.
|
|
*/
|
|
stmdb sp!, { r4-r5, lr }
|
|
#if ARM_ARCH < 5
|
|
ldr r14, =4112
|
|
#else
|
|
ldrsh r14, .Lpool4+2
|
|
#endif
|
|
1:
|
|
ldrsh r12, [r0]
|
|
ldrsh r4, [r0, #2]
|
|
add r12, r12, r14
|
|
add r5, r12, r4
|
|
sub r4, r12, r4
|
|
mov r5, r5, asr #5
|
|
mov r4, r4, asr #5
|
|
cmp r5, #255
|
|
mvnhi r5, r5, asr #31
|
|
cmp r4, #255
|
|
mvnhi r4, r4, asr #31
|
|
strb r5, [r1]
|
|
strb r4, [r1, #pix8_size]
|
|
add r0, r0, #16
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmpc regs=r4-r5
|
|
#else
|
|
stmdb sp!, { r4, lr }
|
|
ldrsh r14, .Lpool4+2
|
|
1:
|
|
ldr r12, [r0]
|
|
sadd16 r12, r12, r14
|
|
saddsubx r12, r12, r12
|
|
usat r4, #8, r12, asr #21
|
|
sxth r12, r12
|
|
usat r12, #8, r12, asr #5
|
|
strb r4, [r1]
|
|
strb r12, [r1, #pix8_size]
|
|
add r0, r0, #16
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmia sp!, { r4, pc }
|
|
#endif
|
|
.size jpeg_idct2h, .-jpeg_idct2h
|
|
|
|
jpeg_idct4v:
|
|
#if ARM_ARCH < 5
|
|
stmdb sp!, { r4-r7, lr }
|
|
ldr r14, =-15137
|
|
ldr r12, =6270
|
|
1:
|
|
ldrsh r4, [r0, #32]
|
|
ldrsh r2, [r0]
|
|
ldrsh r5, [r0, #48]
|
|
ldrsh r3, [r0, #16]
|
|
add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
|
|
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
|
|
add r4, r3, r5 /* r4 = z1 = d1 + d3 */
|
|
add r7, r4, r4, lsl #3
|
|
rsb r4, r4, r7, lsl #4
|
|
rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
|
|
add r4, r4, #1024
|
|
mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
|
|
mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
|
|
mov r6, r6, lsl #2 /* r6 <<= 2 */
|
|
mov r2, r2, lsl #2 /* r2 <<= 2 */
|
|
add r7, r6, r3, asr #11 /* r7 = o0 */
|
|
sub r3, r6, r3, asr #11 /* r3 = o3 */
|
|
add r6, r2, r5, asr #11 /* r6 = o1 */
|
|
sub r2, r2, r5, asr #11 /* r2 = o2 */
|
|
strh r7, [r0]
|
|
strh r3, [r0, #48]
|
|
strh r6, [r0, #16]
|
|
strh r2, [r0, #32]
|
|
add r0, r0, #2
|
|
cmp r0, r1
|
|
bcc 1b
|
|
ldmpc regs=r4-r7
|
|
#elif ARM_ARCH < 6
|
|
stmdb sp!, { r4-r8, lr }
|
|
mov r8, #1024
|
|
ldr r4, .Lpool4
|
|
ldr r5, .Lpool4+4
|
|
1:
|
|
ldrsh r14, [r0, #48]
|
|
ldrsh r3, [r0, #16]
|
|
ldrsh r12, [r0, #32]
|
|
ldrsh r2, [r0]
|
|
add r6, r3, r14 /* r6 = z1 = d1 + d3 */
|
|
add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
|
|
smlabb r6, r5, r6, r8 /* z1 *= 4433 */
|
|
sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
|
|
smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
|
smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
|
|
mov r7, r7, lsl #2
|
|
mov r2, r2, lsl #2
|
|
add r12, r7, r3, asr #11 /* r12 = o0 */
|
|
sub r7, r7, r3, asr #11 /* r7 = o3 */
|
|
add r3, r2, r14, asr #11 /* r3 = o1 */
|
|
sub r2, r2, r14, asr #11 /* r2 = o2 */
|
|
strh r12, [r0]
|
|
strh r7, [r0, #48]
|
|
strh r3, [r0, #16]
|
|
strh r2, [r0, #32]
|
|
add r0, r0, #2
|
|
cmp r0, r1
|
|
bcc 1b
|
|
ldmia sp!, { r4-r8, pc }
|
|
#else /* ARMv6+ */
|
|
stmdb sp!, { r4-r10, lr }
|
|
ldrd r2, .Lpool4
|
|
mov r12, #1024
|
|
1:
|
|
ldr r6, [r0, #32]
|
|
ldr r4, [r0]
|
|
ldr r7, [r0, #48]
|
|
ldr r5, [r0, #16]
|
|
/* this part is being done in parallel on two columns */
|
|
sadd16 r8, r4, r6 /* r8 = d0 + d2 */
|
|
ssub16 r4, r4, r6 /* r4 = d0 - d2 */
|
|
sadd16 r6, r5, r7 /* r6 = d1 + d3 */
|
|
/* there is no parallel shift operation, but we can fake it with bic
|
|
and lsl */
|
|
bic r8, r8, #0xc000
|
|
bic r4, r4, #0xc000
|
|
/* multiplication expands values beyond 16 bits, so this part needs to be
|
|
split. the values will be merged below so that the rest of the addition
|
|
can be done in parallel */
|
|
smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
|
|
smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
|
|
smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
|
|
smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
|
|
smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
|
|
smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
|
|
mov r8, r8, lsl #2 /* complete the parallel shift started */
|
|
mov r4, r4, lsl #2 /* with the earlier bic instructions */
|
|
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
|
|
/* tmp10, tmp12 are in r4, r8 */
|
|
mov r10, r10, asr #11
|
|
mov r14, r14, asr #11
|
|
pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
|
|
pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
|
|
sadd16 r10, r8, r5 /* d0 */
|
|
ssub16 r5, r8, r5 /* d3 */
|
|
sadd16 r14, r4, r6 /* d1 */
|
|
ssub16 r6, r4, r6 /* d2 */
|
|
str r10, [r0]
|
|
str r5, [r0, #48]
|
|
str r14, [r0, #16]
|
|
str r6, [r0, #32]
|
|
add r0, r0, #4
|
|
cmp r0, r1
|
|
bcc 1b
|
|
ldmia sp!, { r4-r10, pc }
|
|
#endif
|
|
.size jpeg_idct4v, .-jpeg_idct4v
|
|
|
|
#if ARM_ARCH > 4
|
|
.align 4
|
|
.Lpool4:
|
|
.short -15137
|
|
.short 4112
|
|
.short 4433
|
|
.short 6270
|
|
|
|
.align 2
|
|
#endif
|
|
|
|
jpeg_idct4h:
|
|
#if ARM_ARCH < 5
|
|
stmdb sp!, { r4-r10, lr }
|
|
ldr r10, =-15137
|
|
ldr r14, =4112
|
|
ldr r12, =6270
|
|
1:
|
|
ldrsh r4, [r0]
|
|
ldrsh r6, [r0, #4]
|
|
ldrsh r7, [r0, #6]
|
|
ldrsh r5, [r0, #2]
|
|
add r4, r4, r14
|
|
add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
|
|
sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
|
|
add r6, r5, r7 /* r6 = z1 = d1 + d3 */
|
|
add r9, r6, r6, lsl #3
|
|
rsb r6, r6, r9, lsl #4
|
|
rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
|
|
mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
|
|
mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
|
add r9, r5, r8, lsl #13 /* r7 = o0 */
|
|
rsb r5, r5, r8, lsl #13 /* r3 = o3 */
|
|
add r8, r7, r4, lsl #13 /* r6 = o1 */
|
|
rsb r4, r7, r4, lsl #13 /* r2 = o2 */
|
|
mov r9, r9, asr #18
|
|
mov r8, r8, asr #18
|
|
mov r4, r4, asr #18
|
|
mov r5, r5, asr #18
|
|
cmp r9, #255
|
|
mvnhi r9, r9, asr #31
|
|
cmp r8, #255
|
|
mvnhi r8, r8, asr #31
|
|
cmp r4, #255
|
|
mvnhi r4, r4, asr #31
|
|
cmp r5, #255
|
|
mvnhi r5, r5, asr #31
|
|
strb r9, [r1]
|
|
strb r8, [r1, #pix8_size]
|
|
strb r4, [r1, #2*pix8_size]
|
|
strb r5, [r1, #3*pix8_size]
|
|
add r0, r0, #16
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmpc regs=r4-r10
|
|
#elif ARM_ARCH < 6 /* ARMv5 */
|
|
stmdb sp!, { r4-r9, lr }
|
|
ldr r4, .Lpool4
|
|
ldr r5, .Lpool4+4
|
|
1:
|
|
ldrsh r7, [r0, #6]
|
|
ldrsh r14, [r0, #2]
|
|
ldrsh r12, [r0]
|
|
ldrsh r6, [r0, #4]
|
|
add r8, r14, r7 /* r8 = z1 = d1 + d3 */
|
|
add r12, r12, r4, lsr #16
|
|
smulbb r8, r5, r8 /* z1 *= 4433 */
|
|
add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
|
|
smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
|
|
smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
|
|
sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
|
|
add r6, r14, r9, lsl #13 /* r6 = o0 */
|
|
rsb r9, r14, r9, lsl #13 /* r9 = o3 */
|
|
add r14, r7, r12, lsl #13 /* r14= o1 */
|
|
rsb r12, r7, r12, lsl #13 /* r12= o2 */
|
|
mov r6, r6, asr #18
|
|
mov r14, r14, asr #18
|
|
mov r12, r12, asr #18
|
|
mov r9, r9, asr #18
|
|
cmp r6, #255
|
|
mvnhi r6, r6, asr #31
|
|
cmp r14, #255
|
|
mvnhi r14, r14, asr #31
|
|
cmp r12, #255
|
|
mvnhi r12, r12, asr #31
|
|
cmp r9, #255
|
|
mvnhi r9, r9, asr #31
|
|
strb r6, [r1]
|
|
strb r14, [r1, #pix8_size]
|
|
strb r12, [r1, #2*pix8_size]
|
|
strb r9, [r1, #3*pix8_size]
|
|
add r0, r0, #16
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmia sp!, { r4-r9, pc }
|
|
#else /* ARMv6+ */
|
|
stmdb sp!, { r4-r9, lr }
|
|
ldrd r4, .Lpool4
|
|
mov r9, r4, lsr #16
|
|
1:
|
|
ldmia r0, { r12, r14 }
|
|
sadd16 r12, r12, r9
|
|
sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
|
|
ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
|
|
smulbt r8, r5, r6
|
|
sxth r6, r6
|
|
smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
|
|
smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
|
|
sxth r7, r7
|
|
add r8, r12, r6, lsl #13 /* r8 = o0 */
|
|
rsb r6, r12, r6, lsl #13 /* r6 = o3 */
|
|
add r12, r14, r7, lsl #13 /* r12= o1 */
|
|
rsb r14, r14, r7, lsl #13 /* r14= o2 */
|
|
usat r8, #8, r8, asr #18
|
|
usat r6, #8, r6, asr #18
|
|
usat r12, #8, r12, asr #18
|
|
usat r14, #8, r14, asr #18
|
|
strb r8, [r1]
|
|
strb r6, [r1, #3*pix8_size]
|
|
strb r12, [r1, #pix8_size]
|
|
strb r14, [r1, #2*pix8_size]
|
|
add r0, r0, #16
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmia sp!, { r4-r9, pc }
|
|
#endif
|
|
.size jpeg_idct4h, .-jpeg_idct4h
|
|
|
|
#if ARM_ARCH < 6
|
|
jpeg_idct8v:
|
|
stmdb sp!, { r4-r11, lr }
|
|
add r2, r0, #128
|
|
1:
|
|
ldmia r0!, { r4-r7 }
|
|
#if ARM_ARCH < 5
|
|
mov r8, r4, lsl #16
|
|
orrs r9, r6, r7
|
|
orreqs r9, r5, r4, lsr #16
|
|
bne 2f
|
|
mov r8, r8, asr #14
|
|
strh r8, [r2]
|
|
strh r8, [r2, #16]
|
|
strh r8, [r2, #32]
|
|
strh r8, [r2, #48]
|
|
strh r8, [r2, #64]
|
|
strh r8, [r2, #80]
|
|
strh r8, [r2, #96]
|
|
strh r8, [r2, #112]
|
|
cmp r0, r1
|
|
add r2, r2, #2
|
|
bcc 1b
|
|
ldmpc regs=r4-r11
|
|
2:
|
|
ldr r14, =4433
|
|
ldr r12, =-15137
|
|
mov r10, r5, lsl #16
|
|
mov r11, r7, lsl #16
|
|
mov r10, r10, asr #16 /* r10 = z2 = d2 */
|
|
mov r11, r11, asr #16 /* r11 = z3 = d6 */
|
|
add r8, r8, #8192
|
|
add r9, r10, r11
|
|
mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
|
|
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
|
|
ldr r14, =6270
|
|
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
|
|
mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
|
|
mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
|
|
add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
|
|
sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
|
|
add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
|
|
sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
|
|
add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
|
|
sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
|
|
stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
|
|
mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
|
|
mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
|
|
mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
|
|
mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
|
|
ldr r10, =9633
|
|
ldr r11, =-16069
|
|
add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
|
|
add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
|
|
add r9, r12, r14 /* r9 = z3 + z4 */
|
|
mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
|
|
ldr r10, =-3196
|
|
mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
|
|
ldr r11, =-7373
|
|
mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
|
|
ldr r10, =2446
|
|
add r9, r4, r7 /* r9 = tmp0 + tmp3 */
|
|
mla r8, r11, r9, r12 /* r8 = z1 + z3 */
|
|
mla r9, r11, r9, r14 /* r9 = z1 + z4 */
|
|
ldr r11, =12299
|
|
mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
|
|
ldr r10, =-20995
|
|
mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
|
|
ldr r11, =25172
|
|
add r9, r5, r6 /* r9 = tmp1 + tmp2 */
|
|
mla r12, r10, r9, r12 /* r12 = z2 + z3 */
|
|
mla r14, r10, r9, r14 /* r14 = z2 + z4 */
|
|
ldr r10, =16819
|
|
mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
|
|
mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
|
|
ldmdb sp, { r8-r11 }
|
|
add r12, r8, r4 /* o0 */
|
|
sub r14, r8, r4 /* o7 */
|
|
add r8, r9, r7 /* o3 */
|
|
sub r9, r9, r7 /* o4 */
|
|
add r4, r10, r5 /* O1 */
|
|
sub r5, r10, r5 /* o6 */
|
|
add r10, r11, r6 /* o2 */
|
|
sub r11, r11, r6 /* o5 */
|
|
/* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
|
|
mov r12, r12, asr #11
|
|
mov r4, r4, asr #11
|
|
mov r10, r10, asr #11
|
|
mov r8, r8, asr #11
|
|
mov r9, r9, asr #11
|
|
mov r11, r11, asr #11
|
|
mov r5, r5, asr #11
|
|
mov r14, r14, asr #11
|
|
strh r12, [r2]
|
|
strh r4, [r2, #16]
|
|
strh r10, [r2, #32]
|
|
strh r8, [r2, #48]
|
|
strh r9, [r2, #64]
|
|
strh r11, [r2, #80]
|
|
strh r5, [r2, #96]
|
|
strh r14, [r2, #112]
|
|
#else /* ARMv5+ */
|
|
mov r12, r4, lsl #16
|
|
orrs r9, r6, r7
|
|
orreqs r9, r5, r4, lsr #16
|
|
bne 2f
|
|
mov r12, r12, asr #14
|
|
strh r12, [r2]
|
|
strh r12, [r2, #16]
|
|
strh r12, [r2, #32]
|
|
strh r12, [r2, #48]
|
|
strh r12, [r2, #64]
|
|
strh r12, [r2, #80]
|
|
strh r12, [r2, #96]
|
|
strh r12, [r2, #112]
|
|
add r2, r2, #2
|
|
cmp r0, r1
|
|
bcc 1b
|
|
ldmia sp!, { r4-r11, pc }
|
|
2:
|
|
ldr r8, .Lpool8
|
|
ldr r9, .Lpool8+4
|
|
add r12, r12, #8192
|
|
add r10, r5, r7 /* r10[15:0] = d2 + d6 */
|
|
sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
|
|
smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
|
|
add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
|
|
smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
|
|
smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
|
|
add r8, r11, r14, asr #3 /* r8 = tmp11 */
|
|
rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
|
|
add r14, r10, r12, asr #3 /* r14 = tmp10 */
|
|
rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
|
|
stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
|
|
mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
|
|
mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
|
|
add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
|
|
add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
|
|
add r8, r12, r14 /* r8 = z3 + z4 */
|
|
ldr r10, .Lpool8+8
|
|
ldr r11, .Lpool8+12
|
|
smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
|
|
add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
|
|
smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
|
|
smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
|
|
smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
|
|
smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
|
|
add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
|
|
smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
|
|
smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
|
|
ldr r10, .Lpool8+16
|
|
ldr r11, .Lpool8+20
|
|
smlabb r7, r10, r7, r8 /* r7 = tmp0 */
|
|
smlatt r4, r10, r4, r9 /* r4 = tmp3 */
|
|
smlabb r6, r11, r6, r12 /* r6 = tmp1 */
|
|
smlatt r5, r11, r5, r14 /* r5 = tmp2 */
|
|
ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
|
|
add r12, r8, r5 /* o1 */
|
|
sub r14, r8, r5 /* o6 */
|
|
add r8, r9, r6 /* o2 */
|
|
sub r9, r9, r6 /* o5 */
|
|
add r6, r10, r7 /* o3 */
|
|
sub r7, r10, r7 /* o4 */
|
|
add r10, r11, r4 /* o0 */
|
|
sub r11, r11, r4 /* o7 */
|
|
mov r12, r12, asr #11
|
|
mov r14, r14, asr #11
|
|
mov r8, r8, asr #11
|
|
mov r9, r9, asr #11
|
|
mov r6, r6, asr #11
|
|
mov r7, r7, asr #11
|
|
mov r10, r10, asr #11
|
|
mov r11, r11, asr #11
|
|
strh r10, [r2]
|
|
strh r12, [r2, #16]
|
|
strh r8, [r2, #32]
|
|
strh r6, [r2, #48]
|
|
strh r7, [r2, #64]
|
|
strh r9, [r2, #80]
|
|
strh r14, [r2, #96]
|
|
strh r11, [r2, #112]
|
|
#endif
|
|
cmp r0, r1
|
|
add r2, r2, #2
|
|
bcc 1b
|
|
ldmpc regs=r4-r11
|
|
.size jpeg_idct8v, .-jpeg_idct8v
|
|
|
|
#if ARM_ARCH > 4
|
|
.align 4
|
|
.Lpool8:
|
|
.short 4433
|
|
.short -15137
|
|
.short 6270
|
|
.short 9633
|
|
.short -16069
|
|
.short -3196
|
|
.short -7373
|
|
.short -20995
|
|
.short 2446
|
|
.short 12299
|
|
.short 16819
|
|
.short 25172
|
|
.align 2
|
|
#endif
|
|
|
|
jpeg_idct8h:
|
|
stmdb sp!, { r4-r11, lr }
|
|
1:
|
|
ldmia r0!, { r4-r7 }
|
|
ldr r14, =(4112<<16)
|
|
#if ARM_ARCH < 5
|
|
add r8, r14, r4, lsl #16
|
|
orrs r9, r6, r7
|
|
orreqs r9, r5, r4, lsr #16
|
|
bne 2f
|
|
mov r8, r8, asr #21
|
|
cmp r8, #255
|
|
mvnhi r8, r8, asr #31
|
|
strb r8, [r1]
|
|
strb r8, [r1, #pix8_size]
|
|
strb r8, [r1, #2*pix8_size]
|
|
strb r8, [r1, #3*pix8_size]
|
|
strb r8, [r1, #4*pix8_size]
|
|
strb r8, [r1, #5*pix8_size]
|
|
strb r8, [r1, #6*pix8_size]
|
|
strb r8, [r1, #7*pix8_size]
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmpc regs=r4-r11
|
|
2:
|
|
ldr r14, =4433
|
|
ldr r12, =-15137
|
|
mov r10, r5, lsl #16
|
|
mov r11, r7, lsl #16
|
|
mov r10, r10, asr #16 /* r10 = z2 = d2 */
|
|
mov r11, r11, asr #16 /* r11 = z3 = d6 */
|
|
add r9, r10, r11
|
|
mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
|
|
mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
|
|
ldr r14, =6270
|
|
mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
|
|
mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
|
|
mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
|
|
add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
|
|
sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
|
|
add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
|
|
sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
|
|
add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
|
|
sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
|
|
stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
|
|
mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
|
|
mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
|
|
mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
|
|
mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
|
|
ldr r10, =9633
|
|
ldr r11, =-16069
|
|
add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
|
|
add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
|
|
add r9, r12, r14 /* r9 = z3 + z4 */
|
|
mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
|
|
ldr r10, =-3196
|
|
mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
|
|
ldr r11, =-7373
|
|
mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
|
|
ldr r10, =2446
|
|
add r9, r4, r7 /* r9 = tmp0 + tmp3 */
|
|
mla r8, r11, r9, r12 /* r8 = z1 + z3 */
|
|
mla r9, r11, r9, r14 /* r9 = z1 + z4 */
|
|
ldr r11, =12299
|
|
mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
|
|
ldr r10, =-20995
|
|
mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
|
|
ldr r11, =25172
|
|
add r9, r5, r6 /* r9 = tmp1 + tmp2 */
|
|
mla r12, r10, r9, r12 /* r12 = z2 + z3 */
|
|
mla r14, r10, r9, r14 /* r14 = z2 + z4 */
|
|
ldr r10, =16819
|
|
mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
|
|
mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
|
|
ldmdb sp, { r8-r11 }
|
|
add r12, r8, r4 /* o0 */
|
|
sub r14, r8, r4 /* o7 */
|
|
add r8, r9, r7 /* o3 */
|
|
sub r9, r9, r7 /* o4 */
|
|
add r4, r10, r5 /* O1 */
|
|
sub r5, r10, r5 /* o6 */
|
|
add r10, r11, r6 /* o2 */
|
|
sub r11, r11, r6 /* o5 */
|
|
/* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
|
|
mov r12, r12, asr #18
|
|
cmp r12, #255
|
|
mvnhi r12, r12, asr #31
|
|
mov r4, r4, asr #18
|
|
cmp r4, #255
|
|
mvnhi r4, r4, asr #31
|
|
mov r10, r10, asr #18
|
|
cmp r10, #255
|
|
mvnhi r10, r10, asr #31
|
|
mov r8, r8, asr #18
|
|
cmp r8, #255
|
|
mvnhi r8, r8, asr #31
|
|
mov r9, r9, asr #18
|
|
cmp r9, #255
|
|
mvnhi r9, r9, asr #31
|
|
mov r11, r11, asr #18
|
|
cmp r11, #255
|
|
mvnhi r11, r11, asr #31
|
|
mov r5, r5, asr #18
|
|
cmp r5, #255
|
|
mvnhi r5, r5, asr #31
|
|
mov r14, r14, asr #18
|
|
cmp r14, #255
|
|
mvnhi r14, r14, asr #31
|
|
strb r12, [r1]
|
|
strb r4, [r1, #pix8_size]
|
|
strb r10, [r1, #2*pix8_size]
|
|
strb r8, [r1, #3*pix8_size]
|
|
strb r9, [r1, #4*pix8_size]
|
|
strb r11, [r1, #5*pix8_size]
|
|
strb r5, [r1, #6*pix8_size]
|
|
strb r14, [r1, #7*pix8_size]
|
|
#else /* ARMv5+ */
|
|
add r12, r14, r4, lsl #16
|
|
orrs r9, r6, r7
|
|
orreqs r9, r5, r4, lsr #16
|
|
bne 2f
|
|
mov r12, r12, asr #21
|
|
cmp r12, #255
|
|
mvnhi r12, r12, asr #31
|
|
strb r12, [r1]
|
|
strb r12, [r1, #pix8_size]
|
|
strb r12, [r1, #2*pix8_size]
|
|
strb r12, [r1, #3*pix8_size]
|
|
strb r12, [r1, #4*pix8_size]
|
|
strb r12, [r1, #5*pix8_size]
|
|
strb r12, [r1, #6*pix8_size]
|
|
strb r12, [r1, #7*pix8_size]
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmia sp!, { r4-r11, pc }
|
|
2:
|
|
ldr r8, .Lpool8
|
|
ldr r9, .Lpool8+4
|
|
add r10, r5, r7 /* r10[15:0] = d2 + d6 */
|
|
sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
|
|
smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
|
|
add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
|
|
smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
|
|
smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
|
|
add r8, r11, r14, asr #3 /* r8 = tmp11 */
|
|
rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
|
|
add r14, r10, r12, asr #3 /* r14 = tmp10 */
|
|
rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
|
|
stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
|
|
mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
|
|
mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
|
|
add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
|
|
add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
|
|
add r8, r12, r14 /* r8 = z3 + z4 */
|
|
ldr r10, .Lpool8+8
|
|
ldr r11, .Lpool8+12
|
|
smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
|
|
add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
|
|
smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
|
|
smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
|
|
smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
|
|
smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
|
|
add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
|
|
smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
|
|
smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
|
|
ldr r10, .Lpool8+16
|
|
ldr r11, .Lpool8+20
|
|
smlabb r7, r10, r7, r8 /* r7 = tmp0 */
|
|
smlatt r4, r10, r4, r9 /* r4 = tmp3 */
|
|
smlabb r6, r11, r6, r12 /* r6 = tmp1 */
|
|
smlatt r5, r11, r5, r14 /* r5 = tmp2 */
|
|
ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
|
|
add r12, r8, r5 /* o1 */
|
|
sub r14, r8, r5 /* o6 */
|
|
add r8, r9, r6 /* o2 */
|
|
sub r9, r9, r6 /* o5 */
|
|
add r6, r10, r7 /* o3 */
|
|
sub r7, r10, r7 /* o4 */
|
|
add r10, r11, r4 /* o0 */
|
|
sub r11, r11, r4 /* o7 */
|
|
/* output in order: r10 r12 r8 r6 r7 r9 r14 r11 */
|
|
mov r10, r10, asr #18
|
|
cmp r10, #255
|
|
mvnhi r10, r10, asr #31
|
|
mov r12, r12, asr #18
|
|
cmp r12, #255
|
|
mvnhi r12, r12, asr #31
|
|
mov r8, r8, asr #18
|
|
cmp r8, #255
|
|
mvnhi r8, r8, asr #31
|
|
mov r6, r6, asr #18
|
|
cmp r6, #255
|
|
mvnhi r6, r6, asr #31
|
|
mov r7, r7, asr #18
|
|
cmp r7, #255
|
|
mvnhi r7, r7, asr #31
|
|
mov r9, r9, asr #18
|
|
cmp r9, #255
|
|
mvnhi r9, r9, asr #31
|
|
mov r14, r14, asr #18
|
|
cmp r14, #255
|
|
mvnhi r14, r14, asr #31
|
|
mov r11, r11, asr #18
|
|
cmp r11, #255
|
|
mvnhi r11, r11, asr #31
|
|
strb r10, [r1]
|
|
strb r12, [r1, #pix8_size]
|
|
strb r8, [r1, #2*pix8_size]
|
|
strb r6, [r1, #3*pix8_size]
|
|
strb r7, [r1, #4*pix8_size]
|
|
strb r9, [r1, #5*pix8_size]
|
|
strb r14, [r1, #6*pix8_size]
|
|
strb r11, [r1, #7*pix8_size]
|
|
#endif
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmpc regs=r4-r11
|
|
.size jpeg_idct8h, .-jpeg_idct8h
|
|
#else /* ARMv6+ */
|
|
jpeg_idct8v:
|
|
stmdb sp!, { r4-r11, lr }
|
|
add r2, r0, #128
|
|
1:
|
|
ldmia r0!, { r4-r7 }
|
|
orrs r9, r6, r7
|
|
orreqs r9, r5, r4, lsr #16
|
|
bne 2f
|
|
mov r4, r4, lsl #2
|
|
strh r4, [r2]
|
|
strh r4, [r2, #16]
|
|
strh r4, [r2, #32]
|
|
strh r4, [r2, #48]
|
|
strh r4, [r2, #64]
|
|
strh r4, [r2, #80]
|
|
strh r4, [r2, #96]
|
|
strh r4, [r2, #112]
|
|
cmp r0, r1
|
|
add r2, r2, #2
|
|
bcc 1b
|
|
ldmia sp!, { r4-r11, pc }
|
|
2:
|
|
ldrd r8, .Lpool8
|
|
mov r12, r4, lsl #16
|
|
add r10, r5, r7 /* r10 = d2 + d6 */
|
|
add r12, r12, #8192
|
|
add r3, r12, r6, lsl #16 /* tmp0 */
|
|
sub r12, r12, r6, lsl #16 /* tmp1 */
|
|
pkhtb r4, r5, r4, asr #16 /* r4 = (tmp3[o], tmp2[0]) = (d1, d3) */
|
|
smulbb r14, r8, r10 /* r14 = z1[e] = (d2 + d6) * 4433 */
|
|
pkhtb r6, r6, r7, asr #16 /* r6 = (tmp0[o], tmp1[o]) = (d7, d5) */
|
|
smlatb r7, r8, r7, r14 /* r7 = tmp2[e] = z1 - d6 * 15137 */
|
|
smlabb r5, r9, r5, r14 /* r5 = tmp3[e] = z1 + d2 * 6270 */
|
|
pkhtb r9, r9, r9, asr #16 /* r9 = (9633, 9633) */
|
|
add r10, r5, r3, asr #3 /* r10 = tmp10 */
|
|
rsb r11, r5, r3, asr #3 /* r11 = tmp13 */
|
|
mov r3, r4, ror #16
|
|
rsb r14, r7, r12, asr #3 /* r14 = tmp12 */
|
|
add r12, r7, r12, asr #3 /* r12 = tmp11 */
|
|
sadd16 r8, r3, r6 /* z3, z4 */
|
|
stmdb sp, { r10-r12, r14 } /* tmp10 tmp13 tmp11 tmp12 */
|
|
smuad r5, r9, r8 /* r5 = z5 = (z3[o] + z4[o]) * 9633 */
|
|
ldrd r10, .Lpool8+8
|
|
sadd16 r7, r4, r6 /* r7 = (z1, z2) */
|
|
smlatt r9, r10, r8, r5 /* r9 = z4 = z5 - z4 * 16069 */
|
|
smlabb r8, r10, r8, r5 /* r8 = z3 = z5 - z3 * 3196 */
|
|
smlabb r14, r11, r7, r9 /* r14 = z1 + z4 */
|
|
smlabb r12, r11, r7, r8 /* r12 = z1 + z3 */
|
|
smlatt r5, r11, r7, r9 /* r5 = z2 + z4 */
|
|
smlatt r7, r11, r7, r8 /* r7 = z2 + z3 */
|
|
ldrd r8, .Lpool8+16
|
|
smlabt r7, r9, r4, r7 /* r7 = tmp2 */
|
|
smlatb r14, r9, r4, r14 /* r14 = tmp3 */
|
|
ldmdb sp, { r4, r9-r11 } /* tmp10 tmp13 tmp11 tmp12 */
|
|
smlabb r12, r8, r6, r12 /* r12 = tmp0 */
|
|
smlatt r5, r8, r6, r5 /* r5 = tmp1 */
|
|
/* used: r4, r5, r7, r9-r12, r14 */
|
|
add r6, r4, r14 /* o0 */
|
|
sub r8, r4, r14 /* o7 */
|
|
add r14, r9, r12 /* o3 */
|
|
sub r12, r9, r12 /* o4 */
|
|
add r4, r10, r7 /* o1 */
|
|
sub r7, r10, r7 /* o6 */
|
|
add r9, r11, r5 /* o2 */
|
|
sub r10, r11, r5 /* o5 */
|
|
mov r6, r6, asr #11
|
|
mov r4, r4, asr #11
|
|
mov r9, r9, asr #11
|
|
mov r14, r14, asr #11
|
|
mov r12, r12, asr #11
|
|
mov r10, r10, asr #11
|
|
mov r7, r7, asr #11
|
|
mov r8, r8, asr #11
|
|
strh r6, [r2]
|
|
strh r4, [r2, #16]
|
|
strh r9, [r2, #32]
|
|
strh r14, [r2, #48]
|
|
strh r12, [r2, #64]
|
|
strh r10, [r2, #80]
|
|
strh r7, [r2, #96]
|
|
strh r8, [r2, #112]
|
|
cmp r0, r1
|
|
add r2, r2, #2
|
|
bcc 1b
|
|
ldmia sp!, { r4-r11, pc }
|
|
.size jpeg_idct8v, .-jpeg_idct8v
|
|
|
|
.align 4
|
|
.Lpool8:
|
|
.short 4433
|
|
.short -15137
|
|
.short 6270
|
|
.short 9633
|
|
.short -16069
|
|
.short -3196
|
|
.short -7373
|
|
.short -20995
|
|
.short 2446
|
|
.short 16819
|
|
.short 25172
|
|
.short 12299
|
|
|
|
.align 2
|
|
jpeg_idct8h:
|
|
stmdb sp!, { r4-r11, lr }
|
|
1:
|
|
ldr r14, =4112
|
|
ldmia r0!, { r4-r7 }
|
|
sadd16 r4, r4, r14
|
|
orrs r9, r6, r7
|
|
orreqs r9, r5, r4, lsr #16
|
|
bne 2f
|
|
sxth r4, r4
|
|
usat r4, #8, r4, asr #5
|
|
strb r4, [r1]
|
|
strb r4, [r1, #pix8_size]
|
|
strb r4, [r1, #2*pix8_size]
|
|
strb r4, [r1, #3*pix8_size]
|
|
strb r4, [r1, #4*pix8_size]
|
|
strb r4, [r1, #5*pix8_size]
|
|
strb r4, [r1, #6*pix8_size]
|
|
strb r4, [r1, #7*pix8_size]
|
|
add r1, r1, r3
|
|
cmp r0, r2
|
|
bcc 1b
|
|
ldmia sp!, { r4-r11, pc }
|
|
2:
|
|
ldrd r8, .Lpool8
|
|
sadd16 r10, r5, r7 /* r10 = (d2 + d6, d3 + d7) */
|
|
ssub16 r12, r4, r6 /* r12 = (d0 - d4, d1 - d5) */
|
|
sadd16 r11, r4, r6 /* r11 = (d0 + d4, d1 + d5) */
|
|
pkhtb r4, r5, r4, asr #16 /* r4 = (tmp3[o], tmp2[0]) = (d1, d3) */
|
|
smulbb r14, r8, r10 /* r14 = z1[e] = (d2 + d6) * 4433 */
|
|
pkhtb r6, r6, r7, asr #16 /* r6 = (tmp0[o], tmp1[o]) = (d7, d5) */
|
|
smlatb r7, r8, r7, r14 /* r7 = tmp2[e] = z1 - d6 * 15137 */
|
|
smlabb r5, r9, r5, r14 /* r5 = tmp3[e] = z1 + d2 * 6270 */
|
|
sxth r12, r12 /* r12 = tmp1[e] = d0 - d4 */
|
|
pkhtb r8, r11, r10, asr #16 /* r8 = (z3[o], z4[o]) */
|
|
sxth r14, r11 /* r14 = tmp0[e] */
|
|
pkhtb r9, r9, r9, asr #16 /* r9 = (9633, 9633) */
|
|
add r10, r5, r14, lsl #13 /* r10 = tmp10 */
|
|
rsb r11, r5, r14, lsl #13 /* r11 = tmp13 */
|
|
rsb r14, r7, r12, lsl #13 /* r14 = tmp12 */
|
|
add r12, r7, r12, lsl #13 /* r12 = tmp11 */
|
|
stmdb sp, { r10-r12, r14 } /* tmp10 tmp13 tmp11 tmp12 */
|
|
smuad r5, r9, r8 /* r5 = z5 = (z3[o] + z4[o]) * 9633 */
|
|
ldrd r10, .Lpool8+8
|
|
sadd16 r7, r4, r6 /* r7 = (z1, z2) */
|
|
smlatt r9, r10, r8, r5 /* r9 = z4 = z5 - z4 * 16069 */
|
|
smlabb r8, r10, r8, r5 /* r8 = z3 = z5 - z3 * 3196 */
|
|
smlabb r14, r11, r7, r9 /* r14 = z1 + z4 */
|
|
smlabb r12, r11, r7, r8 /* r12 = z1 + z3 */
|
|
smlatt r5, r11, r7, r9 /* r5 = z2 + z4 */
|
|
smlatt r7, r11, r7, r8 /* r7 = z2 + z3 */
|
|
ldrd r8, .Lpool8+16
|
|
smlabt r7, r9, r4, r7 /* r7 = tmp2 */
|
|
smlatb r14, r9, r4, r14 /* r14 = tmp3 */
|
|
ldmdb sp, { r4, r9-r11 } /* tmp10 tmp13 tmp11 tmp12 */
|
|
smlabb r12, r8, r6, r12 /* r12 = tmp0 */
|
|
smlatt r5, r8, r6, r5 /* r5 = tmp1 */
|
|
/* used: r4, r5, r7, r9-r12, r14 */
|
|
add r6, r4, r14 /* o0 */
|
|
sub r8, r4, r14 /* o7 */
|
|
add r14, r9, r12 /* o3 */
|
|
sub r12, r9, r12 /* o4 */
|
|
add r4, r10, r7 /* o1 */
|
|
sub r7, r10, r7 /* o6 */
|
|
add r9, r11, r5 /* o2 */
|
|
sub r10, r11, r5 /* o5 */
|
|
usat r6, #8, r6, asr #18
|
|
usat r4, #8, r4, asr #18
|
|
usat r9, #8, r9, asr #18
|
|
usat r14, #8, r14, asr #18
|
|
usat r12, #8, r12, asr #18
|
|
usat r10, #8, r10, asr #18
|
|
usat r7, #8, r7, asr #18
|
|
usat r8, #8, r8, asr #18
|
|
strb r6, [r1]
|
|
strb r4, [r1, #pix8_size]
|
|
strb r9, [r1, #2*pix8_size]
|
|
strb r14, [r1, #3*pix8_size]
|
|
strb r12, [r1, #4*pix8_size]
|
|
strb r10, [r1, #5*pix8_size]
|
|
strb r7, [r1, #6*pix8_size]
|
|
strb r8, [r1, #7*pix8_size]
|
|
cmp r0, r2
|
|
add r1, r1, r3
|
|
bcc 1b
|
|
ldmia sp!, { r4-r11, pc }
|
|
.size jpeg_idct8h, .-jpeg_idct8h
|
|
#endif
|