f289b9f591
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21392 a1c6a512-1295-4272-9138-f99709370657
337 lines
10 KiB
ArmAsm
337 lines
10 KiB
ArmAsm
/***************************************************************************
|
|
* __________ __ ___.
|
|
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
|
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
|
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
|
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
|
* \/ \/ \/ \/ \/
|
|
* $Id$
|
|
*
|
|
* Copyright (C) 2009 by Jens Arnold
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
* KIND, either express or implied.
|
|
*
|
|
****************************************************************************/
|
|
|
|
|
|
.global mpeg2_idct_copy
|
|
.type mpeg2_idct_copy, %function
|
|
.global mpeg2_idct_add
|
|
.type mpeg2_idct_add, %function
|
|
|
|
/* Custom calling convention:
|
|
* r0 contains block pointer and is non-volatile
|
|
* all non-volatile c context saved and restored on its behalf
|
|
*/
|
|
.idct:
|
|
str lr, [sp, #-4]! @ lr is used
|
|
add r1, r0, #128 @ secondary, transposed temp buffer
|
|
mov r14, #8 @ loop counter
|
|
|
|
.row_loop:
|
|
ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
|
|
ldrd r4, L_W1357 @ load W1, W3, W5, W7
|
|
|
|
smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
|
|
smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
|
|
|
|
smultt r7, r5, r10 @ b1 = -W7 * f3
|
|
smlabb r7, r4, r11, r7 @ + -W1 * f5
|
|
smlabt r7, r5, r11, r7 @ + -W5 * f7
|
|
rsb r7, r7, #0
|
|
smlatb r7, r4, r10, r7 @ + W3 * f1
|
|
|
|
smulbt r8, r4, r10 @ b2 = -W1 * f3
|
|
rsb r8, r8, #0
|
|
smlabb r8, r5, r10, r8 @ + W5 * f1
|
|
smlatb r8, r5, r11, r8 @ + W7 * f5
|
|
smlatt r8, r4, r11, r8 @ + W3 * f7
|
|
|
|
smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
|
|
smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
|
|
|
|
ldrd r4, L_W0246 @ load W0, W2, W4, W6
|
|
add r2, r2, #1 @ f0 += 1
|
|
|
|
smulbb r10, r4, r2 @ a0' = W0 * f0
|
|
smlabb r10, r5, r3, r10 @ + W4 * f4
|
|
smultt r12, r4, r2 @ a3' = W2 * f2
|
|
smlatt r12, r5, r3, r12 @ + W6 * f6
|
|
add r10, r10, r12 @ a0 = a0' + a3'
|
|
sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
|
|
|
|
smulbb r11, r5, r3 @ a1' = -W4 * f4
|
|
rsb r11, r11, #0
|
|
smlabb r11, r4, r2, r11 @ + W0 * f0
|
|
smultt r3, r4, r3 @ a2' = -W2 * f6
|
|
rsb r3, r3, #0
|
|
smlatt r3, r5, r2, r3 @ + W6 * f2
|
|
add r11, r11, r3 @ a1 = a1' + a2'
|
|
sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
|
|
|
|
sub r2, r10, r6 @ block[7] = (a0 - b0)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #7*16]
|
|
sub r2, r11, r7 @ block[6] = (a1 - b1)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #6*16]
|
|
sub r2, r3, r8 @ block[5] = (a2 - b2)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #5*16]
|
|
sub r2, r12, r9 @ block[4] = (a3 - b3)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #4*16]
|
|
add r2, r12, r9 @ block[3] = (a3 + b3)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #3*16]
|
|
add r2, r3, r8 @ block[2] = (a2 + b2)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #2*16]
|
|
add r2, r11, r7 @ block[1] = (a1 + b1)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1, #1*16]
|
|
add r2, r10, r6 @ block[0] = (a0 + b0)
|
|
mov r2, r2, asr #12 @ >> 12
|
|
strh r2, [r1], #2 @ advance to next temp column
|
|
|
|
subs r14, r14, #1
|
|
bne .row_loop
|
|
b .col_start
|
|
|
|
@placed here because of ldrd's offset limit
|
|
L_W1357:
|
|
.short 2841
|
|
.short 2408
|
|
.short 1609
|
|
.short 565
|
|
|
|
L_W0246:
|
|
.short 2048
|
|
.short 2676
|
|
.short 2048
|
|
.short 1108
|
|
|
|
.col_start:
|
|
@ r0 now points to the temp buffer, where we need it.
|
|
sub r1, r1, #128+16 @ point r1 back to the input block
|
|
mov r14, #8 @ loop counter
|
|
|
|
.col_loop:
|
|
ldmia r0!, {r2, r3, r10, r11} @ fetch f0, f2, f4, f6, f1, f3, f5, f7
|
|
ldrd r4, L_W1357 @ load W1, W3, W5, W7
|
|
|
|
smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
|
|
smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7
|
|
|
|
smultt r7, r5, r10 @ b1 = -W7 * f3
|
|
smlabb r7, r4, r11, r7 @ + -W1 * f5
|
|
smlabt r7, r5, r11, r7 @ + -W5 * f7
|
|
rsb r7, r7, #0
|
|
smlatb r7, r4, r10, r7 @ + W3 * f1
|
|
|
|
smulbt r8, r4, r10 @ b2 = -W1 * f3
|
|
rsb r8, r8, #0
|
|
smlabb r8, r5, r10, r8 @ + W5 * f1
|
|
smlatb r8, r5, r11, r8 @ + W7 * f5
|
|
smlatt r8, r4, r11, r8 @ + W3 * f7
|
|
|
|
smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
|
|
smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1
|
|
|
|
ldrd r4, L_W0246 @ load W0, W2, W4, W6
|
|
add r2, r2, #32 @ DC offset: 0.5
|
|
|
|
smulbb r10, r4, r2 @ a0' = W0 * f0
|
|
smlabb r10, r5, r3, r10 @ + W4 * f4
|
|
smultt r12, r4, r2 @ a3' = W2 * f2
|
|
smlatt r12, r5, r3, r12 @ + W6 * f6
|
|
add r10, r10, r12 @ a0 = a0' + a3'
|
|
sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
|
|
|
|
smulbb r11, r5, r3 @ a1' = -W4 * f4
|
|
rsb r11, r11, #0
|
|
smlabb r11, r4, r2, r11 @ + W0 * f0
|
|
smultt r3, r4, r3 @ a2' = -W2 * f6
|
|
rsb r3, r3, #0
|
|
smlatt r3, r5, r2, r3 @ + W6 * f2
|
|
add r11, r11, r3 @ a1 = a1' + a2'
|
|
sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
|
|
|
|
sub r2, r10, r6 @ block[7] = (a0 - b0)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #7*16]
|
|
sub r2, r11, r7 @ block[6] = (a1 - b1)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #6*16]
|
|
sub r2, r3, r8 @ block[5] = (a2 - b2)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #5*16]
|
|
sub r2, r12, r9 @ block[4] = (a3 - b3)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #4*16]
|
|
add r2, r12, r9 @ block[3] = (a3 + b3)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #3*16]
|
|
add r2, r3, r8 @ block[2] = (a2 + b2)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #2*16]
|
|
add r2, r11, r7 @ block[1] = (a1 + b1)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1, #1*16]
|
|
add r2, r10, r6 @ block[0] = (a0 + b0)
|
|
mov r2, r2, asr #17 @ >> 17
|
|
strh r2, [r1], #2 @ advance to next column
|
|
|
|
subs r14, r14, #1
|
|
bne .col_loop
|
|
|
|
sub r0, r0, #256 @ point r0 back to the input block
|
|
ldr pc, [sp], #4
|
|
|
|
|
|
mpeg2_idct_copy:
|
|
stmfd sp!, {r1-r2, r4-r12, lr}
|
|
bl .idct
|
|
ldmfd sp!, {r1-r2}
|
|
|
|
add r12, r0, #128
|
|
ldrd r4, [r0]
|
|
mov r8, #0
|
|
mov r9, #0
|
|
mov r10, #0
|
|
mov r11, #0
|
|
1:
|
|
ldrd r6, [r0, #8]
|
|
usat16 r4, #8, r4
|
|
strb r4, [r1, #0]
|
|
mov r4, r4, lsr #16
|
|
strb r4, [r1, #1]
|
|
usat16 r5, #8, r5
|
|
strb r5, [r1, #2]
|
|
mov r5, r5, lsr #16
|
|
strb r5, [r1, #3]
|
|
ldrd r4, [r0, #16]
|
|
usat16 r6, #8, r6
|
|
strb r6, [r1, #4]
|
|
mov r6, r6, lsr #16
|
|
strb r6, [r1, #5]
|
|
usat16 r7, #8, r7
|
|
strb r7, [r1, #6]
|
|
mov r7, r7, lsr #16
|
|
strb r7, [r1, #7]
|
|
stmia r0!, {r8-r11}
|
|
add r1, r1, r2
|
|
cmp r0, r12
|
|
blo 1b
|
|
|
|
ldmfd sp!, {r4-r12, pc}
|
|
|
|
mpeg2_idct_add:
|
|
cmp r0, #129
|
|
mov r0, r1
|
|
ldreqsh r1, [r0, #0]
|
|
bne 1f
|
|
and r1, r1, #0x70
|
|
cmp r1, #0x40
|
|
bne 3f
|
|
1:
|
|
stmfd sp!, {r2-r12, lr}
|
|
bl .idct
|
|
ldmfd sp!, {r1-r2}
|
|
mov r11, #0
|
|
add r12, r0, #128
|
|
2:
|
|
ldmia r0, {r3-r6}
|
|
ldrb r7, [r1, #0]
|
|
ldrb r8, [r1, #1]
|
|
ldrb r9, [r1, #2]
|
|
ldrb r10, [r1, #3]
|
|
str r11, [r0], #4
|
|
orr r7, r7, r8, lsl #16
|
|
sadd16 r3, r3, r7
|
|
usat16 r3, #8, r3
|
|
strb r3, [r1, #0]
|
|
mov r3, r3, lsr #16
|
|
strb r3, [r1, #1]
|
|
str r11, [r0], #4
|
|
orr r9, r9, r10, lsl #16
|
|
sadd16 r4, r4, r9
|
|
usat16 r4, #8, r4
|
|
strb r4, [r1, #2]
|
|
mov r4, r4, lsr #16
|
|
strb r4, [r1, #3]
|
|
ldrb r7, [r1, #4]
|
|
ldrb r8, [r1, #5]
|
|
ldrb r9, [r1, #6]
|
|
ldrb r10, [r1, #7]
|
|
str r11, [r0], #4
|
|
orr r7, r7, r8, lsl #16
|
|
sadd16 r5, r5, r7
|
|
usat16 r5, #8, r5
|
|
strb r5, [r1, #4]
|
|
mov r5, r5, lsr #16
|
|
strb r5, [r1, #5]
|
|
str r11, [r0], #4
|
|
orr r9, r9, r10, lsl #16
|
|
sadd16 r6, r6, r9
|
|
usat16 r6, #8, r6
|
|
strb r6, [r1, #6]
|
|
mov r6, r6, lsr #16
|
|
strb r6, [r1, #7]
|
|
add r1, r1, r2
|
|
cmp r0, r12
|
|
blo 2b
|
|
ldmfd sp!, {r4-r12, pc}
|
|
|
|
3:
|
|
stmfd sp!, {r4-r7}
|
|
ldrsh r1, [r0, #0] /* r1 = block[0] */
|
|
mov r11, #0
|
|
strh r11, [r0, #0] /* block[0] = 0 */
|
|
strh r11, [r0, #126] /* block[63] = 0 */
|
|
add r1, r1, #64 /* r1 = DC << 7 */
|
|
add r0, r2, r3, asl #3
|
|
4:
|
|
ldrb r4, [r2, #0]
|
|
ldrb r5, [r2, #1]
|
|
ldrb r6, [r2, #2]
|
|
ldrb r7, [r2, #3]
|
|
add r4, r4, r1, asr #7
|
|
usat r4, #8, r4
|
|
strb r4, [r2, #0]
|
|
add r5, r5, r1, asr #7
|
|
usat r5, #8, r5
|
|
strb r5, [r2, #1]
|
|
add r6, r6, r1, asr #7
|
|
usat r6, #8, r6
|
|
strb r6, [r2, #2]
|
|
add r7, r7, r1, asr #7
|
|
usat r7, #8, r7
|
|
strb r7, [r2, #3]
|
|
ldrb r4, [r2, #4]
|
|
ldrb r5, [r2, #5]
|
|
ldrb r6, [r2, #6]
|
|
ldrb r7, [r2, #7]
|
|
add r4, r4, r1, asr #7
|
|
usat r4, #8, r4
|
|
strb r4, [r2, #4]
|
|
add r5, r5, r1, asr #7
|
|
usat r5, #8, r5
|
|
strb r5, [r2, #5]
|
|
add r6, r6, r1, asr #7
|
|
usat r6, #8, r6
|
|
strb r6, [r2, #6]
|
|
add r7, r7, r1, asr #7
|
|
usat r7, #8, r7
|
|
strb r7, [r2, #7]
|
|
add r2, r2, r3
|
|
cmp r2, r0
|
|
blo 4b
|
|
ldmfd sp!, {r4-r7}
|
|
bx lr
|