Core JPEG decoder improvements:
For >8-point vertical IDCT, transpose the coefficients while decoding them, so that the vertical IDCT can read in rows rather than columns. This improves speed a bit for this size even using the C IDCT. Remove inline ARM asm, replacing it with an external file containing pure asm IDCT functions. Add jpeg_ prefix to JPEG IDCT functions since some of them will now be visible globally. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21345 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
parent
4c58ad26ba
commit
6a0d931f38
5 changed files with 465 additions and 219 deletions
|
@ -104,6 +104,9 @@ recorder/resize.c
|
|||
#endif
|
||||
#ifdef HAVE_JPEG
|
||||
recorder/jpeg_load.c
|
||||
#ifdef CPU_ARM
|
||||
recorder/jpeg_idct_arm.S
|
||||
#endif
|
||||
#endif
|
||||
#ifdef HAVE_ALBUMART
|
||||
recorder/albumart.c
|
||||
|
|
|
@ -27,6 +27,9 @@ playergfx.c
|
|||
profile_plugin.c
|
||||
#endif
|
||||
#ifdef HAVE_LCD_BITMAP
|
||||
#ifdef CPU_ARM
|
||||
pluginlib_jpeg_idct_arm.S
|
||||
#endif
|
||||
pluginlib_jpeg_mem.c
|
||||
pluginlib_resize.c
|
||||
#ifndef HAVE_JPEG
|
||||
|
|
24
apps/plugins/lib/pluginlib_jpeg_idct_arm.S
Normal file
24
apps/plugins/lib/pluginlib_jpeg_idct_arm.S
Normal file
|
@ -0,0 +1,24 @@
|
|||
/***************************************************************************
|
||||
* __________ __ ___.
|
||||
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
||||
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
||||
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
||||
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
||||
* \/ \/ \/ \/ \/
|
||||
* $Id$
|
||||
*
|
||||
* Copyright (C) 2009 by Andrew Mahone
|
||||
*
|
||||
* This is a wrapper for the core jpeg_idct_arm.S
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
||||
* KIND, either express or implied.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
#include "recorder/jpeg_idct_arm.S"
|
287
apps/recorder/jpeg_idct_arm.S
Normal file
287
apps/recorder/jpeg_idct_arm.S
Normal file
|
@ -0,0 +1,287 @@
|
|||
/***************************************************************************
|
||||
* __________ __ ___.
|
||||
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
||||
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
||||
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
||||
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
||||
* \/ \/ \/ \/ \/
|
||||
* $Id$
|
||||
*
|
||||
* JPEG assembly IDCT
|
||||
*
|
||||
* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
|
||||
* jpeg_load.c with
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
||||
* KIND, either express or implied.
|
||||
*
|
||||
****************************************************************************/
|
||||
#include "config.h"
|
||||
|
||||
.section .text
|
||||
.align 2
|
||||
.global jpeg_idct4v
|
||||
.type jpeg_idct4v, %function
|
||||
.global jpeg_idct4h
|
||||
.type jpeg_idct4h, %function
|
||||
|
||||
jpeg_idct4v:
|
||||
#if ARM_ARCH < 5
|
||||
stmdb sp!, { r4-r7, lr }
|
||||
ldr r14, =-15137
|
||||
ldr r12, =6270
|
||||
1:
|
||||
ldrsh r4, [r0, #32]
|
||||
ldrsh r2, [r0]
|
||||
ldrsh r5, [r0, #48]
|
||||
ldrsh r3, [r0, #16]
|
||||
add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
|
||||
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
|
||||
add r4, r3, r5 /* r4 = z1 = d1 + d3 */
|
||||
add r7, r4, r4, lsl #3
|
||||
rsb r4, r4, r7, lsl #4
|
||||
rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
|
||||
add r4, r4, #1024
|
||||
mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
|
||||
mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
|
||||
mov r6, r6, lsl #2 /* r6 <<= 2 */
|
||||
mov r2, r2, lsl #2 /* r2 <<= 2 */
|
||||
add r7, r6, r3, asr #11 /* r7 = o0 */
|
||||
sub r3, r6, r3, asr #11 /* r3 = o3 */
|
||||
add r6, r2, r5, asr #11 /* r6 = o1 */
|
||||
sub r2, r2, r5, asr #11 /* r2 = o2 */
|
||||
strh r7, [r0]
|
||||
strh r3, [r0, #48]
|
||||
strh r6, [r0, #16]
|
||||
strh r2, [r0, #32]
|
||||
add r0, r0, #2
|
||||
teq r0, r1
|
||||
bne 1b
|
||||
ldmia sp!, { r4-r7, pc }
|
||||
#elif ARM_ARCH < 6
|
||||
stmdb sp!, { r4-r8, lr }
|
||||
ldr r8, =1024
|
||||
ldr r14, =4433
|
||||
ldr r12, =3302955134
|
||||
1:
|
||||
ldrsh r5, [r0, #48]
|
||||
ldrsh r3, [r0, #16]
|
||||
ldrsh r4, [r0, #32]
|
||||
ldrsh r2, [r0]
|
||||
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
|
||||
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
|
||||
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
|
||||
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
|
||||
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
||||
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
|
||||
mov r7, r7, lsl #2
|
||||
mov r2, r2, lsl #2
|
||||
add r4, r7, r3, asr #11 /* r4 = o0 */
|
||||
sub r7, r7, r3, asr #11 /* r7 = o3 */
|
||||
add r3, r2, r5, asr #11 /* r3 = o1 */
|
||||
sub r2, r2, r5, asr #11 /* r2 = o2 */
|
||||
strh r4, [r0]
|
||||
strh r7, [r0, #48]
|
||||
strh r3, [r0, #16]
|
||||
strh r2, [r0, #32]
|
||||
add r0, r0, #2
|
||||
teq r0, r1
|
||||
bne 1b
|
||||
ldmia sp!, { r4-r8, pc }
|
||||
#else
|
||||
stmdb sp!, { r4-r10, lr }
|
||||
ldr r2, =1024
|
||||
ldr r3, =4433
|
||||
ldr r12, =3302955134
|
||||
1:
|
||||
ldr r6, [r0, #32]
|
||||
ldr r4, [r0]
|
||||
ldr r7, [r0, #48]
|
||||
ldr r5, [r0, #16]
|
||||
/* this part is being done in parallel on two columns */
|
||||
sadd16 r8, r4, r6 /* r8 = d0 + d2 */
|
||||
ssub16 r4, r4, r6 /* r4 = d0 - d2 */
|
||||
sadd16 r6, r5, r7 /* r6 = d1 + d3 */
|
||||
/* there is no parallel shift operation, but we can fake it with bic
|
||||
and lsl */
|
||||
bic r8, r8, #0xc000
|
||||
bic r4, r4, #0xc000
|
||||
/* multiplication expands values beyond 16 bits, so this part needs to be
|
||||
split. the values will be merged below so that the rest of the addition
|
||||
can be done in parallel */
|
||||
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
|
||||
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
|
||||
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
|
||||
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
|
||||
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
|
||||
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
|
||||
mov r8, r8, lsl #2 /* complete the parallel shift started */
|
||||
mov r4, r4, lsl #2 /* with the earlier bic instructions */
|
||||
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
|
||||
/* tmp10, tmp12 are in r4, r8 */
|
||||
mov r10, r10, asr #11
|
||||
mov r14, r14, asr #11
|
||||
pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
|
||||
pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
|
||||
sadd16 r10, r8, r5 /* d0 */
|
||||
ssub16 r5, r8, r5 /* d3 */
|
||||
sadd16 r14, r4, r6 /* d1 */
|
||||
ssub16 r6, r4, r6 /* d2 */
|
||||
str r10, [r0]
|
||||
str r5, [r0, #48]
|
||||
str r14, [r0, #16]
|
||||
str r6, [r0, #32]
|
||||
add r0, r0, #4
|
||||
cmp r0, r1
|
||||
bcc 1b
|
||||
ldmia sp!, { r4-r10, pc }
|
||||
#endif
|
||||
.size jpeg_idct4v, .-jpeg_idct4v
|
||||
|
||||
jpeg_idct4h:
|
||||
#if ARM_ARCH < 5
|
||||
stmdb sp!, { r4-r10, lr }
|
||||
ldr r10, =-15137
|
||||
ldr r14, =4112
|
||||
ldr r12, =6270
|
||||
1:
|
||||
ldrsh r4, [r0]
|
||||
ldrsh r6, [r0, #4]
|
||||
ldrsh r7, [r0, #6]
|
||||
ldrsh r5, [r0, #2]
|
||||
add r4, r4, r14
|
||||
add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
|
||||
sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
|
||||
add r6, r5, r7 /* r6 = z1 = d1 + d3 */
|
||||
add r9, r6, r6, lsl #3
|
||||
rsb r6, r6, r9, lsl #4
|
||||
rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
|
||||
mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
|
||||
mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
|
||||
add r9, r5, r8, lsl #13 /* r7 = o0 */
|
||||
rsb r5, r5, r8, lsl #13 /* r3 = o3 */
|
||||
add r8, r7, r4, lsl #13 /* r6 = o1 */
|
||||
rsb r4, r7, r4, lsl #13 /* r2 = o2 */
|
||||
mov r9, r9, asr #18
|
||||
mov r8, r8, asr #18
|
||||
mov r4, r4, asr #18
|
||||
mov r5, r5, asr #18
|
||||
cmp r9, #255
|
||||
mvnhi r9, r9, asr #31
|
||||
cmp r8, #255
|
||||
mvnhi r8, r8, asr #31
|
||||
cmp r4, #255
|
||||
mvnhi r4, r4, asr #31
|
||||
cmp r5, #255
|
||||
mvnhi r5, r5, asr #31
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
strb r9, [r1]
|
||||
strb r8, [r1, #4]
|
||||
strb r4, [r1, #8]
|
||||
strb r5, [r1, #12]
|
||||
#else
|
||||
strb r9, [r1]
|
||||
strb r8, [r1, #1]
|
||||
strb r4, [r1, #2]
|
||||
strb r5, [r1, #3]
|
||||
#endif
|
||||
add r0, r0, #16
|
||||
add r1, r1, r3
|
||||
teq r0, r2
|
||||
bne 1b
|
||||
ldmia sp!, { r4-r10, pc }
|
||||
#elif ARM_ARCH < 6
|
||||
stmdb sp!, { r4-r10, lr }
|
||||
ldr r10, =4433
|
||||
ldr r14, =4112
|
||||
ldr r12, =3302955134
|
||||
1:
|
||||
ldrsh r7, [r0, #6]
|
||||
ldrsh r5, [r0, #2]
|
||||
ldrsh r4, [r0]
|
||||
ldrsh r6, [r0, #4]
|
||||
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
|
||||
add r4, r4, r14
|
||||
smulbb r8, r10, r8 /* z1 *= 4433 */
|
||||
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
|
||||
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
|
||||
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
|
||||
sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
|
||||
add r6, r5, r9, lsl #13 /* r6 = o0 */
|
||||
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
|
||||
add r5, r7, r4, lsl #13 /* r5 = o1 */
|
||||
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
|
||||
mov r6, r6, asr #18
|
||||
mov r5, r5, asr #18
|
||||
mov r4, r4, asr #18
|
||||
mov r9, r9, asr #18
|
||||
cmp r6, #255
|
||||
mvnhi r6, r6, asr #31
|
||||
cmp r5, #255
|
||||
mvnhi r5, r5, asr #31
|
||||
cmp r4, #255
|
||||
mvnhi r4, r4, asr #31
|
||||
cmp r9, #255
|
||||
mvnhi r9, r9, asr #31
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
strb r6, [r1]
|
||||
strb r5, [r1, #4]
|
||||
strb r4, [r1, #8]
|
||||
strb r9, [r1, #12]
|
||||
#else
|
||||
strb r6, [r1]
|
||||
strb r5, [r1, #1]
|
||||
strb r4, [r1, #2]
|
||||
strb r9, [r1, #3]
|
||||
#endif
|
||||
add r0, r0, #16
|
||||
add r1, r1, r3
|
||||
teq r0, r2
|
||||
bne 1b
|
||||
ldmia sp!, { r4-r10, pc }
|
||||
#else
|
||||
stmdb sp!, { r4-r9, lr }
|
||||
ldr r9, =4433
|
||||
ldr r14, =4112
|
||||
ldr r12, =3302955134
|
||||
1:
|
||||
ldmia r0, { r4-r5 }
|
||||
sadd16 r4, r4, r14
|
||||
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
|
||||
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
|
||||
smulbt r8, r9, r6
|
||||
sxth r6, r6
|
||||
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
|
||||
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
|
||||
sxth r7, r7
|
||||
add r8, r4, r6, lsl #13 /* r8 = o0 */
|
||||
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
|
||||
add r4, r5, r7, lsl #13 /* r4 = o1 */
|
||||
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
|
||||
usat r8, #8, r8, asr #18
|
||||
usat r6, #8, r6, asr #18
|
||||
usat r4, #8, r4, asr #18
|
||||
usat r5, #8, r5, asr #18
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
strb r8, [r1]
|
||||
strb r6, [r1, #12]
|
||||
strb r4, [r1, #4]
|
||||
strb r5, [r1, #8]
|
||||
#else
|
||||
strb r8, [r1]
|
||||
strb r6, [r1, #3]
|
||||
strb r4, [r1, #1]
|
||||
strb r5, [r1, #2]
|
||||
#endif
|
||||
add r0, r0, #16
|
||||
add r1, r1, r3
|
||||
teq r0, r2
|
||||
bne 1b
|
||||
ldmia sp!, { r4-r9, pc }
|
||||
#endif
|
||||
.size jpeg_idct4h, .-jpeg_idct4h
|
|
@ -31,6 +31,7 @@
|
|||
#include "debug.h"
|
||||
#include "jpeg_load.h"
|
||||
/*#define JPEG_BS_DEBUG*/
|
||||
#define ROCKBOX_DEBUG_JPEG
|
||||
/* for portability of below JPEG code */
|
||||
#define MEMSET(p,v,c) memset(p,v,c)
|
||||
#define MEMCPY(d,s,c) memcpy(d,s,c)
|
||||
|
@ -49,7 +50,23 @@ typedef struct uint8_rgb jpeg_pix_t;
|
|||
#else
|
||||
typedef uint8_t jpeg_pix_t;
|
||||
#endif
|
||||
#define JPEG_IDCT_TRANSPOSE
|
||||
#define JPEG_PIX_SZ (sizeof(jpeg_pix_t))
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
#define COLOR_EXTRA_IDCT_WS 64
|
||||
#else
|
||||
#define COLOR_EXTRA_IDCT_WS 0
|
||||
#endif
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
#define V_OUT(n) ws2[8*n]
|
||||
#define V_IN_ST 1
|
||||
#define TRANSPOSE_EXTRA_IDCT_WS 64
|
||||
#else
|
||||
#define V_OUT(n) ws[8*n]
|
||||
#define V_IN_ST 8
|
||||
#define TRANSPOSE_EXTRA_IDCT_WS 0
|
||||
#endif
|
||||
#define IDCT_WS_SIZE (64 + TRANSPOSE_EXTRA_IDCT_WS + COLOR_EXTRA_IDCT_WS)
|
||||
|
||||
/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts
|
||||
* with the definition in jpeg_decoder.h
|
||||
|
@ -259,7 +276,7 @@ INLINE unsigned range_limit(int value)
|
|||
*/
|
||||
|
||||
/* horizontal-pass 1-point IDCT */
|
||||
static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
{
|
||||
for (; ws < end; ws += 8)
|
||||
{
|
||||
|
@ -269,19 +286,19 @@ static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
|||
}
|
||||
|
||||
/* vertical-pass 2-point IDCT */
|
||||
static void idct2v(int16_t *ws, int16_t *end)
|
||||
static void jpeg_idct2v(int16_t *ws, int16_t *end)
|
||||
{
|
||||
for (; ws < end; ws++)
|
||||
{
|
||||
int tmp1 = ws[0];
|
||||
int tmp2 = ws[8];
|
||||
ws[0] = tmp1 + tmp2;
|
||||
ws[8] = tmp1 - tmp2;
|
||||
int tmp1 = ws[0*8];
|
||||
int tmp2 = ws[1*8];
|
||||
ws[0*8] = tmp1 + tmp2;
|
||||
ws[1*8] = tmp1 - tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
/* horizontal-pass 2-point IDCT */
|
||||
static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
{
|
||||
for (; ws < end; ws += 8, out += rowstep)
|
||||
{
|
||||
|
@ -295,69 +312,12 @@ static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef CPU_ARM
|
||||
/* vertical-pass 4-point IDCT */
|
||||
static void idct4v(int16_t *ws, int16_t *end)
|
||||
static void jpeg_idct4v(int16_t *ws, int16_t *end)
|
||||
{
|
||||
for (; ws < end; ws++)
|
||||
{
|
||||
#if defined(CPU_ARM)
|
||||
int t0, t1, t2, t3, t4;
|
||||
#if ARM_ARCH <= 4
|
||||
int t5;
|
||||
#endif
|
||||
asm volatile(
|
||||
"ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[8*0]) */
|
||||
"ldrsh %[t1], [%[ws], #32]\n\t" /* t1 = tmp2 (ws[8*2]) */
|
||||
"ldrsh %[t2], [%[ws], #16]\n\t" /* t2 = z2 (ws[8*1]) */
|
||||
"add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 2
|
||||
(tmp0 + tmp2) */
|
||||
"sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 2
|
||||
(tmp0 - tmp2) */
|
||||
"ldrsh %[t3], [%[ws], #48]\n\t" /* t3 = z3 (ws[8*3] */
|
||||
"add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
|
||||
#if ARM_ARCH > 4
|
||||
"smulbb %[t4], %[c1], %[t4]\n\t"
|
||||
"add %[t4], %[t4], #1024\n\t" /* t4 = z1 */
|
||||
"smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
|
||||
"smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
|
||||
"mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
|
||||
"mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
|
||||
#else
|
||||
"add %[t5], %[t4], %[t4], lsl #3\n\t"
|
||||
"rsb %[t4], %[t4], %[t5], lsl #4\n\t"
|
||||
"rsb %[t4], %[t4], %[t4], lsl #5\n\t"
|
||||
"add %[t4], %[t4], #1024\n\t" /*z1*/
|
||||
"mla %[t3], %[c2], %[t3], %[t4]\n\t"
|
||||
"mla %[t2], %[c3], %[t2], %[t4]\n\t"
|
||||
"mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
|
||||
"mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
|
||||
#endif
|
||||
"add %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */
|
||||
"rsb %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */
|
||||
"add %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */
|
||||
"rsb %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */
|
||||
"strh %[t4], [%[ws]]\n\t"
|
||||
"strh %[t0], [%[ws], #48]\n\t"
|
||||
"strh %[t2], [%[ws], #16]\n\t"
|
||||
"strh %[t3], [%[ws], #32]\n\t"
|
||||
: [t0] "=&r" (t0),
|
||||
[t1] "=&r" (t1),
|
||||
[t2] "=&r" (t2),
|
||||
[t3] "=&r" (t3),
|
||||
[t4] "=&r" (t4)
|
||||
#if ARM_ARCH <= 4
|
||||
,[t5] "=&r" (t5)
|
||||
#endif
|
||||
: [ws] "r" (ws),
|
||||
#if ARM_ARCH > 4
|
||||
[c1] "r" (FIX_0_541196100),
|
||||
[c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
|
||||
#else
|
||||
[c2] "r" (-FIX_1_847759065),
|
||||
[c3] "r" (FIX_0_765366865)
|
||||
#endif
|
||||
);
|
||||
#else
|
||||
int tmp0, tmp2, tmp10, tmp12;
|
||||
int z1, z2, z3;
|
||||
/* Even part */
|
||||
|
@ -382,93 +342,18 @@ static void idct4v(int16_t *ws, int16_t *end)
|
|||
CONST_BITS-PASS1_BITS);
|
||||
|
||||
/* Final output stage */
|
||||
|
||||
ws[8*0] = (int) (tmp10 + tmp2);
|
||||
ws[8*3] = (int) (tmp10 - tmp2);
|
||||
ws[8*1] = (int) (tmp12 + tmp0);
|
||||
ws[8*2] = (int) (tmp12 - tmp0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* horizontal-pass 4-point IDCT */
|
||||
static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
{
|
||||
for (; ws < end; out += rowstep, ws += 8)
|
||||
{
|
||||
#if defined(CPU_ARM)
|
||||
int t0, t1, t2, t3, t4;
|
||||
#if ARM_ARCH <= 4
|
||||
int t5;
|
||||
#endif
|
||||
asm volatile(
|
||||
"ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[0]) */
|
||||
"ldrsh %[t1], [%[ws], #4]\n\t" /* t1 = tmp2 (ws[2]) */
|
||||
"add %[t4], %[t4], #16\n\t" /* add rounding to DC */
|
||||
"add %[t4], %[t4], #4096\n\t" /* pre-add offset */
|
||||
"ldrsh %[t2], [%[ws], #2]\n\t" /* t2 = z2 (ws[1]) */
|
||||
"add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 13
|
||||
(tmp0 + tmp2) */
|
||||
"sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 13
|
||||
(tmp0 - tmp2) */
|
||||
"ldrsh %[t3], [%[ws], #6]\n\t" /* t3 = z3 (ws[3] */
|
||||
"add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
|
||||
#if ARM_ARCH > 4
|
||||
"smulbb %[t4], %[c1], %[t4]\n\t"
|
||||
"smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
|
||||
"smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
|
||||
#else
|
||||
"add %[t5], %[t4], %[t4], lsl #3\n\t"
|
||||
"rsb %[t4], %[t4], %[t5], lsl #4\n\t"
|
||||
"rsb %[t4], %[t4], %[t4], lsl #5\n\t" /* t4 = z1 */
|
||||
"mla %[t3], %[c2], %[t3], %[t4]\n\t"
|
||||
"mla %[t2], %[c3], %[t2], %[t4]\n\t"
|
||||
#endif
|
||||
"add %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */
|
||||
"rsb %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */
|
||||
"add %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */
|
||||
"rsb %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */
|
||||
"mov %[t4], %[t4], asr #18\n\t" /* descale results */
|
||||
"mov %[t0], %[t0], asr #18\n\t"
|
||||
"mov %[t2], %[t2], asr #18\n\t"
|
||||
"mov %[t3], %[t3], asr #18\n\t"
|
||||
"cmp %[t4], #255\n\t" /* range limit results */
|
||||
"mvnhi %[t4], %[t4], asr #31\n\t"
|
||||
"cmp %[t0], #255\n\t"
|
||||
"mvnhi %[t0], %[t0], asr #31\n\t"
|
||||
"cmp %[t2], #255\n\t"
|
||||
"mvnhi %[t2], %[t2], asr #31\n\t"
|
||||
"cmp %[t3], #255\n\t"
|
||||
"mvnhi %[t3], %[t3], asr #31\n\t"
|
||||
"cmp %[t4], #255\n\t"
|
||||
"mvnhi %[t4], %[t4], asr #31\n\t"
|
||||
"strb %[t4], [%[out]]\n\t"
|
||||
"strb %[t0], [%[out], %[o3]]\n\t"
|
||||
"strb %[t2], [%[out], %[o1]]\n\t"
|
||||
"strb %[t3], [%[out], %[o2]]\n\t"
|
||||
: [t0] "=&r" (t0),
|
||||
[t1] "=&r" (t1),
|
||||
[t2] "=&r" (t2),
|
||||
[t3] "=&r" (t3),
|
||||
[t4] "=&r" (t4)
|
||||
#if ARM_ARCH <= 4
|
||||
|
||||
,[t5] "=&r" (t5)
|
||||
#endif
|
||||
: [ws] "r" (ws),
|
||||
[out] "r" (out),
|
||||
[o1] "i" (JPEG_PIX_SZ),
|
||||
[o2] "i" (JPEG_PIX_SZ*2),
|
||||
[o3] "i" (JPEG_PIX_SZ*3),
|
||||
#if ARM_ARCH > 4
|
||||
[c1] "r" (FIX_0_541196100),
|
||||
[c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
|
||||
#else
|
||||
[c2] "r" (-FIX_1_847759065),
|
||||
[c3] "r" (FIX_0_765366865)
|
||||
#endif
|
||||
);
|
||||
#else
|
||||
int tmp0, tmp2, tmp10, tmp12;
|
||||
int z1, z2, z3;
|
||||
/* Even part */
|
||||
|
@ -500,18 +385,27 @@ static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
|||
DS_OUT));
|
||||
out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0,
|
||||
DS_OUT));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else
|
||||
extern void jpeg_idct4v(int16_t *ws, int16_t *end);
|
||||
extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
|
||||
#endif
|
||||
|
||||
/* vertical-pass 8-point IDCT */
|
||||
static void idct8v(int16_t *ws, int16_t *end)
|
||||
static void jpeg_idct8v(int16_t *ws, int16_t *end)
|
||||
{
|
||||
long tmp0, tmp1, tmp2, tmp3;
|
||||
long tmp10, tmp11, tmp12, tmp13;
|
||||
long z1, z2, z3, z4, z5;
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
int16_t *ws2 = ws + 64;
|
||||
for (; ws < end; ws += 8, ws2++)
|
||||
{
|
||||
#else
|
||||
for (; ws < end; ws++)
|
||||
{
|
||||
#endif
|
||||
/* Due to quantization, we will usually find that many of the input
|
||||
* coefficients are zero, especially the AC terms. We can exploit this
|
||||
* by short-circuiting the IDCT calculation for any column in which all
|
||||
|
@ -520,30 +414,30 @@ static void idct8v(int16_t *ws, int16_t *end)
|
|||
* With typical images and quantization tables, half or more of the
|
||||
* column DCT calculations can be simplified this way.
|
||||
*/
|
||||
if ((ws[8*1] | ws[8*2] | ws[8*3]
|
||||
| ws[8*4] | ws[8*5] | ws[8*6] | ws[8*7]) == 0)
|
||||
if ((ws[V_IN_ST*1] | ws[V_IN_ST*2] | ws[V_IN_ST*3]
|
||||
| ws[V_IN_ST*4] | ws[V_IN_ST*5] | ws[V_IN_ST*6] | ws[V_IN_ST*7]) == 0)
|
||||
{
|
||||
/* AC terms all zero */
|
||||
int dcval = ws[8*0] << PASS1_BITS;
|
||||
int dcval = ws[V_IN_ST*0] << PASS1_BITS;
|
||||
|
||||
ws[8*0] = ws[8*1] = ws[8*2] = ws[8*3] = ws[8*4]
|
||||
= ws[8*5] = ws[8*6] = ws[8*7] = dcval;
|
||||
V_OUT(0) = V_OUT(1) = V_OUT(2) = V_OUT(3) = V_OUT(4) = V_OUT(5) =
|
||||
V_OUT(6) = V_OUT(7) = dcval;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Even part: reverse the even part of the forward DCT. */
|
||||
/* The rotator is sqrt(2)*c(-6). */
|
||||
|
||||
z2 = ws[8*2];
|
||||
z3 = ws[8*6];
|
||||
z2 = ws[V_IN_ST*2];
|
||||
z3 = ws[V_IN_ST*6];
|
||||
|
||||
z1 = MULTIPLY16(z2 + z3, FIX_0_541196100);
|
||||
tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065);
|
||||
tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865);
|
||||
|
||||
z2 = ws[8*0] << CONST_BITS;
|
||||
z2 = ws[V_IN_ST*0] << CONST_BITS;
|
||||
z2 += ONE << (CONST_BITS - PASS1_BITS - 1);
|
||||
z3 = ws[8*4] << CONST_BITS;
|
||||
z3 = ws[V_IN_ST*4] << CONST_BITS;
|
||||
|
||||
tmp0 = (z2 + z3);
|
||||
tmp1 = (z2 - z3);
|
||||
|
@ -556,10 +450,10 @@ static void idct8v(int16_t *ws, int16_t *end)
|
|||
/* Odd part per figure 8; the matrix is unitary and hence its
|
||||
transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */
|
||||
|
||||
tmp0 = ws[8*7];
|
||||
tmp1 = ws[8*5];
|
||||
tmp2 = ws[8*3];
|
||||
tmp3 = ws[8*1];
|
||||
tmp0 = ws[V_IN_ST*7];
|
||||
tmp1 = ws[V_IN_ST*5];
|
||||
tmp2 = ws[V_IN_ST*3];
|
||||
tmp3 = ws[V_IN_ST*1];
|
||||
|
||||
z1 = tmp0 + tmp3;
|
||||
z2 = tmp1 + tmp2;
|
||||
|
@ -586,19 +480,19 @@ static void idct8v(int16_t *ws, int16_t *end)
|
|||
|
||||
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
|
||||
|
||||
ws[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
|
||||
ws[8*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
|
||||
ws[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
|
||||
ws[8*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
|
||||
ws[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
|
||||
ws[8*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
|
||||
ws[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
|
||||
ws[8*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(0) = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(7) = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(1) = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(6) = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(2) = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(5) = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(3) = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(4) = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
|
||||
}
|
||||
}
|
||||
|
||||
/* horizontal-pass 8-point IDCT */
|
||||
static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
{
|
||||
long tmp0, tmp1, tmp2, tmp3;
|
||||
long tmp10, tmp11, tmp12, tmp13;
|
||||
|
@ -709,20 +603,26 @@ static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
|||
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
/* vertical-pass 16-point IDCT */
|
||||
static void idct16v(int16_t *ws, int16_t *end)
|
||||
static void jpeg_idct16v(int16_t *ws, int16_t *end)
|
||||
{
|
||||
long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
|
||||
long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
|
||||
long z1, z2, z3, z4;
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
int16_t *ws2 = ws + 64;
|
||||
for (; ws < end; ws += 8, ws2++)
|
||||
{
|
||||
#else
|
||||
for (; ws < end; ws++)
|
||||
{
|
||||
#endif
|
||||
/* Even part */
|
||||
|
||||
tmp0 = ws[8*0] << CONST_BITS;
|
||||
tmp0 = ws[V_IN_ST*0] << CONST_BITS;
|
||||
/* Add fudge factor here for final descale. */
|
||||
tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
|
||||
|
||||
z1 = ws[8*4];
|
||||
z1 = ws[V_IN_ST*4];
|
||||
tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
|
||||
tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
|
||||
|
||||
|
@ -731,8 +631,8 @@ static void idct16v(int16_t *ws, int16_t *end)
|
|||
tmp12 = tmp0 + tmp2;
|
||||
tmp13 = tmp0 - tmp2;
|
||||
|
||||
z1 = ws[8*2];
|
||||
z2 = ws[8*6];
|
||||
z1 = ws[V_IN_ST*2];
|
||||
z2 = ws[V_IN_ST*6];
|
||||
z3 = z1 - z2;
|
||||
z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
|
||||
z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
|
||||
|
@ -757,10 +657,10 @@ static void idct16v(int16_t *ws, int16_t *end)
|
|||
|
||||
/* Odd part */
|
||||
|
||||
z1 = ws[8*1];
|
||||
z2 = ws[8*3];
|
||||
z3 = ws[8*5];
|
||||
z4 = ws[8*7];
|
||||
z1 = ws[V_IN_ST*1];
|
||||
z2 = ws[V_IN_ST*3];
|
||||
z3 = ws[V_IN_ST*5];
|
||||
z4 = ws[V_IN_ST*7];
|
||||
|
||||
tmp11 = z1 + z3;
|
||||
|
||||
|
@ -795,27 +695,27 @@ static void idct16v(int16_t *ws, int16_t *end)
|
|||
tmp11 += z2;
|
||||
|
||||
/* Final output stage */
|
||||
ws[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
|
||||
ws[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
|
||||
ws[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
|
||||
ws[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
|
||||
ws[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
|
||||
ws[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
|
||||
ws[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
|
||||
ws[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
|
||||
ws[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
|
||||
ws[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
|
||||
ws[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
|
||||
ws[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
|
||||
ws[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
|
||||
ws[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
|
||||
ws[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
|
||||
ws[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(0) = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(15) = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(1) = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(14) = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(2) = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(13) = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(3) = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(12) = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(4) = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(11) = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(5) = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(10) = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(6) = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(9) = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(7) = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
|
||||
V_OUT(8) = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
|
||||
}
|
||||
}
|
||||
|
||||
/* horizontal-pass 16-point IDCT */
|
||||
static void idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
static void jpeg_idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
|
||||
{
|
||||
long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
|
||||
long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
|
||||
|
@ -946,12 +846,12 @@ struct idct_entry {
|
|||
};
|
||||
|
||||
struct idct_entry idct_tbl[] = {
|
||||
{ PASS1_BITS, NULL, idct1h },
|
||||
{ PASS1_BITS, idct2v, idct2h },
|
||||
{ 0, idct4v, idct4h },
|
||||
{ 0, idct8v, idct8h },
|
||||
{ PASS1_BITS, NULL, jpeg_idct1h },
|
||||
{ PASS1_BITS, jpeg_idct2v, jpeg_idct2h },
|
||||
{ 0, jpeg_idct4v, jpeg_idct4h },
|
||||
{ 0, jpeg_idct8v, jpeg_idct8h },
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
{ 0, idct16v, idct16h },
|
||||
{ 0, jpeg_idct16v, jpeg_idct16h },
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -1468,21 +1368,27 @@ static void fix_huff_tbl(int* htbl, struct derived_tbl* dtbl)
|
|||
}
|
||||
|
||||
|
||||
/* zag[i] is the natural-order position of the i'th element of zigzag order.
|
||||
* If the incoming data is corrupted, decode_mcu could attempt to
|
||||
* reference values beyond the end of the array. To avoid a wild store,
|
||||
* we put some extra zeroes after the real entries.
|
||||
*/
|
||||
/* zag[i] is the natural-order position of the i'th element of zigzag order. */
|
||||
static const unsigned char zag[] =
|
||||
{
|
||||
0, 1, 8, 16, 9, 2, 3, 10,
|
||||
17, 24, 32, 25, 18, 11, 4, 5,
|
||||
12, 19, 26, 33, 40, 48, 41, 34,
|
||||
27, 20, 13, 6, 7, 14, 21, 28,
|
||||
35, 42, 49, 56, 57, 50, 43, 36,
|
||||
29, 22, 15, 23, 30, 37, 44, 51,
|
||||
58, 59, 52, 45, 38, 31, 39, 46,
|
||||
53, 60, 61, 54, 47, 55, 62, 63,
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
0, 8, 1, 2, 9, 16, 24, 17,
|
||||
10, 3, 4, 11, 18, 25, 32, 40,
|
||||
33, 26, 19, 12, 5, 6, 13, 20,
|
||||
27, 34, 41, 48, 56, 49, 42, 35,
|
||||
28, 21, 14, 7, 15, 22, 29, 36,
|
||||
43, 50, 57, 58, 51, 44, 37, 30,
|
||||
23, 31, 38, 45, 52, 59, 60, 53,
|
||||
46, 39, 47, 54, 61, 62, 55, 63,
|
||||
#endif
|
||||
0, 1, 8, 16, 9, 2, 3, 10,
|
||||
17, 24, 32, 25, 18, 11, 4, 5,
|
||||
12, 19, 26, 33, 40, 48, 41, 34,
|
||||
27, 20, 13, 6, 7, 14, 21, 28,
|
||||
35, 42, 49, 56, 57, 50, 43, 36,
|
||||
29, 22, 15, 23, 30, 37, 44, 51,
|
||||
58, 59, 52, 45, 38, 31, 39, 46,
|
||||
53, 60, 61, 54, 47, 55, 62, 63,
|
||||
};
|
||||
|
||||
/* zig[i] is the the zig-zag order position of the i'th element of natural
|
||||
|
@ -1898,17 +1804,20 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
|
|||
store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0];
|
||||
store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0];
|
||||
store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2];
|
||||
|
||||
int16_t block[128]; /* decoded DCT coefficients */
|
||||
/* decoded DCT coefficients */
|
||||
int16_t block[IDCT_WS_SIZE] __attribute__((aligned(8)));
|
||||
for (x = 0; x < p_jpeg->x_mbl; x++)
|
||||
{
|
||||
int blkn;
|
||||
for (blkn = 0; blkn < p_jpeg->blocks; blkn++)
|
||||
{
|
||||
int k = 1; /* coefficient index */
|
||||
int s, r; /* huffman values */
|
||||
int ci = p_jpeg->mcu_membership[blkn]; /* component index */
|
||||
int ti = p_jpeg->tab_membership[blkn]; /* table index */
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
bool transpose = p_jpeg->v_scale[!!ci] > 2;
|
||||
#endif
|
||||
int k = 1; /* coefficient index */
|
||||
int s, r; /* huffman values */
|
||||
struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti];
|
||||
struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti];
|
||||
|
||||
|
@ -1948,7 +1857,11 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
|
|||
r = get_bits(p_jpeg, s);
|
||||
r = HUFF_EXTEND(r, s);
|
||||
r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]);
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
block[zag[transpose ? k : k + 64]] = r ;
|
||||
#else
|
||||
block[zag[k]] = r ;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1988,10 +1901,19 @@ block_end:
|
|||
int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]);
|
||||
unsigned char *b_out = out + (ci ? ci : store_offs[blkn]);
|
||||
if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct)
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
|
||||
transpose ? block + 8 * idct_cols
|
||||
: block + idct_cols);
|
||||
uint16_t * h_block = transpose ? block + 64 : block;
|
||||
idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(h_block, b_out,
|
||||
h_block + idct_rows * 8, b_width);
|
||||
#else
|
||||
idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
|
||||
block + idct_cols);
|
||||
idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out,
|
||||
block + idct_rows * 8, b_width);
|
||||
#endif
|
||||
}
|
||||
} /* for blkn */
|
||||
/* don't starve other threads while an MCU row decodes */
|
||||
|
@ -2048,7 +1970,6 @@ int read_jpeg_file(const char* filename,
|
|||
{
|
||||
int fd, ret;
|
||||
fd = open(filename, O_RDONLY);
|
||||
|
||||
JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n",
|
||||
filename, maxsize, cformat);
|
||||
/* Exit if file opening failed */
|
||||
|
@ -2181,14 +2102,22 @@ int read_jpeg_fd(int fd,
|
|||
int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1;
|
||||
src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3;
|
||||
src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3;
|
||||
p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
|
||||
p_jpeg->k_need[0] = zig[p_jpeg->zero_need[0]];
|
||||
#ifdef JPEG_IDCT_TRANSPOSE
|
||||
if (p_jpeg->v_scale[0] > 2)
|
||||
p_jpeg->zero_need[0] = (decode_w << 3) + decode_h;
|
||||
else
|
||||
#endif
|
||||
p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
|
||||
p_jpeg->k_need[0] = zig[(decode_h << 3) + decode_w];
|
||||
JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]);
|
||||
#ifdef HAVE_LCD_COLOR
|
||||
decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1;
|
||||
decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1;
|
||||
p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
|
||||
p_jpeg->k_need[1] = zig[p_jpeg->zero_need[1]];
|
||||
if (p_jpeg->v_scale[1] > 2)
|
||||
p_jpeg->zero_need[1] = (decode_w << 3) + decode_h;
|
||||
else
|
||||
p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
|
||||
p_jpeg->k_need[1] = zig[(decode_h << 3) + decode_w];
|
||||
JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]);
|
||||
#endif
|
||||
if (cformat)
|
||||
|
|
Loading…
Reference in a new issue