Core JPEG decoder improvements:

For >8-point vertical IDCT, transpose the coefficients while decoding them, so that the vertical IDCT can read in rows rather than columns. This improves speed a bit for this size even using the C IDCT.
Remove inline ARM asm, replacing it with an external file containing pure asm IDCT functions.
Add jpeg_ prefix to JPEG IDCT functions since some of them will now be visible globally.


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21345 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andrew Mahone 2009-06-19 02:56:00 +00:00
parent 4c58ad26ba
commit 6a0d931f38
5 changed files with 465 additions and 219 deletions

View file

@ -104,6 +104,9 @@ recorder/resize.c
#endif
#ifdef HAVE_JPEG
recorder/jpeg_load.c
#ifdef CPU_ARM
recorder/jpeg_idct_arm.S
#endif
#endif
#ifdef HAVE_ALBUMART
recorder/albumart.c

View file

@ -27,6 +27,9 @@ playergfx.c
profile_plugin.c
#endif
#ifdef HAVE_LCD_BITMAP
#ifdef CPU_ARM
pluginlib_jpeg_idct_arm.S
#endif
pluginlib_jpeg_mem.c
pluginlib_resize.c
#ifndef HAVE_JPEG

View file

@ -0,0 +1,24 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2009 by Andrew Mahone
*
* This is a wrapper for the core jpeg_idct_arm.S
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "recorder/jpeg_idct_arm.S"

View file

@ -0,0 +1,287 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* JPEG assembly IDCT
*
* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
* jpeg_load.c with
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
.section .text
.align 2
.global jpeg_idct4v
.type jpeg_idct4v, %function
.global jpeg_idct4h
.type jpeg_idct4h, %function
jpeg_idct4v:
#if ARM_ARCH < 5
stmdb sp!, { r4-r7, lr }
ldr r14, =-15137
ldr r12, =6270
1:
ldrsh r4, [r0, #32]
ldrsh r2, [r0]
ldrsh r5, [r0, #48]
ldrsh r3, [r0, #16]
add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
add r4, r3, r5 /* r4 = z1 = d1 + d3 */
add r7, r4, r4, lsl #3
rsb r4, r4, r7, lsl #4
rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
add r4, r4, #1024
mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
mov r6, r6, lsl #2 /* r6 <<= 2 */
mov r2, r2, lsl #2 /* r2 <<= 2 */
add r7, r6, r3, asr #11 /* r7 = o0 */
sub r3, r6, r3, asr #11 /* r3 = o3 */
add r6, r2, r5, asr #11 /* r6 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r7, [r0]
strh r3, [r0, #48]
strh r6, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
teq r0, r1
bne 1b
ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr }
ldr r8, =1024
ldr r14, =4433
ldr r12, =3302955134
1:
ldrsh r5, [r0, #48]
ldrsh r3, [r0, #16]
ldrsh r4, [r0, #32]
ldrsh r2, [r0]
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
mov r7, r7, lsl #2
mov r2, r2, lsl #2
add r4, r7, r3, asr #11 /* r4 = o0 */
sub r7, r7, r3, asr #11 /* r7 = o3 */
add r3, r2, r5, asr #11 /* r3 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r4, [r0]
strh r7, [r0, #48]
strh r3, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
teq r0, r1
bne 1b
ldmia sp!, { r4-r8, pc }
#else
stmdb sp!, { r4-r10, lr }
ldr r2, =1024
ldr r3, =4433
ldr r12, =3302955134
1:
ldr r6, [r0, #32]
ldr r4, [r0]
ldr r7, [r0, #48]
ldr r5, [r0, #16]
/* this part is being done in parallel on two columns */
sadd16 r8, r4, r6 /* r8 = d0 + d2 */
ssub16 r4, r4, r6 /* r4 = d0 - d2 */
sadd16 r6, r5, r7 /* r6 = d1 + d3 */
/* there is no parallel shift operation, but we can fake it with bic
and lsl */
bic r8, r8, #0xc000
bic r4, r4, #0xc000
/* multiplication expands values beyond 16 bits, so this part needs to be
split. the values will be merged below so that the rest of the addition
can be done in parallel */
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
mov r8, r8, lsl #2 /* complete the parallel shift started */
mov r4, r4, lsl #2 /* with the earlier bic instructions */
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
/* tmp10, tmp12 are in r4, r8 */
mov r10, r10, asr #11
mov r14, r14, asr #11
pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
sadd16 r10, r8, r5 /* d0 */
ssub16 r5, r8, r5 /* d3 */
sadd16 r14, r4, r6 /* d1 */
ssub16 r6, r4, r6 /* d2 */
str r10, [r0]
str r5, [r0, #48]
str r14, [r0, #16]
str r6, [r0, #32]
add r0, r0, #4
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r10, pc }
#endif
.size jpeg_idct4v, .-jpeg_idct4v
jpeg_idct4h:
#if ARM_ARCH < 5
stmdb sp!, { r4-r10, lr }
ldr r10, =-15137
ldr r14, =4112
ldr r12, =6270
1:
ldrsh r4, [r0]
ldrsh r6, [r0, #4]
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
add r4, r4, r14
add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
add r6, r5, r7 /* r6 = z1 = d1 + d3 */
add r9, r6, r6, lsl #3
rsb r6, r6, r9, lsl #4
rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
add r9, r5, r8, lsl #13 /* r7 = o0 */
rsb r5, r5, r8, lsl #13 /* r3 = o3 */
add r8, r7, r4, lsl #13 /* r6 = o1 */
rsb r4, r7, r4, lsl #13 /* r2 = o2 */
mov r9, r9, asr #18
mov r8, r8, asr #18
mov r4, r4, asr #18
mov r5, r5, asr #18
cmp r9, #255
mvnhi r9, r9, asr #31
cmp r8, #255
mvnhi r8, r8, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
#ifdef HAVE_LCD_COLOR
strb r9, [r1]
strb r8, [r1, #4]
strb r4, [r1, #8]
strb r5, [r1, #12]
#else
strb r9, [r1]
strb r8, [r1, #1]
strb r4, [r1, #2]
strb r5, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
teq r0, r2
bne 1b
ldmia sp!, { r4-r10, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r10, lr }
ldr r10, =4433
ldr r14, =4112
ldr r12, =3302955134
1:
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
ldrsh r4, [r0]
ldrsh r6, [r0, #4]
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
add r4, r4, r14
smulbb r8, r10, r8 /* z1 *= 4433 */
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
add r6, r5, r9, lsl #13 /* r6 = o0 */
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
add r5, r7, r4, lsl #13 /* r5 = o1 */
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
mov r6, r6, asr #18
mov r5, r5, asr #18
mov r4, r4, asr #18
mov r9, r9, asr #18
cmp r6, #255
mvnhi r6, r6, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r9, #255
mvnhi r9, r9, asr #31
#ifdef HAVE_LCD_COLOR
strb r6, [r1]
strb r5, [r1, #4]
strb r4, [r1, #8]
strb r9, [r1, #12]
#else
strb r6, [r1]
strb r5, [r1, #1]
strb r4, [r1, #2]
strb r9, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
teq r0, r2
bne 1b
ldmia sp!, { r4-r10, pc }
#else
stmdb sp!, { r4-r9, lr }
ldr r9, =4433
ldr r14, =4112
ldr r12, =3302955134
1:
ldmia r0, { r4-r5 }
sadd16 r4, r4, r14
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
smulbt r8, r9, r6
sxth r6, r6
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
sxth r7, r7
add r8, r4, r6, lsl #13 /* r8 = o0 */
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
add r4, r5, r7, lsl #13 /* r4 = o1 */
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
usat r8, #8, r8, asr #18
usat r6, #8, r6, asr #18
usat r4, #8, r4, asr #18
usat r5, #8, r5, asr #18
#ifdef HAVE_LCD_COLOR
strb r8, [r1]
strb r6, [r1, #12]
strb r4, [r1, #4]
strb r5, [r1, #8]
#else
strb r8, [r1]
strb r6, [r1, #3]
strb r4, [r1, #1]
strb r5, [r1, #2]
#endif
add r0, r0, #16
add r1, r1, r3
teq r0, r2
bne 1b
ldmia sp!, { r4-r9, pc }
#endif
.size jpeg_idct4h, .-jpeg_idct4h

View file

@ -31,6 +31,7 @@
#include "debug.h"
#include "jpeg_load.h"
/*#define JPEG_BS_DEBUG*/
#define ROCKBOX_DEBUG_JPEG
/* for portability of below JPEG code */
#define MEMSET(p,v,c) memset(p,v,c)
#define MEMCPY(d,s,c) memcpy(d,s,c)
@ -49,7 +50,23 @@ typedef struct uint8_rgb jpeg_pix_t;
#else
typedef uint8_t jpeg_pix_t;
#endif
#define JPEG_IDCT_TRANSPOSE
#define JPEG_PIX_SZ (sizeof(jpeg_pix_t))
#ifdef HAVE_LCD_COLOR
#define COLOR_EXTRA_IDCT_WS 64
#else
#define COLOR_EXTRA_IDCT_WS 0
#endif
#ifdef JPEG_IDCT_TRANSPOSE
#define V_OUT(n) ws2[8*n]
#define V_IN_ST 1
#define TRANSPOSE_EXTRA_IDCT_WS 64
#else
#define V_OUT(n) ws[8*n]
#define V_IN_ST 8
#define TRANSPOSE_EXTRA_IDCT_WS 0
#endif
#define IDCT_WS_SIZE (64 + TRANSPOSE_EXTRA_IDCT_WS + COLOR_EXTRA_IDCT_WS)
/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts
* with the definition in jpeg_decoder.h
@ -259,7 +276,7 @@ INLINE unsigned range_limit(int value)
*/
/* horizontal-pass 1-point IDCT */
static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
for (; ws < end; ws += 8)
{
@ -269,19 +286,19 @@ static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
}
/* vertical-pass 2-point IDCT */
static void idct2v(int16_t *ws, int16_t *end)
static void jpeg_idct2v(int16_t *ws, int16_t *end)
{
for (; ws < end; ws++)
{
int tmp1 = ws[0];
int tmp2 = ws[8];
ws[0] = tmp1 + tmp2;
ws[8] = tmp1 - tmp2;
int tmp1 = ws[0*8];
int tmp2 = ws[1*8];
ws[0*8] = tmp1 + tmp2;
ws[1*8] = tmp1 - tmp2;
}
}
/* horizontal-pass 2-point IDCT */
static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
for (; ws < end; ws += 8, out += rowstep)
{
@ -295,69 +312,12 @@ static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
}
}
#ifndef CPU_ARM
/* vertical-pass 4-point IDCT */
static void idct4v(int16_t *ws, int16_t *end)
static void jpeg_idct4v(int16_t *ws, int16_t *end)
{
for (; ws < end; ws++)
{
#if defined(CPU_ARM)
int t0, t1, t2, t3, t4;
#if ARM_ARCH <= 4
int t5;
#endif
asm volatile(
"ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[8*0]) */
"ldrsh %[t1], [%[ws], #32]\n\t" /* t1 = tmp2 (ws[8*2]) */
"ldrsh %[t2], [%[ws], #16]\n\t" /* t2 = z2 (ws[8*1]) */
"add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 2
(tmp0 + tmp2) */
"sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 2
(tmp0 - tmp2) */
"ldrsh %[t3], [%[ws], #48]\n\t" /* t3 = z3 (ws[8*3] */
"add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
#if ARM_ARCH > 4
"smulbb %[t4], %[c1], %[t4]\n\t"
"add %[t4], %[t4], #1024\n\t" /* t4 = z1 */
"smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
"smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
"mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
"mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
#else
"add %[t5], %[t4], %[t4], lsl #3\n\t"
"rsb %[t4], %[t4], %[t5], lsl #4\n\t"
"rsb %[t4], %[t4], %[t4], lsl #5\n\t"
"add %[t4], %[t4], #1024\n\t" /*z1*/
"mla %[t3], %[c2], %[t3], %[t4]\n\t"
"mla %[t2], %[c3], %[t2], %[t4]\n\t"
"mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
"mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
#endif
"add %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */
"rsb %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */
"add %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */
"rsb %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */
"strh %[t4], [%[ws]]\n\t"
"strh %[t0], [%[ws], #48]\n\t"
"strh %[t2], [%[ws], #16]\n\t"
"strh %[t3], [%[ws], #32]\n\t"
: [t0] "=&r" (t0),
[t1] "=&r" (t1),
[t2] "=&r" (t2),
[t3] "=&r" (t3),
[t4] "=&r" (t4)
#if ARM_ARCH <= 4
,[t5] "=&r" (t5)
#endif
: [ws] "r" (ws),
#if ARM_ARCH > 4
[c1] "r" (FIX_0_541196100),
[c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
#else
[c2] "r" (-FIX_1_847759065),
[c3] "r" (FIX_0_765366865)
#endif
);
#else
int tmp0, tmp2, tmp10, tmp12;
int z1, z2, z3;
/* Even part */
@ -382,93 +342,18 @@ static void idct4v(int16_t *ws, int16_t *end)
CONST_BITS-PASS1_BITS);
/* Final output stage */
ws[8*0] = (int) (tmp10 + tmp2);
ws[8*3] = (int) (tmp10 - tmp2);
ws[8*1] = (int) (tmp12 + tmp0);
ws[8*2] = (int) (tmp12 - tmp0);
#endif
}
}
/* horizontal-pass 4-point IDCT */
static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
for (; ws < end; out += rowstep, ws += 8)
{
#if defined(CPU_ARM)
int t0, t1, t2, t3, t4;
#if ARM_ARCH <= 4
int t5;
#endif
asm volatile(
"ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[0]) */
"ldrsh %[t1], [%[ws], #4]\n\t" /* t1 = tmp2 (ws[2]) */
"add %[t4], %[t4], #16\n\t" /* add rounding to DC */
"add %[t4], %[t4], #4096\n\t" /* pre-add offset */
"ldrsh %[t2], [%[ws], #2]\n\t" /* t2 = z2 (ws[1]) */
"add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 13
(tmp0 + tmp2) */
"sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 13
(tmp0 - tmp2) */
"ldrsh %[t3], [%[ws], #6]\n\t" /* t3 = z3 (ws[3] */
"add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
#if ARM_ARCH > 4
"smulbb %[t4], %[c1], %[t4]\n\t"
"smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
"smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
#else
"add %[t5], %[t4], %[t4], lsl #3\n\t"
"rsb %[t4], %[t4], %[t5], lsl #4\n\t"
"rsb %[t4], %[t4], %[t4], lsl #5\n\t" /* t4 = z1 */
"mla %[t3], %[c2], %[t3], %[t4]\n\t"
"mla %[t2], %[c3], %[t2], %[t4]\n\t"
#endif
"add %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */
"rsb %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */
"add %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */
"rsb %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */
"mov %[t4], %[t4], asr #18\n\t" /* descale results */
"mov %[t0], %[t0], asr #18\n\t"
"mov %[t2], %[t2], asr #18\n\t"
"mov %[t3], %[t3], asr #18\n\t"
"cmp %[t4], #255\n\t" /* range limit results */
"mvnhi %[t4], %[t4], asr #31\n\t"
"cmp %[t0], #255\n\t"
"mvnhi %[t0], %[t0], asr #31\n\t"
"cmp %[t2], #255\n\t"
"mvnhi %[t2], %[t2], asr #31\n\t"
"cmp %[t3], #255\n\t"
"mvnhi %[t3], %[t3], asr #31\n\t"
"cmp %[t4], #255\n\t"
"mvnhi %[t4], %[t4], asr #31\n\t"
"strb %[t4], [%[out]]\n\t"
"strb %[t0], [%[out], %[o3]]\n\t"
"strb %[t2], [%[out], %[o1]]\n\t"
"strb %[t3], [%[out], %[o2]]\n\t"
: [t0] "=&r" (t0),
[t1] "=&r" (t1),
[t2] "=&r" (t2),
[t3] "=&r" (t3),
[t4] "=&r" (t4)
#if ARM_ARCH <= 4
,[t5] "=&r" (t5)
#endif
: [ws] "r" (ws),
[out] "r" (out),
[o1] "i" (JPEG_PIX_SZ),
[o2] "i" (JPEG_PIX_SZ*2),
[o3] "i" (JPEG_PIX_SZ*3),
#if ARM_ARCH > 4
[c1] "r" (FIX_0_541196100),
[c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
#else
[c2] "r" (-FIX_1_847759065),
[c3] "r" (FIX_0_765366865)
#endif
);
#else
int tmp0, tmp2, tmp10, tmp12;
int z1, z2, z3;
/* Even part */
@ -500,18 +385,27 @@ static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
DS_OUT));
out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0,
DS_OUT));
#endif
}
}
#else
extern void jpeg_idct4v(int16_t *ws, int16_t *end);
extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
#endif
/* vertical-pass 8-point IDCT */
static void idct8v(int16_t *ws, int16_t *end)
static void jpeg_idct8v(int16_t *ws, int16_t *end)
{
long tmp0, tmp1, tmp2, tmp3;
long tmp10, tmp11, tmp12, tmp13;
long z1, z2, z3, z4, z5;
#ifdef JPEG_IDCT_TRANSPOSE
int16_t *ws2 = ws + 64;
for (; ws < end; ws += 8, ws2++)
{
#else
for (; ws < end; ws++)
{
#endif
/* Due to quantization, we will usually find that many of the input
* coefficients are zero, especially the AC terms. We can exploit this
* by short-circuiting the IDCT calculation for any column in which all
@ -520,30 +414,30 @@ static void idct8v(int16_t *ws, int16_t *end)
* With typical images and quantization tables, half or more of the
* column DCT calculations can be simplified this way.
*/
if ((ws[8*1] | ws[8*2] | ws[8*3]
| ws[8*4] | ws[8*5] | ws[8*6] | ws[8*7]) == 0)
if ((ws[V_IN_ST*1] | ws[V_IN_ST*2] | ws[V_IN_ST*3]
| ws[V_IN_ST*4] | ws[V_IN_ST*5] | ws[V_IN_ST*6] | ws[V_IN_ST*7]) == 0)
{
/* AC terms all zero */
int dcval = ws[8*0] << PASS1_BITS;
int dcval = ws[V_IN_ST*0] << PASS1_BITS;
ws[8*0] = ws[8*1] = ws[8*2] = ws[8*3] = ws[8*4]
= ws[8*5] = ws[8*6] = ws[8*7] = dcval;
V_OUT(0) = V_OUT(1) = V_OUT(2) = V_OUT(3) = V_OUT(4) = V_OUT(5) =
V_OUT(6) = V_OUT(7) = dcval;
continue;
}
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
z2 = ws[8*2];
z3 = ws[8*6];
z2 = ws[V_IN_ST*2];
z3 = ws[V_IN_ST*6];
z1 = MULTIPLY16(z2 + z3, FIX_0_541196100);
tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065);
tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865);
z2 = ws[8*0] << CONST_BITS;
z2 = ws[V_IN_ST*0] << CONST_BITS;
z2 += ONE << (CONST_BITS - PASS1_BITS - 1);
z3 = ws[8*4] << CONST_BITS;
z3 = ws[V_IN_ST*4] << CONST_BITS;
tmp0 = (z2 + z3);
tmp1 = (z2 - z3);
@ -556,10 +450,10 @@ static void idct8v(int16_t *ws, int16_t *end)
/* Odd part per figure 8; the matrix is unitary and hence its
transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */
tmp0 = ws[8*7];
tmp1 = ws[8*5];
tmp2 = ws[8*3];
tmp3 = ws[8*1];
tmp0 = ws[V_IN_ST*7];
tmp1 = ws[V_IN_ST*5];
tmp2 = ws[V_IN_ST*3];
tmp3 = ws[V_IN_ST*1];
z1 = tmp0 + tmp3;
z2 = tmp1 + tmp2;
@ -586,19 +480,19 @@ static void idct8v(int16_t *ws, int16_t *end)
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
ws[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
ws[8*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
ws[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
ws[8*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
ws[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
ws[8*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
ws[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
ws[8*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
V_OUT(0) = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
V_OUT(7) = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
V_OUT(1) = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
V_OUT(6) = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
V_OUT(2) = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
V_OUT(5) = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
V_OUT(3) = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
V_OUT(4) = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
}
}
/* horizontal-pass 8-point IDCT */
static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
long tmp0, tmp1, tmp2, tmp3;
long tmp10, tmp11, tmp12, tmp13;
@ -709,20 +603,26 @@ static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
#ifdef HAVE_LCD_COLOR
/* vertical-pass 16-point IDCT */
static void idct16v(int16_t *ws, int16_t *end)
static void jpeg_idct16v(int16_t *ws, int16_t *end)
{
long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
long z1, z2, z3, z4;
#ifdef JPEG_IDCT_TRANSPOSE
int16_t *ws2 = ws + 64;
for (; ws < end; ws += 8, ws2++)
{
#else
for (; ws < end; ws++)
{
#endif
/* Even part */
tmp0 = ws[8*0] << CONST_BITS;
tmp0 = ws[V_IN_ST*0] << CONST_BITS;
/* Add fudge factor here for final descale. */
tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
z1 = ws[8*4];
z1 = ws[V_IN_ST*4];
tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
@ -731,8 +631,8 @@ static void idct16v(int16_t *ws, int16_t *end)
tmp12 = tmp0 + tmp2;
tmp13 = tmp0 - tmp2;
z1 = ws[8*2];
z2 = ws[8*6];
z1 = ws[V_IN_ST*2];
z2 = ws[V_IN_ST*6];
z3 = z1 - z2;
z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
@ -757,10 +657,10 @@ static void idct16v(int16_t *ws, int16_t *end)
/* Odd part */
z1 = ws[8*1];
z2 = ws[8*3];
z3 = ws[8*5];
z4 = ws[8*7];
z1 = ws[V_IN_ST*1];
z2 = ws[V_IN_ST*3];
z3 = ws[V_IN_ST*5];
z4 = ws[V_IN_ST*7];
tmp11 = z1 + z3;
@ -795,27 +695,27 @@ static void idct16v(int16_t *ws, int16_t *end)
tmp11 += z2;
/* Final output stage */
ws[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
ws[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
ws[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
ws[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
ws[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
ws[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
ws[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
ws[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
ws[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
ws[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
ws[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
ws[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
ws[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
ws[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
ws[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
ws[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
V_OUT(0) = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
V_OUT(15) = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
V_OUT(1) = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
V_OUT(14) = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
V_OUT(2) = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
V_OUT(13) = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
V_OUT(3) = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
V_OUT(12) = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
V_OUT(4) = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
V_OUT(11) = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
V_OUT(5) = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
V_OUT(10) = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
V_OUT(6) = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
V_OUT(9) = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
V_OUT(7) = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
V_OUT(8) = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
}
}
/* horizontal-pass 16-point IDCT */
static void idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
static void jpeg_idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@ -946,12 +846,12 @@ struct idct_entry {
};
struct idct_entry idct_tbl[] = {
{ PASS1_BITS, NULL, idct1h },
{ PASS1_BITS, idct2v, idct2h },
{ 0, idct4v, idct4h },
{ 0, idct8v, idct8h },
{ PASS1_BITS, NULL, jpeg_idct1h },
{ PASS1_BITS, jpeg_idct2v, jpeg_idct2h },
{ 0, jpeg_idct4v, jpeg_idct4h },
{ 0, jpeg_idct8v, jpeg_idct8h },
#ifdef HAVE_LCD_COLOR
{ 0, idct16v, idct16h },
{ 0, jpeg_idct16v, jpeg_idct16h },
#endif
};
@ -1468,21 +1368,27 @@ static void fix_huff_tbl(int* htbl, struct derived_tbl* dtbl)
}
/* zag[i] is the natural-order position of the i'th element of zigzag order.
* If the incoming data is corrupted, decode_mcu could attempt to
* reference values beyond the end of the array. To avoid a wild store,
* we put some extra zeroes after the real entries.
*/
/* zag[i] is the natural-order position of the i'th element of zigzag order. */
static const unsigned char zag[] =
{
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
27, 20, 13, 6, 7, 14, 21, 28,
35, 42, 49, 56, 57, 50, 43, 36,
29, 22, 15, 23, 30, 37, 44, 51,
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63,
#ifdef JPEG_IDCT_TRANSPOSE
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
27, 34, 41, 48, 56, 49, 42, 35,
28, 21, 14, 7, 15, 22, 29, 36,
43, 50, 57, 58, 51, 44, 37, 30,
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63,
#endif
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
27, 20, 13, 6, 7, 14, 21, 28,
35, 42, 49, 56, 57, 50, 43, 36,
29, 22, 15, 23, 30, 37, 44, 51,
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63,
};
/* zig[i] is the the zig-zag order position of the i'th element of natural
@ -1898,17 +1804,20 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0];
store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0];
store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2];
int16_t block[128]; /* decoded DCT coefficients */
/* decoded DCT coefficients */
int16_t block[IDCT_WS_SIZE] __attribute__((aligned(8)));
for (x = 0; x < p_jpeg->x_mbl; x++)
{
int blkn;
for (blkn = 0; blkn < p_jpeg->blocks; blkn++)
{
int k = 1; /* coefficient index */
int s, r; /* huffman values */
int ci = p_jpeg->mcu_membership[blkn]; /* component index */
int ti = p_jpeg->tab_membership[blkn]; /* table index */
#ifdef JPEG_IDCT_TRANSPOSE
bool transpose = p_jpeg->v_scale[!!ci] > 2;
#endif
int k = 1; /* coefficient index */
int s, r; /* huffman values */
struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti];
struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti];
@ -1948,7 +1857,11 @@ static struct img_part *store_row_jpeg(void *jpeg_args)
r = get_bits(p_jpeg, s);
r = HUFF_EXTEND(r, s);
r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]);
#ifdef JPEG_IDCT_TRANSPOSE
block[zag[transpose ? k : k + 64]] = r ;
#else
block[zag[k]] = r ;
#endif
}
else
{
@ -1988,10 +1901,19 @@ block_end:
int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]);
unsigned char *b_out = out + (ci ? ci : store_offs[blkn]);
if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct)
#ifdef JPEG_IDCT_TRANSPOSE
idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
transpose ? block + 8 * idct_cols
: block + idct_cols);
uint16_t * h_block = transpose ? block + 64 : block;
idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(h_block, b_out,
h_block + idct_rows * 8, b_width);
#else
idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
block + idct_cols);
idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out,
block + idct_rows * 8, b_width);
#endif
}
} /* for blkn */
/* don't starve other threads while an MCU row decodes */
@ -2048,7 +1970,6 @@ int read_jpeg_file(const char* filename,
{
int fd, ret;
fd = open(filename, O_RDONLY);
JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n",
filename, maxsize, cformat);
/* Exit if file opening failed */
@ -2181,14 +2102,22 @@ int read_jpeg_fd(int fd,
int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1;
src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3;
src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3;
p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
p_jpeg->k_need[0] = zig[p_jpeg->zero_need[0]];
#ifdef JPEG_IDCT_TRANSPOSE
if (p_jpeg->v_scale[0] > 2)
p_jpeg->zero_need[0] = (decode_w << 3) + decode_h;
else
#endif
p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
p_jpeg->k_need[0] = zig[(decode_h << 3) + decode_w];
JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]);
#ifdef HAVE_LCD_COLOR
decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1;
decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1;
p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
p_jpeg->k_need[1] = zig[p_jpeg->zero_need[1]];
if (p_jpeg->v_scale[1] > 2)
p_jpeg->zero_need[1] = (decode_w << 3) + decode_h;
else
p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
p_jpeg->k_need[1] = zig[(decode_h << 3) + decode_w];
JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]);
#endif
if (cformat)