Assembler optimised lcd_yuv_blit() for iPod Video (Fs #7951 by Andree Buschmann, adapted and separated into an .S file by me).

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15257 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Jens Arnold 2007-10-22 00:37:50 +00:00
parent 86a999c478
commit 3c1b9d9d70
3 changed files with 249 additions and 105 deletions

View file

@ -730,6 +730,7 @@ target/arm/ipod/backlight-nano_video.c
target/arm/ipod/button-clickwheel.c
target/arm/ipod/power-ipod.c
target/arm/ipod/powermgmt-ipod-pcf.c
target/arm/ipod/video/lcd-as-video.S
target/arm/ipod/video/lcd-video.c
target/arm/usb-fw-pp502x.c
#endif /* SIMULATOR */

View file

@ -0,0 +1,226 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (C) 2007 by Andree Buschmann
*
* All files in this archive are subject to the GNU General Public License.
* See the file COPYING in the source tree root for full license agreement.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
/****************************************************************************
* void lcd_write_yuv_420_lines(unsigned char const * const src[3],
* int width,
* int stride);
*
* |R| |1.000000 -0.000001 1.402000| |Y'|
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB 565:
* |R| |74 0 101| |Y' - 16| >> 9
* |G| = |74 -24 -51| |Cb - 128| >> 8
* |B| |74 128 0| |Cr - 128| >> 9
*/
.section .icode, "ax", %progbits
.align 2
.global lcd_write_yuv420_lines
.type lcd_write_yuv420_lines, %function
lcd_write_yuv420_lines:
/* r0 = yuv_src */
/* r1 = width */
/* r2 = stride */
stmfd sp!, { r4-r12 } /* save non-scratch */
ldmia r0, { r10, r11, r12 } /* r10 = yuv_src[0] = Y'_p */
/* r11 = yuv_src[1] = Cb_p */
/* r12 = yuv_src[2] = Cr_p */
add r2, r10, r2 /* r2 = &ysrc[stride] */
add r3, r1, r1, asl #1 /* number of bytes for chroma buffer */
add r3, r3, #15 /* plus room for 3 additional words, */
bic r3, r3, #3 /* rounded up to multiples of 4 byte */
sub sp, sp, r3 /* and allocate on stack */
stmia sp, {r1, r2, r3} /* width, &ysrc[stride], stack_alloc */
mov r7, r1 /* r7 = loop count */
add r8, sp, #12 /* chroma buffer */
mov r9, #0x30000000 /* LCD data port */
/* 1st loop start */
10: /* loop start */
ldrb r0, [r11], #1 /* r0 = *usrc++ = *Cb_p++ */
ldrb r1, [r12], #1 /* r1 = *vsrc++ = *Cr_p++ */
sub r0, r0, #128 /* r0 = Cb-128 */
sub r1, r1, #128 /* r1 = Cr-128 */
add r3, r1, r1, asl #1 /* r3 = Cr*51 + Cb*24 */
add r3, r3, r3, asl #4
add r3, r3, r0, asl #3
add r3, r3, r0, asl #4
add r4, r1, r1, asl #2 /* r1 = Cr*101 */
add r4, r4, r1, asl #5
add r1, r4, r1, asl #6
add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */
mov r1, r1, asr #9
strb r1, [r8], #1 /* store r1 to chroma_buf */
rsb r3, r3, #128 /* r3 = guv = (-r3 + 128) >> 8 */
mov r3, r3, asr #8
strb r3, [r8], #1 /* store r3 to chroma_buf */
add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */
mov r0, r0, asr #2
strb r0, [r8], #1 /* store r0 to chroma_buf */
/* 1st loop, first pixel */
ldrb r5, [r10], #1 /* r5 = *ysrc++ = *Y'_p++ */
sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
add r2, r5, r5, asl #2
add r5, r2, r5, asl #5
add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
add r2, r3, r5, asr #7 /* r2 = g = (Y >> 8) + guv */
add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r5, r6, r4 /* check if clamping is needed... */
orr r5, r5, r2, asr #1 /* ...at all */
cmp r5, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r2, #63 /* clamp g */
mvnhi r2, r2, asr #31
andhi r2, r2, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
orr r4, r4, r2, lsl #5 /* pixel = r<<11 | g<<5 | b */
orr r4, r4, r6, lsl #11
strh r4, [r9] /* write pixel */
/* 1st loop, second pixel */
ldrb r5, [r10], #1 /* r5 = *ysrc++ = *Y'_p++ */
sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
add r2, r5, r5, asl #2
add r5, r2, r5, asl #5
add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
add r2, r3, r5, asr #7 /* r2 = g = (Y >> 8) + guv */
add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r5, r6, r4 /* check if clamping is needed... */
orr r5, r5, r2, asr #1 /* ...at all */
cmp r5, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r2, #63 /* clamp g */
mvnhi r2, r2, asr #31
andhi r2, r2, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
orr r4, r4, r2, lsl #5 /* pixel = r<<11 | g<<5 | b */
orr r4, r4, r6, lsl #11
strh r4, [r9] /* write pixel */
subs r7, r7, #2 /* check for loop end */
bgt 10b /* back to beginning */
/* 1st loop end */
add r8, sp, #12 /* chroma buffer */
ldmia sp, { r7, r10 } /* r7 = loop count */
/* r10 = &ysrc[stride] */
/* 2nd loop start */
20: /* loop start */
/* restore r1, r3 and r0 from chroma buffer */
ldrsb r1, [r8], #1
ldrsb r3, [r8], #1
ldrsb r0, [r8], #1
/* 2nd loop, first pixel */
ldrb r5, [r10], #1 /* r5 = *ysrc++ = *Y'_p++ */
sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
add r2, r5, r5, asl #2
add r5, r2, r5, asl #5
add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
add r2, r3, r5, asr #7 /* r2 = g = (Y >> 8) + guv */
add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r5, r6, r4 /* check if clamping is needed... */
orr r5, r5, r2, asr #1 /* ...at all */
cmp r5, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r2, #63 /* clamp g */
mvnhi r2, r2, asr #31
andhi r2, r2, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
orr r4, r4, r2, lsl #5 /* pixel = r<<11 | g<<5 | b */
orr r4, r4, r6, lsl #11
strh r4, [r9] /* write pixel */
/* 2nd loop, second pixel */
ldrb r5, [r10], #1 /* r5 = *ysrc++ = *Y'_p++ */
sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
add r2, r5, r5, asl #2
add r5, r2, r5, asl #5
add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
add r2, r3, r5, asr #7 /* r2 = g = (Y >> 8) + guv */
add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r5, r6, r4 /* check if clamping is needed... */
orr r5, r5, r2, asr #1 /* ...at all */
cmp r5, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r2, #63 /* clamp g */
mvnhi r2, r2, asr #31
andhi r2, r2, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
orr r4, r4, r2, lsl #5 /* pixel = r<<11 | g<<5 | b */
orr r4, r4, r6, lsl #11
strh r4, [r9] /* write pixel */
subs r7, r7, #2 /* check for loop end */
bgt 20b /* back to beginning */
/* 2nd loop end */
ldr r3, [sp, #8]
add sp, sp, r3 /* deallocate buffer */
ldmfd sp!, { r4-r12 } /* restore registers */
bx lr
.ltorg
.size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines

View file

@ -207,43 +207,26 @@ void lcd_update(void)
lcd_update_rect(0, 0, LCD_WIDTH, LCD_HEIGHT);
}
/* YUV- > RGB565 conversion
* |R| |1.000000 -0.000001 1.402000| |Y'|
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB 565:
* |R| |74 0 101| |Y' - 16| >> 9
* |G| = |74 -24 -51| |Cb - 128| >> 8
* |B| |74 128 0| |Cr - 128| >> 9
*/
#define RGBYFAC 74 /* 1.0 */
#define RVFAC 101 /* 1.402 */
#define GVFAC (-51) /* -0.714136 */
#define GUFAC (-24) /* -0.334136 */
#define BUFAC 128 /* 1.772 */
/* ROUNDOFFS contain constant for correct round-offs as well as
constant parts of the conversion matrix (e.g. (Y'-16)*RGBYFAC
-> constant part = -16*RGBYFAC). Through extraction of these
constant parts we save at leat 4 substractions in the conversion
loop */
#define ROUNDOFFSR (256 - 16*RGBYFAC - 128*RVFAC)
#define ROUNDOFFSG (128 - 16*RGBYFAC - 128*GVFAC - 128*GUFAC)
#define ROUNDOFFSB (256 - 16*RGBYFAC - 128*BUFAC)
#define MAX_5BIT 0x1f
#define MAX_6BIT 0x3f
/* Line write helper function for lcd_yuv_blit. Write two lines of yuv420. */
extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
int width,
int stride);
/* Performance function to blit a YUV bitmap directly to the LCD */
void lcd_yuv_blit(unsigned char * const src[3],
int src_x, int src_y, int stride,
int x, int y, int width, int height) ICODE_ATTR;
void lcd_yuv_blit(unsigned char * const src[3],
int src_x, int src_y, int stride,
int x, int y, int width, int height)
{
width = (width + 1) & ~1;
unsigned char const * yuv_src[3];
off_t z;
/* Sorry, but width and height must be >= 2 or else */
width &= ~1;
z = stride * src_y;
yuv_src[0] = src[0] + z + src_x;
yuv_src[1] = src[1] + (z >> 2) + (src_x >> 1);
yuv_src[2] = src[2] + (yuv_src[1] - src[1]);
if (finishup_needed)
{
@ -279,82 +262,16 @@ void lcd_yuv_blit(unsigned char * const src[3],
/* wait for it to be write ready */
while ((inw(0x30030000) & 0x2) == 0);
const int ymax = y + height - 1;
const int stride_div_sub_x = stride >> 1;
unsigned char *ysrc = 0;
unsigned char *usrc = 0;
unsigned char *vsrc = 0;
unsigned char *row_end = 0;
int uvoffset;
int yp, up, vp, rc, gc, bc; /* temporary variables */
int red1, green1, blue1; /* contain RGB of 1st pixel */
int red2, green2, blue2; /* contain RGB of 2nd pixel */
for (; y <= ymax ; y++)
{
/* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */
uvoffset = stride_div_sub_x*(src_y >> 1) + (src_x >> 1);
ysrc = src[0] + stride * src_y + src_x;
usrc = src[1] + uvoffset;
vsrc = src[2] + uvoffset;
row_end = ysrc + width;
height >>= 1;
do
{
up = *usrc++;
vp = *vsrc++;
rc = RVFAC * vp + ROUNDOFFSR;
gc = GVFAC * vp + GUFAC * up + ROUNDOFFSG;
bc = BUFAC * up + ROUNDOFFSB;
lcd_write_yuv420_lines(yuv_src, width, stride);
/* Pixel 1 -> RGB565 */
yp = *ysrc++ * RGBYFAC;
red1 = (yp + rc) >> 9;
green1 = (yp + gc) >> 8;
blue1 = (yp + bc) >> 9;
/* Pixel 2 -> RGB565 */
yp = *ysrc++ * RGBYFAC;
red2 = (yp + rc) >> 9;
green2 = (yp + gc) >> 8;
blue2 = (yp + bc) >> 9;
/* Since out of bounds errors are relatively rare, we check two
pixels at once to see if any components are out of bounds, and
then fix whichever is broken. This works due to high values and
negative values both being !=0 when bitmasking them.
We first check for red and blue components (5bit range). */
if ((red1 | blue1 | red2 | blue2) & ~MAX_5BIT)
{
if (red1 & ~MAX_5BIT)
red1 = (red1 >> 31) ? 0 : MAX_5BIT;
if (blue1 & ~MAX_5BIT)
blue1 = (blue1 >> 31) ? 0 : MAX_5BIT;
if (red2 & ~MAX_5BIT)
red2 = (red2 >> 31) ? 0 : MAX_5BIT;
if (blue2 & ~MAX_5BIT)
blue2 = (blue2 >> 31) ? 0 : MAX_5BIT;
}
/* We second check for green component (6bit range) */
if ((green1 | green2) & ~MAX_6BIT)
{
if (green1 & ~MAX_6BIT)
green1 = (green1 >> 31) ? 0 : MAX_6BIT;
if (green2 & ~MAX_6BIT)
green2 = (green2 >> 31) ? 0 : MAX_6BIT;
}
/* pixel1 */
outw((red1 << 11) | (green1 << 5) | blue1, 0x30000000);
/* pixel2 */
outw((red2 << 11) | (green2 << 5) | blue2, 0x30000000);
}
while (ysrc < row_end);
src_y++;
yuv_src[0] += stride << 1; /* Skip down two luma lines */
yuv_src[1] += stride >> 1; /* Skip down one chroma line */
yuv_src[2] += stride >> 1;
}
while (--height > 0);
/* Top-half of original lcd_bcm_finishup() function */
outw(0x31, 0x30030000);