FS#11807 - Major speedup of iPod nano 2G. Part 4: Introduce asm for yuv blitting. Overall speedup of part1-4 is +50% for RGB and +93% for YUV.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@28813 a1c6a512-1295-4272-9138-f99709370657
This commit is contained in:
Andree Buschmann 2010-12-12 15:23:20 +00:00
parent eadff40863
commit 395d72f71a
3 changed files with 276 additions and 110 deletions

View file

@ -1504,6 +1504,7 @@ target/arm/s5l8700/kernel-s5l8700.c
target/arm/s5l8700/dma-s5l8700.c
target/arm/s5l8700/ipodnano2g/backlight-nano2g.c
target/arm/s5l8700/ipodnano2g/lcd-nano2g.c
target/arm/s5l8700/ipodnano2g/lcd-asm-nano2g.S
target/arm/s5l8700/ipodnano2g/powermgmt-nano2g.c
target/arm/s5l8700/ipodnano2g/power-nano2g.c
target/arm/s5l8700/ipodnano2g/ftl-nano2g.c

View file

@ -0,0 +1,254 @@
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id: lcd-as-video.S 26756 2010-06-11 04:41:36Z funman $
*
* Copyright (C) 2010 by Andree Buschmann
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
.section .icode, "ax", %progbits
/****************************************************************************
* extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
* unsigned LCD_BASE,
* int width,
* int stride);
*
* Conversion from Motion JPEG and MPEG Y'PbPr to RGB is:
* |R| |1.164 0.000 1.596| |Y' - 16|
* |G| = |1.164 -0.391 -0.813| |Pb - 128|
* |B| |1.164 2.018 0.000| |Pr - 128|
*
* Scaled, normalized, rounded and tweaked to yield RGB 565:
* |R| |74 0 101| |Y' - 16| >> 9
* |G| = |74 -24 -51| |Cb - 128| >> 8
* |B| |74 128 0| |Cr - 128| >> 9
*
* Converts two lines from YUV to RGB565 and writes to LCD at once. First loop
* loads Cb/Cr, calculates the chroma offset and saves them to buffer. Within
* the second loop these chroma offset are reloaded from buffer. Within each
* loop two pixels are calculated and written to LCD.
*/
.align 2
.global lcd_write_yuv420_lines
.type lcd_write_yuv420_lines, %function
lcd_write_yuv420_lines:
/* r0 = src = yuv_src */
/* r1 = dst = LCD_BASE */
/* r2 = width */
/* r3 = stride */
stmfd sp!, { r4-r10, lr } /* save non-scratch */
ldmia r0, { r9, r10, r12 } /* r9 = yuv_src[0] = Y'_p */
/* r10 = yuv_src[1] = Cb_p */
/* r12 = yuv_src[2] = Cr_p */
add r3, r9, r3 /* r3 = &ysrc[stride] */
add r4, r2, r2, asr #1 /* chroma buffer lenght = width/2 *3 */
mov r4, r4, asl #2 /* use words for str/ldm possibility */
add r4, r4, #19 /* plus room for 4 additional words, */
bic r4, r4, #3 /* rounded up to multiples of 4 byte */
sub sp, sp, r4 /* and allocate on stack */
stmia sp, {r1-r4} /* LCD_BASE, width, &ysrc[stride], stack_alloc */
mov r7, r2 /* r7 = loop count */
add r8, sp, #16 /* chroma buffer */
mov lr, r1 /* LCD data port = LCD_BASE */
/* 1st loop start */
10: /* loop start */
ldrb r0, [r10], #1 /* r0 = *usrc++ = *Cb_p++ */
ldrb r1, [r12], #1 /* r1 = *vsrc++ = *Cr_p++ */
sub r0, r0, #128 /* r0 = Cb-128 */
sub r1, r1, #128 /* r1 = Cr-128 */
add r2, r1, r1, asl #1 /* r2 = Cr*51 + Cb*24 */
add r2, r2, r2, asl #4
add r2, r2, r0, asl #3
add r2, r2, r0, asl #4
add r4, r1, r1, asl #2 /* r1 = Cr*101 */
add r4, r4, r1, asl #5
add r1, r4, r1, asl #6
add r1, r1, #256 /* r1 = rv = (r1 + 256) >> 9 */
mov r1, r1, asr #9
rsb r2, r2, #128 /* r2 = guv = (-r2 + 128) >> 8 */
mov r2, r2, asr #8
add r0, r0, #2 /* r0 = bu = (Cb*128 + 256) >> 9 */
mov r0, r0, asr #2
stmia r8!, {r0-r2} /* store r0, r1 and r2 to chroma buffer */
/* 1st loop, first pixel */
ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
add r3, r5, r5, asl #2
add r5, r3, r5, asl #5
add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r5, r6, r4 /* check if clamping is needed... */
orr r5, r5, r3, asr #1 /* ...at all */
cmp r5, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r3, #63 /* clamp g */
mvnhi r3, r3, asr #31
andhi r3, r3, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
/* calculate pixel_1 and save to r5 for later pixel packing */
orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
orr r5, r4, r6, lsl #11 /* r5 = pixel_1 */
/* 1st loop, second pixel */
ldrb r4, [r9], #1 /* r4 = *ysrc++ = *Y'_p++ */
sub r4, r4, #16 /* r4 = (Y'-16) * 74 */
add r3, r4, r4, asl #2
add r4, r3, r4, asl #5
add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
add r3, r2, r4, asr #7 /* r3 = g = (Y >> 8) + guv */
add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r0, r6, r4 /* check if clamping is needed... */
orr r0, r0, r3, asr #1 /* ...at all */
cmp r0, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r3, #63 /* clamp g */
mvnhi r3, r3, asr #31
andhi r3, r3, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
/* calculate pixel_2 and pack with pixel_1 before writing */
orr r4, r4, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */
/* wait for FIFO half full */
.fifo_wait1:
ldr r3, [lr, #0x1C] /* while (LCD_STATUS & 0x08); */
tst r3, #0x8
bgt .fifo_wait1
str r5, [lr, #0x40] /* write pixel_1 */
str r4, [lr, #0x40] /* write pixel_2 */
subs r7, r7, #2 /* check for loop end */
bgt 10b /* back to beginning */
/* 1st loop end */
/* Reload several registers for pointer rewinding for next loop */
add r8, sp, #16 /* chroma buffer */
ldmia sp, { r1, r7, r9} /* r1 = LCD_BASE */
/* r7 = loop count */
/* r9 = &ysrc[stride] */
/* 2nd loop start */
20: /* loop start */
/* restore r0 (bu), r1 (rv) and r2 (guv) from chroma buffer */
ldmia r8!, {r0-r2}
/* 2nd loop, first pixel */
ldrb r5, [r9], #1 /* r5 = *ysrc++ = *Y'_p++ */
sub r5, r5, #16 /* r5 = (Y'-16) * 74 */
add r3, r5, r5, asl #2
add r5, r3, r5, asl #5
add r6, r1, r5, asr #8 /* r6 = r = (Y >> 9) + rv */
add r3, r2, r5, asr #7 /* r3 = g = (Y >> 8) + guv */
add r4, r0, r5, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r5, r6, r4 /* check if clamping is needed... */
orr r5, r5, r3, asr #1 /* ...at all */
cmp r5, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r3, #63 /* clamp g */
mvnhi r3, r3, asr #31
andhi r3, r3, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
/* calculate pixel_1 and save to r5 for later pixel packing */
orr r4, r4, r3, lsl #5 /* pixel_1 = r<<11 | g<<5 | b */
orr r5, r4, r6, lsl #11 /* r5 = pixel_1 */
/* 2nd loop, second pixel */
ldrb r4, [r9], #1 /* r4 = *ysrc++ = *Y'_p++ */
sub r4, r4, #16 /* r4 = (Y'-16) * 74 */
add r3, r4, r4, asl #2
add r4, r3, r4, asl #5
add r6, r1, r4, asr #8 /* r6 = r = (Y >> 9) + rv */
add r3, r2, r4, asr #7 /* r3 = g = (Y >> 8) + guv */
add r4, r0, r4, asr #8 /* r4 = b = (Y >> 9) + bu */
orr r0, r6, r4 /* check if clamping is needed... */
orr r0, r0, r3, asr #1 /* ...at all */
cmp r0, #31
bls 15f /* -> no clamp */
cmp r6, #31 /* clamp r */
mvnhi r6, r6, asr #31
andhi r6, r6, #31
cmp r3, #63 /* clamp g */
mvnhi r3, r3, asr #31
andhi r3, r3, #63
cmp r4, #31 /* clamp b */
mvnhi r4, r4, asr #31
andhi r4, r4, #31
15: /* no clamp */
/* calculate pixel_2 and pack with pixel_1 before writing */
orr r4, r4, r3, lsl #5 /* pixel_2 = r<<11 | g<<5 | b */
orr r4, r4, r6, lsl #11 /* r4 = pixel_2 */
/* wait for FIFO half full */
.fifo_wait2:
ldr r3, [lr, #0x1C] /* while (LCD_STATUS & 0x08); */
tst r3, #0x8
bgt .fifo_wait2
str r5, [lr, #0x40] /* write pixel_1 */
str r4, [lr, #0x40] /* write pixel_2 */
subs r7, r7, #2 /* check for loop end */
bgt 20b /* back to beginning */
/* 2nd loop end */
ldr r3, [sp, #12]
add sp, sp, r3 /* deallocate buffer */
ldmpc regs=r4-r10 /* restore registers */
.ltorg
.size lcd_write_yuv420_lines, .-lcd_write_yuv420_lines

View file

@ -400,48 +400,21 @@ void lcd_update_rect(int x, int y, int width, int height)
}
}
/*** update functions ***/
/* Line write helper function for lcd_yuv_blit. Writes two lines of yuv420. */
extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
unsigned lcd_baseadress,
int width,
int stride);
#define CSUB_X 2
#define CSUB_Y 2
/* YUV- > RGB565 conversion
* |R| |1.000000 -0.000001 1.402000| |Y'|
* |G| = |1.000000 -0.334136 -0.714136| |Pb|
* |B| |1.000000 1.772000 0.000000| |Pr|
* Scaled, normalized, rounded and tweaked to yield RGB 565:
* |R| |74 0 101| |Y' - 16| >> 9
* |G| = |74 -24 -51| |Cb - 128| >> 8
* |B| |74 128 0| |Cr - 128| >> 9
*/
#define RGBYFAC 74 /* 1.0 */
#define RVFAC 101 /* 1.402 */
#define GVFAC (-51) /* -0.714136 */
#define GUFAC (-24) /* -0.334136 */
#define BUFAC 128 /* 1.772 */
/* ROUNDOFFS contain constant for correct round-offs as well as
constant parts of the conversion matrix (e.g. (Y'-16)*RGBYFAC
-> constant part = -16*RGBYFAC). Through extraction of these
constant parts we save at leat 4 substractions in the conversion
loop */
#define ROUNDOFFSR (256 - 16*RGBYFAC - 128*RVFAC)
#define ROUNDOFFSG (128 - 16*RGBYFAC - 128*GVFAC - 128*GUFAC)
#define ROUNDOFFSB (256 - 16*RGBYFAC - 128*BUFAC)
#define MAX_5BIT 0x1f
#define MAX_6BIT 0x3f
/* Performance function to blit a YUV bitmap directly to the LCD */
/* Blit a YUV bitmap directly to the LCD */
void lcd_blit_yuv(unsigned char * const src[3],
int src_x, int src_y, int stride,
int x, int y, int width, int height)
{
int h;
int y0, x0, y1, x1;
unsigned int z, y0, x0, y1, x1;;
unsigned char const * yuv_src[3];
width = (width + 1) & ~1;
width = (width + 1) & ~1; /* ensure width is even */
x0 = x; /* start horiz */
y0 = y; /* start vert */
@ -471,79 +444,17 @@ void lcd_blit_yuv(unsigned char * const src[3],
s5l_lcd_write_cmd(R_MEMORY_WRITE);
}
const int stride_div_csub_x = stride/CSUB_X;
z = stride * src_y;
yuv_src[0] = src[0] + z + src_x;
yuv_src[1] = src[1] + (z >> 2) + (src_x >> 1);
yuv_src[2] = src[2] + (yuv_src[1] - src[1]);
h = height;
while (h > 0) {
/* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */
const unsigned char *ysrc = src[0] + stride * src_y + src_x;
height >>= 1;
const int uvoffset = stride_div_csub_x * (src_y/CSUB_Y) +
(src_x/CSUB_X);
const unsigned char *usrc = src[1] + uvoffset;
const unsigned char *vsrc = src[2] + uvoffset;
const unsigned char *row_end = ysrc + width;
int yp, up, vp;
int red1, green1, blue1;
int red2, green2, blue2;
int rc, gc, bc;
do
{
up = *usrc++;
vp = *vsrc++;
rc = RVFAC * vp + ROUNDOFFSR;
gc = GVFAC * vp + GUFAC * up + ROUNDOFFSG;
bc = BUFAC * up + ROUNDOFFSB;
/* Pixel 1 -> RGB565 */
yp = *ysrc++ * RGBYFAC;
red1 = (yp + rc) >> 9;
green1 = (yp + gc) >> 8;
blue1 = (yp + bc) >> 9;
/* Pixel 2 -> RGB565 */
yp = *ysrc++ * RGBYFAC;
red2 = (yp + rc) >> 9;
green2 = (yp + gc) >> 8;
blue2 = (yp + bc) >> 9;
/* Since out of bounds errors are relatively rare, we check two
pixels at once to see if any components are out of bounds, and
then fix whichever is broken. This works due to high values and
negative values both being !=0 when bitmasking them.
We first check for red and blue components (5bit range). */
if ((red1 | blue1 | red2 | blue2) & ~MAX_5BIT)
{
if (red1 & ~MAX_5BIT)
red1 = (red1 >> 31) ? 0 : MAX_5BIT;
if (blue1 & ~MAX_5BIT)
blue1 = (blue1 >> 31) ? 0 : MAX_5BIT;
if (red2 & ~MAX_5BIT)
red2 = (red2 >> 31) ? 0 : MAX_5BIT;
if (blue2 & ~MAX_5BIT)
blue2 = (blue2 >> 31) ? 0 : MAX_5BIT;
}
/* We second check for green component (6bit range) */
if ((green1 | green2) & ~MAX_6BIT)
{
if (green1 & ~MAX_6BIT)
green1 = (green1 >> 31) ? 0 : MAX_6BIT;
if (green2 & ~MAX_6BIT)
green2 = (green2 >> 31) ? 0 : MAX_6BIT;
}
/* output 2 pixels */
while (LCD_STATUS & 0x08); /* wait while FIFO is half full */
lcd_write_pixel((red1 << 11) | (green1 << 5) | blue1);
lcd_write_pixel((red2 << 11) | (green2 << 5) | blue2);
}
while (ysrc < row_end);
src_y++;
h--;
}
do {
lcd_write_yuv420_lines(yuv_src, LCD_BASE, width, stride);
yuv_src[0] += stride << 1;
yuv_src[1] += stride >> 1; /* Skip down one chroma line */
yuv_src[2] += stride >> 1;
} while (--height > 0);
}