From 92785b8f2f20b0fc16de7e771e5eb55fd8497ff8 Mon Sep 17 00:00:00 2001 From: Andrew Mahone Date: Tue, 26 May 2009 20:00:47 +0000 Subject: [PATCH] Use pre-multiplication in scaler to save one multiply per color component on ARM and Coldfire, at the cost of an extra add/shift in the horizontal scaler to reduce values to a workable range. SH-1 retains the same basic math, as the use of 16x16->32 hardware multiplication in the earlier scaler stages saves more than removing the 32x32->40 multiply to descale output. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21091 a1c6a512-1295-4272-9138-f99709370657 --- apps/plugins/bench_scaler.c | 4 +- apps/plugins/lib/grey_draw.c | 2 +- apps/plugins/pictureflow/pictureflow.c | 49 +-- apps/recorder/resize.c | 413 +++++++++++++++++-------- apps/recorder/resize.h | 100 +++--- 5 files changed, 356 insertions(+), 212 deletions(-) diff --git a/apps/plugins/bench_scaler.c b/apps/plugins/bench_scaler.c index c24807dad6..246271dcc8 100644 --- a/apps/plugins/bench_scaler.c +++ b/apps/plugins/bench_scaler.c @@ -49,8 +49,8 @@ static void output_row_null(uint32_t row, void * row_in, #else uint32_t *lim = in + ctx->bm->width; #endif - for (; in < lim; in++) - output = SC_MUL(*in + ctx->round, ctx->divisor); + while (in < lim) + output = SC_OUT(*in++, ctx); return; } diff --git a/apps/plugins/lib/grey_draw.c b/apps/plugins/lib/grey_draw.c index 6315ad9b1a..c1e6376cfe 100644 --- a/apps/plugins/lib/grey_draw.c +++ b/apps/plugins/lib/grey_draw.c @@ -733,7 +733,7 @@ static void output_row_grey_32(uint32_t row, void * row_in, uint32_t *qp = (uint32_t*)row_in; uint8_t *dest = (uint8_t*)ctx->bm->data + ctx->bm->width * row; for (col = 0; col < ctx->bm->width; col++) - *dest++ = SC_MUL((*qp++) + ctx->round,ctx->divisor); + *dest++ = SC_OUT(*qp++, ctx); } static unsigned int get_size_grey(struct bitmap *bm) diff --git a/apps/plugins/pictureflow/pictureflow.c b/apps/plugins/pictureflow/pictureflow.c index a1ad3d2776..bbe2541681 100644 --- a/apps/plugins/pictureflow/pictureflow.c +++ b/apps/plugins/pictureflow/pictureflow.c @@ -592,25 +592,12 @@ static inline PFreal fcos(int iangle) return fsin(iangle + (IANGLE_MAX >> 2)); } -static inline uint32_t div255(uint32_t val) +static inline unsigned scale_val(unsigned val, unsigned bits) { - return ((((val >> 8) + val) >> 8) + val) >> 8; + val = val * ((1 << bits) - 1); + return ((val >> 8) + val + 128) >> 8; } -#define SCALE_VAL(val,out) div255((val) * (out) + 127) -#define SCALE_VAL32(val, out) \ -({ \ - uint32_t val__ = (val) * (out); \ - val__ = ((((val__ >> 8) + val__) >> 8) + val__ + 128) >> 8; \ - val__; \ -}) -#define SCALE_VAL8(val, out) \ -({ \ - unsigned val__ = (val) * (out); \ - val__ = ((val__ >> 8) + val__ + 128) >> 8; \ - val__; \ -}) - static void output_row_8_transposed(uint32_t row, void * row_in, struct scaler_context *ctx) { @@ -625,9 +612,9 @@ static void output_row_8_transposed(uint32_t row, void * row_in, unsigned r, g, b; for (; dest < end; dest += ctx->bm->height) { - r = SCALE_VAL8(qp->red, 31); - g = SCALE_VAL8(qp->green, 63); - b = SCALE_VAL8((qp++)->blue, 31); + r = scale_val(qp->red, 5); + g = scale_val(qp->green, 6); + b = scale_val((qp++)->blue, 5); *dest = LCD_RGBPACK_LCD(r,g,b); } #endif @@ -641,19 +628,15 @@ static void output_row_32_transposed(uint32_t row, void * row_in, #ifdef USEGSLIB uint32_t *qp = (uint32_t*)row_in; for (; dest < end; dest += ctx->bm->height) - *dest = SC_MUL((*qp++) + ctx->round, ctx->divisor); + *dest = SC_OUT(*qp++, ctx); #else struct uint32_rgb *qp = (struct uint32_rgb*)row_in; - uint32_t rb_mul = SCALE_VAL32(ctx->divisor, 31), - rb_rnd = SCALE_VAL32(ctx->round, 31), - g_mul = SCALE_VAL32(ctx->divisor, 63), - g_rnd = SCALE_VAL32(ctx->round, 63); int r, g, b; for (; dest < end; dest += ctx->bm->height) { - r = SC_MUL(qp->r + rb_rnd, rb_mul); - g = SC_MUL(qp->g + g_rnd, g_mul); - b = SC_MUL(qp->b + rb_rnd, rb_mul); + r = scale_val(SC_OUT(qp->r, ctx), 5); + g = scale_val(SC_OUT(qp->g, ctx), 6); + b = scale_val(SC_OUT(qp->b, ctx), 5); qp++; *dest = LCD_RGBPACK_LCD(r,g,b); } @@ -670,14 +653,14 @@ static void output_row_32_transposed_fromyuv(uint32_t row, void * row_in, for (; dest < end; dest += ctx->bm->height) { unsigned r, g, b, y, u, v; - y = SC_MUL(qp->b + ctx->round, ctx->divisor); - u = SC_MUL(qp->g + ctx->round, ctx->divisor); - v = SC_MUL(qp->r + ctx->round, ctx->divisor); + y = SC_OUT(qp->b, ctx); + u = SC_OUT(qp->g, ctx); + v = SC_OUT(qp->r, ctx); qp++; yuv_to_rgb(y, u, v, &r, &g, &b); - r = (31 * r + (r >> 3) + 127) >> 8; - g = (63 * g + (g >> 2) + 127) >> 8; - b = (31 * b + (b >> 3) + 127) >> 8; + r = scale_val(r, 5); + g = scale_val(g, 6); + b = scale_val(b, 5); *dest = LCD_RGBPACK_LCD(r, g, b); } } diff --git a/apps/recorder/resize.c b/apps/recorder/resize.c index 1e9210e819..3a0ad8d75b 100644 --- a/apps/recorder/resize.c +++ b/apps/recorder/resize.c @@ -131,20 +131,45 @@ int recalc_dimension(struct dim *dst, struct dim *src) return false; \ } -/* Set up rounding and scale factors for horizontal area scaler */ -static inline void scale_h_area_setup(struct scaler_context *ctx) +#if defined(CPU_COLDFIRE) +#define MAC(op1, op2, num) \ + asm volatile( \ + "mac.l %0, %1, %%acc" #num \ + : \ + : "%d" (op1), "d" (op2)\ + ) +#define MAC_OUT(dest, num) \ + asm volatile( \ + "movclr.l %%acc" #num ", %0" \ + : "=d" (dest) \ + ) +#elif defined(CPU_SH) +/* calculate the 32-bit product of unsigned 16-bit op1 and op2 */ +static inline int32_t mul_s16_s16(int16_t op1, int16_t op2) { -/* sum is output value * src->width */ - SDEBUGF("scale_h_area_setup\n"); - ctx->divisor = ctx->src->width; + return (int32_t)(op1 * op2); } +/* calculate the 32-bit product of signed 16-bit op1 and op2 */ +static inline uint32_t mul_u16_u16(uint16_t op1, uint16_t op2) +{ + return (uint32_t)(op1 * op2); +} +#endif + /* horizontal area average scaler */ static bool scale_h_area(void *out_line_ptr, struct scaler_context *ctx, bool accum) { SDEBUGF("scale_h_area\n"); unsigned int ix, ox, oxe, mul; +#if defined(CPU_SH) || defined (TEST_SH_MATH) + const uint32_t h_i_val = ctx->src->width, + h_o_val = ctx->bm->width; +#else + const uint32_t h_i_val = ctx->h_i_val, + h_o_val = ctx->h_o_val; +#endif #ifdef HAVE_LCD_COLOR struct uint32_rgb rgbvalacc = { 0, 0, 0 }, rgbvaltmp = { 0, 0, 0 }, @@ -161,31 +186,57 @@ static bool scale_h_area(void *out_line_ptr, yield(); for (ix = 0; ix < (unsigned int)ctx->src->width; ix++) { - oxe += ctx->bm->width; + oxe += h_o_val; /* end of current area has been reached */ /* fill buffer if needed */ FILL_BUF(part,ctx->store_part,ctx->args); #ifdef HAVE_LCD_COLOR - if (oxe >= (unsigned int)ctx->src->width) + if (oxe >= h_i_val) { /* "reset" error, which now represents partial coverage of next pixel by the next area */ - oxe -= ctx->src->width; + oxe -= h_i_val; +#if defined(CPU_COLDFIRE) +/* Coldfire EMAC math */ /* add saved partial pixel from start of area */ - rgbvalacc.r = rgbvalacc.r * ctx->bm->width + rgbvaltmp.r * mul; - rgbvalacc.g = rgbvalacc.g * ctx->bm->width + rgbvaltmp.g * mul; - rgbvalacc.b = rgbvalacc.b * ctx->bm->width + rgbvaltmp.b * mul; + MAC(rgbvalacc.r, h_o_val, 0); + MAC(rgbvalacc.g, h_o_val, 1); + MAC(rgbvalacc.b, h_o_val, 2); + MAC(rgbvaltmp.r, mul, 0); + MAC(rgbvaltmp.g, mul, 1); + MAC(rgbvaltmp.b, mul, 2); + /* get new pixel , then add its partial coverage to this area */ + mul = h_o_val - oxe; + rgbvaltmp.r = part->buf->red; + rgbvaltmp.g = part->buf->green; + rgbvaltmp.b = part->buf->blue; + MAC(rgbvaltmp.r, mul, 0); + MAC(rgbvaltmp.g, mul, 1); + MAC(rgbvaltmp.b, mul, 2); + MAC_OUT(rgbvalacc.r, 0); + MAC_OUT(rgbvalacc.g, 1); + MAC_OUT(rgbvalacc.b, 2); +#else +/* generic C math */ + /* add saved partial pixel from start of area */ + rgbvalacc.r = rgbvalacc.r * h_o_val + rgbvaltmp.r * mul; + rgbvalacc.g = rgbvalacc.g * h_o_val + rgbvaltmp.g * mul; + rgbvalacc.b = rgbvalacc.b * h_o_val + rgbvaltmp.b * mul; /* get new pixel , then add its partial coverage to this area */ rgbvaltmp.r = part->buf->red; rgbvaltmp.g = part->buf->green; rgbvaltmp.b = part->buf->blue; - mul = ctx->bm->width - oxe; + mul = h_o_val - oxe; rgbvalacc.r += rgbvaltmp.r * mul; rgbvalacc.g += rgbvaltmp.g * mul; rgbvalacc.b += rgbvaltmp.b * mul; +#endif /* CPU */ + rgbvalacc.r = (rgbvalacc.r + (1 << 21)) >> 22; + rgbvalacc.g = (rgbvalacc.g + (1 << 21)) >> 22; + rgbvalacc.b = (rgbvalacc.b + (1 << 21)) >> 22; /* store or accumulate to output row */ if (accum) { @@ -200,7 +251,7 @@ static bool scale_h_area(void *out_line_ptr, rgbvalacc.r = 0; rgbvalacc.g = 0; rgbvalacc.b = 0; - mul = ctx->bm->width - mul; + mul = oxe; ox += 1; /* inside an area */ } else { @@ -210,21 +261,45 @@ static bool scale_h_area(void *out_line_ptr, rgbvalacc.b += part->buf->blue; } #else - if (oxe >= (unsigned int)ctx->src->width) + if (oxe >= h_i_val) { /* "reset" error, which now represents partial coverage of next pixel by the next area */ - oxe -= ctx->src->width; - + oxe -= h_i_val; +#if defined(CPU_COLDFIRE) +/* Coldfire EMAC math */ /* add saved partial pixel from start of area */ - acc = MULUQ(acc, ctx->bm->width) + MULUQ(tmp, mul); + MAC(acc, h_o_val, 0); + MAC(tmp, mul, 0); + /* get new pixel , then add its partial coverage to this area */ + tmp = *(part->buf); + mul = h_o_val - oxe; + MAC(tmp, mul, 0); + MAC_OUT(acc, 0); +#elif defined(CPU_SH) +/* SH-1 16x16->32 math */ + /* add saved partial pixel from start of area */ + acc = mul_u16_u16(acc, h_o_val) + mul_u16_u16(tmp, mul); /* get new pixel , then add its partial coverage to this area */ tmp = *(part->buf); - mul = ctx->bm->width - oxe; - acc += MULUQ(tmp, mul); + mul = h_o_val - oxe; + acc += mul_u16_u16(tmp, mul); +#else +/* generic C math */ + /* add saved partial pixel from start of area */ + acc = (acc * h_o_val) + (tmp * mul); + + /* get new pixel , then add its partial coverage to this area */ + tmp = *(part->buf); + mul = h_o_val - oxe; + acc += tmp * mul; +#endif /* CPU */ +#if !(defined(CPU_SH) || defined(TEST_SH_MATH)) /* round, divide, and either store or accumulate to output row */ + acc = (acc + (1 << 21)) >> 22; +#endif if (accum) { acc += out_line[ox]; @@ -232,7 +307,7 @@ static bool scale_h_area(void *out_line_ptr, out_line[ox] = acc; /* reset accumulator */ acc = 0; - mul = ctx->bm->width - mul; + mul = oxe; ox += 1; /* inside an area */ } else { @@ -249,56 +324,56 @@ static bool scale_h_area(void *out_line_ptr, /* vertical area average scaler */ static inline bool scale_v_area(struct rowset *rset, struct scaler_context *ctx) { - uint32_t mul, x, oy, iy, oye; + uint32_t mul, oy, iy, oye; +#if defined(CPU_SH) || defined (TEST_SH_MATH) + const uint32_t v_i_val = ctx->src->height, + v_o_val = ctx->bm->height; +#else + const uint32_t v_i_val = ctx->v_i_val, + v_o_val = ctx->v_o_val; +#endif /* Set up rounding and scale factors */ - ctx->divisor *= ctx->src->height; - ctx->round = ctx->divisor >> 1; - ctx->divisor = 1 + (-((ctx->divisor + 1) >> 1)) / ctx->divisor; mul = 0; oy = rset->rowstart; oye = 0; #ifdef HAVE_LCD_COLOR uint32_t *rowacc = (uint32_t *) ctx->buf, - *rowtmp = rowacc + 3 * ctx->bm->width; + *rowtmp = rowacc + 3 * ctx->bm->width, + *rowacc_px, *rowtmp_px; memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(struct uint32_rgb)); #else uint32_t *rowacc = (uint32_t *) ctx->buf, - *rowtmp = rowacc + ctx->bm->width; + *rowtmp = rowacc + ctx->bm->width, + *rowacc_px, *rowtmp_px; memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(uint32_t)); #endif SDEBUGF("scale_v_area\n"); /* zero the accumulator and temp rows */ for (iy = 0; iy < (unsigned int)ctx->src->height; iy++) { - oye += ctx->bm->height; + oye += v_o_val; /* end of current area has been reached */ - if (oye >= (unsigned int)ctx->src->height) + if (oye >= v_i_val) { /* "reset" error, which now represents partial coverage of the next row by the next area */ - oye -= ctx->src->height; + oye -= v_i_val; /* add stored partial row to accumulator */ -#ifdef HAVE_LCD_COLOR - for (x = 0; x < 3 * (unsigned int)ctx->bm->width; x++) -#else - for (x = 0; x < (unsigned int)ctx->bm->width; x++) -#endif - rowacc[x] = rowacc[x] * ctx->bm->height + mul * rowtmp[x]; + for(rowacc_px = rowacc, rowtmp_px = rowtmp; rowacc_px != rowtmp; + rowacc_px++, rowtmp_px++) + *rowacc_px = *rowacc_px * v_o_val + *rowtmp_px * mul; /* store new scaled row in temp row */ if(!ctx->h_scaler(rowtmp, ctx, false)) return false; /* add partial coverage by new row to this area, then round and scale to final value */ - mul = ctx->bm->height - oye; -#ifdef HAVE_LCD_COLOR - for (x = 0; x < 3 * (unsigned int)ctx->bm->width; x++) -#else - for (x = 0; x < (unsigned int)ctx->bm->width; x++) -#endif - rowacc[x] += mul * rowtmp[x]; + mul = v_o_val - oye; + for(rowacc_px = rowacc, rowtmp_px = rowtmp; rowacc_px != rowtmp; + rowacc_px++, rowtmp_px++) + *rowacc_px += mul * *rowtmp_px; ctx->output_row(oy, (void*)rowacc, ctx); /* clear accumulator row, store partial coverage for next row */ #ifdef HAVE_LCD_COLOR @@ -319,20 +394,18 @@ static inline bool scale_v_area(struct rowset *rset, struct scaler_context *ctx) } #ifdef HAVE_UPSCALER -/* Set up rounding and scale factors for the horizontal scaler. The divisor - is bm->width - 1, so that the first and last pixels in the row align - exactly between input and output -*/ -static inline void scale_h_linear_setup(struct scaler_context *ctx) -{ - ctx->divisor = ctx->bm->width - 1; -} - /* horizontal linear scaler */ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, bool accum) { unsigned int ix, ox, ixe; +#if defined(CPU_SH) || defined (TEST_SH_MATH) + const uint32_t h_i_val = ctx->src->width - 1, + h_o_val = ctx->bm->width - 1; +#else + const uint32_t h_i_val = ctx->h_i_val, + h_o_val = ctx->h_o_val; +#endif /* type x = x is an ugly hack for hiding an unitialized data warning. The values are conditionally initialized before use, but other values are set such that this will occur before these are used. @@ -348,27 +421,35 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, FILL_BUF_INIT(part,ctx->store_part,ctx->args); ix = 0; /* The error is set so that values are initialized on the first pass. */ - ixe = ctx->bm->width - 1; + ixe = h_o_val; /* give other tasks a chance to run */ yield(); for (ox = 0; ox < (uint32_t)ctx->bm->width; ox++) { #ifdef HAVE_LCD_COLOR - if (ixe >= ((uint32_t)ctx->bm->width - 1)) + if (ixe >= h_o_val) { /* Store the new "current" pixel value in rgbval, and the color step value in rgbinc. */ - ixe -= (ctx->bm->width - 1); + ixe -= h_o_val; rgbinc.r = -(part->buf->red); rgbinc.g = -(part->buf->green); rgbinc.b = -(part->buf->blue); - rgbval.r = (part->buf->red) * (ctx->bm->width - 1); - rgbval.g = (part->buf->green) * (ctx->bm->width - 1); - rgbval.b = (part->buf->blue) * (ctx->bm->width - 1); +#if defined(CPU_COLDFIRE) +/* Coldfire EMAC math */ + MAC(part->buf->red, h_o_val, 0); + MAC(part->buf->green, h_o_val, 1); + MAC(part->buf->blue, h_o_val, 2); +#else +/* generic C math */ + rgbval.r = (part->buf->red) * h_o_val; + rgbval.g = (part->buf->green) * h_o_val; + rgbval.b = (part->buf->blue) * h_o_val; +#endif /* CPU */ ix += 1; /* If this wasn't the last pixel, add the next one to rgbinc. */ - if (ix < (uint32_t)ctx->src->width) { + if (LIKELY(ix < (uint32_t)ctx->src->width)) { part->buf++; part->len--; /* Fetch new pixels if needed */ @@ -379,14 +460,28 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, /* Add a partial step to rgbval, in this pixel isn't precisely aligned with the new source pixel */ +#if defined(CPU_COLDFIRE) +/* Coldfire EMAC math */ + MAC(rgbinc.r, ixe, 0); + MAC(rgbinc.g, ixe, 1); + MAC(rgbinc.b, ixe, 2); +#else +/* generic C math */ rgbval.r += rgbinc.r * ixe; rgbval.g += rgbinc.g * ixe; rgbval.b += rgbinc.b * ixe; +#endif } - /* Now multiple the color increment to its proper value */ - rgbinc.r *= ctx->src->width - 1; - rgbinc.g *= ctx->src->width - 1; - rgbinc.b *= ctx->src->width - 1; +#if defined(CPU_COLDFIRE) +/* get final EMAC result out of ACC registers */ + MAC_OUT(rgbval.r, 0); + MAC_OUT(rgbval.g, 1); + MAC_OUT(rgbval.b, 2); +#endif + /* Now multiply the color increment to its proper value */ + rgbinc.r *= h_i_val; + rgbinc.g *= h_i_val; + rgbinc.b *= h_i_val; } else { rgbval.r += rgbinc.r; rgbval.g += rgbinc.g; @@ -395,27 +490,36 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, /* round and scale values, and accumulate or store to output */ if (accum) { - out_line[ox].r += rgbval.r; - out_line[ox].g += rgbval.g; - out_line[ox].b += rgbval.b; + out_line[ox].r += (rgbval.r + (1 << 21)) >> 22; + out_line[ox].g += (rgbval.g + (1 << 21)) >> 22; + out_line[ox].b += (rgbval.b + (1 << 21)) >> 22; } else { - out_line[ox].r = rgbval.r; - out_line[ox].g = rgbval.g; - out_line[ox].b = rgbval.b; + out_line[ox].r = (rgbval.r + (1 << 21)) >> 22; + out_line[ox].g = (rgbval.g + (1 << 21)) >> 22; + out_line[ox].b = (rgbval.b + (1 << 21)) >> 22; } #else - if (ixe >= ((uint32_t)ctx->bm->width - 1)) + if (ixe >= h_o_val) { /* Store the new "current" pixel value in rgbval, and the color step value in rgbinc. */ - ixe -= (ctx->bm->width - 1); + ixe -= h_o_val; val = *(part->buf); inc = -val; - val = MULUQ(val, ctx->bm->width - 1); +#if defined(CPU_COLDFIRE) +/* Coldfire EMAC math */ + MAC(val, h_o_val, 0); +#elif defined(CPU_SH) +/* SH-1 16x16->32 math */ + val = mul_u16_u16(val, h_o_val); +#else +/* generic C math */ + val = val * h_o_val; +#endif ix += 1; /* If this wasn't the last pixel, add the next one to rgbinc. */ - if (ix < (uint32_t)ctx->src->width) { + if (LIKELY(ix < (uint32_t)ctx->src->width)) { part->buf++; part->len--; /* Fetch new pixels if needed */ @@ -424,12 +528,40 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, /* Add a partial step to rgbval, in this pixel isn't precisely aligned with the new source pixel */ - val += MULQ(inc, ixe); +#if defined(CPU_COLDFIRE) +/* Coldfire EMAC math */ + MAC(inc, ixe, 0); +#elif defined(CPU_SH) +/* SH-1 16x16->32 math */ + val += mul_s16_s16(inc, ixe); +#else +/* generic C math */ + val += inc * ixe; +#endif } +#if defined(CPU_COLDFIRE) +/* get final EMAC result out of ACC register */ + MAC_OUT(val, 0); +#endif /* Now multiply the color increment to its proper value */ - inc = MULQ(inc, ctx->src->width - 1); +#if defined(CPU_SH) +/* SH-1 16x16->32 math */ + inc = mul_s16_s16(inc, h_i_val); +#else +/* generic C math */ + inc *= h_i_val; +#endif } else val += inc; +#if !(defined(CPU_SH) || defined(TEST_SH_MATH)) + /* round and scale values, and accumulate or store to output */ + if (accum) + { + out_line[ox] += (val + (1 << 21)) >> 22; + } else { + out_line[ox] = (val + (1 << 21)) >> 22; + } +#else /* round and scale values, and accumulate or store to output */ if (accum) { @@ -438,7 +570,8 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, out_line[ox] = val; } #endif - ixe += ctx->src->width - 1; +#endif + ixe += h_i_val; } return true; } @@ -447,71 +580,66 @@ static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx, static inline bool scale_v_linear(struct rowset *rset, struct scaler_context *ctx) { - uint32_t mul, x, iy, iye; + uint32_t mul, iy, iye; int32_t oy; - /* Set up scale and rounding factors, the divisor is bm->height - 1 */ - ctx->divisor *= (ctx->bm->height - 1); - ctx->round = ctx->divisor >> 1; - ctx->divisor = 1 + (-((ctx->divisor + 1) >> 1)) / ctx->divisor; - /* Set up our two temp buffers. The names are generic because they'll be - swapped each time a new input row is read +#if defined(CPU_SH) || defined (TEST_SH_MATH) + const uint32_t v_i_val = ctx->src->height - 1, + v_o_val = ctx->bm->height - 1; +#else + const uint32_t v_i_val = ctx->v_i_val, + v_o_val = ctx->v_o_val; +#endif + /* Set up our buffers, to store the increment and current value for each + column, and one temp buffer used to read in new rows. */ #ifdef HAVE_LCD_COLOR uint32_t *rowinc = (uint32_t *)(ctx->buf), *rowval = rowinc + 3 * ctx->bm->width, - *rowtmp = rowval + 3 * ctx->bm->width; + *rowtmp = rowval + 3 * ctx->bm->width, #else uint32_t *rowinc = (uint32_t *)(ctx->buf), *rowval = rowinc + ctx->bm->width, - *rowtmp = rowval + ctx->bm->width; + *rowtmp = rowval + ctx->bm->width, #endif + *rowinc_px, *rowval_px, *rowtmp_px; SDEBUGF("scale_v_linear\n"); mul = 0; iy = 0; - iye = ctx->bm->height - 1; + iye = v_o_val; /* get first scaled row in rowtmp */ if(!ctx->h_scaler((void*)rowtmp, ctx, false)) return false; for (oy = rset->rowstart; oy != rset->rowstop; oy += rset->rowstep) { - if (iye >= (uint32_t)ctx->bm->height - 1) + if (iye >= v_o_val) { - iye -= ctx->bm->height - 1; + iye -= v_o_val; iy += 1; -#ifdef HAVE_LCD_COLOR - for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++) -#else - for (x = 0; x < (uint32_t)ctx->bm->width; x++) -#endif + for(rowinc_px = rowinc, rowtmp_px = rowtmp, rowval_px = rowval; + rowinc_px < rowval; rowinc_px++, rowtmp_px++, rowval_px++) { - rowinc[x] = -rowtmp[x]; - rowval[x] = rowtmp[x] * (ctx->bm->height - 1); + *rowinc_px = -*rowtmp_px; + *rowval_px = *rowtmp_px * v_o_val; } if (iy < (uint32_t)ctx->src->height) { if (!ctx->h_scaler((void*)rowtmp, ctx, false)) return false; -#ifdef HAVE_LCD_COLOR - for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++) -#else - for (x = 0; x < (uint32_t)ctx->bm->width; x++) -#endif + for(rowinc_px = rowinc, rowtmp_px = rowtmp, rowval_px = rowval; + rowinc_px < rowval; rowinc_px++, rowtmp_px++, rowval_px++) { - rowinc[x] += rowtmp[x]; - rowval[x] += rowinc[x] * iye; - rowinc[x] *= ctx->src->height - 1; + *rowinc_px += *rowtmp_px; + *rowval_px += *rowinc_px * iye; + *rowinc_px *= v_i_val; } } } else -#ifdef HAVE_LCD_COLOR - for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++) -#else - for (x = 0; x < (uint32_t)ctx->bm->width; x++) -#endif - rowval[x] += rowinc[x]; + for(rowinc_px = rowinc, rowval_px = rowval; rowinc_px < rowval; + rowinc_px++, rowval_px++) + *rowval_px += *rowinc_px; ctx->output_row(oy, (void*)rowval, ctx); - iye += ctx->src->height - 1; + iye += v_i_val; } return true; } @@ -533,9 +661,9 @@ static void output_row_32_native_fromyuv(uint32_t row, void * row_in, for (col = 0; col < ctx->bm->width; col++) { if (ctx->dither) delta = DITHERXDY(col,dy); - y = SC_MUL(qp->b + ctx->round, ctx->divisor); - u = SC_MUL(qp->g + ctx->round, ctx->divisor); - v = SC_MUL(qp->r + ctx->round, ctx->divisor); + y = SC_OUT(qp->b, ctx); + u = SC_OUT(qp->g, ctx); + v = SC_OUT(qp->r, ctx); qp++; yuv_to_rgb(y, u, v, &r, &g, &b); r = (31 * r + (r >> 3) + delta) >> 8; @@ -571,7 +699,7 @@ static void output_row_32_native(uint32_t row, void * row_in, for (col = 0; col < ctx->bm->width; col++) { if (ctx->dither) delta = DITHERXDY(col,dy); - bright = SC_MUL((*qp++) + ctx->round,ctx->divisor); + bright = SC_OUT(*qp++, ctx); bright = (3 * bright + (bright >> 6) + delta) >> 8; data |= (~bright & 3) << shift; shift -= 2; @@ -594,7 +722,7 @@ static void output_row_32_native(uint32_t row, void * row_in, for (col = 0; col < ctx->bm->width; col++) { if (ctx->dither) delta = DITHERXDY(col,dy); - bright = SC_MUL((*qp++) + ctx->round, ctx->divisor); + bright = SC_OUT(*qp++, ctx); bright = (3 * bright + (bright >> 6) + delta) >> 8; *dest++ |= (~bright & 3) << shift; } @@ -609,7 +737,7 @@ static void output_row_32_native(uint32_t row, void * row_in, for (col = 0; col < ctx->bm->width; col++) { if (ctx->dither) delta = DITHERXDY(col,dy); - bright = SC_MUL((*qp++) + ctx->round, ctx->divisor); + bright = SC_OUT(*qp++, ctx); bright = (3 * bright + (bright >> 6) + delta) >> 8; *dest++ |= vi_pattern[bright] << shift; } @@ -625,9 +753,9 @@ static void output_row_32_native(uint32_t row, void * row_in, if (ctx->dither) delta = DITHERXDY(col,dy); q0 = *qp++; - r = SC_MUL(q0.r + ctx->round, ctx->divisor); - g = SC_MUL(q0.g + ctx->round, ctx->divisor); - b = SC_MUL(q0.b + ctx->round, ctx->divisor); + r = SC_OUT(q0.r, ctx); + g = SC_OUT(q0.g, ctx); + b = SC_OUT(q0.b, ctx); r = (31 * r + (r >> 3) + delta) >> 8; g = (63 * g + (g >> 2) + delta) >> 8; b = (31 * b + (b >> 3) + delta) >> 8; @@ -664,13 +792,10 @@ int resize_on_load(struct bitmap *bm, bool dither, struct dim *src, struct img_part* (*store_part)(void *args), void *args) { - -#ifdef HAVE_UPSCALER const int sw = src->width; const int sh = src->height; const int dw = bm->width; const int dh = bm->height; -#endif int ret; #ifdef HAVE_LCD_COLOR unsigned int needed = sizeof(struct uint32_rgb) * 3 * bm->width; @@ -721,6 +846,9 @@ int resize_on_load(struct bitmap *bm, bool dither, struct dim *src, ctx.bm = bm; ctx.src = src; ctx.dither = dither; +#if defined(CPU_SH) || defined (TEST_SH_MATH) + uint32_t div; +#endif #if !defined(PLUGIN) #if defined(HAVE_LCD_COLOR) && defined(HAVE_JPEG) ctx.output_row = format_index ? output_row_32_native_fromyuv @@ -740,23 +868,56 @@ int resize_on_load(struct bitmap *bm, bool dither, struct dim *src, { #endif ctx.h_scaler = scale_h_area; - scale_h_area_setup(&ctx); +#if defined(CPU_SH) || defined (TEST_SH_MATH) + div = sw; +#else + uint32_t h_div = (1U << 24) / sw; + ctx.h_i_val = sw * h_div; + ctx.h_o_val = dw * h_div; +#endif #ifdef HAVE_UPSCALER } else { ctx.h_scaler = scale_h_linear; - scale_h_linear_setup(&ctx); +#if defined(CPU_SH) || defined (TEST_SH_MATH) + div = dw - 1; +#else + uint32_t h_div = (1U << 24) / (dw - 1); + ctx.h_i_val = (sw - 1) * h_div; + ctx.h_o_val = (dw - 1) * h_div; +#endif } #endif - SC_MUL_INIT; +#ifdef CPU_COLDFIRE + coldfire_set_macsr(EMAC_UNSIGNED); +#endif #ifdef HAVE_UPSCALER if (sh > dh) +#endif + { +#if defined(CPU_SH) || defined (TEST_SH_MATH) + div *= sh; + ctx.recip = ((uint32_t)(-div)) / div + 1; +#else + uint32_t v_div = (1U << 22) / sh; + ctx.v_i_val = sh * v_div; + ctx.v_o_val = dh * v_div; #endif ret = scale_v_area(rset, &ctx); + } #ifdef HAVE_UPSCALER else - ret = scale_v_linear(rset, &ctx); + { +#if defined(CPU_SH) || defined (TEST_SH_MATH) + div *= dh - 1; + ctx.recip = ((uint32_t)(-div)) / div + 1; +#else + uint32_t v_div = (1U << 22) / dh; + ctx.v_i_val = (sh - 1) * v_div; + ctx.v_o_val = (dh - 1) * v_div; +#endif + ret = scale_v_linear(rset, &ctx); + } #endif - SC_MUL_END; #ifdef HAVE_ADJUSTABLE_CPU_FREQ cpu_boost(false); #endif diff --git a/apps/recorder/resize.h b/apps/recorder/resize.h index 2964fcd2a9..ef32066a0d 100644 --- a/apps/recorder/resize.h +++ b/apps/recorder/resize.h @@ -43,67 +43,61 @@ #define MAX_SC_STACK_ALLOC 0 #define HAVE_UPSCALER 1 -#if defined(CPU_COLDFIRE) -#define SC_MUL_INIT \ - unsigned long macsr_st = coldfire_get_macsr(); \ - coldfire_set_macsr(EMAC_UNSIGNED); -#define SC_MUL_END coldfire_set_macsr(macsr_st); -#define SC_MUL(x, y) \ -({ \ - unsigned long t; \ - asm ("mac.l %[a], %[b], %%acc0\n\t" \ - "move.l %%accext01, %[t]\n\t" \ - "move.l #0, %%acc0\n\t" \ - : [t] "=r" (t) : [a] "r" (x), [b] "r" (y)); \ - t; \ -}) -#elif (CONFIG_CPU == SH7034) -/* multiply two unsigned 32 bit values and return the top 32 bit - * of the 64 bit result */ -static inline unsigned sc_mul32(unsigned a, unsigned b) +#if defined(CPU_SH) +/* perform 32x32->40 unsigned multiply, round off and return top 8 bits */ +static inline uint32_t sc_mul_u32_rnd(uint32_t m, uint32_t n) { unsigned r, t1, t2, t3; - + unsigned h = 1 << 15; + /* notation: + m = ab, n = cd + final result is (((a *c) << 32) + ((b * c + a * d) << 16) + b * d + + (1 << 31)) >> 32 + */ asm ( - "swap.w %[a], %[t1] \n" /* t1 = ba */ - "mulu %[t1], %[b] \n" /* a * d */ - "swap.w %[b], %[t3] \n" /* t3 = dc */ - "sts macl, %[t2] \n" /* t2 = a * d */ - "mulu %[t1], %[t3] \n" /* a * c */ - "sts macl, %[r] \n" /* hi = a * c */ - "mulu %[a], %[t3] \n" /* b * c */ - "clrt \n" - "sts macl, %[t3] \n" /* t3 = b * c */ - "addc %[t2], %[t3] \n" /* t3 += t2, carry -> t2 */ - "movt %[t2] \n" - "mulu %[a], %[b] \n" /* b * d */ - "mov %[t3], %[t1] \n" /* t1t3 = t2t3 << 16 */ - "xtrct %[t2], %[t1] \n" - "shll16 %[t3] \n" - "sts macl, %[t2] \n" /* lo = b * d */ - "clrt \n" /* hi.lo += t1t3 */ - "addc %[t3], %[t2] \n" - "addc %[t1], %[r] \n" + "swap.w %[m], %[t1]\n\t" /* t1 = ba */ + "mulu %[m], %[n]\n\t" /* b * d */ + "swap.w %[n], %[t3]\n\t" /* t3 = dc */ + "sts macl, %[r]\n\t" /* r = b * d */ + "mulu %[m], %[t3]\n\t" /* b * c */ + "shlr16 %[r]\n\t" + "sts macl, %[t2]\n\t" /* t2 = b * c */ + "mulu %[t1], %[t3]\n\t" /* a * c */ + "add %[t2], %[r]\n\t" + "sts macl, %[t3]\n\t" /* t3 = a * c */ + "mulu %[t1], %[n]\n\t" /* a * d */ + "shll16 %[t3]\n\t" + "sts macl, %[t2]\n\t" /* t2 = a * d */ + "add %[t2], %[r]\n\t" + "add %[t3], %[r]\n\t" /* r = ((b * d) >> 16) + (b * c + a * d) + + ((a * c) << 16) */ + "add %[h], %[r]\n\t" /* round result */ + "shlr16 %[r]\n\t" /* truncate result */ : /* outputs */ [r] "=&r"(r), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3) : /* inputs */ - [a] "r" (a), - [b] "r" (b) + [h] "r" (h), + [m] "r" (m), + [n] "r" (n) ); return r; } -#define SC_MUL(x, y) sc_mul32(x, y) -#define SC_MUL_INIT -#define SC_MUL_END +#elif defined(TEST_SH_MATH) +static inline uint32_t sc_mul_u32_rnd(uint32_t op1, uint32_t op2) +{ + uint64_t tmp = (uint64_t)op1 * op2; + tmp += 1LU << 31; + tmp >>= 32; + return tmp; +} +#else +#define SC_OUT(n, c) (((n) + (1 << 23)) >> 24) #endif - -#ifndef SC_MUL -#define SC_MUL(x, y) ((x) * (uint64_t)(y) >> 32) -#define SC_MUL_INIT -#define SC_MUL_END +#ifndef SC_OUT +#define SC_OUT(n, c) (sc_mul_u32_rnd(n, (c)->recip)) #endif struct img_part { @@ -130,8 +124,14 @@ struct uint32_rgb { horizontal scaler, and row output */ struct scaler_context { - uint32_t divisor; - uint32_t round; +#if defined(CPU_SH) || defined(TEST_SH_MATH) + uint32_t recip; +#else + uint32_t h_i_val; + uint32_t h_o_val; + uint32_t v_i_val; + uint32_t v_o_val; +#endif struct bitmap *bm; struct dim *src; unsigned char *buf;