rockbox/firmware/common/unicode.c
Dominik Riebeling 2d9c0bab54 Add support for cp1252 (Western European) codepage.
In Europe Windows defaults to its own codepage cp1252 (also known as "WinLatin"
or "Windows-1252"). cp1252 adds some characters to ISO-8859-1.

Some mp3 tagging software on Windows uses cp1252 instead of ISO-8859-1. This
violates the ID3 specification, which requires tags to be ISO-8859-1 or
Unicode. However, similar violations are made for other codepages and supported
by Rockbox using the "Default Codepage" setting. Add support for cp1252 to
enable people using such broken tools to override the correct decoding to get
their tags displayed properly.

Change-Id: I9f2ec478afe2503e99ee8e6609416c92b0f453e0
Reviewed-on: http://gerrit.rockbox.org/209
Reviewed-by: Jens Arnold <amiconn@rockbox.org>
Tested-by: Jens Arnold <amiconn@rockbox.org>
2012-05-19 01:42:53 +02:00

397 lines
10 KiB
C

/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* Copyright (c) 2004,2005 by Marcoen Hirschberg
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
/* Some conversion functions for handling UTF-8
*
* I got all the info from:
* http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
* and
* http://en.wikipedia.org/wiki/Unicode
*/
#include <stdio.h>
#include "config.h"
#include "file.h"
#include "debug.h"
#include "rbunicode.h"
#include "rbpaths.h"
#ifndef O_BINARY
#define O_BINARY 0
#endif
static int default_codepage = 0;
static int loaded_cp_table = 0;
#ifdef HAVE_LCD_BITMAP
#define MAX_CP_TABLE_SIZE 32768
#define NUM_TABLES 5
static const char * const filename[NUM_TABLES] =
{
CODEPAGE_DIR"/iso.cp",
CODEPAGE_DIR"/932.cp", /* SJIS */
CODEPAGE_DIR"/936.cp", /* GB2312 */
CODEPAGE_DIR"/949.cp", /* KSX1001 */
CODEPAGE_DIR"/950.cp" /* BIG5 */
};
static const char cp_2_table[NUM_CODEPAGES] =
{
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 0
};
static const char * const name_codepages[NUM_CODEPAGES+1] =
{
"ISO-8859-1",
"ISO-8859-7",
"ISO-8859-8",
"CP1251",
"ISO-8859-11",
"CP1256",
"ISO-8859-9",
"ISO-8859-2",
"CP1250",
"CP1252",
"SJIS",
"GB-2312",
"KSX-1001",
"BIG5",
"UTF-8",
"unknown"
};
#else /* !HAVE_LCD_BITMAP, reduced support */
#define MAX_CP_TABLE_SIZE 768
#define NUM_TABLES 1
static const char * const filename[NUM_TABLES] = {
CODEPAGE_DIR"/isomini.cp"
};
static const char cp_2_table[NUM_CODEPAGES] =
{
0, 1, 1, 1, 1, 1, 1, 0
};
static const char * const name_codepages[NUM_CODEPAGES+1] =
{
"ISO-8859-1",
"ISO-8859-7",
"CP1251",
"ISO-8859-9",
"ISO-8859-2",
"CP1250",
"CP1252",
"UTF-8",
"unknown"
};
#endif
static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
static const unsigned char utf8comp[6] =
{
0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};
/* Load codepage file into memory */
static int load_cp_table(int cp)
{
int i = 0;
int table = cp_2_table[cp];
int file, tablesize;
unsigned char tmp[2];
if (table == 0 || table == loaded_cp_table)
return 1;
file = open(filename[table-1], O_RDONLY|O_BINARY);
if (file < 0) {
DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
return 0;
}
tablesize = filesize(file) / 2;
if (tablesize > MAX_CP_TABLE_SIZE) {
DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
close(file);
return 0;
}
while (i < tablesize) {
if (!read(file, tmp, 2)) {
DEBUGF("Can't read from codepage file: %s.cp\n",
filename[table-1]);
loaded_cp_table = 0;
return 0;
}
codepage_table[i++] = (tmp[1] << 8) | tmp[0];
}
loaded_cp_table = table;
close(file);
return 1;
}
/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
{
int tail = 0;
if (ucs > 0x7F)
while (ucs >> (5*tail + 6))
tail++;
*utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
while (tail--)
*utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
return utf8;
}
/* Recode an iso encoded string to UTF-8 */
unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
int cp, int count)
{
unsigned short ucs, tmp;
if (cp == -1) /* use default codepage */
cp = default_codepage;
if (!load_cp_table(cp)) cp = 0;
while (count--) {
if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
*utf8++ = *iso++;
else {
/* cp tells us which codepage to convert from */
switch (cp) {
case ISO_8859_7: /* Greek */
case WIN_1252: /* Western European */
case WIN_1251: /* Cyrillic */
case ISO_8859_9: /* Turkish */
case ISO_8859_2: /* Latin Extended */
case WIN_1250: /* Central European */
#ifdef HAVE_LCD_BITMAP
case ISO_8859_8: /* Hebrew */
case ISO_8859_11: /* Thai */
case WIN_1256: /* Arabic */
#endif
tmp = ((cp-1)*128) + (*iso++ - 128);
ucs = codepage_table[tmp];
break;
#ifdef HAVE_LCD_BITMAP
case SJIS: /* Japanese */
if (*iso > 0xA0 && *iso < 0xE0) {
tmp = *iso++ | (0xA100 - 0x8000);
ucs = codepage_table[tmp];
break;
}
case GB_2312: /* Simplified Chinese */
case KSX_1001: /* Korean */
case BIG_5: /* Traditional Chinese */
if (count < 1 || !iso[1]) {
ucs = *iso++;
break;
}
/* we assume all cjk strings are written
in big endian order */
tmp = *iso++ << 8;
tmp |= *iso++;
tmp -= 0x8000;
ucs = codepage_table[tmp];
count--;
break;
#endif /* HAVE_LCD_BITMAP */
default:
ucs = *iso++;
break;
}
if (ucs == 0) /* unknown char, use replacement char */
ucs = 0xfffd;
utf8 = utf8encode(ucs, utf8);
}
}
return utf8;
}
/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
int count)
{
unsigned long ucs;
while (count > 0) {
/* Check for a surrogate pair */
if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
| utf16[2] | ((utf16[3] - 0xDC) << 8));
utf16 += 4;
count -= 2;
} else {
ucs = (utf16[0] | (utf16[1] << 8));
utf16 += 2;
count -= 1;
}
utf8 = utf8encode(ucs, utf8);
}
return utf8;
}
/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
int count)
{
unsigned long ucs;
while (count > 0) {
if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
| ((utf16[2] - 0xDC) << 8) | utf16[3]);
utf16 += 4;
count -= 2;
} else {
ucs = (utf16[0] << 8) | utf16[1];
utf16 += 2;
count -= 1;
}
utf8 = utf8encode(ucs, utf8);
}
return utf8;
}
#if 0 /* currently unused */
/* Recode any UTF-16 string to UTF-8 */
unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
unsigned int count)
{
unsigned long ucs;
ucs = *(utf16++) << 8;
ucs |= *(utf16++);
if (ucs == 0xFEFF) /* Check for BOM */
return utf16BEdecode(utf16, utf8, count-1);
else if (ucs == 0xFFFE)
return utf16LEdecode(utf16, utf8, count-1);
else { /* ADDME: Should default be LE or BE? */
utf16 -= 2;
return utf16BEdecode(utf16, utf8, count);
}
}
#endif
/* Return the number of UTF-8 chars in a string */
unsigned long utf8length(const unsigned char *utf8)
{
unsigned long l = 0;
while (*utf8 != 0)
if ((*utf8++ & MASK) != COMP)
l++;
return l;
}
/* Decode 1 UTF-8 char and return a pointer to the next char. */
const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
{
unsigned char c = *utf8++;
unsigned long code;
int tail = 0;
if ((c <= 0x7f) || (c >= 0xc2)) {
/* Start of new character. */
if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */
code = c;
} else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
tail = 1;
code = c & 0x1f;
} else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
tail = 2;
code = c & 0x0f;
} else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
tail = 3;
code = c & 0x07;
} else {
/* Invalid size. */
code = 0xfffd;
}
while (tail-- && ((c = *utf8++) != 0)) {
if ((c & 0xc0) == 0x80) {
/* Valid continuation character. */
code = (code << 6) | (c & 0x3f);
} else {
/* Invalid continuation char */
code = 0xfffd;
utf8--;
break;
}
}
} else {
/* Invalid UTF-8 char */
code = 0xfffd;
}
/* currently we don't support chars above U-FFFF */
*ucs = (code < 0x10000) ? code : 0xfffd;
return utf8;
}
void set_codepage(int cp)
{
default_codepage = cp;
return;
}
/* seek to a given char in a utf8 string and
return its start position in the string */
int utf8seek(const unsigned char* utf8, int offset)
{
int pos = 0;
while (offset--) {
pos++;
while ((utf8[pos] & MASK) == COMP)
pos++;
}
return pos;
}
const char* get_codepage_name(int cp)
{
if (cp < 0 || cp>= NUM_CODEPAGES)
return name_codepages[NUM_CODEPAGES];
return name_codepages[cp];
}