/* Some conversion functions for handling UTF-8 * * copyright Marcoen Hirschberg (2004,2005) * * I got all the info from: * http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 * and * http://en.wikipedia.org/wiki/Unicode */ #include #include "file.h" #include "debug.h" #include "rbunicode.h" #ifndef O_BINARY #define O_BINARY 0 #endif #define NUM_TABLES 5 #define NUM_CODEPAGES 13 static int default_codepage = 0; static unsigned short codepage_table[MAX_CP_TABLE_SIZE]; static int loaded_cp_table = 0; static const unsigned char utf8comp[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; static const char *filename[NUM_TABLES] = { CODEPAGE_DIR"/iso.cp", CODEPAGE_DIR"/932.cp", /* SJIS */ CODEPAGE_DIR"/936.cp", /* GB2312 */ CODEPAGE_DIR"/949.cp", /* KSX1001 */ CODEPAGE_DIR"/950.cp" /* BIG5 */ }; static const char cp_2_table[NUM_CODEPAGES] = { 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5 }; /* Load codepage file into memory */ int load_cp_table(int cp) { int i=0; int table = cp_2_table[cp]; int file, tablesize; unsigned char tmp[2]; if (cp == 0 || table == loaded_cp_table) return 1; file = open(filename[table-1], O_RDONLY|O_BINARY); if (file < 0) { DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]); return 0; } tablesize = lseek(file, 0, SEEK_END) / 2; lseek(file, 0, SEEK_SET); if (tablesize > MAX_CP_TABLE_SIZE) { DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]); close(file); return 0; } while (i < tablesize) { if (!read(file, tmp, 2)) { DEBUGF("Can't read from codepage file: %s.cp\n", filename[table-1]); loaded_cp_table = 0; return 0; } codepage_table[i++] = (tmp[1] << 8) | tmp[0]; } loaded_cp_table = table; close(file); return 1; } /* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */ unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8) { int tail = 0; if (ucs > 0x7F) while (ucs >> (5*tail + 6)) tail++; *utf8++ = (ucs >> (6*tail)) | utf8comp[tail]; while (tail--) *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP; return utf8; } /* Recode an iso encoded string to UTF-8 */ unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8, int cp, int count) { unsigned short ucs, tmp; if (cp == -1) /* use default codepage */ cp = default_codepage; if (!load_cp_table(cp)) cp = 0; while (count--) { if (*iso < 128) *utf8++ = *iso++; else { /* cp tells us which codepage to convert from */ switch (cp) { case 0x01: /* Greek (ISO-8859-7) */ case 0x02: /* Hebrew (ISO-8859-8) */ case 0x03: /* Russian (CP1251) */ case 0x04: /* Thai (ISO-8859-11) */ case 0x05: /* Arabic (ISO-8859-6) */ case 0x06: /* Turkish (ISO-8859-9) */ case 0x07: /* Latin Extended (ISO-8859-2) */ tmp = ((cp-1)*128) + (*iso++ - 128); ucs = codepage_table[tmp]; break; case 0x08: /* Japanese (SJIS) */ if (*iso > 0xA0 && *iso < 0xE0) { tmp = *iso | 0xA100; ucs = codepage_table[tmp]; break; } case 0x09: /* Simplified Chinese (GB2312) */ case 0x0A: /* Korean (KSX1001) */ case 0x0B: /* Traditional Chinese (BIG5) */ if (count < 1 || !iso[1]) { ucs = *iso++; break; } /* we assume all cjk strings are written in big endian order */ tmp = *iso++ << 8; tmp |= *iso++; tmp -= 0x8000; ucs = codepage_table[tmp]; count--; break; case 0x0C: /* UTF-8, do nothing */ default: ucs = *iso++; break; } if (ucs == 0) /* unknown char, assume invalid encoding */ ucs = 0xffff; utf8 = utf8encode(ucs, utf8); } } return utf8; } /* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */ unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count) { unsigned long ucs; while (count != 0) { /* Check for a surrogate pair */ if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18) | utf16[2] | ((utf16[3] - 0xDC) << 8)); utf16 += 4; count -= 2; } else { ucs = (utf16[0] | (utf16[1] << 8)); utf16 += 2; count -= 1; } utf8 = utf8encode(ucs, utf8); } return utf8; } /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */ unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count) { unsigned long ucs; while (count != 0) { if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */ ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10) | ((utf16[2] - 0xDC) << 8) | utf16[3]); utf16 += 4; count -= 2; } else { ucs = (utf16[0] << 8) | utf16[1]; utf16 += 2; count -= 1; } utf8 = utf8encode(ucs, utf8); } return utf8; } /* Recode any UTF-16 string to UTF-8 */ unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, unsigned int count) { unsigned long ucs; ucs = *(utf16++) << 8; ucs |= *(utf16++); if (ucs == 0xFEFF) /* Check for BOM */ return utf16BEdecode(utf16, utf8, count-1); else if (ucs == 0xFFFE) return utf16LEdecode(utf16, utf8, count-1); else { /* ADDME: Should default be LE or BE? */ utf16 -= 2; return utf16BEdecode(utf16, utf8, count); } } /* Return the number of UTF-8 chars in a string */ unsigned long utf8length(const unsigned char *utf8) { unsigned long l = 0; while (*utf8 != 0) if ((*utf8++ & MASK) != COMP) l++; return l; } /* Decode 1 UTF-8 char and return a pointer to the next char. */ const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs) { unsigned char c = *utf8++; unsigned long code; int tail = 0; if ((c <= 0x7f) || (c >= 0xc2)) { /* Start of new character. */ if (c < 0x80) { /* U-00000000 - U-0000007F, 1 byte */ code = c; } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */ tail = 1; code = c & 0x1f; } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */ tail = 2; code = c & 0x0f; } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */ tail = 3; code = c & 0x07; } else { /* Invalid size. */ code = 0xffff; } while (tail-- && ((c = *utf8++) != 0)) { if ((c & 0xc0) == 0x80) { /* Valid continuation character. */ code = (code << 6) | (c & 0x3f); } else { /* Invalid continuation char */ code = 0xffff; utf8--; break; } } } else { /* Invalid UTF-8 char */ code = 0xffff; } /* currently we don't support chars above U-FFFF */ *ucs = (code < 0x10000) ? code : 0xffff; return utf8; } void set_codepage(int cp) { default_codepage = cp; return; } /* seek to a given char in a utf8 string and return its start position in the string */ int utf8seek(const unsigned char* utf8, int offset) { int pos = 0; while (offset--) { pos++; while ((utf8[pos] & MASK) == COMP) pos++; } return pos; }