cannam@85: /* cannam@85: * libid3tag - ID3 tag manipulation library cannam@85: * Copyright (C) 2000-2004 Underbit Technologies, Inc. cannam@85: * cannam@85: * This program is free software; you can redistribute it and/or modify cannam@85: * it under the terms of the GNU General Public License as published by cannam@85: * the Free Software Foundation; either version 2 of the License, or cannam@85: * (at your option) any later version. cannam@85: * cannam@85: * This program is distributed in the hope that it will be useful, cannam@85: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@85: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@85: * GNU General Public License for more details. cannam@85: * cannam@85: * You should have received a copy of the GNU General Public License cannam@85: * along with this program; if not, write to the Free Software cannam@85: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA cannam@85: * cannam@85: * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $ cannam@85: */ cannam@85: cannam@85: # ifdef HAVE_CONFIG_H cannam@85: # include "config.h" cannam@85: # endif cannam@85: cannam@85: # include "global.h" cannam@85: cannam@85: # include cannam@85: cannam@85: # include "id3tag.h" cannam@85: # include "utf8.h" cannam@85: # include "ucs4.h" cannam@85: cannam@85: /* cannam@85: * NAME: utf8->length() cannam@85: * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string cannam@85: */ cannam@85: id3_length_t id3_utf8_length(id3_utf8_t const *utf8) cannam@85: { cannam@85: id3_length_t length = 0; cannam@85: cannam@85: while (*utf8) { cannam@85: if ((utf8[0] & 0x80) == 0x00) cannam@85: ++length; cannam@85: else if ((utf8[0] & 0xe0) == 0xc0 && cannam@85: (utf8[1] & 0xc0) == 0x80) { cannam@85: if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) { cannam@85: ++length; cannam@85: utf8 += 1; cannam@85: } cannam@85: } cannam@85: else if ((utf8[0] & 0xf0) == 0xe0 && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80) { cannam@85: if ((((utf8[0] & 0x0fL) << 12) | cannam@85: ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) { cannam@85: ++length; cannam@85: utf8 += 2; cannam@85: } cannam@85: } cannam@85: else if ((utf8[0] & 0xf8) == 0xf0 && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80 && cannam@85: (utf8[3] & 0xc0) == 0x80) { cannam@85: if ((((utf8[0] & 0x07L) << 18) | cannam@85: ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) { cannam@85: ++length; cannam@85: utf8 += 3; cannam@85: } cannam@85: } cannam@85: else if ((utf8[0] & 0xfc) == 0xf8 && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80 && cannam@85: (utf8[3] & 0xc0) == 0x80 && cannam@85: (utf8[4] & 0xc0) == 0x80) { cannam@85: if ((((utf8[0] & 0x03L) << 24) | cannam@85: ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) { cannam@85: ++length; cannam@85: utf8 += 4; cannam@85: } cannam@85: } cannam@85: else if ((utf8[0] & 0xfe) == 0xfc && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80 && cannam@85: (utf8[3] & 0xc0) == 0x80 && cannam@85: (utf8[4] & 0xc0) == 0x80 && cannam@85: (utf8[5] & 0xc0) == 0x80) { cannam@85: if ((((utf8[0] & 0x01L) << 30) | cannam@85: ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) { cannam@85: ++length; cannam@85: utf8 += 5; cannam@85: } cannam@85: } cannam@85: cannam@85: ++utf8; cannam@85: } cannam@85: cannam@85: return length; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->size() cannam@85: * DESCRIPTION: return the encoding size of a utf8 string cannam@85: */ cannam@85: id3_length_t id3_utf8_size(id3_utf8_t const *utf8) cannam@85: { cannam@85: id3_utf8_t const *ptr = utf8; cannam@85: cannam@85: while (*ptr) cannam@85: ++ptr; cannam@85: cannam@85: return ptr - utf8 + 1; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->ucs4duplicate() cannam@85: * DESCRIPTION: duplicate and decode a utf8 string into ucs4 cannam@85: */ cannam@85: id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8) cannam@85: { cannam@85: id3_ucs4_t *ucs4; cannam@85: cannam@85: ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4)); cannam@85: if (ucs4) cannam@85: id3_utf8_decode(utf8, ucs4); cannam@85: cannam@85: return release(ucs4); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->decodechar() cannam@85: * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char cannam@85: */ cannam@85: id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4) cannam@85: { cannam@85: id3_utf8_t const *start = utf8; cannam@85: cannam@85: while (1) { cannam@85: if ((utf8[0] & 0x80) == 0x00) { cannam@85: *ucs4 = utf8[0]; cannam@85: return utf8 - start + 1; cannam@85: } cannam@85: else if ((utf8[0] & 0xe0) == 0xc0 && cannam@85: (utf8[1] & 0xc0) == 0x80) { cannam@85: *ucs4 = cannam@85: ((utf8[0] & 0x1fL) << 6) | cannam@85: ((utf8[1] & 0x3fL) << 0); cannam@85: if (*ucs4 >= 0x00000080L) cannam@85: return utf8 - start + 2; cannam@85: } cannam@85: else if ((utf8[0] & 0xf0) == 0xe0 && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80) { cannam@85: *ucs4 = cannam@85: ((utf8[0] & 0x0fL) << 12) | cannam@85: ((utf8[1] & 0x3fL) << 6) | cannam@85: ((utf8[2] & 0x3fL) << 0); cannam@85: if (*ucs4 >= 0x00000800L) cannam@85: return utf8 - start + 3; cannam@85: } cannam@85: else if ((utf8[0] & 0xf8) == 0xf0 && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80 && cannam@85: (utf8[3] & 0xc0) == 0x80) { cannam@85: *ucs4 = cannam@85: ((utf8[0] & 0x07L) << 18) | cannam@85: ((utf8[1] & 0x3fL) << 12) | cannam@85: ((utf8[2] & 0x3fL) << 6) | cannam@85: ((utf8[3] & 0x3fL) << 0); cannam@85: if (*ucs4 >= 0x00010000L) cannam@85: return utf8 - start + 4; cannam@85: } cannam@85: else if ((utf8[0] & 0xfc) == 0xf8 && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80 && cannam@85: (utf8[3] & 0xc0) == 0x80 && cannam@85: (utf8[4] & 0xc0) == 0x80) { cannam@85: *ucs4 = cannam@85: ((utf8[0] & 0x03L) << 24) | cannam@85: ((utf8[1] & 0x3fL) << 18) | cannam@85: ((utf8[2] & 0x3fL) << 12) | cannam@85: ((utf8[3] & 0x3fL) << 6) | cannam@85: ((utf8[4] & 0x3fL) << 0); cannam@85: if (*ucs4 >= 0x00200000L) cannam@85: return utf8 - start + 5; cannam@85: } cannam@85: else if ((utf8[0] & 0xfe) == 0xfc && cannam@85: (utf8[1] & 0xc0) == 0x80 && cannam@85: (utf8[2] & 0xc0) == 0x80 && cannam@85: (utf8[3] & 0xc0) == 0x80 && cannam@85: (utf8[4] & 0xc0) == 0x80 && cannam@85: (utf8[5] & 0xc0) == 0x80) { cannam@85: *ucs4 = cannam@85: ((utf8[0] & 0x01L) << 30) | cannam@85: ((utf8[1] & 0x3fL) << 24) | cannam@85: ((utf8[2] & 0x3fL) << 18) | cannam@85: ((utf8[3] & 0x3fL) << 12) | cannam@85: ((utf8[4] & 0x3fL) << 6) | cannam@85: ((utf8[5] & 0x3fL) << 0); cannam@85: if (*ucs4 >= 0x04000000L) cannam@85: return utf8 - start + 6; cannam@85: } cannam@85: cannam@85: ++utf8; cannam@85: } cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->encodechar() cannam@85: * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars cannam@85: */ cannam@85: id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4) cannam@85: { cannam@85: if (ucs4 <= 0x0000007fL) { cannam@85: utf8[0] = ucs4; cannam@85: cannam@85: return 1; cannam@85: } cannam@85: else if (ucs4 <= 0x000007ffL) { cannam@85: utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f); cannam@85: utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f); cannam@85: cannam@85: return 2; cannam@85: } cannam@85: else if (ucs4 <= 0x0000ffffL) { cannam@85: utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f); cannam@85: utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); cannam@85: utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f); cannam@85: cannam@85: return 3; cannam@85: } cannam@85: else if (ucs4 <= 0x001fffffL) { cannam@85: utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07); cannam@85: utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); cannam@85: utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); cannam@85: utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f); cannam@85: cannam@85: return 4; cannam@85: } cannam@85: else if (ucs4 <= 0x03ffffffL) { cannam@85: utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03); cannam@85: utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f); cannam@85: utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f); cannam@85: utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f); cannam@85: utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f); cannam@85: cannam@85: return 5; cannam@85: } cannam@85: else if (ucs4 <= 0x7fffffffL) { cannam@85: utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01); cannam@85: utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f); cannam@85: utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f); cannam@85: utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f); cannam@85: utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f); cannam@85: utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f); cannam@85: cannam@85: return 6; cannam@85: } cannam@85: cannam@85: /* default */ cannam@85: cannam@85: return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->decode() cannam@85: * DESCRIPTION: decode a complete utf8 string into a ucs4 string cannam@85: */ cannam@85: void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4) cannam@85: { cannam@85: do cannam@85: utf8 += id3_utf8_decodechar(utf8, ucs4); cannam@85: while (*ucs4++); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->encode() cannam@85: * DESCRIPTION: encode a complete ucs4 string into a utf8 string cannam@85: */ cannam@85: void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4) cannam@85: { cannam@85: do cannam@85: utf8 += id3_utf8_encodechar(utf8, *ucs4); cannam@85: while (*ucs4++); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->put() cannam@85: * DESCRIPTION: serialize a single utf8 character cannam@85: */ cannam@85: id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8) cannam@85: { cannam@85: if (ptr) cannam@85: *(*ptr)++ = utf8; cannam@85: cannam@85: return 1; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->get() cannam@85: * DESCRIPTION: deserialize a single utf8 character cannam@85: */ cannam@85: id3_utf8_t id3_utf8_get(id3_byte_t const **ptr) cannam@85: { cannam@85: return *(*ptr)++; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->serialize() cannam@85: * DESCRIPTION: serialize a ucs4 string using utf8 encoding cannam@85: */ cannam@85: id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4, cannam@85: int terminate) cannam@85: { cannam@85: id3_length_t size = 0; cannam@85: id3_utf8_t utf8[6], *out; cannam@85: cannam@85: while (*ucs4) { cannam@85: switch (id3_utf8_encodechar(out = utf8, *ucs4++)) { cannam@85: case 6: size += id3_utf8_put(ptr, *out++); cannam@85: case 5: size += id3_utf8_put(ptr, *out++); cannam@85: case 4: size += id3_utf8_put(ptr, *out++); cannam@85: case 3: size += id3_utf8_put(ptr, *out++); cannam@85: case 2: size += id3_utf8_put(ptr, *out++); cannam@85: case 1: size += id3_utf8_put(ptr, *out++); cannam@85: case 0: break; cannam@85: } cannam@85: } cannam@85: cannam@85: if (terminate) cannam@85: size += id3_utf8_put(ptr, 0); cannam@85: cannam@85: return size; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf8->deserialize() cannam@85: * DESCRIPTION: deserialize a ucs4 string using utf8 encoding cannam@85: */ cannam@85: id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length) cannam@85: { cannam@85: id3_byte_t const *end; cannam@85: id3_utf8_t *utf8ptr, *utf8; cannam@85: id3_ucs4_t *ucs4; cannam@85: cannam@85: end = *ptr + length; cannam@85: cannam@85: utf8 = malloc((length + 1) * sizeof(*utf8)); cannam@85: if (utf8 == 0) cannam@85: return 0; cannam@85: cannam@85: utf8ptr = utf8; cannam@85: while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr))) cannam@85: ++utf8ptr; cannam@85: cannam@85: *utf8ptr = 0; cannam@85: cannam@85: ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4)); cannam@85: if (ucs4) cannam@85: id3_utf8_decode(utf8, ucs4); cannam@85: cannam@85: free(utf8); cannam@85: cannam@85: return ucs4; cannam@85: }