cannam@85: /* cannam@85: * libid3tag - ID3 tag manipulation library cannam@85: * Copyright (C) 2000-2004 Underbit Technologies, Inc. cannam@85: * cannam@85: * This program is free software; you can redistribute it and/or modify cannam@85: * it under the terms of the GNU General Public License as published by cannam@85: * the Free Software Foundation; either version 2 of the License, or cannam@85: * (at your option) any later version. cannam@85: * cannam@85: * This program is distributed in the hope that it will be useful, cannam@85: * but WITHOUT ANY WARRANTY; without even the implied warranty of cannam@85: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the cannam@85: * GNU General Public License for more details. cannam@85: * cannam@85: * You should have received a copy of the GNU General Public License cannam@85: * along with this program; if not, write to the Free Software cannam@85: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA cannam@85: * cannam@85: * $Id: utf16.c,v 1.9 2004/01/23 09:41:32 rob Exp $ cannam@85: */ cannam@85: cannam@85: # ifdef HAVE_CONFIG_H cannam@85: # include "config.h" cannam@85: # endif cannam@85: cannam@85: # include "global.h" cannam@85: cannam@85: # include cannam@85: cannam@85: # include "id3tag.h" cannam@85: # include "utf16.h" cannam@85: # include "ucs4.h" cannam@85: cannam@85: /* cannam@85: * NAME: utf16->length() cannam@85: * DESCRIPTION: return the number of ucs4 chars represented by a utf16 string cannam@85: */ cannam@85: id3_length_t id3_utf16_length(id3_utf16_t const *utf16) cannam@85: { cannam@85: id3_length_t length = 0; cannam@85: cannam@85: while (*utf16) { cannam@85: if (utf16[0] < 0xd800 || utf16[0] > 0xdfff) cannam@85: ++length; cannam@85: else if (utf16[0] >= 0xd800 && utf16[0] <= 0xdbff && cannam@85: utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff) { cannam@85: ++length; cannam@85: ++utf16; cannam@85: } cannam@85: cannam@85: ++utf16; cannam@85: } cannam@85: cannam@85: return length; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->size() cannam@85: * DESCRIPTION: return the encoding size of a utf16 string cannam@85: */ cannam@85: id3_length_t id3_utf16_size(id3_utf16_t const *utf16) cannam@85: { cannam@85: id3_utf16_t const *ptr = utf16; cannam@85: cannam@85: while (*ptr) cannam@85: ++ptr; cannam@85: cannam@85: return ptr - utf16 + 1; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->ucs4duplicate() cannam@85: * DESCRIPTION: duplicate and decode a utf16 string into ucs4 cannam@85: */ cannam@85: id3_ucs4_t *id3_utf16_ucs4duplicate(id3_utf16_t const *utf16) cannam@85: { cannam@85: id3_ucs4_t *ucs4; cannam@85: cannam@85: ucs4 = malloc((id3_utf16_length(utf16) + 1) * sizeof(*ucs4)); cannam@85: if (ucs4) cannam@85: id3_utf16_decode(utf16, ucs4); cannam@85: cannam@85: return release(ucs4); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->decodechar() cannam@85: * DESCRIPTION: decode a series of utf16 chars into a single ucs4 char cannam@85: */ cannam@85: id3_length_t id3_utf16_decodechar(id3_utf16_t const *utf16, id3_ucs4_t *ucs4) cannam@85: { cannam@85: id3_utf16_t const *start = utf16; cannam@85: cannam@85: while (1) { cannam@85: if (utf16[0] < 0xd800 || utf16[0] > 0xdfff) { cannam@85: *ucs4 = utf16[0]; cannam@85: return utf16 - start + 1; cannam@85: } cannam@85: else if (utf16[0] >= 0xd800 && utf16[0] <= 0xdbff && cannam@85: utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff) { cannam@85: *ucs4 = (((utf16[0] & 0x03ffL) << 10) | cannam@85: ((utf16[1] & 0x03ffL) << 0)) + 0x00010000L; cannam@85: return utf16 - start + 2; cannam@85: } cannam@85: cannam@85: ++utf16; cannam@85: } cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->encodechar() cannam@85: * DESCRIPTION: encode a single ucs4 char into a series of up to 2 utf16 chars cannam@85: */ cannam@85: id3_length_t id3_utf16_encodechar(id3_utf16_t *utf16, id3_ucs4_t ucs4) cannam@85: { cannam@85: if (ucs4 < 0x00010000L) { cannam@85: utf16[0] = ucs4; cannam@85: cannam@85: return 1; cannam@85: } cannam@85: else if (ucs4 < 0x00110000L) { cannam@85: ucs4 -= 0x00010000L; cannam@85: cannam@85: utf16[0] = ((ucs4 >> 10) & 0x3ff) | 0xd800; cannam@85: utf16[1] = ((ucs4 >> 0) & 0x3ff) | 0xdc00; cannam@85: cannam@85: return 2; cannam@85: } cannam@85: cannam@85: /* default */ cannam@85: cannam@85: return id3_utf16_encodechar(utf16, ID3_UCS4_REPLACEMENTCHAR); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->decode() cannam@85: * DESCRIPTION: decode a complete utf16 string into a ucs4 string cannam@85: */ cannam@85: void id3_utf16_decode(id3_utf16_t const *utf16, id3_ucs4_t *ucs4) cannam@85: { cannam@85: do cannam@85: utf16 += id3_utf16_decodechar(utf16, ucs4); cannam@85: while (*ucs4++); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->encode() cannam@85: * DESCRIPTION: encode a complete ucs4 string into a utf16 string cannam@85: */ cannam@85: void id3_utf16_encode(id3_utf16_t *utf16, id3_ucs4_t const *ucs4) cannam@85: { cannam@85: do cannam@85: utf16 += id3_utf16_encodechar(utf16, *ucs4); cannam@85: while (*ucs4++); cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->put() cannam@85: * DESCRIPTION: serialize a single utf16 character cannam@85: */ cannam@85: id3_length_t id3_utf16_put(id3_byte_t **ptr, id3_utf16_t utf16, cannam@85: enum id3_utf16_byteorder byteorder) cannam@85: { cannam@85: if (ptr) { cannam@85: switch (byteorder) { cannam@85: default: cannam@85: case ID3_UTF16_BYTEORDER_BE: cannam@85: (*ptr)[0] = (utf16 >> 8) & 0xff; cannam@85: (*ptr)[1] = (utf16 >> 0) & 0xff; cannam@85: break; cannam@85: cannam@85: case ID3_UTF16_BYTEORDER_LE: cannam@85: (*ptr)[0] = (utf16 >> 0) & 0xff; cannam@85: (*ptr)[1] = (utf16 >> 8) & 0xff; cannam@85: break; cannam@85: } cannam@85: cannam@85: *ptr += 2; cannam@85: } cannam@85: cannam@85: return 2; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->get() cannam@85: * DESCRIPTION: deserialize a single utf16 character cannam@85: */ cannam@85: id3_utf16_t id3_utf16_get(id3_byte_t const **ptr, cannam@85: enum id3_utf16_byteorder byteorder) cannam@85: { cannam@85: id3_utf16_t utf16; cannam@85: cannam@85: switch (byteorder) { cannam@85: default: cannam@85: case ID3_UTF16_BYTEORDER_BE: cannam@85: utf16 = cannam@85: ((*ptr)[0] << 8) | cannam@85: ((*ptr)[1] << 0); cannam@85: break; cannam@85: cannam@85: case ID3_UTF16_BYTEORDER_LE: cannam@85: utf16 = cannam@85: ((*ptr)[0] << 0) | cannam@85: ((*ptr)[1] << 8); cannam@85: break; cannam@85: } cannam@85: cannam@85: *ptr += 2; cannam@85: cannam@85: return utf16; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->serialize() cannam@85: * DESCRIPTION: serialize a ucs4 string using utf16 encoding cannam@85: */ cannam@85: id3_length_t id3_utf16_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4, cannam@85: enum id3_utf16_byteorder byteorder, cannam@85: int terminate) cannam@85: { cannam@85: id3_length_t size = 0; cannam@85: id3_utf16_t utf16[2], *out; cannam@85: cannam@85: if (byteorder == ID3_UTF16_BYTEORDER_ANY) cannam@85: size += id3_utf16_put(ptr, 0xfeff, byteorder); cannam@85: cannam@85: while (*ucs4) { cannam@85: switch (id3_utf16_encodechar(out = utf16, *ucs4++)) { cannam@85: case 2: size += id3_utf16_put(ptr, *out++, byteorder); cannam@85: case 1: size += id3_utf16_put(ptr, *out++, byteorder); cannam@85: case 0: break; cannam@85: } cannam@85: } cannam@85: cannam@85: if (terminate) cannam@85: size += id3_utf16_put(ptr, 0, byteorder); cannam@85: cannam@85: return size; cannam@85: } cannam@85: cannam@85: /* cannam@85: * NAME: utf16->deserialize() cannam@85: * DESCRIPTION: deserialize a ucs4 string using utf16 encoding cannam@85: */ cannam@85: id3_ucs4_t *id3_utf16_deserialize(id3_byte_t const **ptr, id3_length_t length, cannam@85: enum id3_utf16_byteorder byteorder) cannam@85: { cannam@85: id3_byte_t const *end; cannam@85: id3_utf16_t *utf16ptr, *utf16; cannam@85: id3_ucs4_t *ucs4; cannam@85: cannam@85: end = *ptr + (length & ~1); cannam@85: cannam@85: utf16 = malloc((length / 2 + 1) * sizeof(*utf16)); cannam@85: if (utf16 == 0) cannam@85: return 0; cannam@85: cannam@85: if (byteorder == ID3_UTF16_BYTEORDER_ANY && end - *ptr > 0) { cannam@85: switch (((*ptr)[0] << 8) | cannam@85: ((*ptr)[1] << 0)) { cannam@85: case 0xfeff: cannam@85: byteorder = ID3_UTF16_BYTEORDER_BE; cannam@85: *ptr += 2; cannam@85: break; cannam@85: cannam@85: case 0xfffe: cannam@85: byteorder = ID3_UTF16_BYTEORDER_LE; cannam@85: *ptr += 2; cannam@85: break; cannam@85: } cannam@85: } cannam@85: cannam@85: utf16ptr = utf16; cannam@85: while (end - *ptr > 0 && (*utf16ptr = id3_utf16_get(ptr, byteorder))) cannam@85: ++utf16ptr; cannam@85: cannam@85: *utf16ptr = 0; cannam@85: cannam@85: ucs4 = malloc((id3_utf16_length(utf16) + 1) * sizeof(*ucs4)); cannam@85: if (ucs4) cannam@85: id3_utf16_decode(utf16, ucs4); cannam@85: cannam@85: free(utf16); cannam@85: cannam@85: return ucs4; cannam@85: }