annotate src/libid3tag-0.15.1b/utf8.c @ 23:619f715526df sv_v2.1

Update Vamp plugin SDK to 2.5
author Chris Cannam
date Thu, 09 May 2013 10:52:46 +0100
parents c7265573341e
children
rev   line source
Chris@0 1 /*
Chris@0 2 * libid3tag - ID3 tag manipulation library
Chris@0 3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
Chris@0 4 *
Chris@0 5 * This program is free software; you can redistribute it and/or modify
Chris@0 6 * it under the terms of the GNU General Public License as published by
Chris@0 7 * the Free Software Foundation; either version 2 of the License, or
Chris@0 8 * (at your option) any later version.
Chris@0 9 *
Chris@0 10 * This program is distributed in the hope that it will be useful,
Chris@0 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@0 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@0 13 * GNU General Public License for more details.
Chris@0 14 *
Chris@0 15 * You should have received a copy of the GNU General Public License
Chris@0 16 * along with this program; if not, write to the Free Software
Chris@0 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Chris@0 18 *
Chris@0 19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
Chris@0 20 */
Chris@0 21
Chris@0 22 # ifdef HAVE_CONFIG_H
Chris@0 23 # include "config.h"
Chris@0 24 # endif
Chris@0 25
Chris@0 26 # include "global.h"
Chris@0 27
Chris@0 28 # include <stdlib.h>
Chris@0 29
Chris@0 30 # include "id3tag.h"
Chris@0 31 # include "utf8.h"
Chris@0 32 # include "ucs4.h"
Chris@0 33
Chris@0 34 /*
Chris@0 35 * NAME: utf8->length()
Chris@0 36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
Chris@0 37 */
Chris@0 38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
Chris@0 39 {
Chris@0 40 id3_length_t length = 0;
Chris@0 41
Chris@0 42 while (*utf8) {
Chris@0 43 if ((utf8[0] & 0x80) == 0x00)
Chris@0 44 ++length;
Chris@0 45 else if ((utf8[0] & 0xe0) == 0xc0 &&
Chris@0 46 (utf8[1] & 0xc0) == 0x80) {
Chris@0 47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
Chris@0 48 ++length;
Chris@0 49 utf8 += 1;
Chris@0 50 }
Chris@0 51 }
Chris@0 52 else if ((utf8[0] & 0xf0) == 0xe0 &&
Chris@0 53 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 54 (utf8[2] & 0xc0) == 0x80) {
Chris@0 55 if ((((utf8[0] & 0x0fL) << 12) |
Chris@0 56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) {
Chris@0 57 ++length;
Chris@0 58 utf8 += 2;
Chris@0 59 }
Chris@0 60 }
Chris@0 61 else if ((utf8[0] & 0xf8) == 0xf0 &&
Chris@0 62 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 63 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 64 (utf8[3] & 0xc0) == 0x80) {
Chris@0 65 if ((((utf8[0] & 0x07L) << 18) |
Chris@0 66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
Chris@0 67 ++length;
Chris@0 68 utf8 += 3;
Chris@0 69 }
Chris@0 70 }
Chris@0 71 else if ((utf8[0] & 0xfc) == 0xf8 &&
Chris@0 72 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 73 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 74 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 75 (utf8[4] & 0xc0) == 0x80) {
Chris@0 76 if ((((utf8[0] & 0x03L) << 24) |
Chris@0 77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
Chris@0 78 ++length;
Chris@0 79 utf8 += 4;
Chris@0 80 }
Chris@0 81 }
Chris@0 82 else if ((utf8[0] & 0xfe) == 0xfc &&
Chris@0 83 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 84 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 85 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 86 (utf8[4] & 0xc0) == 0x80 &&
Chris@0 87 (utf8[5] & 0xc0) == 0x80) {
Chris@0 88 if ((((utf8[0] & 0x01L) << 30) |
Chris@0 89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
Chris@0 90 ++length;
Chris@0 91 utf8 += 5;
Chris@0 92 }
Chris@0 93 }
Chris@0 94
Chris@0 95 ++utf8;
Chris@0 96 }
Chris@0 97
Chris@0 98 return length;
Chris@0 99 }
Chris@0 100
Chris@0 101 /*
Chris@0 102 * NAME: utf8->size()
Chris@0 103 * DESCRIPTION: return the encoding size of a utf8 string
Chris@0 104 */
Chris@0 105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
Chris@0 106 {
Chris@0 107 id3_utf8_t const *ptr = utf8;
Chris@0 108
Chris@0 109 while (*ptr)
Chris@0 110 ++ptr;
Chris@0 111
Chris@0 112 return ptr - utf8 + 1;
Chris@0 113 }
Chris@0 114
Chris@0 115 /*
Chris@0 116 * NAME: utf8->ucs4duplicate()
Chris@0 117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4
Chris@0 118 */
Chris@0 119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
Chris@0 120 {
Chris@0 121 id3_ucs4_t *ucs4;
Chris@0 122
Chris@0 123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
Chris@0 124 if (ucs4)
Chris@0 125 id3_utf8_decode(utf8, ucs4);
Chris@0 126
Chris@0 127 return release(ucs4);
Chris@0 128 }
Chris@0 129
Chris@0 130 /*
Chris@0 131 * NAME: utf8->decodechar()
Chris@0 132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
Chris@0 133 */
Chris@0 134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
Chris@0 135 {
Chris@0 136 id3_utf8_t const *start = utf8;
Chris@0 137
Chris@0 138 while (1) {
Chris@0 139 if ((utf8[0] & 0x80) == 0x00) {
Chris@0 140 *ucs4 = utf8[0];
Chris@0 141 return utf8 - start + 1;
Chris@0 142 }
Chris@0 143 else if ((utf8[0] & 0xe0) == 0xc0 &&
Chris@0 144 (utf8[1] & 0xc0) == 0x80) {
Chris@0 145 *ucs4 =
Chris@0 146 ((utf8[0] & 0x1fL) << 6) |
Chris@0 147 ((utf8[1] & 0x3fL) << 0);
Chris@0 148 if (*ucs4 >= 0x00000080L)
Chris@0 149 return utf8 - start + 2;
Chris@0 150 }
Chris@0 151 else if ((utf8[0] & 0xf0) == 0xe0 &&
Chris@0 152 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 153 (utf8[2] & 0xc0) == 0x80) {
Chris@0 154 *ucs4 =
Chris@0 155 ((utf8[0] & 0x0fL) << 12) |
Chris@0 156 ((utf8[1] & 0x3fL) << 6) |
Chris@0 157 ((utf8[2] & 0x3fL) << 0);
Chris@0 158 if (*ucs4 >= 0x00000800L)
Chris@0 159 return utf8 - start + 3;
Chris@0 160 }
Chris@0 161 else if ((utf8[0] & 0xf8) == 0xf0 &&
Chris@0 162 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 163 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 164 (utf8[3] & 0xc0) == 0x80) {
Chris@0 165 *ucs4 =
Chris@0 166 ((utf8[0] & 0x07L) << 18) |
Chris@0 167 ((utf8[1] & 0x3fL) << 12) |
Chris@0 168 ((utf8[2] & 0x3fL) << 6) |
Chris@0 169 ((utf8[3] & 0x3fL) << 0);
Chris@0 170 if (*ucs4 >= 0x00010000L)
Chris@0 171 return utf8 - start + 4;
Chris@0 172 }
Chris@0 173 else if ((utf8[0] & 0xfc) == 0xf8 &&
Chris@0 174 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 175 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 176 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 177 (utf8[4] & 0xc0) == 0x80) {
Chris@0 178 *ucs4 =
Chris@0 179 ((utf8[0] & 0x03L) << 24) |
Chris@0 180 ((utf8[1] & 0x3fL) << 18) |
Chris@0 181 ((utf8[2] & 0x3fL) << 12) |
Chris@0 182 ((utf8[3] & 0x3fL) << 6) |
Chris@0 183 ((utf8[4] & 0x3fL) << 0);
Chris@0 184 if (*ucs4 >= 0x00200000L)
Chris@0 185 return utf8 - start + 5;
Chris@0 186 }
Chris@0 187 else if ((utf8[0] & 0xfe) == 0xfc &&
Chris@0 188 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 189 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 190 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 191 (utf8[4] & 0xc0) == 0x80 &&
Chris@0 192 (utf8[5] & 0xc0) == 0x80) {
Chris@0 193 *ucs4 =
Chris@0 194 ((utf8[0] & 0x01L) << 30) |
Chris@0 195 ((utf8[1] & 0x3fL) << 24) |
Chris@0 196 ((utf8[2] & 0x3fL) << 18) |
Chris@0 197 ((utf8[3] & 0x3fL) << 12) |
Chris@0 198 ((utf8[4] & 0x3fL) << 6) |
Chris@0 199 ((utf8[5] & 0x3fL) << 0);
Chris@0 200 if (*ucs4 >= 0x04000000L)
Chris@0 201 return utf8 - start + 6;
Chris@0 202 }
Chris@0 203
Chris@0 204 ++utf8;
Chris@0 205 }
Chris@0 206 }
Chris@0 207
Chris@0 208 /*
Chris@0 209 * NAME: utf8->encodechar()
Chris@0 210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
Chris@0 211 */
Chris@0 212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
Chris@0 213 {
Chris@0 214 if (ucs4 <= 0x0000007fL) {
Chris@0 215 utf8[0] = ucs4;
Chris@0 216
Chris@0 217 return 1;
Chris@0 218 }
Chris@0 219 else if (ucs4 <= 0x000007ffL) {
Chris@0 220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f);
Chris@0 221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 222
Chris@0 223 return 2;
Chris@0 224 }
Chris@0 225 else if (ucs4 <= 0x0000ffffL) {
Chris@0 226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
Chris@0 227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 229
Chris@0 230 return 3;
Chris@0 231 }
Chris@0 232 else if (ucs4 <= 0x001fffffL) {
Chris@0 233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
Chris@0 234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
Chris@0 235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 237
Chris@0 238 return 4;
Chris@0 239 }
Chris@0 240 else if (ucs4 <= 0x03ffffffL) {
Chris@0 241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
Chris@0 242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
Chris@0 243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
Chris@0 244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 246
Chris@0 247 return 5;
Chris@0 248 }
Chris@0 249 else if (ucs4 <= 0x7fffffffL) {
Chris@0 250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
Chris@0 251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
Chris@0 252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
Chris@0 253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
Chris@0 254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 256
Chris@0 257 return 6;
Chris@0 258 }
Chris@0 259
Chris@0 260 /* default */
Chris@0 261
Chris@0 262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
Chris@0 263 }
Chris@0 264
Chris@0 265 /*
Chris@0 266 * NAME: utf8->decode()
Chris@0 267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string
Chris@0 268 */
Chris@0 269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
Chris@0 270 {
Chris@0 271 do
Chris@0 272 utf8 += id3_utf8_decodechar(utf8, ucs4);
Chris@0 273 while (*ucs4++);
Chris@0 274 }
Chris@0 275
Chris@0 276 /*
Chris@0 277 * NAME: utf8->encode()
Chris@0 278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string
Chris@0 279 */
Chris@0 280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
Chris@0 281 {
Chris@0 282 do
Chris@0 283 utf8 += id3_utf8_encodechar(utf8, *ucs4);
Chris@0 284 while (*ucs4++);
Chris@0 285 }
Chris@0 286
Chris@0 287 /*
Chris@0 288 * NAME: utf8->put()
Chris@0 289 * DESCRIPTION: serialize a single utf8 character
Chris@0 290 */
Chris@0 291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
Chris@0 292 {
Chris@0 293 if (ptr)
Chris@0 294 *(*ptr)++ = utf8;
Chris@0 295
Chris@0 296 return 1;
Chris@0 297 }
Chris@0 298
Chris@0 299 /*
Chris@0 300 * NAME: utf8->get()
Chris@0 301 * DESCRIPTION: deserialize a single utf8 character
Chris@0 302 */
Chris@0 303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
Chris@0 304 {
Chris@0 305 return *(*ptr)++;
Chris@0 306 }
Chris@0 307
Chris@0 308 /*
Chris@0 309 * NAME: utf8->serialize()
Chris@0 310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding
Chris@0 311 */
Chris@0 312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
Chris@0 313 int terminate)
Chris@0 314 {
Chris@0 315 id3_length_t size = 0;
Chris@0 316 id3_utf8_t utf8[6], *out;
Chris@0 317
Chris@0 318 while (*ucs4) {
Chris@0 319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
Chris@0 320 case 6: size += id3_utf8_put(ptr, *out++);
Chris@0 321 case 5: size += id3_utf8_put(ptr, *out++);
Chris@0 322 case 4: size += id3_utf8_put(ptr, *out++);
Chris@0 323 case 3: size += id3_utf8_put(ptr, *out++);
Chris@0 324 case 2: size += id3_utf8_put(ptr, *out++);
Chris@0 325 case 1: size += id3_utf8_put(ptr, *out++);
Chris@0 326 case 0: break;
Chris@0 327 }
Chris@0 328 }
Chris@0 329
Chris@0 330 if (terminate)
Chris@0 331 size += id3_utf8_put(ptr, 0);
Chris@0 332
Chris@0 333 return size;
Chris@0 334 }
Chris@0 335
Chris@0 336 /*
Chris@0 337 * NAME: utf8->deserialize()
Chris@0 338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding
Chris@0 339 */
Chris@0 340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
Chris@0 341 {
Chris@0 342 id3_byte_t const *end;
Chris@0 343 id3_utf8_t *utf8ptr, *utf8;
Chris@0 344 id3_ucs4_t *ucs4;
Chris@0 345
Chris@0 346 end = *ptr + length;
Chris@0 347
Chris@0 348 utf8 = malloc((length + 1) * sizeof(*utf8));
Chris@0 349 if (utf8 == 0)
Chris@0 350 return 0;
Chris@0 351
Chris@0 352 utf8ptr = utf8;
Chris@0 353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
Chris@0 354 ++utf8ptr;
Chris@0 355
Chris@0 356 *utf8ptr = 0;
Chris@0 357
Chris@0 358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
Chris@0 359 if (ucs4)
Chris@0 360 id3_utf8_decode(utf8, ucs4);
Chris@0 361
Chris@0 362 free(utf8);
Chris@0 363
Chris@0 364 return ucs4;
Chris@0 365 }