annotate src/libid3tag-0.15.1b/utf8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents c7265573341e
children
rev   line source
Chris@0 1 /*
Chris@0 2 * libid3tag - ID3 tag manipulation library
Chris@0 3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
Chris@0 4 *
Chris@0 5 * This program is free software; you can redistribute it and/or modify
Chris@0 6 * it under the terms of the GNU General Public License as published by
Chris@0 7 * the Free Software Foundation; either version 2 of the License, or
Chris@0 8 * (at your option) any later version.
Chris@0 9 *
Chris@0 10 * This program is distributed in the hope that it will be useful,
Chris@0 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@0 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@0 13 * GNU General Public License for more details.
Chris@0 14 *
Chris@0 15 * You should have received a copy of the GNU General Public License
Chris@0 16 * along with this program; if not, write to the Free Software
Chris@0 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Chris@0 18 *
Chris@0 19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
Chris@0 20 */
Chris@0 21
Chris@0 22 # ifdef HAVE_CONFIG_H
Chris@0 23 # include "config.h"
Chris@0 24 # endif
Chris@0 25
Chris@0 26 # include "global.h"
Chris@0 27
Chris@0 28 # include <stdlib.h>
Chris@0 29
Chris@0 30 # include "id3tag.h"
Chris@0 31 # include "utf8.h"
Chris@0 32 # include "ucs4.h"
Chris@0 33
Chris@0 34 /*
Chris@0 35 * NAME: utf8->length()
Chris@0 36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
Chris@0 37 */
Chris@0 38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
Chris@0 39 {
Chris@0 40 id3_length_t length = 0;
Chris@0 41
Chris@0 42 while (*utf8) {
Chris@0 43 if ((utf8[0] & 0x80) == 0x00)
Chris@0 44 ++length;
Chris@0 45 else if ((utf8[0] & 0xe0) == 0xc0 &&
Chris@0 46 (utf8[1] & 0xc0) == 0x80) {
Chris@0 47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
Chris@0 48 ++length;
Chris@0 49 utf8 += 1;
Chris@0 50 }
Chris@0 51 }
Chris@0 52 else if ((utf8[0] & 0xf0) == 0xe0 &&
Chris@0 53 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 54 (utf8[2] & 0xc0) == 0x80) {
Chris@0 55 if ((((utf8[0] & 0x0fL) << 12) |
Chris@0 56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) {
Chris@0 57 ++length;
Chris@0 58 utf8 += 2;
Chris@0 59 }
Chris@0 60 }
Chris@0 61 else if ((utf8[0] & 0xf8) == 0xf0 &&
Chris@0 62 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 63 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 64 (utf8[3] & 0xc0) == 0x80) {
Chris@0 65 if ((((utf8[0] & 0x07L) << 18) |
Chris@0 66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
Chris@0 67 ++length;
Chris@0 68 utf8 += 3;
Chris@0 69 }
Chris@0 70 }
Chris@0 71 else if ((utf8[0] & 0xfc) == 0xf8 &&
Chris@0 72 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 73 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 74 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 75 (utf8[4] & 0xc0) == 0x80) {
Chris@0 76 if ((((utf8[0] & 0x03L) << 24) |
Chris@0 77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
Chris@0 78 ++length;
Chris@0 79 utf8 += 4;
Chris@0 80 }
Chris@0 81 }
Chris@0 82 else if ((utf8[0] & 0xfe) == 0xfc &&
Chris@0 83 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 84 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 85 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 86 (utf8[4] & 0xc0) == 0x80 &&
Chris@0 87 (utf8[5] & 0xc0) == 0x80) {
Chris@0 88 if ((((utf8[0] & 0x01L) << 30) |
Chris@0 89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
Chris@0 90 ++length;
Chris@0 91 utf8 += 5;
Chris@0 92 }
Chris@0 93 }
Chris@0 94
Chris@0 95 ++utf8;
Chris@0 96 }
Chris@0 97
Chris@0 98 return length;
Chris@0 99 }
Chris@0 100
Chris@0 101 /*
Chris@0 102 * NAME: utf8->size()
Chris@0 103 * DESCRIPTION: return the encoding size of a utf8 string
Chris@0 104 */
Chris@0 105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
Chris@0 106 {
Chris@0 107 id3_utf8_t const *ptr = utf8;
Chris@0 108
Chris@0 109 while (*ptr)
Chris@0 110 ++ptr;
Chris@0 111
Chris@0 112 return ptr - utf8 + 1;
Chris@0 113 }
Chris@0 114
Chris@0 115 /*
Chris@0 116 * NAME: utf8->ucs4duplicate()
Chris@0 117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4
Chris@0 118 */
Chris@0 119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
Chris@0 120 {
Chris@0 121 id3_ucs4_t *ucs4;
Chris@0 122
Chris@0 123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
Chris@0 124 if (ucs4)
Chris@0 125 id3_utf8_decode(utf8, ucs4);
Chris@0 126
Chris@0 127 return release(ucs4);
Chris@0 128 }
Chris@0 129
Chris@0 130 /*
Chris@0 131 * NAME: utf8->decodechar()
Chris@0 132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
Chris@0 133 */
Chris@0 134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
Chris@0 135 {
Chris@0 136 id3_utf8_t const *start = utf8;
Chris@0 137
Chris@0 138 while (1) {
Chris@0 139 if ((utf8[0] & 0x80) == 0x00) {
Chris@0 140 *ucs4 = utf8[0];
Chris@0 141 return utf8 - start + 1;
Chris@0 142 }
Chris@0 143 else if ((utf8[0] & 0xe0) == 0xc0 &&
Chris@0 144 (utf8[1] & 0xc0) == 0x80) {
Chris@0 145 *ucs4 =
Chris@0 146 ((utf8[0] & 0x1fL) << 6) |
Chris@0 147 ((utf8[1] & 0x3fL) << 0);
Chris@0 148 if (*ucs4 >= 0x00000080L)
Chris@0 149 return utf8 - start + 2;
Chris@0 150 }
Chris@0 151 else if ((utf8[0] & 0xf0) == 0xe0 &&
Chris@0 152 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 153 (utf8[2] & 0xc0) == 0x80) {
Chris@0 154 *ucs4 =
Chris@0 155 ((utf8[0] & 0x0fL) << 12) |
Chris@0 156 ((utf8[1] & 0x3fL) << 6) |
Chris@0 157 ((utf8[2] & 0x3fL) << 0);
Chris@0 158 if (*ucs4 >= 0x00000800L)
Chris@0 159 return utf8 - start + 3;
Chris@0 160 }
Chris@0 161 else if ((utf8[0] & 0xf8) == 0xf0 &&
Chris@0 162 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 163 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 164 (utf8[3] & 0xc0) == 0x80) {
Chris@0 165 *ucs4 =
Chris@0 166 ((utf8[0] & 0x07L) << 18) |
Chris@0 167 ((utf8[1] & 0x3fL) << 12) |
Chris@0 168 ((utf8[2] & 0x3fL) << 6) |
Chris@0 169 ((utf8[3] & 0x3fL) << 0);
Chris@0 170 if (*ucs4 >= 0x00010000L)
Chris@0 171 return utf8 - start + 4;
Chris@0 172 }
Chris@0 173 else if ((utf8[0] & 0xfc) == 0xf8 &&
Chris@0 174 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 175 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 176 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 177 (utf8[4] & 0xc0) == 0x80) {
Chris@0 178 *ucs4 =
Chris@0 179 ((utf8[0] & 0x03L) << 24) |
Chris@0 180 ((utf8[1] & 0x3fL) << 18) |
Chris@0 181 ((utf8[2] & 0x3fL) << 12) |
Chris@0 182 ((utf8[3] & 0x3fL) << 6) |
Chris@0 183 ((utf8[4] & 0x3fL) << 0);
Chris@0 184 if (*ucs4 >= 0x00200000L)
Chris@0 185 return utf8 - start + 5;
Chris@0 186 }
Chris@0 187 else if ((utf8[0] & 0xfe) == 0xfc &&
Chris@0 188 (utf8[1] & 0xc0) == 0x80 &&
Chris@0 189 (utf8[2] & 0xc0) == 0x80 &&
Chris@0 190 (utf8[3] & 0xc0) == 0x80 &&
Chris@0 191 (utf8[4] & 0xc0) == 0x80 &&
Chris@0 192 (utf8[5] & 0xc0) == 0x80) {
Chris@0 193 *ucs4 =
Chris@0 194 ((utf8[0] & 0x01L) << 30) |
Chris@0 195 ((utf8[1] & 0x3fL) << 24) |
Chris@0 196 ((utf8[2] & 0x3fL) << 18) |
Chris@0 197 ((utf8[3] & 0x3fL) << 12) |
Chris@0 198 ((utf8[4] & 0x3fL) << 6) |
Chris@0 199 ((utf8[5] & 0x3fL) << 0);
Chris@0 200 if (*ucs4 >= 0x04000000L)
Chris@0 201 return utf8 - start + 6;
Chris@0 202 }
Chris@0 203
Chris@0 204 ++utf8;
Chris@0 205 }
Chris@0 206 }
Chris@0 207
Chris@0 208 /*
Chris@0 209 * NAME: utf8->encodechar()
Chris@0 210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
Chris@0 211 */
Chris@0 212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
Chris@0 213 {
Chris@0 214 if (ucs4 <= 0x0000007fL) {
Chris@0 215 utf8[0] = ucs4;
Chris@0 216
Chris@0 217 return 1;
Chris@0 218 }
Chris@0 219 else if (ucs4 <= 0x000007ffL) {
Chris@0 220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f);
Chris@0 221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 222
Chris@0 223 return 2;
Chris@0 224 }
Chris@0 225 else if (ucs4 <= 0x0000ffffL) {
Chris@0 226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
Chris@0 227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 229
Chris@0 230 return 3;
Chris@0 231 }
Chris@0 232 else if (ucs4 <= 0x001fffffL) {
Chris@0 233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
Chris@0 234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
Chris@0 235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 237
Chris@0 238 return 4;
Chris@0 239 }
Chris@0 240 else if (ucs4 <= 0x03ffffffL) {
Chris@0 241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
Chris@0 242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
Chris@0 243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
Chris@0 244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 246
Chris@0 247 return 5;
Chris@0 248 }
Chris@0 249 else if (ucs4 <= 0x7fffffffL) {
Chris@0 250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
Chris@0 251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
Chris@0 252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
Chris@0 253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
Chris@0 254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f);
Chris@0 255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f);
Chris@0 256
Chris@0 257 return 6;
Chris@0 258 }
Chris@0 259
Chris@0 260 /* default */
Chris@0 261
Chris@0 262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
Chris@0 263 }
Chris@0 264
Chris@0 265 /*
Chris@0 266 * NAME: utf8->decode()
Chris@0 267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string
Chris@0 268 */
Chris@0 269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
Chris@0 270 {
Chris@0 271 do
Chris@0 272 utf8 += id3_utf8_decodechar(utf8, ucs4);
Chris@0 273 while (*ucs4++);
Chris@0 274 }
Chris@0 275
Chris@0 276 /*
Chris@0 277 * NAME: utf8->encode()
Chris@0 278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string
Chris@0 279 */
Chris@0 280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
Chris@0 281 {
Chris@0 282 do
Chris@0 283 utf8 += id3_utf8_encodechar(utf8, *ucs4);
Chris@0 284 while (*ucs4++);
Chris@0 285 }
Chris@0 286
Chris@0 287 /*
Chris@0 288 * NAME: utf8->put()
Chris@0 289 * DESCRIPTION: serialize a single utf8 character
Chris@0 290 */
Chris@0 291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
Chris@0 292 {
Chris@0 293 if (ptr)
Chris@0 294 *(*ptr)++ = utf8;
Chris@0 295
Chris@0 296 return 1;
Chris@0 297 }
Chris@0 298
Chris@0 299 /*
Chris@0 300 * NAME: utf8->get()
Chris@0 301 * DESCRIPTION: deserialize a single utf8 character
Chris@0 302 */
Chris@0 303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
Chris@0 304 {
Chris@0 305 return *(*ptr)++;
Chris@0 306 }
Chris@0 307
Chris@0 308 /*
Chris@0 309 * NAME: utf8->serialize()
Chris@0 310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding
Chris@0 311 */
Chris@0 312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
Chris@0 313 int terminate)
Chris@0 314 {
Chris@0 315 id3_length_t size = 0;
Chris@0 316 id3_utf8_t utf8[6], *out;
Chris@0 317
Chris@0 318 while (*ucs4) {
Chris@0 319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
Chris@0 320 case 6: size += id3_utf8_put(ptr, *out++);
Chris@0 321 case 5: size += id3_utf8_put(ptr, *out++);
Chris@0 322 case 4: size += id3_utf8_put(ptr, *out++);
Chris@0 323 case 3: size += id3_utf8_put(ptr, *out++);
Chris@0 324 case 2: size += id3_utf8_put(ptr, *out++);
Chris@0 325 case 1: size += id3_utf8_put(ptr, *out++);
Chris@0 326 case 0: break;
Chris@0 327 }
Chris@0 328 }
Chris@0 329
Chris@0 330 if (terminate)
Chris@0 331 size += id3_utf8_put(ptr, 0);
Chris@0 332
Chris@0 333 return size;
Chris@0 334 }
Chris@0 335
Chris@0 336 /*
Chris@0 337 * NAME: utf8->deserialize()
Chris@0 338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding
Chris@0 339 */
Chris@0 340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
Chris@0 341 {
Chris@0 342 id3_byte_t const *end;
Chris@0 343 id3_utf8_t *utf8ptr, *utf8;
Chris@0 344 id3_ucs4_t *ucs4;
Chris@0 345
Chris@0 346 end = *ptr + length;
Chris@0 347
Chris@0 348 utf8 = malloc((length + 1) * sizeof(*utf8));
Chris@0 349 if (utf8 == 0)
Chris@0 350 return 0;
Chris@0 351
Chris@0 352 utf8ptr = utf8;
Chris@0 353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
Chris@0 354 ++utf8ptr;
Chris@0 355
Chris@0 356 *utf8ptr = 0;
Chris@0 357
Chris@0 358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
Chris@0 359 if (ucs4)
Chris@0 360 id3_utf8_decode(utf8, ucs4);
Chris@0 361
Chris@0 362 free(utf8);
Chris@0 363
Chris@0 364 return ucs4;
Chris@0 365 }