annotate src/libid3tag-0.15.1b/utf8.c @ 169:223a55898ab9 tip default

Add null config files
author Chris Cannam <cannam@all-day-breakfast.com>
date Mon, 02 Mar 2020 14:03:47 +0000
parents 545efbb81310
children
rev   line source
cannam@85 1 /*
cannam@85 2 * libid3tag - ID3 tag manipulation library
cannam@85 3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
cannam@85 4 *
cannam@85 5 * This program is free software; you can redistribute it and/or modify
cannam@85 6 * it under the terms of the GNU General Public License as published by
cannam@85 7 * the Free Software Foundation; either version 2 of the License, or
cannam@85 8 * (at your option) any later version.
cannam@85 9 *
cannam@85 10 * This program is distributed in the hope that it will be useful,
cannam@85 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@85 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@85 13 * GNU General Public License for more details.
cannam@85 14 *
cannam@85 15 * You should have received a copy of the GNU General Public License
cannam@85 16 * along with this program; if not, write to the Free Software
cannam@85 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
cannam@85 18 *
cannam@85 19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
cannam@85 20 */
cannam@85 21
cannam@85 22 # ifdef HAVE_CONFIG_H
cannam@85 23 # include "config.h"
cannam@85 24 # endif
cannam@85 25
cannam@85 26 # include "global.h"
cannam@85 27
cannam@85 28 # include <stdlib.h>
cannam@85 29
cannam@85 30 # include "id3tag.h"
cannam@85 31 # include "utf8.h"
cannam@85 32 # include "ucs4.h"
cannam@85 33
cannam@85 34 /*
cannam@85 35 * NAME: utf8->length()
cannam@85 36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
cannam@85 37 */
cannam@85 38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
cannam@85 39 {
cannam@85 40 id3_length_t length = 0;
cannam@85 41
cannam@85 42 while (*utf8) {
cannam@85 43 if ((utf8[0] & 0x80) == 0x00)
cannam@85 44 ++length;
cannam@85 45 else if ((utf8[0] & 0xe0) == 0xc0 &&
cannam@85 46 (utf8[1] & 0xc0) == 0x80) {
cannam@85 47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
cannam@85 48 ++length;
cannam@85 49 utf8 += 1;
cannam@85 50 }
cannam@85 51 }
cannam@85 52 else if ((utf8[0] & 0xf0) == 0xe0 &&
cannam@85 53 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 54 (utf8[2] & 0xc0) == 0x80) {
cannam@85 55 if ((((utf8[0] & 0x0fL) << 12) |
cannam@85 56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) {
cannam@85 57 ++length;
cannam@85 58 utf8 += 2;
cannam@85 59 }
cannam@85 60 }
cannam@85 61 else if ((utf8[0] & 0xf8) == 0xf0 &&
cannam@85 62 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 63 (utf8[2] & 0xc0) == 0x80 &&
cannam@85 64 (utf8[3] & 0xc0) == 0x80) {
cannam@85 65 if ((((utf8[0] & 0x07L) << 18) |
cannam@85 66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
cannam@85 67 ++length;
cannam@85 68 utf8 += 3;
cannam@85 69 }
cannam@85 70 }
cannam@85 71 else if ((utf8[0] & 0xfc) == 0xf8 &&
cannam@85 72 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 73 (utf8[2] & 0xc0) == 0x80 &&
cannam@85 74 (utf8[3] & 0xc0) == 0x80 &&
cannam@85 75 (utf8[4] & 0xc0) == 0x80) {
cannam@85 76 if ((((utf8[0] & 0x03L) << 24) |
cannam@85 77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
cannam@85 78 ++length;
cannam@85 79 utf8 += 4;
cannam@85 80 }
cannam@85 81 }
cannam@85 82 else if ((utf8[0] & 0xfe) == 0xfc &&
cannam@85 83 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 84 (utf8[2] & 0xc0) == 0x80 &&
cannam@85 85 (utf8[3] & 0xc0) == 0x80 &&
cannam@85 86 (utf8[4] & 0xc0) == 0x80 &&
cannam@85 87 (utf8[5] & 0xc0) == 0x80) {
cannam@85 88 if ((((utf8[0] & 0x01L) << 30) |
cannam@85 89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
cannam@85 90 ++length;
cannam@85 91 utf8 += 5;
cannam@85 92 }
cannam@85 93 }
cannam@85 94
cannam@85 95 ++utf8;
cannam@85 96 }
cannam@85 97
cannam@85 98 return length;
cannam@85 99 }
cannam@85 100
cannam@85 101 /*
cannam@85 102 * NAME: utf8->size()
cannam@85 103 * DESCRIPTION: return the encoding size of a utf8 string
cannam@85 104 */
cannam@85 105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
cannam@85 106 {
cannam@85 107 id3_utf8_t const *ptr = utf8;
cannam@85 108
cannam@85 109 while (*ptr)
cannam@85 110 ++ptr;
cannam@85 111
cannam@85 112 return ptr - utf8 + 1;
cannam@85 113 }
cannam@85 114
cannam@85 115 /*
cannam@85 116 * NAME: utf8->ucs4duplicate()
cannam@85 117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4
cannam@85 118 */
cannam@85 119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
cannam@85 120 {
cannam@85 121 id3_ucs4_t *ucs4;
cannam@85 122
cannam@85 123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
cannam@85 124 if (ucs4)
cannam@85 125 id3_utf8_decode(utf8, ucs4);
cannam@85 126
cannam@85 127 return release(ucs4);
cannam@85 128 }
cannam@85 129
cannam@85 130 /*
cannam@85 131 * NAME: utf8->decodechar()
cannam@85 132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
cannam@85 133 */
cannam@85 134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
cannam@85 135 {
cannam@85 136 id3_utf8_t const *start = utf8;
cannam@85 137
cannam@85 138 while (1) {
cannam@85 139 if ((utf8[0] & 0x80) == 0x00) {
cannam@85 140 *ucs4 = utf8[0];
cannam@85 141 return utf8 - start + 1;
cannam@85 142 }
cannam@85 143 else if ((utf8[0] & 0xe0) == 0xc0 &&
cannam@85 144 (utf8[1] & 0xc0) == 0x80) {
cannam@85 145 *ucs4 =
cannam@85 146 ((utf8[0] & 0x1fL) << 6) |
cannam@85 147 ((utf8[1] & 0x3fL) << 0);
cannam@85 148 if (*ucs4 >= 0x00000080L)
cannam@85 149 return utf8 - start + 2;
cannam@85 150 }
cannam@85 151 else if ((utf8[0] & 0xf0) == 0xe0 &&
cannam@85 152 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 153 (utf8[2] & 0xc0) == 0x80) {
cannam@85 154 *ucs4 =
cannam@85 155 ((utf8[0] & 0x0fL) << 12) |
cannam@85 156 ((utf8[1] & 0x3fL) << 6) |
cannam@85 157 ((utf8[2] & 0x3fL) << 0);
cannam@85 158 if (*ucs4 >= 0x00000800L)
cannam@85 159 return utf8 - start + 3;
cannam@85 160 }
cannam@85 161 else if ((utf8[0] & 0xf8) == 0xf0 &&
cannam@85 162 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 163 (utf8[2] & 0xc0) == 0x80 &&
cannam@85 164 (utf8[3] & 0xc0) == 0x80) {
cannam@85 165 *ucs4 =
cannam@85 166 ((utf8[0] & 0x07L) << 18) |
cannam@85 167 ((utf8[1] & 0x3fL) << 12) |
cannam@85 168 ((utf8[2] & 0x3fL) << 6) |
cannam@85 169 ((utf8[3] & 0x3fL) << 0);
cannam@85 170 if (*ucs4 >= 0x00010000L)
cannam@85 171 return utf8 - start + 4;
cannam@85 172 }
cannam@85 173 else if ((utf8[0] & 0xfc) == 0xf8 &&
cannam@85 174 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 175 (utf8[2] & 0xc0) == 0x80 &&
cannam@85 176 (utf8[3] & 0xc0) == 0x80 &&
cannam@85 177 (utf8[4] & 0xc0) == 0x80) {
cannam@85 178 *ucs4 =
cannam@85 179 ((utf8[0] & 0x03L) << 24) |
cannam@85 180 ((utf8[1] & 0x3fL) << 18) |
cannam@85 181 ((utf8[2] & 0x3fL) << 12) |
cannam@85 182 ((utf8[3] & 0x3fL) << 6) |
cannam@85 183 ((utf8[4] & 0x3fL) << 0);
cannam@85 184 if (*ucs4 >= 0x00200000L)
cannam@85 185 return utf8 - start + 5;
cannam@85 186 }
cannam@85 187 else if ((utf8[0] & 0xfe) == 0xfc &&
cannam@85 188 (utf8[1] & 0xc0) == 0x80 &&
cannam@85 189 (utf8[2] & 0xc0) == 0x80 &&
cannam@85 190 (utf8[3] & 0xc0) == 0x80 &&
cannam@85 191 (utf8[4] & 0xc0) == 0x80 &&
cannam@85 192 (utf8[5] & 0xc0) == 0x80) {
cannam@85 193 *ucs4 =
cannam@85 194 ((utf8[0] & 0x01L) << 30) |
cannam@85 195 ((utf8[1] & 0x3fL) << 24) |
cannam@85 196 ((utf8[2] & 0x3fL) << 18) |
cannam@85 197 ((utf8[3] & 0x3fL) << 12) |
cannam@85 198 ((utf8[4] & 0x3fL) << 6) |
cannam@85 199 ((utf8[5] & 0x3fL) << 0);
cannam@85 200 if (*ucs4 >= 0x04000000L)
cannam@85 201 return utf8 - start + 6;
cannam@85 202 }
cannam@85 203
cannam@85 204 ++utf8;
cannam@85 205 }
cannam@85 206 }
cannam@85 207
cannam@85 208 /*
cannam@85 209 * NAME: utf8->encodechar()
cannam@85 210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
cannam@85 211 */
cannam@85 212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
cannam@85 213 {
cannam@85 214 if (ucs4 <= 0x0000007fL) {
cannam@85 215 utf8[0] = ucs4;
cannam@85 216
cannam@85 217 return 1;
cannam@85 218 }
cannam@85 219 else if (ucs4 <= 0x000007ffL) {
cannam@85 220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f);
cannam@85 221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f);
cannam@85 222
cannam@85 223 return 2;
cannam@85 224 }
cannam@85 225 else if (ucs4 <= 0x0000ffffL) {
cannam@85 226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
cannam@85 227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
cannam@85 228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f);
cannam@85 229
cannam@85 230 return 3;
cannam@85 231 }
cannam@85 232 else if (ucs4 <= 0x001fffffL) {
cannam@85 233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
cannam@85 234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
cannam@85 235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
cannam@85 236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f);
cannam@85 237
cannam@85 238 return 4;
cannam@85 239 }
cannam@85 240 else if (ucs4 <= 0x03ffffffL) {
cannam@85 241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
cannam@85 242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
cannam@85 243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
cannam@85 244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f);
cannam@85 245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f);
cannam@85 246
cannam@85 247 return 5;
cannam@85 248 }
cannam@85 249 else if (ucs4 <= 0x7fffffffL) {
cannam@85 250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
cannam@85 251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
cannam@85 252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
cannam@85 253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
cannam@85 254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f);
cannam@85 255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f);
cannam@85 256
cannam@85 257 return 6;
cannam@85 258 }
cannam@85 259
cannam@85 260 /* default */
cannam@85 261
cannam@85 262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
cannam@85 263 }
cannam@85 264
cannam@85 265 /*
cannam@85 266 * NAME: utf8->decode()
cannam@85 267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string
cannam@85 268 */
cannam@85 269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
cannam@85 270 {
cannam@85 271 do
cannam@85 272 utf8 += id3_utf8_decodechar(utf8, ucs4);
cannam@85 273 while (*ucs4++);
cannam@85 274 }
cannam@85 275
cannam@85 276 /*
cannam@85 277 * NAME: utf8->encode()
cannam@85 278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string
cannam@85 279 */
cannam@85 280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
cannam@85 281 {
cannam@85 282 do
cannam@85 283 utf8 += id3_utf8_encodechar(utf8, *ucs4);
cannam@85 284 while (*ucs4++);
cannam@85 285 }
cannam@85 286
cannam@85 287 /*
cannam@85 288 * NAME: utf8->put()
cannam@85 289 * DESCRIPTION: serialize a single utf8 character
cannam@85 290 */
cannam@85 291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
cannam@85 292 {
cannam@85 293 if (ptr)
cannam@85 294 *(*ptr)++ = utf8;
cannam@85 295
cannam@85 296 return 1;
cannam@85 297 }
cannam@85 298
cannam@85 299 /*
cannam@85 300 * NAME: utf8->get()
cannam@85 301 * DESCRIPTION: deserialize a single utf8 character
cannam@85 302 */
cannam@85 303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
cannam@85 304 {
cannam@85 305 return *(*ptr)++;
cannam@85 306 }
cannam@85 307
cannam@85 308 /*
cannam@85 309 * NAME: utf8->serialize()
cannam@85 310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding
cannam@85 311 */
cannam@85 312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
cannam@85 313 int terminate)
cannam@85 314 {
cannam@85 315 id3_length_t size = 0;
cannam@85 316 id3_utf8_t utf8[6], *out;
cannam@85 317
cannam@85 318 while (*ucs4) {
cannam@85 319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
cannam@85 320 case 6: size += id3_utf8_put(ptr, *out++);
cannam@85 321 case 5: size += id3_utf8_put(ptr, *out++);
cannam@85 322 case 4: size += id3_utf8_put(ptr, *out++);
cannam@85 323 case 3: size += id3_utf8_put(ptr, *out++);
cannam@85 324 case 2: size += id3_utf8_put(ptr, *out++);
cannam@85 325 case 1: size += id3_utf8_put(ptr, *out++);
cannam@85 326 case 0: break;
cannam@85 327 }
cannam@85 328 }
cannam@85 329
cannam@85 330 if (terminate)
cannam@85 331 size += id3_utf8_put(ptr, 0);
cannam@85 332
cannam@85 333 return size;
cannam@85 334 }
cannam@85 335
cannam@85 336 /*
cannam@85 337 * NAME: utf8->deserialize()
cannam@85 338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding
cannam@85 339 */
cannam@85 340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
cannam@85 341 {
cannam@85 342 id3_byte_t const *end;
cannam@85 343 id3_utf8_t *utf8ptr, *utf8;
cannam@85 344 id3_ucs4_t *ucs4;
cannam@85 345
cannam@85 346 end = *ptr + length;
cannam@85 347
cannam@85 348 utf8 = malloc((length + 1) * sizeof(*utf8));
cannam@85 349 if (utf8 == 0)
cannam@85 350 return 0;
cannam@85 351
cannam@85 352 utf8ptr = utf8;
cannam@85 353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
cannam@85 354 ++utf8ptr;
cannam@85 355
cannam@85 356 *utf8ptr = 0;
cannam@85 357
cannam@85 358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
cannam@85 359 if (ucs4)
cannam@85 360 id3_utf8_decode(utf8, ucs4);
cannam@85 361
cannam@85 362 free(utf8);
cannam@85 363
cannam@85 364 return ucs4;
cannam@85 365 }