cannam@85
|
1 /*
|
cannam@85
|
2 * libid3tag - ID3 tag manipulation library
|
cannam@85
|
3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
|
cannam@85
|
4 *
|
cannam@85
|
5 * This program is free software; you can redistribute it and/or modify
|
cannam@85
|
6 * it under the terms of the GNU General Public License as published by
|
cannam@85
|
7 * the Free Software Foundation; either version 2 of the License, or
|
cannam@85
|
8 * (at your option) any later version.
|
cannam@85
|
9 *
|
cannam@85
|
10 * This program is distributed in the hope that it will be useful,
|
cannam@85
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
cannam@85
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
cannam@85
|
13 * GNU General Public License for more details.
|
cannam@85
|
14 *
|
cannam@85
|
15 * You should have received a copy of the GNU General Public License
|
cannam@85
|
16 * along with this program; if not, write to the Free Software
|
cannam@85
|
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
cannam@85
|
18 *
|
cannam@85
|
19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
|
cannam@85
|
20 */
|
cannam@85
|
21
|
cannam@85
|
22 # ifdef HAVE_CONFIG_H
|
cannam@85
|
23 # include "config.h"
|
cannam@85
|
24 # endif
|
cannam@85
|
25
|
cannam@85
|
26 # include "global.h"
|
cannam@85
|
27
|
cannam@85
|
28 # include <stdlib.h>
|
cannam@85
|
29
|
cannam@85
|
30 # include "id3tag.h"
|
cannam@85
|
31 # include "utf8.h"
|
cannam@85
|
32 # include "ucs4.h"
|
cannam@85
|
33
|
cannam@85
|
34 /*
|
cannam@85
|
35 * NAME: utf8->length()
|
cannam@85
|
36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
|
cannam@85
|
37 */
|
cannam@85
|
38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
|
cannam@85
|
39 {
|
cannam@85
|
40 id3_length_t length = 0;
|
cannam@85
|
41
|
cannam@85
|
42 while (*utf8) {
|
cannam@85
|
43 if ((utf8[0] & 0x80) == 0x00)
|
cannam@85
|
44 ++length;
|
cannam@85
|
45 else if ((utf8[0] & 0xe0) == 0xc0 &&
|
cannam@85
|
46 (utf8[1] & 0xc0) == 0x80) {
|
cannam@85
|
47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
|
cannam@85
|
48 ++length;
|
cannam@85
|
49 utf8 += 1;
|
cannam@85
|
50 }
|
cannam@85
|
51 }
|
cannam@85
|
52 else if ((utf8[0] & 0xf0) == 0xe0 &&
|
cannam@85
|
53 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
54 (utf8[2] & 0xc0) == 0x80) {
|
cannam@85
|
55 if ((((utf8[0] & 0x0fL) << 12) |
|
cannam@85
|
56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) {
|
cannam@85
|
57 ++length;
|
cannam@85
|
58 utf8 += 2;
|
cannam@85
|
59 }
|
cannam@85
|
60 }
|
cannam@85
|
61 else if ((utf8[0] & 0xf8) == 0xf0 &&
|
cannam@85
|
62 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
63 (utf8[2] & 0xc0) == 0x80 &&
|
cannam@85
|
64 (utf8[3] & 0xc0) == 0x80) {
|
cannam@85
|
65 if ((((utf8[0] & 0x07L) << 18) |
|
cannam@85
|
66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
|
cannam@85
|
67 ++length;
|
cannam@85
|
68 utf8 += 3;
|
cannam@85
|
69 }
|
cannam@85
|
70 }
|
cannam@85
|
71 else if ((utf8[0] & 0xfc) == 0xf8 &&
|
cannam@85
|
72 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
73 (utf8[2] & 0xc0) == 0x80 &&
|
cannam@85
|
74 (utf8[3] & 0xc0) == 0x80 &&
|
cannam@85
|
75 (utf8[4] & 0xc0) == 0x80) {
|
cannam@85
|
76 if ((((utf8[0] & 0x03L) << 24) |
|
cannam@85
|
77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
|
cannam@85
|
78 ++length;
|
cannam@85
|
79 utf8 += 4;
|
cannam@85
|
80 }
|
cannam@85
|
81 }
|
cannam@85
|
82 else if ((utf8[0] & 0xfe) == 0xfc &&
|
cannam@85
|
83 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
84 (utf8[2] & 0xc0) == 0x80 &&
|
cannam@85
|
85 (utf8[3] & 0xc0) == 0x80 &&
|
cannam@85
|
86 (utf8[4] & 0xc0) == 0x80 &&
|
cannam@85
|
87 (utf8[5] & 0xc0) == 0x80) {
|
cannam@85
|
88 if ((((utf8[0] & 0x01L) << 30) |
|
cannam@85
|
89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
|
cannam@85
|
90 ++length;
|
cannam@85
|
91 utf8 += 5;
|
cannam@85
|
92 }
|
cannam@85
|
93 }
|
cannam@85
|
94
|
cannam@85
|
95 ++utf8;
|
cannam@85
|
96 }
|
cannam@85
|
97
|
cannam@85
|
98 return length;
|
cannam@85
|
99 }
|
cannam@85
|
100
|
cannam@85
|
101 /*
|
cannam@85
|
102 * NAME: utf8->size()
|
cannam@85
|
103 * DESCRIPTION: return the encoding size of a utf8 string
|
cannam@85
|
104 */
|
cannam@85
|
105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
|
cannam@85
|
106 {
|
cannam@85
|
107 id3_utf8_t const *ptr = utf8;
|
cannam@85
|
108
|
cannam@85
|
109 while (*ptr)
|
cannam@85
|
110 ++ptr;
|
cannam@85
|
111
|
cannam@85
|
112 return ptr - utf8 + 1;
|
cannam@85
|
113 }
|
cannam@85
|
114
|
cannam@85
|
115 /*
|
cannam@85
|
116 * NAME: utf8->ucs4duplicate()
|
cannam@85
|
117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4
|
cannam@85
|
118 */
|
cannam@85
|
119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
|
cannam@85
|
120 {
|
cannam@85
|
121 id3_ucs4_t *ucs4;
|
cannam@85
|
122
|
cannam@85
|
123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
|
cannam@85
|
124 if (ucs4)
|
cannam@85
|
125 id3_utf8_decode(utf8, ucs4);
|
cannam@85
|
126
|
cannam@85
|
127 return release(ucs4);
|
cannam@85
|
128 }
|
cannam@85
|
129
|
cannam@85
|
130 /*
|
cannam@85
|
131 * NAME: utf8->decodechar()
|
cannam@85
|
132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
|
cannam@85
|
133 */
|
cannam@85
|
134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
|
cannam@85
|
135 {
|
cannam@85
|
136 id3_utf8_t const *start = utf8;
|
cannam@85
|
137
|
cannam@85
|
138 while (1) {
|
cannam@85
|
139 if ((utf8[0] & 0x80) == 0x00) {
|
cannam@85
|
140 *ucs4 = utf8[0];
|
cannam@85
|
141 return utf8 - start + 1;
|
cannam@85
|
142 }
|
cannam@85
|
143 else if ((utf8[0] & 0xe0) == 0xc0 &&
|
cannam@85
|
144 (utf8[1] & 0xc0) == 0x80) {
|
cannam@85
|
145 *ucs4 =
|
cannam@85
|
146 ((utf8[0] & 0x1fL) << 6) |
|
cannam@85
|
147 ((utf8[1] & 0x3fL) << 0);
|
cannam@85
|
148 if (*ucs4 >= 0x00000080L)
|
cannam@85
|
149 return utf8 - start + 2;
|
cannam@85
|
150 }
|
cannam@85
|
151 else if ((utf8[0] & 0xf0) == 0xe0 &&
|
cannam@85
|
152 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
153 (utf8[2] & 0xc0) == 0x80) {
|
cannam@85
|
154 *ucs4 =
|
cannam@85
|
155 ((utf8[0] & 0x0fL) << 12) |
|
cannam@85
|
156 ((utf8[1] & 0x3fL) << 6) |
|
cannam@85
|
157 ((utf8[2] & 0x3fL) << 0);
|
cannam@85
|
158 if (*ucs4 >= 0x00000800L)
|
cannam@85
|
159 return utf8 - start + 3;
|
cannam@85
|
160 }
|
cannam@85
|
161 else if ((utf8[0] & 0xf8) == 0xf0 &&
|
cannam@85
|
162 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
163 (utf8[2] & 0xc0) == 0x80 &&
|
cannam@85
|
164 (utf8[3] & 0xc0) == 0x80) {
|
cannam@85
|
165 *ucs4 =
|
cannam@85
|
166 ((utf8[0] & 0x07L) << 18) |
|
cannam@85
|
167 ((utf8[1] & 0x3fL) << 12) |
|
cannam@85
|
168 ((utf8[2] & 0x3fL) << 6) |
|
cannam@85
|
169 ((utf8[3] & 0x3fL) << 0);
|
cannam@85
|
170 if (*ucs4 >= 0x00010000L)
|
cannam@85
|
171 return utf8 - start + 4;
|
cannam@85
|
172 }
|
cannam@85
|
173 else if ((utf8[0] & 0xfc) == 0xf8 &&
|
cannam@85
|
174 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
175 (utf8[2] & 0xc0) == 0x80 &&
|
cannam@85
|
176 (utf8[3] & 0xc0) == 0x80 &&
|
cannam@85
|
177 (utf8[4] & 0xc0) == 0x80) {
|
cannam@85
|
178 *ucs4 =
|
cannam@85
|
179 ((utf8[0] & 0x03L) << 24) |
|
cannam@85
|
180 ((utf8[1] & 0x3fL) << 18) |
|
cannam@85
|
181 ((utf8[2] & 0x3fL) << 12) |
|
cannam@85
|
182 ((utf8[3] & 0x3fL) << 6) |
|
cannam@85
|
183 ((utf8[4] & 0x3fL) << 0);
|
cannam@85
|
184 if (*ucs4 >= 0x00200000L)
|
cannam@85
|
185 return utf8 - start + 5;
|
cannam@85
|
186 }
|
cannam@85
|
187 else if ((utf8[0] & 0xfe) == 0xfc &&
|
cannam@85
|
188 (utf8[1] & 0xc0) == 0x80 &&
|
cannam@85
|
189 (utf8[2] & 0xc0) == 0x80 &&
|
cannam@85
|
190 (utf8[3] & 0xc0) == 0x80 &&
|
cannam@85
|
191 (utf8[4] & 0xc0) == 0x80 &&
|
cannam@85
|
192 (utf8[5] & 0xc0) == 0x80) {
|
cannam@85
|
193 *ucs4 =
|
cannam@85
|
194 ((utf8[0] & 0x01L) << 30) |
|
cannam@85
|
195 ((utf8[1] & 0x3fL) << 24) |
|
cannam@85
|
196 ((utf8[2] & 0x3fL) << 18) |
|
cannam@85
|
197 ((utf8[3] & 0x3fL) << 12) |
|
cannam@85
|
198 ((utf8[4] & 0x3fL) << 6) |
|
cannam@85
|
199 ((utf8[5] & 0x3fL) << 0);
|
cannam@85
|
200 if (*ucs4 >= 0x04000000L)
|
cannam@85
|
201 return utf8 - start + 6;
|
cannam@85
|
202 }
|
cannam@85
|
203
|
cannam@85
|
204 ++utf8;
|
cannam@85
|
205 }
|
cannam@85
|
206 }
|
cannam@85
|
207
|
cannam@85
|
208 /*
|
cannam@85
|
209 * NAME: utf8->encodechar()
|
cannam@85
|
210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
|
cannam@85
|
211 */
|
cannam@85
|
212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
|
cannam@85
|
213 {
|
cannam@85
|
214 if (ucs4 <= 0x0000007fL) {
|
cannam@85
|
215 utf8[0] = ucs4;
|
cannam@85
|
216
|
cannam@85
|
217 return 1;
|
cannam@85
|
218 }
|
cannam@85
|
219 else if (ucs4 <= 0x000007ffL) {
|
cannam@85
|
220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f);
|
cannam@85
|
221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f);
|
cannam@85
|
222
|
cannam@85
|
223 return 2;
|
cannam@85
|
224 }
|
cannam@85
|
225 else if (ucs4 <= 0x0000ffffL) {
|
cannam@85
|
226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
|
cannam@85
|
227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
|
cannam@85
|
228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f);
|
cannam@85
|
229
|
cannam@85
|
230 return 3;
|
cannam@85
|
231 }
|
cannam@85
|
232 else if (ucs4 <= 0x001fffffL) {
|
cannam@85
|
233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
|
cannam@85
|
234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
|
cannam@85
|
235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
|
cannam@85
|
236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f);
|
cannam@85
|
237
|
cannam@85
|
238 return 4;
|
cannam@85
|
239 }
|
cannam@85
|
240 else if (ucs4 <= 0x03ffffffL) {
|
cannam@85
|
241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
|
cannam@85
|
242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
|
cannam@85
|
243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
|
cannam@85
|
244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f);
|
cannam@85
|
245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f);
|
cannam@85
|
246
|
cannam@85
|
247 return 5;
|
cannam@85
|
248 }
|
cannam@85
|
249 else if (ucs4 <= 0x7fffffffL) {
|
cannam@85
|
250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
|
cannam@85
|
251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
|
cannam@85
|
252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
|
cannam@85
|
253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
|
cannam@85
|
254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f);
|
cannam@85
|
255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f);
|
cannam@85
|
256
|
cannam@85
|
257 return 6;
|
cannam@85
|
258 }
|
cannam@85
|
259
|
cannam@85
|
260 /* default */
|
cannam@85
|
261
|
cannam@85
|
262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
|
cannam@85
|
263 }
|
cannam@85
|
264
|
cannam@85
|
265 /*
|
cannam@85
|
266 * NAME: utf8->decode()
|
cannam@85
|
267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string
|
cannam@85
|
268 */
|
cannam@85
|
269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
|
cannam@85
|
270 {
|
cannam@85
|
271 do
|
cannam@85
|
272 utf8 += id3_utf8_decodechar(utf8, ucs4);
|
cannam@85
|
273 while (*ucs4++);
|
cannam@85
|
274 }
|
cannam@85
|
275
|
cannam@85
|
276 /*
|
cannam@85
|
277 * NAME: utf8->encode()
|
cannam@85
|
278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string
|
cannam@85
|
279 */
|
cannam@85
|
280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
|
cannam@85
|
281 {
|
cannam@85
|
282 do
|
cannam@85
|
283 utf8 += id3_utf8_encodechar(utf8, *ucs4);
|
cannam@85
|
284 while (*ucs4++);
|
cannam@85
|
285 }
|
cannam@85
|
286
|
cannam@85
|
287 /*
|
cannam@85
|
288 * NAME: utf8->put()
|
cannam@85
|
289 * DESCRIPTION: serialize a single utf8 character
|
cannam@85
|
290 */
|
cannam@85
|
291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
|
cannam@85
|
292 {
|
cannam@85
|
293 if (ptr)
|
cannam@85
|
294 *(*ptr)++ = utf8;
|
cannam@85
|
295
|
cannam@85
|
296 return 1;
|
cannam@85
|
297 }
|
cannam@85
|
298
|
cannam@85
|
299 /*
|
cannam@85
|
300 * NAME: utf8->get()
|
cannam@85
|
301 * DESCRIPTION: deserialize a single utf8 character
|
cannam@85
|
302 */
|
cannam@85
|
303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
|
cannam@85
|
304 {
|
cannam@85
|
305 return *(*ptr)++;
|
cannam@85
|
306 }
|
cannam@85
|
307
|
cannam@85
|
308 /*
|
cannam@85
|
309 * NAME: utf8->serialize()
|
cannam@85
|
310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding
|
cannam@85
|
311 */
|
cannam@85
|
312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
|
cannam@85
|
313 int terminate)
|
cannam@85
|
314 {
|
cannam@85
|
315 id3_length_t size = 0;
|
cannam@85
|
316 id3_utf8_t utf8[6], *out;
|
cannam@85
|
317
|
cannam@85
|
318 while (*ucs4) {
|
cannam@85
|
319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
|
cannam@85
|
320 case 6: size += id3_utf8_put(ptr, *out++);
|
cannam@85
|
321 case 5: size += id3_utf8_put(ptr, *out++);
|
cannam@85
|
322 case 4: size += id3_utf8_put(ptr, *out++);
|
cannam@85
|
323 case 3: size += id3_utf8_put(ptr, *out++);
|
cannam@85
|
324 case 2: size += id3_utf8_put(ptr, *out++);
|
cannam@85
|
325 case 1: size += id3_utf8_put(ptr, *out++);
|
cannam@85
|
326 case 0: break;
|
cannam@85
|
327 }
|
cannam@85
|
328 }
|
cannam@85
|
329
|
cannam@85
|
330 if (terminate)
|
cannam@85
|
331 size += id3_utf8_put(ptr, 0);
|
cannam@85
|
332
|
cannam@85
|
333 return size;
|
cannam@85
|
334 }
|
cannam@85
|
335
|
cannam@85
|
336 /*
|
cannam@85
|
337 * NAME: utf8->deserialize()
|
cannam@85
|
338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding
|
cannam@85
|
339 */
|
cannam@85
|
340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
|
cannam@85
|
341 {
|
cannam@85
|
342 id3_byte_t const *end;
|
cannam@85
|
343 id3_utf8_t *utf8ptr, *utf8;
|
cannam@85
|
344 id3_ucs4_t *ucs4;
|
cannam@85
|
345
|
cannam@85
|
346 end = *ptr + length;
|
cannam@85
|
347
|
cannam@85
|
348 utf8 = malloc((length + 1) * sizeof(*utf8));
|
cannam@85
|
349 if (utf8 == 0)
|
cannam@85
|
350 return 0;
|
cannam@85
|
351
|
cannam@85
|
352 utf8ptr = utf8;
|
cannam@85
|
353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
|
cannam@85
|
354 ++utf8ptr;
|
cannam@85
|
355
|
cannam@85
|
356 *utf8ptr = 0;
|
cannam@85
|
357
|
cannam@85
|
358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
|
cannam@85
|
359 if (ucs4)
|
cannam@85
|
360 id3_utf8_decode(utf8, ucs4);
|
cannam@85
|
361
|
cannam@85
|
362 free(utf8);
|
cannam@85
|
363
|
cannam@85
|
364 return ucs4;
|
cannam@85
|
365 }
|