Mercurial > hg > sv-dependency-builds
comparison src/libid3tag-0.15.1b/utf8.c @ 85:545efbb81310
Import initial set of sources
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Mon, 18 Mar 2013 14:12:14 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 85:545efbb81310 |
---|---|
1 /* | |
2 * libid3tag - ID3 tag manipulation library | |
3 * Copyright (C) 2000-2004 Underbit Technologies, Inc. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 * | |
19 * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $ | |
20 */ | |
21 | |
22 # ifdef HAVE_CONFIG_H | |
23 # include "config.h" | |
24 # endif | |
25 | |
26 # include "global.h" | |
27 | |
28 # include <stdlib.h> | |
29 | |
30 # include "id3tag.h" | |
31 # include "utf8.h" | |
32 # include "ucs4.h" | |
33 | |
34 /* | |
35 * NAME: utf8->length() | |
36 * DESCRIPTION: return the number of ucs4 chars represented by a utf8 string | |
37 */ | |
38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8) | |
39 { | |
40 id3_length_t length = 0; | |
41 | |
42 while (*utf8) { | |
43 if ((utf8[0] & 0x80) == 0x00) | |
44 ++length; | |
45 else if ((utf8[0] & 0xe0) == 0xc0 && | |
46 (utf8[1] & 0xc0) == 0x80) { | |
47 if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) { | |
48 ++length; | |
49 utf8 += 1; | |
50 } | |
51 } | |
52 else if ((utf8[0] & 0xf0) == 0xe0 && | |
53 (utf8[1] & 0xc0) == 0x80 && | |
54 (utf8[2] & 0xc0) == 0x80) { | |
55 if ((((utf8[0] & 0x0fL) << 12) | | |
56 ((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) { | |
57 ++length; | |
58 utf8 += 2; | |
59 } | |
60 } | |
61 else if ((utf8[0] & 0xf8) == 0xf0 && | |
62 (utf8[1] & 0xc0) == 0x80 && | |
63 (utf8[2] & 0xc0) == 0x80 && | |
64 (utf8[3] & 0xc0) == 0x80) { | |
65 if ((((utf8[0] & 0x07L) << 18) | | |
66 ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) { | |
67 ++length; | |
68 utf8 += 3; | |
69 } | |
70 } | |
71 else if ((utf8[0] & 0xfc) == 0xf8 && | |
72 (utf8[1] & 0xc0) == 0x80 && | |
73 (utf8[2] & 0xc0) == 0x80 && | |
74 (utf8[3] & 0xc0) == 0x80 && | |
75 (utf8[4] & 0xc0) == 0x80) { | |
76 if ((((utf8[0] & 0x03L) << 24) | | |
77 ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) { | |
78 ++length; | |
79 utf8 += 4; | |
80 } | |
81 } | |
82 else if ((utf8[0] & 0xfe) == 0xfc && | |
83 (utf8[1] & 0xc0) == 0x80 && | |
84 (utf8[2] & 0xc0) == 0x80 && | |
85 (utf8[3] & 0xc0) == 0x80 && | |
86 (utf8[4] & 0xc0) == 0x80 && | |
87 (utf8[5] & 0xc0) == 0x80) { | |
88 if ((((utf8[0] & 0x01L) << 30) | | |
89 ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) { | |
90 ++length; | |
91 utf8 += 5; | |
92 } | |
93 } | |
94 | |
95 ++utf8; | |
96 } | |
97 | |
98 return length; | |
99 } | |
100 | |
101 /* | |
102 * NAME: utf8->size() | |
103 * DESCRIPTION: return the encoding size of a utf8 string | |
104 */ | |
105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8) | |
106 { | |
107 id3_utf8_t const *ptr = utf8; | |
108 | |
109 while (*ptr) | |
110 ++ptr; | |
111 | |
112 return ptr - utf8 + 1; | |
113 } | |
114 | |
115 /* | |
116 * NAME: utf8->ucs4duplicate() | |
117 * DESCRIPTION: duplicate and decode a utf8 string into ucs4 | |
118 */ | |
119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8) | |
120 { | |
121 id3_ucs4_t *ucs4; | |
122 | |
123 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4)); | |
124 if (ucs4) | |
125 id3_utf8_decode(utf8, ucs4); | |
126 | |
127 return release(ucs4); | |
128 } | |
129 | |
130 /* | |
131 * NAME: utf8->decodechar() | |
132 * DESCRIPTION: decode a series of utf8 chars into a single ucs4 char | |
133 */ | |
134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4) | |
135 { | |
136 id3_utf8_t const *start = utf8; | |
137 | |
138 while (1) { | |
139 if ((utf8[0] & 0x80) == 0x00) { | |
140 *ucs4 = utf8[0]; | |
141 return utf8 - start + 1; | |
142 } | |
143 else if ((utf8[0] & 0xe0) == 0xc0 && | |
144 (utf8[1] & 0xc0) == 0x80) { | |
145 *ucs4 = | |
146 ((utf8[0] & 0x1fL) << 6) | | |
147 ((utf8[1] & 0x3fL) << 0); | |
148 if (*ucs4 >= 0x00000080L) | |
149 return utf8 - start + 2; | |
150 } | |
151 else if ((utf8[0] & 0xf0) == 0xe0 && | |
152 (utf8[1] & 0xc0) == 0x80 && | |
153 (utf8[2] & 0xc0) == 0x80) { | |
154 *ucs4 = | |
155 ((utf8[0] & 0x0fL) << 12) | | |
156 ((utf8[1] & 0x3fL) << 6) | | |
157 ((utf8[2] & 0x3fL) << 0); | |
158 if (*ucs4 >= 0x00000800L) | |
159 return utf8 - start + 3; | |
160 } | |
161 else if ((utf8[0] & 0xf8) == 0xf0 && | |
162 (utf8[1] & 0xc0) == 0x80 && | |
163 (utf8[2] & 0xc0) == 0x80 && | |
164 (utf8[3] & 0xc0) == 0x80) { | |
165 *ucs4 = | |
166 ((utf8[0] & 0x07L) << 18) | | |
167 ((utf8[1] & 0x3fL) << 12) | | |
168 ((utf8[2] & 0x3fL) << 6) | | |
169 ((utf8[3] & 0x3fL) << 0); | |
170 if (*ucs4 >= 0x00010000L) | |
171 return utf8 - start + 4; | |
172 } | |
173 else if ((utf8[0] & 0xfc) == 0xf8 && | |
174 (utf8[1] & 0xc0) == 0x80 && | |
175 (utf8[2] & 0xc0) == 0x80 && | |
176 (utf8[3] & 0xc0) == 0x80 && | |
177 (utf8[4] & 0xc0) == 0x80) { | |
178 *ucs4 = | |
179 ((utf8[0] & 0x03L) << 24) | | |
180 ((utf8[1] & 0x3fL) << 18) | | |
181 ((utf8[2] & 0x3fL) << 12) | | |
182 ((utf8[3] & 0x3fL) << 6) | | |
183 ((utf8[4] & 0x3fL) << 0); | |
184 if (*ucs4 >= 0x00200000L) | |
185 return utf8 - start + 5; | |
186 } | |
187 else if ((utf8[0] & 0xfe) == 0xfc && | |
188 (utf8[1] & 0xc0) == 0x80 && | |
189 (utf8[2] & 0xc0) == 0x80 && | |
190 (utf8[3] & 0xc0) == 0x80 && | |
191 (utf8[4] & 0xc0) == 0x80 && | |
192 (utf8[5] & 0xc0) == 0x80) { | |
193 *ucs4 = | |
194 ((utf8[0] & 0x01L) << 30) | | |
195 ((utf8[1] & 0x3fL) << 24) | | |
196 ((utf8[2] & 0x3fL) << 18) | | |
197 ((utf8[3] & 0x3fL) << 12) | | |
198 ((utf8[4] & 0x3fL) << 6) | | |
199 ((utf8[5] & 0x3fL) << 0); | |
200 if (*ucs4 >= 0x04000000L) | |
201 return utf8 - start + 6; | |
202 } | |
203 | |
204 ++utf8; | |
205 } | |
206 } | |
207 | |
208 /* | |
209 * NAME: utf8->encodechar() | |
210 * DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars | |
211 */ | |
212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4) | |
213 { | |
214 if (ucs4 <= 0x0000007fL) { | |
215 utf8[0] = ucs4; | |
216 | |
217 return 1; | |
218 } | |
219 else if (ucs4 <= 0x000007ffL) { | |
220 utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f); | |
221 utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f); | |
222 | |
223 return 2; | |
224 } | |
225 else if (ucs4 <= 0x0000ffffL) { | |
226 utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f); | |
227 utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); | |
228 utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f); | |
229 | |
230 return 3; | |
231 } | |
232 else if (ucs4 <= 0x001fffffL) { | |
233 utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07); | |
234 utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); | |
235 utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); | |
236 utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f); | |
237 | |
238 return 4; | |
239 } | |
240 else if (ucs4 <= 0x03ffffffL) { | |
241 utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03); | |
242 utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f); | |
243 utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f); | |
244 utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f); | |
245 utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f); | |
246 | |
247 return 5; | |
248 } | |
249 else if (ucs4 <= 0x7fffffffL) { | |
250 utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01); | |
251 utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f); | |
252 utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f); | |
253 utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f); | |
254 utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f); | |
255 utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f); | |
256 | |
257 return 6; | |
258 } | |
259 | |
260 /* default */ | |
261 | |
262 return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR); | |
263 } | |
264 | |
265 /* | |
266 * NAME: utf8->decode() | |
267 * DESCRIPTION: decode a complete utf8 string into a ucs4 string | |
268 */ | |
269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4) | |
270 { | |
271 do | |
272 utf8 += id3_utf8_decodechar(utf8, ucs4); | |
273 while (*ucs4++); | |
274 } | |
275 | |
276 /* | |
277 * NAME: utf8->encode() | |
278 * DESCRIPTION: encode a complete ucs4 string into a utf8 string | |
279 */ | |
280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4) | |
281 { | |
282 do | |
283 utf8 += id3_utf8_encodechar(utf8, *ucs4); | |
284 while (*ucs4++); | |
285 } | |
286 | |
287 /* | |
288 * NAME: utf8->put() | |
289 * DESCRIPTION: serialize a single utf8 character | |
290 */ | |
291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8) | |
292 { | |
293 if (ptr) | |
294 *(*ptr)++ = utf8; | |
295 | |
296 return 1; | |
297 } | |
298 | |
299 /* | |
300 * NAME: utf8->get() | |
301 * DESCRIPTION: deserialize a single utf8 character | |
302 */ | |
303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr) | |
304 { | |
305 return *(*ptr)++; | |
306 } | |
307 | |
308 /* | |
309 * NAME: utf8->serialize() | |
310 * DESCRIPTION: serialize a ucs4 string using utf8 encoding | |
311 */ | |
312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4, | |
313 int terminate) | |
314 { | |
315 id3_length_t size = 0; | |
316 id3_utf8_t utf8[6], *out; | |
317 | |
318 while (*ucs4) { | |
319 switch (id3_utf8_encodechar(out = utf8, *ucs4++)) { | |
320 case 6: size += id3_utf8_put(ptr, *out++); | |
321 case 5: size += id3_utf8_put(ptr, *out++); | |
322 case 4: size += id3_utf8_put(ptr, *out++); | |
323 case 3: size += id3_utf8_put(ptr, *out++); | |
324 case 2: size += id3_utf8_put(ptr, *out++); | |
325 case 1: size += id3_utf8_put(ptr, *out++); | |
326 case 0: break; | |
327 } | |
328 } | |
329 | |
330 if (terminate) | |
331 size += id3_utf8_put(ptr, 0); | |
332 | |
333 return size; | |
334 } | |
335 | |
336 /* | |
337 * NAME: utf8->deserialize() | |
338 * DESCRIPTION: deserialize a ucs4 string using utf8 encoding | |
339 */ | |
340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length) | |
341 { | |
342 id3_byte_t const *end; | |
343 id3_utf8_t *utf8ptr, *utf8; | |
344 id3_ucs4_t *ucs4; | |
345 | |
346 end = *ptr + length; | |
347 | |
348 utf8 = malloc((length + 1) * sizeof(*utf8)); | |
349 if (utf8 == 0) | |
350 return 0; | |
351 | |
352 utf8ptr = utf8; | |
353 while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr))) | |
354 ++utf8ptr; | |
355 | |
356 *utf8ptr = 0; | |
357 | |
358 ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4)); | |
359 if (ucs4) | |
360 id3_utf8_decode(utf8, ucs4); | |
361 | |
362 free(utf8); | |
363 | |
364 return ucs4; | |
365 } |