cannam@85
|
1 /*
|
cannam@85
|
2 * libid3tag - ID3 tag manipulation library
|
cannam@85
|
3 * Copyright (C) 2000-2004 Underbit Technologies, Inc.
|
cannam@85
|
4 *
|
cannam@85
|
5 * This program is free software; you can redistribute it and/or modify
|
cannam@85
|
6 * it under the terms of the GNU General Public License as published by
|
cannam@85
|
7 * the Free Software Foundation; either version 2 of the License, or
|
cannam@85
|
8 * (at your option) any later version.
|
cannam@85
|
9 *
|
cannam@85
|
10 * This program is distributed in the hope that it will be useful,
|
cannam@85
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
cannam@85
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
cannam@85
|
13 * GNU General Public License for more details.
|
cannam@85
|
14 *
|
cannam@85
|
15 * You should have received a copy of the GNU General Public License
|
cannam@85
|
16 * along with this program; if not, write to the Free Software
|
cannam@85
|
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
cannam@85
|
18 *
|
cannam@85
|
19 * $Id: utf16.c,v 1.9 2004/01/23 09:41:32 rob Exp $
|
cannam@85
|
20 */
|
cannam@85
|
21
|
cannam@85
|
22 # ifdef HAVE_CONFIG_H
|
cannam@85
|
23 # include "config.h"
|
cannam@85
|
24 # endif
|
cannam@85
|
25
|
cannam@85
|
26 # include "global.h"
|
cannam@85
|
27
|
cannam@85
|
28 # include <stdlib.h>
|
cannam@85
|
29
|
cannam@85
|
30 # include "id3tag.h"
|
cannam@85
|
31 # include "utf16.h"
|
cannam@85
|
32 # include "ucs4.h"
|
cannam@85
|
33
|
cannam@85
|
34 /*
|
cannam@85
|
35 * NAME: utf16->length()
|
cannam@85
|
36 * DESCRIPTION: return the number of ucs4 chars represented by a utf16 string
|
cannam@85
|
37 */
|
cannam@85
|
38 id3_length_t id3_utf16_length(id3_utf16_t const *utf16)
|
cannam@85
|
39 {
|
cannam@85
|
40 id3_length_t length = 0;
|
cannam@85
|
41
|
cannam@85
|
42 while (*utf16) {
|
cannam@85
|
43 if (utf16[0] < 0xd800 || utf16[0] > 0xdfff)
|
cannam@85
|
44 ++length;
|
cannam@85
|
45 else if (utf16[0] >= 0xd800 && utf16[0] <= 0xdbff &&
|
cannam@85
|
46 utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff) {
|
cannam@85
|
47 ++length;
|
cannam@85
|
48 ++utf16;
|
cannam@85
|
49 }
|
cannam@85
|
50
|
cannam@85
|
51 ++utf16;
|
cannam@85
|
52 }
|
cannam@85
|
53
|
cannam@85
|
54 return length;
|
cannam@85
|
55 }
|
cannam@85
|
56
|
cannam@85
|
57 /*
|
cannam@85
|
58 * NAME: utf16->size()
|
cannam@85
|
59 * DESCRIPTION: return the encoding size of a utf16 string
|
cannam@85
|
60 */
|
cannam@85
|
61 id3_length_t id3_utf16_size(id3_utf16_t const *utf16)
|
cannam@85
|
62 {
|
cannam@85
|
63 id3_utf16_t const *ptr = utf16;
|
cannam@85
|
64
|
cannam@85
|
65 while (*ptr)
|
cannam@85
|
66 ++ptr;
|
cannam@85
|
67
|
cannam@85
|
68 return ptr - utf16 + 1;
|
cannam@85
|
69 }
|
cannam@85
|
70
|
cannam@85
|
71 /*
|
cannam@85
|
72 * NAME: utf16->ucs4duplicate()
|
cannam@85
|
73 * DESCRIPTION: duplicate and decode a utf16 string into ucs4
|
cannam@85
|
74 */
|
cannam@85
|
75 id3_ucs4_t *id3_utf16_ucs4duplicate(id3_utf16_t const *utf16)
|
cannam@85
|
76 {
|
cannam@85
|
77 id3_ucs4_t *ucs4;
|
cannam@85
|
78
|
cannam@85
|
79 ucs4 = malloc((id3_utf16_length(utf16) + 1) * sizeof(*ucs4));
|
cannam@85
|
80 if (ucs4)
|
cannam@85
|
81 id3_utf16_decode(utf16, ucs4);
|
cannam@85
|
82
|
cannam@85
|
83 return release(ucs4);
|
cannam@85
|
84 }
|
cannam@85
|
85
|
cannam@85
|
86 /*
|
cannam@85
|
87 * NAME: utf16->decodechar()
|
cannam@85
|
88 * DESCRIPTION: decode a series of utf16 chars into a single ucs4 char
|
cannam@85
|
89 */
|
cannam@85
|
90 id3_length_t id3_utf16_decodechar(id3_utf16_t const *utf16, id3_ucs4_t *ucs4)
|
cannam@85
|
91 {
|
cannam@85
|
92 id3_utf16_t const *start = utf16;
|
cannam@85
|
93
|
cannam@85
|
94 while (1) {
|
cannam@85
|
95 if (utf16[0] < 0xd800 || utf16[0] > 0xdfff) {
|
cannam@85
|
96 *ucs4 = utf16[0];
|
cannam@85
|
97 return utf16 - start + 1;
|
cannam@85
|
98 }
|
cannam@85
|
99 else if (utf16[0] >= 0xd800 && utf16[0] <= 0xdbff &&
|
cannam@85
|
100 utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff) {
|
cannam@85
|
101 *ucs4 = (((utf16[0] & 0x03ffL) << 10) |
|
cannam@85
|
102 ((utf16[1] & 0x03ffL) << 0)) + 0x00010000L;
|
cannam@85
|
103 return utf16 - start + 2;
|
cannam@85
|
104 }
|
cannam@85
|
105
|
cannam@85
|
106 ++utf16;
|
cannam@85
|
107 }
|
cannam@85
|
108 }
|
cannam@85
|
109
|
cannam@85
|
110 /*
|
cannam@85
|
111 * NAME: utf16->encodechar()
|
cannam@85
|
112 * DESCRIPTION: encode a single ucs4 char into a series of up to 2 utf16 chars
|
cannam@85
|
113 */
|
cannam@85
|
114 id3_length_t id3_utf16_encodechar(id3_utf16_t *utf16, id3_ucs4_t ucs4)
|
cannam@85
|
115 {
|
cannam@85
|
116 if (ucs4 < 0x00010000L) {
|
cannam@85
|
117 utf16[0] = ucs4;
|
cannam@85
|
118
|
cannam@85
|
119 return 1;
|
cannam@85
|
120 }
|
cannam@85
|
121 else if (ucs4 < 0x00110000L) {
|
cannam@85
|
122 ucs4 -= 0x00010000L;
|
cannam@85
|
123
|
cannam@85
|
124 utf16[0] = ((ucs4 >> 10) & 0x3ff) | 0xd800;
|
cannam@85
|
125 utf16[1] = ((ucs4 >> 0) & 0x3ff) | 0xdc00;
|
cannam@85
|
126
|
cannam@85
|
127 return 2;
|
cannam@85
|
128 }
|
cannam@85
|
129
|
cannam@85
|
130 /* default */
|
cannam@85
|
131
|
cannam@85
|
132 return id3_utf16_encodechar(utf16, ID3_UCS4_REPLACEMENTCHAR);
|
cannam@85
|
133 }
|
cannam@85
|
134
|
cannam@85
|
135 /*
|
cannam@85
|
136 * NAME: utf16->decode()
|
cannam@85
|
137 * DESCRIPTION: decode a complete utf16 string into a ucs4 string
|
cannam@85
|
138 */
|
cannam@85
|
139 void id3_utf16_decode(id3_utf16_t const *utf16, id3_ucs4_t *ucs4)
|
cannam@85
|
140 {
|
cannam@85
|
141 do
|
cannam@85
|
142 utf16 += id3_utf16_decodechar(utf16, ucs4);
|
cannam@85
|
143 while (*ucs4++);
|
cannam@85
|
144 }
|
cannam@85
|
145
|
cannam@85
|
146 /*
|
cannam@85
|
147 * NAME: utf16->encode()
|
cannam@85
|
148 * DESCRIPTION: encode a complete ucs4 string into a utf16 string
|
cannam@85
|
149 */
|
cannam@85
|
150 void id3_utf16_encode(id3_utf16_t *utf16, id3_ucs4_t const *ucs4)
|
cannam@85
|
151 {
|
cannam@85
|
152 do
|
cannam@85
|
153 utf16 += id3_utf16_encodechar(utf16, *ucs4);
|
cannam@85
|
154 while (*ucs4++);
|
cannam@85
|
155 }
|
cannam@85
|
156
|
cannam@85
|
157 /*
|
cannam@85
|
158 * NAME: utf16->put()
|
cannam@85
|
159 * DESCRIPTION: serialize a single utf16 character
|
cannam@85
|
160 */
|
cannam@85
|
161 id3_length_t id3_utf16_put(id3_byte_t **ptr, id3_utf16_t utf16,
|
cannam@85
|
162 enum id3_utf16_byteorder byteorder)
|
cannam@85
|
163 {
|
cannam@85
|
164 if (ptr) {
|
cannam@85
|
165 switch (byteorder) {
|
cannam@85
|
166 default:
|
cannam@85
|
167 case ID3_UTF16_BYTEORDER_BE:
|
cannam@85
|
168 (*ptr)[0] = (utf16 >> 8) & 0xff;
|
cannam@85
|
169 (*ptr)[1] = (utf16 >> 0) & 0xff;
|
cannam@85
|
170 break;
|
cannam@85
|
171
|
cannam@85
|
172 case ID3_UTF16_BYTEORDER_LE:
|
cannam@85
|
173 (*ptr)[0] = (utf16 >> 0) & 0xff;
|
cannam@85
|
174 (*ptr)[1] = (utf16 >> 8) & 0xff;
|
cannam@85
|
175 break;
|
cannam@85
|
176 }
|
cannam@85
|
177
|
cannam@85
|
178 *ptr += 2;
|
cannam@85
|
179 }
|
cannam@85
|
180
|
cannam@85
|
181 return 2;
|
cannam@85
|
182 }
|
cannam@85
|
183
|
cannam@85
|
184 /*
|
cannam@85
|
185 * NAME: utf16->get()
|
cannam@85
|
186 * DESCRIPTION: deserialize a single utf16 character
|
cannam@85
|
187 */
|
cannam@85
|
188 id3_utf16_t id3_utf16_get(id3_byte_t const **ptr,
|
cannam@85
|
189 enum id3_utf16_byteorder byteorder)
|
cannam@85
|
190 {
|
cannam@85
|
191 id3_utf16_t utf16;
|
cannam@85
|
192
|
cannam@85
|
193 switch (byteorder) {
|
cannam@85
|
194 default:
|
cannam@85
|
195 case ID3_UTF16_BYTEORDER_BE:
|
cannam@85
|
196 utf16 =
|
cannam@85
|
197 ((*ptr)[0] << 8) |
|
cannam@85
|
198 ((*ptr)[1] << 0);
|
cannam@85
|
199 break;
|
cannam@85
|
200
|
cannam@85
|
201 case ID3_UTF16_BYTEORDER_LE:
|
cannam@85
|
202 utf16 =
|
cannam@85
|
203 ((*ptr)[0] << 0) |
|
cannam@85
|
204 ((*ptr)[1] << 8);
|
cannam@85
|
205 break;
|
cannam@85
|
206 }
|
cannam@85
|
207
|
cannam@85
|
208 *ptr += 2;
|
cannam@85
|
209
|
cannam@85
|
210 return utf16;
|
cannam@85
|
211 }
|
cannam@85
|
212
|
cannam@85
|
213 /*
|
cannam@85
|
214 * NAME: utf16->serialize()
|
cannam@85
|
215 * DESCRIPTION: serialize a ucs4 string using utf16 encoding
|
cannam@85
|
216 */
|
cannam@85
|
217 id3_length_t id3_utf16_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
|
cannam@85
|
218 enum id3_utf16_byteorder byteorder,
|
cannam@85
|
219 int terminate)
|
cannam@85
|
220 {
|
cannam@85
|
221 id3_length_t size = 0;
|
cannam@85
|
222 id3_utf16_t utf16[2], *out;
|
cannam@85
|
223
|
cannam@85
|
224 if (byteorder == ID3_UTF16_BYTEORDER_ANY)
|
cannam@85
|
225 size += id3_utf16_put(ptr, 0xfeff, byteorder);
|
cannam@85
|
226
|
cannam@85
|
227 while (*ucs4) {
|
cannam@85
|
228 switch (id3_utf16_encodechar(out = utf16, *ucs4++)) {
|
cannam@85
|
229 case 2: size += id3_utf16_put(ptr, *out++, byteorder);
|
cannam@85
|
230 case 1: size += id3_utf16_put(ptr, *out++, byteorder);
|
cannam@85
|
231 case 0: break;
|
cannam@85
|
232 }
|
cannam@85
|
233 }
|
cannam@85
|
234
|
cannam@85
|
235 if (terminate)
|
cannam@85
|
236 size += id3_utf16_put(ptr, 0, byteorder);
|
cannam@85
|
237
|
cannam@85
|
238 return size;
|
cannam@85
|
239 }
|
cannam@85
|
240
|
cannam@85
|
241 /*
|
cannam@85
|
242 * NAME: utf16->deserialize()
|
cannam@85
|
243 * DESCRIPTION: deserialize a ucs4 string using utf16 encoding
|
cannam@85
|
244 */
|
cannam@85
|
245 id3_ucs4_t *id3_utf16_deserialize(id3_byte_t const **ptr, id3_length_t length,
|
cannam@85
|
246 enum id3_utf16_byteorder byteorder)
|
cannam@85
|
247 {
|
cannam@85
|
248 id3_byte_t const *end;
|
cannam@85
|
249 id3_utf16_t *utf16ptr, *utf16;
|
cannam@85
|
250 id3_ucs4_t *ucs4;
|
cannam@85
|
251
|
cannam@85
|
252 end = *ptr + (length & ~1);
|
cannam@85
|
253
|
cannam@85
|
254 utf16 = malloc((length / 2 + 1) * sizeof(*utf16));
|
cannam@85
|
255 if (utf16 == 0)
|
cannam@85
|
256 return 0;
|
cannam@85
|
257
|
cannam@85
|
258 if (byteorder == ID3_UTF16_BYTEORDER_ANY && end - *ptr > 0) {
|
cannam@85
|
259 switch (((*ptr)[0] << 8) |
|
cannam@85
|
260 ((*ptr)[1] << 0)) {
|
cannam@85
|
261 case 0xfeff:
|
cannam@85
|
262 byteorder = ID3_UTF16_BYTEORDER_BE;
|
cannam@85
|
263 *ptr += 2;
|
cannam@85
|
264 break;
|
cannam@85
|
265
|
cannam@85
|
266 case 0xfffe:
|
cannam@85
|
267 byteorder = ID3_UTF16_BYTEORDER_LE;
|
cannam@85
|
268 *ptr += 2;
|
cannam@85
|
269 break;
|
cannam@85
|
270 }
|
cannam@85
|
271 }
|
cannam@85
|
272
|
cannam@85
|
273 utf16ptr = utf16;
|
cannam@85
|
274 while (end - *ptr > 0 && (*utf16ptr = id3_utf16_get(ptr, byteorder)))
|
cannam@85
|
275 ++utf16ptr;
|
cannam@85
|
276
|
cannam@85
|
277 *utf16ptr = 0;
|
cannam@85
|
278
|
cannam@85
|
279 ucs4 = malloc((id3_utf16_length(utf16) + 1) * sizeof(*ucs4));
|
cannam@85
|
280 if (ucs4)
|
cannam@85
|
281 id3_utf16_decode(utf16, ucs4);
|
cannam@85
|
282
|
cannam@85
|
283 free(utf16);
|
cannam@85
|
284
|
cannam@85
|
285 return ucs4;
|
cannam@85
|
286 }
|