annotate src/Support/ConvertUTF.c @ 16:2a5354042241

-Updated the Slaney IIR gammatone to use a cascase of four second-order filters as per the implementtion in Slaney's auditory toolbox. This is more numerically stable at high sample rates and low centre frequencies.
author tomwalters
date Sat, 20 Feb 2010 17:56:40 +0000
parents 582cbe817f2c
children
rev   line source
tomwalters@0 1 /*
tomwalters@0 2 * Copyright 2001-2004 Unicode, Inc.
tomwalters@0 3 *
tomwalters@0 4 * Disclaimer
tomwalters@0 5 *
tomwalters@0 6 * This source code is provided as is by Unicode, Inc. No claims are
tomwalters@0 7 * made as to fitness for any particular purpose. No warranties of any
tomwalters@0 8 * kind are expressed or implied. The recipient agrees to determine
tomwalters@0 9 * applicability of information provided. If this file has been
tomwalters@0 10 * purchased on magnetic or optical media from Unicode, Inc., the
tomwalters@0 11 * sole remedy for any claim will be exchange of defective media
tomwalters@0 12 * within 90 days of receipt.
tomwalters@0 13 *
tomwalters@0 14 * Limitations on Rights to Redistribute This Code
tomwalters@0 15 *
tomwalters@0 16 * Unicode, Inc. hereby grants the right to freely use the information
tomwalters@0 17 * supplied in this file in the creation of products supporting the
tomwalters@0 18 * Unicode Standard, and to make copies of this file in any form
tomwalters@0 19 * for internal or external distribution as long as this notice
tomwalters@0 20 * remains attached.
tomwalters@0 21 */
tomwalters@0 22
tomwalters@0 23 /* ---------------------------------------------------------------------
tomwalters@0 24
tomwalters@0 25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
tomwalters@0 26 Author: Mark E. Davis, 1994.
tomwalters@0 27 Rev History: Rick McGowan, fixes & updates May 2001.
tomwalters@0 28 Sept 2001: fixed const & error conditions per
tomwalters@0 29 mods suggested by S. Parent & A. Lillich.
tomwalters@0 30 June 2002: Tim Dodd added detection and handling of incomplete
tomwalters@0 31 source sequences, enhanced error detection, added casts
tomwalters@0 32 to eliminate compiler warnings.
tomwalters@0 33 July 2003: slight mods to back out aggressive FFFE detection.
tomwalters@0 34 Jan 2004: updated switches in from-UTF8 conversions.
tomwalters@0 35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
tomwalters@0 36
tomwalters@0 37 See the header file "ConvertUTF.h" for complete documentation.
tomwalters@0 38
tomwalters@0 39 ------------------------------------------------------------------------ */
tomwalters@0 40
tomwalters@0 41
tomwalters@0 42 #include "ConvertUTF.h"
tomwalters@0 43 #ifdef CVTUTF_DEBUG
tomwalters@0 44 #include <stdio.h>
tomwalters@0 45 #endif
tomwalters@0 46
tomwalters@0 47 static const int halfShift = 10; /* used for shifting by 10 bits */
tomwalters@0 48
tomwalters@0 49 static const UTF32 halfBase = 0x0010000UL;
tomwalters@0 50 static const UTF32 halfMask = 0x3FFUL;
tomwalters@0 51
tomwalters@0 52 #define UNI_SUR_HIGH_START (UTF32)0xD800
tomwalters@0 53 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
tomwalters@0 54 #define UNI_SUR_LOW_START (UTF32)0xDC00
tomwalters@0 55 #define UNI_SUR_LOW_END (UTF32)0xDFFF
tomwalters@0 56 #define false 0
tomwalters@0 57 #define true 1
tomwalters@0 58
tomwalters@0 59 /* --------------------------------------------------------------------- */
tomwalters@0 60
tomwalters@0 61 ConversionResult ConvertUTF32toUTF16 (
tomwalters@0 62 const UTF32** sourceStart, const UTF32* sourceEnd,
tomwalters@0 63 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
tomwalters@0 64 ConversionResult result = conversionOK;
tomwalters@0 65 const UTF32* source = *sourceStart;
tomwalters@0 66 UTF16* target = *targetStart;
tomwalters@0 67 while (source < sourceEnd) {
tomwalters@0 68 UTF32 ch;
tomwalters@0 69 if (target >= targetEnd) {
tomwalters@0 70 result = targetExhausted; break;
tomwalters@0 71 }
tomwalters@0 72 ch = *source++;
tomwalters@0 73 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
tomwalters@0 74 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
tomwalters@0 75 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
tomwalters@0 76 if (flags == strictConversion) {
tomwalters@0 77 --source; /* return to the illegal value itself */
tomwalters@0 78 result = sourceIllegal;
tomwalters@0 79 break;
tomwalters@0 80 } else {
tomwalters@0 81 *target++ = UNI_REPLACEMENT_CHAR;
tomwalters@0 82 }
tomwalters@0 83 } else {
tomwalters@0 84 *target++ = (UTF16)ch; /* normal case */
tomwalters@0 85 }
tomwalters@0 86 } else if (ch > UNI_MAX_LEGAL_UTF32) {
tomwalters@0 87 if (flags == strictConversion) {
tomwalters@0 88 result = sourceIllegal;
tomwalters@0 89 } else {
tomwalters@0 90 *target++ = UNI_REPLACEMENT_CHAR;
tomwalters@0 91 }
tomwalters@0 92 } else {
tomwalters@0 93 /* target is a character in range 0xFFFF - 0x10FFFF. */
tomwalters@0 94 if (target + 1 >= targetEnd) {
tomwalters@0 95 --source; /* Back up source pointer! */
tomwalters@0 96 result = targetExhausted; break;
tomwalters@0 97 }
tomwalters@0 98 ch -= halfBase;
tomwalters@0 99 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
tomwalters@0 100 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
tomwalters@0 101 }
tomwalters@0 102 }
tomwalters@0 103 *sourceStart = source;
tomwalters@0 104 *targetStart = target;
tomwalters@0 105 return result;
tomwalters@0 106 }
tomwalters@0 107
tomwalters@0 108 /* --------------------------------------------------------------------- */
tomwalters@0 109
tomwalters@0 110 ConversionResult ConvertUTF16toUTF32 (
tomwalters@0 111 const UTF16** sourceStart, const UTF16* sourceEnd,
tomwalters@0 112 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
tomwalters@0 113 ConversionResult result = conversionOK;
tomwalters@0 114 const UTF16* source = *sourceStart;
tomwalters@0 115 UTF32* target = *targetStart;
tomwalters@0 116 UTF32 ch, ch2;
tomwalters@0 117 while (source < sourceEnd) {
tomwalters@0 118 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
tomwalters@0 119 ch = *source++;
tomwalters@0 120 /* If we have a surrogate pair, convert to UTF32 first. */
tomwalters@0 121 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
tomwalters@0 122 /* If the 16 bits following the high surrogate are in the source buffer... */
tomwalters@0 123 if (source < sourceEnd) {
tomwalters@0 124 ch2 = *source;
tomwalters@0 125 /* If it's a low surrogate, convert to UTF32. */
tomwalters@0 126 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
tomwalters@0 127 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
tomwalters@0 128 + (ch2 - UNI_SUR_LOW_START) + halfBase;
tomwalters@0 129 ++source;
tomwalters@0 130 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
tomwalters@0 131 --source; /* return to the illegal value itself */
tomwalters@0 132 result = sourceIllegal;
tomwalters@0 133 break;
tomwalters@0 134 }
tomwalters@0 135 } else { /* We don't have the 16 bits following the high surrogate. */
tomwalters@0 136 --source; /* return to the high surrogate */
tomwalters@0 137 result = sourceExhausted;
tomwalters@0 138 break;
tomwalters@0 139 }
tomwalters@0 140 } else if (flags == strictConversion) {
tomwalters@0 141 /* UTF-16 surrogate values are illegal in UTF-32 */
tomwalters@0 142 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
tomwalters@0 143 --source; /* return to the illegal value itself */
tomwalters@0 144 result = sourceIllegal;
tomwalters@0 145 break;
tomwalters@0 146 }
tomwalters@0 147 }
tomwalters@0 148 if (target >= targetEnd) {
tomwalters@0 149 source = oldSource; /* Back up source pointer! */
tomwalters@0 150 result = targetExhausted; break;
tomwalters@0 151 }
tomwalters@0 152 *target++ = ch;
tomwalters@0 153 }
tomwalters@0 154 *sourceStart = source;
tomwalters@0 155 *targetStart = target;
tomwalters@0 156 #ifdef CVTUTF_DEBUG
tomwalters@0 157 if (result == sourceIllegal) {
tomwalters@0 158 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
tomwalters@0 159 fflush(stderr);
tomwalters@0 160 }
tomwalters@0 161 #endif
tomwalters@0 162 return result;
tomwalters@0 163 }
tomwalters@0 164
tomwalters@0 165 /* --------------------------------------------------------------------- */
tomwalters@0 166
tomwalters@0 167 /*
tomwalters@0 168 * Index into the table below with the first byte of a UTF-8 sequence to
tomwalters@0 169 * get the number of trailing bytes that are supposed to follow it.
tomwalters@0 170 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
tomwalters@0 171 * left as-is for anyone who may want to do such conversion, which was
tomwalters@0 172 * allowed in earlier algorithms.
tomwalters@0 173 */
tomwalters@0 174 static const char trailingBytesForUTF8[256] = {
tomwalters@0 175 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
tomwalters@0 176 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
tomwalters@0 177 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
tomwalters@0 178 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
tomwalters@0 179 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
tomwalters@0 180 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
tomwalters@0 181 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
tomwalters@0 182 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
tomwalters@0 183 };
tomwalters@0 184
tomwalters@0 185 /*
tomwalters@0 186 * Magic values subtracted from a buffer value during UTF8 conversion.
tomwalters@0 187 * This table contains as many values as there might be trailing bytes
tomwalters@0 188 * in a UTF-8 sequence.
tomwalters@0 189 */
tomwalters@0 190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
tomwalters@0 191 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
tomwalters@0 192
tomwalters@0 193 /*
tomwalters@0 194 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
tomwalters@0 195 * into the first byte, depending on how many bytes follow. There are
tomwalters@0 196 * as many entries in this table as there are UTF-8 sequence types.
tomwalters@0 197 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
tomwalters@0 198 * for *legal* UTF-8 will be 4 or fewer bytes total.
tomwalters@0 199 */
tomwalters@0 200 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
tomwalters@0 201
tomwalters@0 202 /* --------------------------------------------------------------------- */
tomwalters@0 203
tomwalters@0 204 /* The interface converts a whole buffer to avoid function-call overhead.
tomwalters@0 205 * Constants have been gathered. Loops & conditionals have been removed as
tomwalters@0 206 * much as possible for efficiency, in favor of drop-through switches.
tomwalters@0 207 * (See "Note A" at the bottom of the file for equivalent code.)
tomwalters@0 208 * If your compiler supports it, the "isLegalUTF8" call can be turned
tomwalters@0 209 * into an inline function.
tomwalters@0 210 */
tomwalters@0 211
tomwalters@0 212 /* --------------------------------------------------------------------- */
tomwalters@0 213
tomwalters@0 214 ConversionResult ConvertUTF16toUTF8 (
tomwalters@0 215 const UTF16** sourceStart, const UTF16* sourceEnd,
tomwalters@0 216 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
tomwalters@0 217 ConversionResult result = conversionOK;
tomwalters@0 218 const UTF16* source = *sourceStart;
tomwalters@0 219 UTF8* target = *targetStart;
tomwalters@0 220 while (source < sourceEnd) {
tomwalters@0 221 UTF32 ch;
tomwalters@0 222 unsigned short bytesToWrite = 0;
tomwalters@0 223 const UTF32 byteMask = 0xBF;
tomwalters@0 224 const UTF32 byteMark = 0x80;
tomwalters@0 225 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
tomwalters@0 226 ch = *source++;
tomwalters@0 227 /* If we have a surrogate pair, convert to UTF32 first. */
tomwalters@0 228 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
tomwalters@0 229 /* If the 16 bits following the high surrogate are in the source buffer... */
tomwalters@0 230 if (source < sourceEnd) {
tomwalters@0 231 UTF32 ch2 = *source;
tomwalters@0 232 /* If it's a low surrogate, convert to UTF32. */
tomwalters@0 233 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
tomwalters@0 234 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
tomwalters@0 235 + (ch2 - UNI_SUR_LOW_START) + halfBase;
tomwalters@0 236 ++source;
tomwalters@0 237 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
tomwalters@0 238 --source; /* return to the illegal value itself */
tomwalters@0 239 result = sourceIllegal;
tomwalters@0 240 break;
tomwalters@0 241 }
tomwalters@0 242 } else { /* We don't have the 16 bits following the high surrogate. */
tomwalters@0 243 --source; /* return to the high surrogate */
tomwalters@0 244 result = sourceExhausted;
tomwalters@0 245 break;
tomwalters@0 246 }
tomwalters@0 247 } else if (flags == strictConversion) {
tomwalters@0 248 /* UTF-16 surrogate values are illegal in UTF-32 */
tomwalters@0 249 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
tomwalters@0 250 --source; /* return to the illegal value itself */
tomwalters@0 251 result = sourceIllegal;
tomwalters@0 252 break;
tomwalters@0 253 }
tomwalters@0 254 }
tomwalters@0 255 /* Figure out how many bytes the result will require */
tomwalters@0 256 if (ch < (UTF32)0x80) { bytesToWrite = 1;
tomwalters@0 257 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
tomwalters@0 258 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
tomwalters@0 259 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
tomwalters@0 260 } else { bytesToWrite = 3;
tomwalters@0 261 ch = UNI_REPLACEMENT_CHAR;
tomwalters@0 262 }
tomwalters@0 263
tomwalters@0 264 target += bytesToWrite;
tomwalters@0 265 if (target > targetEnd) {
tomwalters@0 266 source = oldSource; /* Back up source pointer! */
tomwalters@0 267 target -= bytesToWrite; result = targetExhausted; break;
tomwalters@0 268 }
tomwalters@0 269 switch (bytesToWrite) { /* note: everything falls through. */
tomwalters@0 270 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
tomwalters@0 271 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
tomwalters@0 272 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
tomwalters@0 273 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
tomwalters@0 274 }
tomwalters@0 275 target += bytesToWrite;
tomwalters@0 276 }
tomwalters@0 277 *sourceStart = source;
tomwalters@0 278 *targetStart = target;
tomwalters@0 279 return result;
tomwalters@0 280 }
tomwalters@0 281
tomwalters@0 282 /* --------------------------------------------------------------------- */
tomwalters@0 283
tomwalters@0 284 /*
tomwalters@0 285 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
tomwalters@0 286 * This must be called with the length pre-determined by the first byte.
tomwalters@0 287 * If not calling this from ConvertUTF8to*, then the length can be set by:
tomwalters@0 288 * length = trailingBytesForUTF8[*source]+1;
tomwalters@0 289 * and the sequence is illegal right away if there aren't that many bytes
tomwalters@0 290 * available.
tomwalters@0 291 * If presented with a length > 4, this returns false. The Unicode
tomwalters@0 292 * definition of UTF-8 goes up to 4-byte sequences.
tomwalters@0 293 */
tomwalters@0 294
tomwalters@0 295 static Boolean isLegalUTF8(const UTF8 *source, int length) {
tomwalters@0 296 UTF8 a;
tomwalters@0 297 const UTF8 *srcptr = source+length;
tomwalters@0 298 switch (length) {
tomwalters@0 299 default: return false;
tomwalters@0 300 /* Everything else falls through when "true"... */
tomwalters@0 301 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
tomwalters@0 302 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
tomwalters@0 303 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
tomwalters@0 304
tomwalters@0 305 switch (*source) {
tomwalters@0 306 /* no fall-through in this inner switch */
tomwalters@0 307 case 0xE0: if (a < 0xA0) return false; break;
tomwalters@0 308 case 0xED: if (a > 0x9F) return false; break;
tomwalters@0 309 case 0xF0: if (a < 0x90) return false; break;
tomwalters@0 310 case 0xF4: if (a > 0x8F) return false; break;
tomwalters@0 311 default: if (a < 0x80) return false;
tomwalters@0 312 }
tomwalters@0 313
tomwalters@0 314 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
tomwalters@0 315 }
tomwalters@0 316 if (*source > 0xF4) return false;
tomwalters@0 317 return true;
tomwalters@0 318 }
tomwalters@0 319
tomwalters@0 320 /* --------------------------------------------------------------------- */
tomwalters@0 321
tomwalters@0 322 /*
tomwalters@0 323 * Exported function to return whether a UTF-8 sequence is legal or not.
tomwalters@0 324 * This is not used here; it's just exported.
tomwalters@0 325 */
tomwalters@0 326 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
tomwalters@0 327 int length = trailingBytesForUTF8[*source]+1;
tomwalters@0 328 if (source+length > sourceEnd) {
tomwalters@0 329 return false;
tomwalters@0 330 }
tomwalters@0 331 return isLegalUTF8(source, length);
tomwalters@0 332 }
tomwalters@0 333
tomwalters@0 334 /* --------------------------------------------------------------------- */
tomwalters@0 335
tomwalters@0 336 ConversionResult ConvertUTF8toUTF16 (
tomwalters@0 337 const UTF8** sourceStart, const UTF8* sourceEnd,
tomwalters@0 338 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
tomwalters@0 339 ConversionResult result = conversionOK;
tomwalters@0 340 const UTF8* source = *sourceStart;
tomwalters@0 341 UTF16* target = *targetStart;
tomwalters@0 342 while (source < sourceEnd) {
tomwalters@0 343 UTF32 ch = 0;
tomwalters@0 344 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
tomwalters@0 345 if (source + extraBytesToRead >= sourceEnd) {
tomwalters@0 346 result = sourceExhausted; break;
tomwalters@0 347 }
tomwalters@0 348 /* Do this check whether lenient or strict */
tomwalters@0 349 if (! isLegalUTF8(source, extraBytesToRead+1)) {
tomwalters@0 350 result = sourceIllegal;
tomwalters@0 351 break;
tomwalters@0 352 }
tomwalters@0 353 /*
tomwalters@0 354 * The cases all fall through. See "Note A" below.
tomwalters@0 355 */
tomwalters@0 356 switch (extraBytesToRead) {
tomwalters@0 357 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
tomwalters@0 358 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
tomwalters@0 359 case 3: ch += *source++; ch <<= 6;
tomwalters@0 360 case 2: ch += *source++; ch <<= 6;
tomwalters@0 361 case 1: ch += *source++; ch <<= 6;
tomwalters@0 362 case 0: ch += *source++;
tomwalters@0 363 }
tomwalters@0 364 ch -= offsetsFromUTF8[extraBytesToRead];
tomwalters@0 365
tomwalters@0 366 if (target >= targetEnd) {
tomwalters@0 367 source -= (extraBytesToRead+1); /* Back up source pointer! */
tomwalters@0 368 result = targetExhausted; break;
tomwalters@0 369 }
tomwalters@0 370 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
tomwalters@0 371 /* UTF-16 surrogate values are illegal in UTF-32 */
tomwalters@0 372 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
tomwalters@0 373 if (flags == strictConversion) {
tomwalters@0 374 source -= (extraBytesToRead+1); /* return to the illegal value itself */
tomwalters@0 375 result = sourceIllegal;
tomwalters@0 376 break;
tomwalters@0 377 } else {
tomwalters@0 378 *target++ = UNI_REPLACEMENT_CHAR;
tomwalters@0 379 }
tomwalters@0 380 } else {
tomwalters@0 381 *target++ = (UTF16)ch; /* normal case */
tomwalters@0 382 }
tomwalters@0 383 } else if (ch > UNI_MAX_UTF16) {
tomwalters@0 384 if (flags == strictConversion) {
tomwalters@0 385 result = sourceIllegal;
tomwalters@0 386 source -= (extraBytesToRead+1); /* return to the start */
tomwalters@0 387 break; /* Bail out; shouldn't continue */
tomwalters@0 388 } else {
tomwalters@0 389 *target++ = UNI_REPLACEMENT_CHAR;
tomwalters@0 390 }
tomwalters@0 391 } else {
tomwalters@0 392 /* target is a character in range 0xFFFF - 0x10FFFF. */
tomwalters@0 393 if (target + 1 >= targetEnd) {
tomwalters@0 394 source -= (extraBytesToRead+1); /* Back up source pointer! */
tomwalters@0 395 result = targetExhausted; break;
tomwalters@0 396 }
tomwalters@0 397 ch -= halfBase;
tomwalters@0 398 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
tomwalters@0 399 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
tomwalters@0 400 }
tomwalters@0 401 }
tomwalters@0 402 *sourceStart = source;
tomwalters@0 403 *targetStart = target;
tomwalters@0 404 return result;
tomwalters@0 405 }
tomwalters@0 406
tomwalters@0 407 /* --------------------------------------------------------------------- */
tomwalters@0 408
tomwalters@0 409 ConversionResult ConvertUTF32toUTF8 (
tomwalters@0 410 const UTF32** sourceStart, const UTF32* sourceEnd,
tomwalters@0 411 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
tomwalters@0 412 ConversionResult result = conversionOK;
tomwalters@0 413 const UTF32* source = *sourceStart;
tomwalters@0 414 UTF8* target = *targetStart;
tomwalters@0 415 while (source < sourceEnd) {
tomwalters@0 416 UTF32 ch;
tomwalters@0 417 unsigned short bytesToWrite = 0;
tomwalters@0 418 const UTF32 byteMask = 0xBF;
tomwalters@0 419 const UTF32 byteMark = 0x80;
tomwalters@0 420 ch = *source++;
tomwalters@0 421 if (flags == strictConversion ) {
tomwalters@0 422 /* UTF-16 surrogate values are illegal in UTF-32 */
tomwalters@0 423 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
tomwalters@0 424 --source; /* return to the illegal value itself */
tomwalters@0 425 result = sourceIllegal;
tomwalters@0 426 break;
tomwalters@0 427 }
tomwalters@0 428 }
tomwalters@0 429 /*
tomwalters@0 430 * Figure out how many bytes the result will require. Turn any
tomwalters@0 431 * illegally large UTF32 things (> Plane 17) into replacement chars.
tomwalters@0 432 */
tomwalters@0 433 if (ch < (UTF32)0x80) { bytesToWrite = 1;
tomwalters@0 434 } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
tomwalters@0 435 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
tomwalters@0 436 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
tomwalters@0 437 } else { bytesToWrite = 3;
tomwalters@0 438 ch = UNI_REPLACEMENT_CHAR;
tomwalters@0 439 result = sourceIllegal;
tomwalters@0 440 }
tomwalters@0 441
tomwalters@0 442 target += bytesToWrite;
tomwalters@0 443 if (target > targetEnd) {
tomwalters@0 444 --source; /* Back up source pointer! */
tomwalters@0 445 target -= bytesToWrite; result = targetExhausted; break;
tomwalters@0 446 }
tomwalters@0 447 switch (bytesToWrite) { /* note: everything falls through. */
tomwalters@0 448 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
tomwalters@0 449 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
tomwalters@0 450 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
tomwalters@0 451 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
tomwalters@0 452 }
tomwalters@0 453 target += bytesToWrite;
tomwalters@0 454 }
tomwalters@0 455 *sourceStart = source;
tomwalters@0 456 *targetStart = target;
tomwalters@0 457 return result;
tomwalters@0 458 }
tomwalters@0 459
tomwalters@0 460 /* --------------------------------------------------------------------- */
tomwalters@0 461
tomwalters@0 462 ConversionResult ConvertUTF8toUTF32 (
tomwalters@0 463 const UTF8** sourceStart, const UTF8* sourceEnd,
tomwalters@0 464 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
tomwalters@0 465 ConversionResult result = conversionOK;
tomwalters@0 466 const UTF8* source = *sourceStart;
tomwalters@0 467 UTF32* target = *targetStart;
tomwalters@0 468 while (source < sourceEnd) {
tomwalters@0 469 UTF32 ch = 0;
tomwalters@0 470 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
tomwalters@0 471 if (source + extraBytesToRead >= sourceEnd) {
tomwalters@0 472 result = sourceExhausted; break;
tomwalters@0 473 }
tomwalters@0 474 /* Do this check whether lenient or strict */
tomwalters@0 475 if (! isLegalUTF8(source, extraBytesToRead+1)) {
tomwalters@0 476 result = sourceIllegal;
tomwalters@0 477 break;
tomwalters@0 478 }
tomwalters@0 479 /*
tomwalters@0 480 * The cases all fall through. See "Note A" below.
tomwalters@0 481 */
tomwalters@0 482 switch (extraBytesToRead) {
tomwalters@0 483 case 5: ch += *source++; ch <<= 6;
tomwalters@0 484 case 4: ch += *source++; ch <<= 6;
tomwalters@0 485 case 3: ch += *source++; ch <<= 6;
tomwalters@0 486 case 2: ch += *source++; ch <<= 6;
tomwalters@0 487 case 1: ch += *source++; ch <<= 6;
tomwalters@0 488 case 0: ch += *source++;
tomwalters@0 489 }
tomwalters@0 490 ch -= offsetsFromUTF8[extraBytesToRead];
tomwalters@0 491
tomwalters@0 492 if (target >= targetEnd) {
tomwalters@0 493 source -= (extraBytesToRead+1); /* Back up the source pointer! */
tomwalters@0 494 result = targetExhausted; break;
tomwalters@0 495 }
tomwalters@0 496 if (ch <= UNI_MAX_LEGAL_UTF32) {
tomwalters@0 497 /*
tomwalters@0 498 * UTF-16 surrogate values are illegal in UTF-32, and anything
tomwalters@0 499 * over Plane 17 (> 0x10FFFF) is illegal.
tomwalters@0 500 */
tomwalters@0 501 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
tomwalters@0 502 if (flags == strictConversion) {
tomwalters@0 503 source -= (extraBytesToRead+1); /* return to the illegal value itself */
tomwalters@0 504 result = sourceIllegal;
tomwalters@0 505 break;
tomwalters@0 506 } else {
tomwalters@0 507 *target++ = UNI_REPLACEMENT_CHAR;
tomwalters@0 508 }
tomwalters@0 509 } else {
tomwalters@0 510 *target++ = ch;
tomwalters@0 511 }
tomwalters@0 512 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
tomwalters@0 513 result = sourceIllegal;
tomwalters@0 514 *target++ = UNI_REPLACEMENT_CHAR;
tomwalters@0 515 }
tomwalters@0 516 }
tomwalters@0 517 *sourceStart = source;
tomwalters@0 518 *targetStart = target;
tomwalters@0 519 return result;
tomwalters@0 520 }
tomwalters@0 521
tomwalters@0 522 /* ---------------------------------------------------------------------
tomwalters@0 523
tomwalters@0 524 Note A.
tomwalters@0 525 The fall-through switches in UTF-8 reading code save a
tomwalters@0 526 temp variable, some decrements & conditionals. The switches
tomwalters@0 527 are equivalent to the following loop:
tomwalters@0 528 {
tomwalters@0 529 int tmpBytesToRead = extraBytesToRead+1;
tomwalters@0 530 do {
tomwalters@0 531 ch += *source++;
tomwalters@0 532 --tmpBytesToRead;
tomwalters@0 533 if (tmpBytesToRead) ch <<= 6;
tomwalters@0 534 } while (tmpBytesToRead > 0);
tomwalters@0 535 }
tomwalters@0 536 In UTF-8 writing code, the switches on "bytesToWrite" are
tomwalters@0 537 similarly unrolled loops.
tomwalters@0 538
tomwalters@0 539 --------------------------------------------------------------------- */