tomwalters@268: /* tomwalters@268: * Copyright 2001-2004 Unicode, Inc. tomwalters@268: * tomwalters@268: * Disclaimer tomwalters@268: * tomwalters@268: * This source code is provided as is by Unicode, Inc. No claims are tomwalters@268: * made as to fitness for any particular purpose. No warranties of any tomwalters@268: * kind are expressed or implied. The recipient agrees to determine tomwalters@268: * applicability of information provided. If this file has been tomwalters@268: * purchased on magnetic or optical media from Unicode, Inc., the tomwalters@268: * sole remedy for any claim will be exchange of defective media tomwalters@268: * within 90 days of receipt. tomwalters@268: * tomwalters@268: * Limitations on Rights to Redistribute This Code tomwalters@268: * tomwalters@268: * Unicode, Inc. hereby grants the right to freely use the information tomwalters@268: * supplied in this file in the creation of products supporting the tomwalters@268: * Unicode Standard, and to make copies of this file in any form tomwalters@268: * for internal or external distribution as long as this notice tomwalters@268: * remains attached. tomwalters@268: */ tomwalters@268: tomwalters@268: /* --------------------------------------------------------------------- tomwalters@268: tomwalters@268: Conversions between UTF32, UTF-16, and UTF-8. Header file. tomwalters@268: tomwalters@268: Several funtions are included here, forming a complete set of tomwalters@268: conversions between the three formats. UTF-7 is not included tomwalters@268: here, but is handled in a separate source file. tomwalters@268: tomwalters@268: Each of these routines takes pointers to input buffers and output tomwalters@268: buffers. The input buffers are const. tomwalters@268: tomwalters@268: Each routine converts the text between *sourceStart and sourceEnd, tomwalters@268: putting the result into the buffer between *targetStart and tomwalters@268: targetEnd. Note: the end pointers are *after* the last item: e.g. tomwalters@268: *(sourceEnd - 1) is the last item. tomwalters@268: tomwalters@268: The return result indicates whether the conversion was successful, tomwalters@268: and if not, whether the problem was in the source or target buffers. tomwalters@268: (Only the first encountered problem is indicated.) tomwalters@268: tomwalters@268: After the conversion, *sourceStart and *targetStart are both tomwalters@268: updated to point to the end of last text successfully converted in tomwalters@268: the respective buffers. tomwalters@268: tomwalters@268: Input parameters: tomwalters@268: sourceStart - pointer to a pointer to the source buffer. tomwalters@268: The contents of this are modified on return so that tomwalters@268: it points at the next thing to be converted. tomwalters@268: targetStart - similarly, pointer to pointer to the target buffer. tomwalters@268: sourceEnd, targetEnd - respectively pointers to the ends of the tomwalters@268: two buffers, for overflow checking only. tomwalters@268: tomwalters@268: These conversion functions take a ConversionFlags argument. When this tomwalters@268: flag is set to strict, both irregular sequences and isolated surrogates tomwalters@268: will cause an error. When the flag is set to lenient, both irregular tomwalters@268: sequences and isolated surrogates are converted. tomwalters@268: tomwalters@268: Whether the flag is strict or lenient, all illegal sequences will cause tomwalters@268: an error return. This includes sequences such as: , , tomwalters@268: or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code tomwalters@268: must check for illegal sequences. tomwalters@268: tomwalters@268: When the flag is set to lenient, characters over 0x10FFFF are converted tomwalters@268: to the replacement character; otherwise (when the flag is set to strict) tomwalters@268: they constitute an error. tomwalters@268: tomwalters@268: Output parameters: tomwalters@268: The value "sourceIllegal" is returned from some routines if the input tomwalters@268: sequence is malformed. When "sourceIllegal" is returned, the source tomwalters@268: value will point to the illegal value that caused the problem. E.g., tomwalters@268: in UTF-8 when a sequence is malformed, it points to the start of the tomwalters@268: malformed sequence. tomwalters@268: tomwalters@268: Author: Mark E. Davis, 1994. tomwalters@268: Rev History: Rick McGowan, fixes & updates May 2001. tomwalters@268: Fixes & updates, Sept 2001. tomwalters@268: tomwalters@268: ------------------------------------------------------------------------ */ tomwalters@268: tomwalters@268: /* --------------------------------------------------------------------- tomwalters@268: The following 4 definitions are compiler-specific. tomwalters@268: The C standard does not guarantee that wchar_t has at least tomwalters@268: 16 bits, so wchar_t is no less portable than unsigned short! tomwalters@268: All should be unsigned values to avoid sign extension during tomwalters@268: bit mask & shift operations. tomwalters@268: ------------------------------------------------------------------------ */ tomwalters@268: tomwalters@268: typedef unsigned int UTF32; /* at least 32 bits */ tomwalters@268: typedef unsigned short UTF16; /* at least 16 bits */ tomwalters@268: typedef unsigned char UTF8; /* typically 8 bits */ tomwalters@268: typedef unsigned char Boolean; /* 0 or 1 */ tomwalters@268: tomwalters@268: /* Some fundamental constants */ tomwalters@268: #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD tomwalters@268: #define UNI_MAX_BMP (UTF32)0x0000FFFF tomwalters@268: #define UNI_MAX_UTF16 (UTF32)0x0010FFFF tomwalters@268: #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF tomwalters@268: #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF tomwalters@268: tomwalters@268: typedef enum { tomwalters@268: conversionOK, /* conversion successful */ tomwalters@268: sourceExhausted, /* partial character in source, but hit end */ tomwalters@268: targetExhausted, /* insuff. room in target for conversion */ tomwalters@268: sourceIllegal /* source sequence is illegal/malformed */ tomwalters@268: } ConversionResult; tomwalters@268: tomwalters@268: typedef enum { tomwalters@268: strictConversion = 0, tomwalters@268: lenientConversion tomwalters@268: } ConversionFlags; tomwalters@268: tomwalters@268: /* This is for C++ and does no harm in C */ tomwalters@268: #ifdef __cplusplus tomwalters@268: extern "C" { tomwalters@268: #endif tomwalters@268: tomwalters@268: ConversionResult ConvertUTF8toUTF16 ( tomwalters@268: const UTF8** sourceStart, const UTF8* sourceEnd, tomwalters@268: UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); tomwalters@268: tomwalters@268: ConversionResult ConvertUTF16toUTF8 ( tomwalters@268: const UTF16** sourceStart, const UTF16* sourceEnd, tomwalters@268: UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); tomwalters@268: tomwalters@268: ConversionResult ConvertUTF8toUTF32 ( tomwalters@268: const UTF8** sourceStart, const UTF8* sourceEnd, tomwalters@268: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); tomwalters@268: tomwalters@268: ConversionResult ConvertUTF32toUTF8 ( tomwalters@268: const UTF32** sourceStart, const UTF32* sourceEnd, tomwalters@268: UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); tomwalters@268: tomwalters@268: ConversionResult ConvertUTF16toUTF32 ( tomwalters@268: const UTF16** sourceStart, const UTF16* sourceEnd, tomwalters@268: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); tomwalters@268: tomwalters@268: ConversionResult ConvertUTF32toUTF16 ( tomwalters@268: const UTF32** sourceStart, const UTF32* sourceEnd, tomwalters@268: UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); tomwalters@268: tomwalters@268: Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); tomwalters@268: tomwalters@268: #ifdef __cplusplus tomwalters@268: } tomwalters@268: #endif tomwalters@268: tomwalters@268: /* --------------------------------------------------------------------- */