annotate src/flac-1.2.1/src/share/utf8/utf8.c @ 20:ab7c38c4c577

Ranlib
author Chris Cannam
date Mon, 25 Mar 2013 16:28:19 +0000
parents 05aa0afa9217
children
rev   line source
Chris@1 1 /*
Chris@1 2 * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
Chris@1 3 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
Chris@1 4 *
Chris@1 5 * Buffer overflow checking added: Josh Coalson, 9/9/2007
Chris@1 6 *
Chris@1 7 * This program is free software; you can redistribute it and/or modify
Chris@1 8 * it under the terms of the GNU General Public License as published by
Chris@1 9 * the Free Software Foundation; either version 2 of the License, or
Chris@1 10 * (at your option) any later version.
Chris@1 11 *
Chris@1 12 * This program is distributed in the hope that it will be useful,
Chris@1 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@1 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@1 15 * GNU General Public License for more details.
Chris@1 16 *
Chris@1 17 * You should have received a copy of the GNU General Public License
Chris@1 18 * along with this program; if not, write to the Free Software
Chris@1 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Chris@1 20 */
Chris@1 21
Chris@1 22 /*
Chris@1 23 * Convert a string between UTF-8 and the locale's charset.
Chris@1 24 */
Chris@1 25
Chris@1 26 #if HAVE_CONFIG_H
Chris@1 27 # include <config.h>
Chris@1 28 #endif
Chris@1 29
Chris@1 30 #include <stdlib.h>
Chris@1 31 #include <string.h>
Chris@1 32
Chris@1 33 #include "share/alloc.h"
Chris@1 34 #include "utf8.h"
Chris@1 35 #include "charset.h"
Chris@1 36
Chris@1 37
Chris@1 38 #ifdef _WIN32
Chris@1 39
Chris@1 40 /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
Chris@1 41 * code.
Chris@1 42 */
Chris@1 43
Chris@1 44 #include <stdio.h>
Chris@1 45 #include <windows.h>
Chris@1 46
Chris@1 47 static unsigned char *make_utf8_string(const wchar_t *unicode)
Chris@1 48 {
Chris@1 49 size_t size = 0, n;
Chris@1 50 int index = 0, out_index = 0;
Chris@1 51 unsigned char *out;
Chris@1 52 unsigned short c;
Chris@1 53
Chris@1 54 /* first calculate the size of the target string */
Chris@1 55 c = unicode[index++];
Chris@1 56 while(c) {
Chris@1 57 if(c < 0x0080) {
Chris@1 58 n = 1;
Chris@1 59 } else if(c < 0x0800) {
Chris@1 60 n = 2;
Chris@1 61 } else {
Chris@1 62 n = 3;
Chris@1 63 }
Chris@1 64 if(size+n < size) /* overflow check */
Chris@1 65 return NULL;
Chris@1 66 size += n;
Chris@1 67 c = unicode[index++];
Chris@1 68 }
Chris@1 69
Chris@1 70 out = safe_malloc_add_2op_(size, /*+*/1);
Chris@1 71 if (out == NULL)
Chris@1 72 return NULL;
Chris@1 73 index = 0;
Chris@1 74
Chris@1 75 c = unicode[index++];
Chris@1 76 while(c)
Chris@1 77 {
Chris@1 78 if(c < 0x080) {
Chris@1 79 out[out_index++] = (unsigned char)c;
Chris@1 80 } else if(c < 0x800) {
Chris@1 81 out[out_index++] = 0xc0 | (c >> 6);
Chris@1 82 out[out_index++] = 0x80 | (c & 0x3f);
Chris@1 83 } else {
Chris@1 84 out[out_index++] = 0xe0 | (c >> 12);
Chris@1 85 out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
Chris@1 86 out[out_index++] = 0x80 | (c & 0x3f);
Chris@1 87 }
Chris@1 88 c = unicode[index++];
Chris@1 89 }
Chris@1 90 out[out_index] = 0x00;
Chris@1 91
Chris@1 92 return out;
Chris@1 93 }
Chris@1 94
Chris@1 95 static wchar_t *make_unicode_string(const unsigned char *utf8)
Chris@1 96 {
Chris@1 97 size_t size = 0;
Chris@1 98 int index = 0, out_index = 0;
Chris@1 99 wchar_t *out;
Chris@1 100 unsigned char c;
Chris@1 101
Chris@1 102 /* first calculate the size of the target string */
Chris@1 103 c = utf8[index++];
Chris@1 104 while(c) {
Chris@1 105 if((c & 0x80) == 0) {
Chris@1 106 index += 0;
Chris@1 107 } else if((c & 0xe0) == 0xe0) {
Chris@1 108 index += 2;
Chris@1 109 } else {
Chris@1 110 index += 1;
Chris@1 111 }
Chris@1 112 if(size + 1 == 0) /* overflow check */
Chris@1 113 return NULL;
Chris@1 114 size++;
Chris@1 115 c = utf8[index++];
Chris@1 116 }
Chris@1 117
Chris@1 118 if(size + 1 == 0) /* overflow check */
Chris@1 119 return NULL;
Chris@1 120 out = safe_malloc_mul_2op_(size+1, /*times*/sizeof(wchar_t));
Chris@1 121 if (out == NULL)
Chris@1 122 return NULL;
Chris@1 123 index = 0;
Chris@1 124
Chris@1 125 c = utf8[index++];
Chris@1 126 while(c)
Chris@1 127 {
Chris@1 128 if((c & 0x80) == 0) {
Chris@1 129 out[out_index++] = c;
Chris@1 130 } else if((c & 0xe0) == 0xe0) {
Chris@1 131 out[out_index] = (c & 0x1F) << 12;
Chris@1 132 c = utf8[index++];
Chris@1 133 out[out_index] |= (c & 0x3F) << 6;
Chris@1 134 c = utf8[index++];
Chris@1 135 out[out_index++] |= (c & 0x3F);
Chris@1 136 } else {
Chris@1 137 out[out_index] = (c & 0x3F) << 6;
Chris@1 138 c = utf8[index++];
Chris@1 139 out[out_index++] |= (c & 0x3F);
Chris@1 140 }
Chris@1 141 c = utf8[index++];
Chris@1 142 }
Chris@1 143 out[out_index] = 0;
Chris@1 144
Chris@1 145 return out;
Chris@1 146 }
Chris@1 147
Chris@1 148 int utf8_encode(const char *from, char **to)
Chris@1 149 {
Chris@1 150 wchar_t *unicode;
Chris@1 151 int wchars, err;
Chris@1 152
Chris@1 153 wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
Chris@1 154 strlen(from), NULL, 0);
Chris@1 155
Chris@1 156 if(wchars == 0)
Chris@1 157 {
Chris@1 158 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
Chris@1 159 return -1;
Chris@1 160 }
Chris@1 161
Chris@1 162 if(wchars < 0) /* underflow check */
Chris@1 163 return -1;
Chris@1 164
Chris@1 165 unicode = safe_calloc_((size_t)wchars + 1, sizeof(unsigned short));
Chris@1 166 if(unicode == NULL)
Chris@1 167 {
Chris@1 168 fprintf(stderr, "Out of memory processing string to UTF8\n");
Chris@1 169 return -1;
Chris@1 170 }
Chris@1 171
Chris@1 172 err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
Chris@1 173 strlen(from), unicode, wchars);
Chris@1 174 if(err != wchars)
Chris@1 175 {
Chris@1 176 free(unicode);
Chris@1 177 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
Chris@1 178 return -1;
Chris@1 179 }
Chris@1 180
Chris@1 181 /* On NT-based windows systems, we could use WideCharToMultiByte(), but
Chris@1 182 * MS doesn't actually have a consistent API across win32.
Chris@1 183 */
Chris@1 184 *to = make_utf8_string(unicode);
Chris@1 185
Chris@1 186 free(unicode);
Chris@1 187 return 0;
Chris@1 188 }
Chris@1 189
Chris@1 190 int utf8_decode(const char *from, char **to)
Chris@1 191 {
Chris@1 192 wchar_t *unicode;
Chris@1 193 int chars, err;
Chris@1 194
Chris@1 195 /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
Chris@1 196 * MS doesn't actually have a consistent API across win32.
Chris@1 197 */
Chris@1 198 unicode = make_unicode_string(from);
Chris@1 199 if(unicode == NULL)
Chris@1 200 {
Chris@1 201 fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
Chris@1 202 return -1;
Chris@1 203 }
Chris@1 204
Chris@1 205 chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
Chris@1 206 -1, NULL, 0, NULL, NULL);
Chris@1 207
Chris@1 208 if(chars < 0) /* underflow check */
Chris@1 209 return -1;
Chris@1 210
Chris@1 211 if(chars == 0)
Chris@1 212 {
Chris@1 213 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
Chris@1 214 free(unicode);
Chris@1 215 return -1;
Chris@1 216 }
Chris@1 217
Chris@1 218 *to = safe_calloc_((size_t)chars + 1, sizeof(unsigned char));
Chris@1 219 if(*to == NULL)
Chris@1 220 {
Chris@1 221 fprintf(stderr, "Out of memory processing string to local charset\n");
Chris@1 222 free(unicode);
Chris@1 223 return -1;
Chris@1 224 }
Chris@1 225
Chris@1 226 err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
Chris@1 227 -1, *to, chars, NULL, NULL);
Chris@1 228 if(err != chars)
Chris@1 229 {
Chris@1 230 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
Chris@1 231 free(unicode);
Chris@1 232 free(*to);
Chris@1 233 *to = NULL;
Chris@1 234 return -1;
Chris@1 235 }
Chris@1 236
Chris@1 237 free(unicode);
Chris@1 238 return 0;
Chris@1 239 }
Chris@1 240
Chris@1 241 #else /* End win32. Rest is for real operating systems */
Chris@1 242
Chris@1 243
Chris@1 244 #ifdef HAVE_LANGINFO_CODESET
Chris@1 245 #include <langinfo.h>
Chris@1 246 #endif
Chris@1 247
Chris@1 248 #include "iconvert.h"
Chris@1 249
Chris@1 250 static const char *current_charset(void)
Chris@1 251 {
Chris@1 252 const char *c = 0;
Chris@1 253 #ifdef HAVE_LANGINFO_CODESET
Chris@1 254 c = nl_langinfo(CODESET);
Chris@1 255 #endif
Chris@1 256
Chris@1 257 if (!c)
Chris@1 258 c = getenv("CHARSET");
Chris@1 259
Chris@1 260 return c? c : "US-ASCII";
Chris@1 261 }
Chris@1 262
Chris@1 263 static int convert_buffer(const char *fromcode, const char *tocode,
Chris@1 264 const char *from, size_t fromlen,
Chris@1 265 char **to, size_t *tolen)
Chris@1 266 {
Chris@1 267 int ret = -1;
Chris@1 268
Chris@1 269 #ifdef HAVE_ICONV
Chris@1 270 ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
Chris@1 271 if (ret != -1)
Chris@1 272 return ret;
Chris@1 273 #endif
Chris@1 274
Chris@1 275 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
Chris@1 276 ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
Chris@1 277 if (ret != -1)
Chris@1 278 return ret;
Chris@1 279 #endif
Chris@1 280
Chris@1 281 return ret;
Chris@1 282 }
Chris@1 283
Chris@1 284 static int convert_string(const char *fromcode, const char *tocode,
Chris@1 285 const char *from, char **to, char replace)
Chris@1 286 {
Chris@1 287 int ret;
Chris@1 288 size_t fromlen;
Chris@1 289 char *s;
Chris@1 290
Chris@1 291 fromlen = strlen(from);
Chris@1 292 ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
Chris@1 293 if (ret == -2)
Chris@1 294 return -1;
Chris@1 295 if (ret != -1)
Chris@1 296 return ret;
Chris@1 297
Chris@1 298 s = safe_malloc_add_2op_(fromlen, /*+*/1);
Chris@1 299 if (!s)
Chris@1 300 return -1;
Chris@1 301 strcpy(s, from);
Chris@1 302 *to = s;
Chris@1 303 for (; *s; s++)
Chris@1 304 if (*s & ~0x7f)
Chris@1 305 *s = replace;
Chris@1 306 return 3;
Chris@1 307 }
Chris@1 308
Chris@1 309 int utf8_encode(const char *from, char **to)
Chris@1 310 {
Chris@1 311 return convert_string(current_charset(), "UTF-8", from, to, '#');
Chris@1 312 }
Chris@1 313
Chris@1 314 int utf8_decode(const char *from, char **to)
Chris@1 315 {
Chris@1 316 return convert_string("UTF-8", current_charset(), from, to, '?');
Chris@1 317 }
Chris@1 318
Chris@1 319 #endif