| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2017 The Android Open Source Project | 
|  | 3 | * All rights reserved. | 
|  | 4 | * | 
|  | 5 | * Redistribution and use in source and binary forms, with or without | 
|  | 6 | * modification, are permitted provided that the following conditions | 
|  | 7 | * are met: | 
|  | 8 | *  * Redistributions of source code must retain the above copyright | 
|  | 9 | *    notice, this list of conditions and the following disclaimer. | 
|  | 10 | *  * Redistributions in binary form must reproduce the above copyright | 
|  | 11 | *    notice, this list of conditions and the following disclaimer in | 
|  | 12 | *    the documentation and/or other materials provided with the | 
|  | 13 | *    distribution. | 
|  | 14 | * | 
|  | 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 16 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 17 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
|  | 18 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
|  | 19 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
|  | 20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
|  | 21 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
|  | 22 | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
|  | 23 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
|  | 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
|  | 25 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 26 | * SUCH DAMAGE. | 
|  | 27 | */ | 
|  | 28 |  | 
|  | 29 | #include <iconv.h> | 
|  | 30 |  | 
|  | 31 | #include <ctype.h> | 
|  | 32 | #include <endian.h> | 
|  | 33 | #include <errno.h> | 
|  | 34 | #include <stdlib.h> | 
|  | 35 | #include <uchar.h> | 
|  | 36 |  | 
|  | 37 | #include "private/bionic_mbstate.h" | 
|  | 38 |  | 
|  | 39 | #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) | 
|  | 40 |  | 
|  | 41 | // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something | 
|  | 42 | // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're | 
|  | 43 | // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. | 
|  | 44 | enum Encoding { | 
|  | 45 | US_ASCII, | 
|  | 46 | UTF_8, | 
|  | 47 | UTF_16_LE, | 
|  | 48 | UTF_16_BE, | 
|  | 49 | UTF_32_LE, | 
|  | 50 | UTF_32_BE, | 
|  | 51 | WCHAR_T, | 
|  | 52 | }; | 
|  | 53 |  | 
|  | 54 | enum Mode { | 
|  | 55 | ERROR, | 
|  | 56 | IGNORE, | 
|  | 57 | TRANSLIT, | 
|  | 58 | }; | 
|  | 59 |  | 
|  | 60 | // This matching is strange but true. | 
|  | 61 | // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. | 
|  | 62 | static bool __match_encoding(const char* lhs, const char* rhs) { | 
|  | 63 | while (*lhs && *rhs) { | 
|  | 64 | // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. | 
|  | 65 | // Also implement the "delete each 0 that is not preceded by a digit" rule. | 
|  | 66 | for (; *lhs; ++lhs) { | 
|  | 67 | if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; | 
|  | 68 | } | 
|  | 69 | // Case doesn't matter either. | 
|  | 70 | if (tolower(*lhs) != tolower(*rhs)) break; | 
|  | 71 | ++lhs; | 
|  | 72 | ++rhs; | 
|  | 73 | } | 
|  | 74 | // As a special case we treat the GNU "//" extensions as end of string. | 
|  | 75 | if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; | 
|  | 76 | return false; | 
|  | 77 | } | 
|  | 78 |  | 
|  | 79 | static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { | 
|  | 80 | const char* suffix = strstr(s, "//"); | 
|  | 81 | if (suffix) { | 
|  | 82 | if (!mode) return false; | 
|  | 83 | if (strcmp(suffix, "//IGNORE") == 0) { | 
|  | 84 | *mode = IGNORE; | 
|  | 85 | } else if (strcmp(suffix, "//TRANSLIT") == 0) { | 
|  | 86 | *mode = TRANSLIT; | 
|  | 87 | } else { | 
|  | 88 | return false; | 
|  | 89 | } | 
|  | 90 | } | 
|  | 91 | if (__match_encoding(s, "utf8")) { | 
|  | 92 | *encoding = UTF_8; | 
|  | 93 | } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { | 
|  | 94 | *encoding = US_ASCII; | 
|  | 95 | } else if (__match_encoding(s, "utf16le")) { | 
|  | 96 | *encoding = UTF_16_LE; | 
|  | 97 | } else if (__match_encoding(s, "utf16be")) { | 
|  | 98 | *encoding = UTF_16_BE; | 
|  | 99 | } else if (__match_encoding(s, "utf32le")) { | 
|  | 100 | *encoding = UTF_32_LE; | 
|  | 101 | } else if (__match_encoding(s, "utf32be")) { | 
|  | 102 | *encoding = UTF_32_BE; | 
|  | 103 | } else if (__match_encoding(s, "wchart")) { | 
|  | 104 | *encoding = WCHAR_T; | 
|  | 105 | } else { | 
|  | 106 | return false; | 
|  | 107 | } | 
|  | 108 | return true; | 
|  | 109 | } | 
|  | 110 |  | 
|  | 111 | struct __iconv_t { | 
|  | 112 | Encoding src_encoding; | 
|  | 113 | Encoding dst_encoding; | 
|  | 114 | Mode mode; | 
|  | 115 |  | 
|  | 116 | __iconv_t() : mode(ERROR) { | 
|  | 117 | } | 
|  | 118 |  | 
|  | 119 | int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { | 
|  | 120 | // Reset state. | 
|  | 121 | wc = 0; | 
|  | 122 | memset(&ps, 0, sizeof(ps)); | 
|  | 123 | replacement_count = 0; | 
|  | 124 | ignored = false; | 
|  | 125 | src_buf = src_buf0; | 
|  | 126 | src_bytes_left = src_bytes_left0; | 
|  | 127 | dst_buf = dst_buf0; | 
|  | 128 | dst_bytes_left = dst_bytes_left0; | 
|  | 129 |  | 
|  | 130 | while (*src_bytes_left > 0) { | 
|  | 131 | if (!GetNext() || !Convert()) return -1; | 
|  | 132 | } | 
|  | 133 | return Done(); | 
|  | 134 | } | 
|  | 135 |  | 
|  | 136 | private: | 
|  | 137 | char32_t wc; | 
|  | 138 | char buf[16]; | 
|  | 139 | size_t src_bytes_used; | 
|  | 140 | size_t dst_bytes_used; | 
|  | 141 | mbstate_t ps; | 
|  | 142 |  | 
|  | 143 | size_t replacement_count; | 
|  | 144 | bool ignored; | 
|  | 145 |  | 
|  | 146 | char** src_buf; | 
|  | 147 | size_t* src_bytes_left; | 
|  | 148 | char** dst_buf; | 
|  | 149 | size_t* dst_bytes_left; | 
|  | 150 |  | 
|  | 151 | bool GetNext() { | 
|  | 152 | errno = 0; | 
|  | 153 | switch (src_encoding) { | 
|  | 154 | case US_ASCII: | 
|  | 155 | wc = **src_buf; | 
|  | 156 | src_bytes_used = 1; | 
|  | 157 | if (wc > 0x7f) errno = EILSEQ; | 
|  | 158 | break; | 
|  | 159 |  | 
|  | 160 | case UTF_8: | 
|  | 161 | src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); | 
|  | 162 | if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { | 
|  | 163 | break;  // EILSEQ already set. | 
|  | 164 | } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { | 
|  | 165 | errno = EINVAL; | 
|  | 166 | return false; | 
|  | 167 | } | 
|  | 168 | break; | 
|  | 169 |  | 
|  | 170 | case UTF_16_BE: | 
|  | 171 | case UTF_16_LE: { | 
|  | 172 | if (*src_bytes_left < 2) { | 
|  | 173 | errno = EINVAL; | 
|  | 174 | return false; | 
|  | 175 | } | 
|  | 176 | bool swap = (src_encoding == UTF_16_BE); | 
|  | 177 | wc = In16(*src_buf, swap); | 
|  | 178 | // 0xd800-0xdbff: high surrogates | 
|  | 179 | // 0xdc00-0xdfff: low surrogates | 
|  | 180 | if (wc >= 0xd800 && wc <= 0xdfff) { | 
|  | 181 | if (wc >= 0xdc00) {  // Low surrogate before high surrogate. | 
|  | 182 | errno = EILSEQ; | 
|  | 183 | return false; | 
|  | 184 | } | 
|  | 185 | if (*src_bytes_left < 4) { | 
|  | 186 | errno = EINVAL; | 
|  | 187 | return false; | 
|  | 188 | } | 
|  | 189 | uint16_t hi = wc; | 
|  | 190 | uint16_t lo = In16(*src_buf + 2, swap); | 
|  | 191 | wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); | 
|  | 192 | src_bytes_used = 4; | 
|  | 193 | } | 
|  | 194 | break; | 
|  | 195 | } | 
|  | 196 |  | 
|  | 197 | case UTF_32_BE: | 
|  | 198 | case UTF_32_LE: | 
|  | 199 | case WCHAR_T: | 
|  | 200 | if (*src_bytes_left < 4) { | 
|  | 201 | errno = EINVAL; | 
|  | 202 | return false; | 
|  | 203 | } | 
|  | 204 | wc = In32(*src_buf, (src_encoding == UTF_32_BE)); | 
|  | 205 | break; | 
|  | 206 | } | 
|  | 207 |  | 
|  | 208 | if (errno == EILSEQ) { | 
|  | 209 | switch (mode) { | 
|  | 210 | case ERROR: | 
|  | 211 | return false; | 
|  | 212 | case IGNORE: | 
|  | 213 | *src_buf += src_bytes_used; | 
|  | 214 | *src_bytes_left -= src_bytes_used; | 
|  | 215 | ignored = true; | 
|  | 216 | return GetNext(); | 
|  | 217 | case TRANSLIT: | 
|  | 218 | wc = '?'; | 
|  | 219 | ++replacement_count; | 
|  | 220 | return true; | 
|  | 221 | } | 
|  | 222 | } | 
|  | 223 | return true; | 
|  | 224 | } | 
|  | 225 |  | 
|  | 226 | bool Convert() { | 
|  | 227 | errno = 0; | 
|  | 228 | switch (dst_encoding) { | 
|  | 229 | case US_ASCII: | 
|  | 230 | buf[0] = wc; | 
|  | 231 | dst_bytes_used = 1; | 
|  | 232 | if (wc > 0x7f) errno = EILSEQ; | 
|  | 233 | break; | 
|  | 234 |  | 
|  | 235 | case UTF_8: | 
|  | 236 | dst_bytes_used = c32rtomb(buf, wc, &ps); | 
|  | 237 | if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { | 
|  | 238 | break;  // EILSEQ already set. | 
|  | 239 | } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { | 
|  | 240 | errno = EINVAL; | 
|  | 241 | return false; | 
|  | 242 | } | 
|  | 243 | break; | 
|  | 244 |  | 
|  | 245 | case UTF_16_BE: | 
|  | 246 | case UTF_16_LE: { | 
|  | 247 | bool swap = (dst_encoding == UTF_16_BE); | 
|  | 248 | if (wc < 0x10000) {  // BMP. | 
|  | 249 | Out16(buf, wc, swap); | 
|  | 250 | } else {  // Supplementary plane; output surrogate pair. | 
|  | 251 | wc -= 0x10000; | 
|  | 252 | char16_t hi = 0xd800 | (wc >> 10); | 
|  | 253 | char16_t lo = 0xdc00 | (wc & 0x3ff); | 
|  | 254 | Out16(buf + 0, hi, swap); | 
|  | 255 | Out16(buf + 2, lo, swap); | 
|  | 256 | dst_bytes_used = 4; | 
|  | 257 | } | 
|  | 258 | } break; | 
|  | 259 |  | 
|  | 260 | case UTF_32_BE: | 
|  | 261 | case UTF_32_LE: | 
|  | 262 | case WCHAR_T: | 
|  | 263 | Out32(wc, (dst_encoding == UTF_32_BE)); | 
|  | 264 | break; | 
|  | 265 | } | 
|  | 266 |  | 
|  | 267 | if (errno == EILSEQ) { | 
|  | 268 | if (mode == IGNORE) { | 
|  | 269 | *src_buf += src_bytes_used; | 
|  | 270 | *src_bytes_left -= src_bytes_used; | 
|  | 271 | ignored = true; | 
|  | 272 | return true; | 
|  | 273 | } else if (mode == TRANSLIT) { | 
|  | 274 | wc = '?'; | 
|  | 275 | ++replacement_count; | 
|  | 276 | return Convert(); | 
|  | 277 | } | 
|  | 278 | return false; | 
|  | 279 | } | 
|  | 280 |  | 
|  | 281 | return Emit(); | 
|  | 282 | } | 
|  | 283 |  | 
|  | 284 | uint16_t In16(const char* buf, bool swap) { | 
|  | 285 | const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
|  | 286 | uint16_t wc = (src[0]) | (src[1] << 8); | 
|  | 287 | if (swap) wc = __swap16(wc); | 
|  | 288 | src_bytes_used = 2; | 
|  | 289 | return wc; | 
|  | 290 | } | 
|  | 291 |  | 
|  | 292 | uint32_t In32(const char* buf, bool swap) { | 
|  | 293 | const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
|  | 294 | uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); | 
|  | 295 | if (swap) wc = __swap32(wc); | 
|  | 296 | src_bytes_used = 4; | 
|  | 297 | return wc; | 
|  | 298 | } | 
|  | 299 |  | 
|  | 300 | void Out16(char* dst, char16_t ch, bool swap) { | 
|  | 301 | if (swap) ch = __swap16(ch); | 
|  | 302 | dst[0] = ch; | 
|  | 303 | dst[1] = ch >> 8; | 
|  | 304 | dst_bytes_used = 2; | 
|  | 305 | } | 
|  | 306 |  | 
|  | 307 | void Out32(char32_t ch, bool swap) { | 
|  | 308 | if (swap) ch = __swap32(ch); | 
|  | 309 | buf[0] = ch; | 
|  | 310 | buf[1] = ch >> 8; | 
|  | 311 | buf[2] = ch >> 16; | 
|  | 312 | buf[3] = ch >> 24; | 
|  | 313 | dst_bytes_used = 4; | 
|  | 314 | } | 
|  | 315 |  | 
|  | 316 | bool Emit() { | 
|  | 317 | if (dst_bytes_used > *dst_bytes_left) { | 
|  | 318 | errno = E2BIG; | 
|  | 319 | return false; | 
|  | 320 | } | 
|  | 321 |  | 
|  | 322 | memcpy(*dst_buf, buf, dst_bytes_used); | 
|  | 323 | *src_buf += src_bytes_used; | 
|  | 324 | *src_bytes_left -= src_bytes_used; | 
|  | 325 | *dst_buf += dst_bytes_used; | 
|  | 326 | *dst_bytes_left -= dst_bytes_used; | 
|  | 327 | return true; | 
|  | 328 | } | 
|  | 329 |  | 
|  | 330 | int Done() { | 
|  | 331 | if (mode == TRANSLIT) return replacement_count; | 
|  | 332 | if (ignored) { | 
|  | 333 | errno = EILSEQ; | 
|  | 334 | return -1; | 
|  | 335 | } | 
|  | 336 | return 0; | 
|  | 337 | } | 
|  | 338 | }; | 
|  | 339 |  | 
|  | 340 | iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { | 
|  | 341 | iconv_t result = new __iconv_t; | 
|  | 342 | if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) || | 
|  | 343 | !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { | 
|  | 344 | delete result; | 
|  | 345 | errno = EINVAL; | 
|  | 346 | return INVALID_ICONV_T; | 
|  | 347 | } | 
|  | 348 | return result; | 
|  | 349 | } | 
|  | 350 |  | 
|  | 351 | size_t iconv(iconv_t __converter, | 
|  | 352 | char** __src_buf, size_t* __src_bytes_left, | 
|  | 353 | char** __dst_buf, size_t* __dst_bytes_left) { | 
|  | 354 | if (__converter == INVALID_ICONV_T) { | 
|  | 355 | errno = EBADF; | 
|  | 356 | return -1; | 
|  | 357 | } | 
|  | 358 | return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); | 
|  | 359 | } | 
|  | 360 |  | 
|  | 361 | int iconv_close(iconv_t __converter) { | 
|  | 362 | if (__converter == INVALID_ICONV_T) { | 
|  | 363 | errno = EBADF; | 
|  | 364 | return -1; | 
|  | 365 | } | 
|  | 366 | delete __converter; | 
|  | 367 | return 0; | 
|  | 368 | } |