| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2017 The Android Open Source Project | 
|  | 3 | * All rights reserved. | 
|  | 4 | * | 
|  | 5 | * Redistribution and use in source and binary forms, with or without | 
|  | 6 | * modification, are permitted provided that the following conditions | 
|  | 7 | * are met: | 
|  | 8 | *  * Redistributions of source code must retain the above copyright | 
|  | 9 | *    notice, this list of conditions and the following disclaimer. | 
|  | 10 | *  * Redistributions in binary form must reproduce the above copyright | 
|  | 11 | *    notice, this list of conditions and the following disclaimer in | 
|  | 12 | *    the documentation and/or other materials provided with the | 
|  | 13 | *    distribution. | 
|  | 14 | * | 
|  | 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 16 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 17 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
|  | 18 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
|  | 19 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
|  | 20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
|  | 21 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
|  | 22 | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
|  | 23 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
|  | 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
|  | 25 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 26 | * SUCH DAMAGE. | 
|  | 27 | */ | 
|  | 28 |  | 
|  | 29 | #include <iconv.h> | 
|  | 30 |  | 
|  | 31 | #include <ctype.h> | 
|  | 32 | #include <endian.h> | 
|  | 33 | #include <errno.h> | 
|  | 34 | #include <stdlib.h> | 
| Dan Albert | 1c78cb0 | 2017-10-11 11:25:25 -0700 | [diff] [blame] | 35 | #include <string.h> | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 36 | #include <uchar.h> | 
|  | 37 |  | 
|  | 38 | #include "private/bionic_mbstate.h" | 
|  | 39 |  | 
|  | 40 | #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) | 
|  | 41 |  | 
|  | 42 | // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something | 
|  | 43 | // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're | 
|  | 44 | // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. | 
|  | 45 | enum Encoding { | 
|  | 46 | US_ASCII, | 
|  | 47 | UTF_8, | 
|  | 48 | UTF_16_LE, | 
|  | 49 | UTF_16_BE, | 
|  | 50 | UTF_32_LE, | 
|  | 51 | UTF_32_BE, | 
|  | 52 | WCHAR_T, | 
|  | 53 | }; | 
|  | 54 |  | 
|  | 55 | enum Mode { | 
|  | 56 | ERROR, | 
|  | 57 | IGNORE, | 
|  | 58 | TRANSLIT, | 
|  | 59 | }; | 
|  | 60 |  | 
|  | 61 | // This matching is strange but true. | 
|  | 62 | // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. | 
|  | 63 | static bool __match_encoding(const char* lhs, const char* rhs) { | 
|  | 64 | while (*lhs && *rhs) { | 
|  | 65 | // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. | 
|  | 66 | // Also implement the "delete each 0 that is not preceded by a digit" rule. | 
|  | 67 | for (; *lhs; ++lhs) { | 
|  | 68 | if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; | 
|  | 69 | } | 
|  | 70 | // Case doesn't matter either. | 
|  | 71 | if (tolower(*lhs) != tolower(*rhs)) break; | 
|  | 72 | ++lhs; | 
|  | 73 | ++rhs; | 
|  | 74 | } | 
|  | 75 | // As a special case we treat the GNU "//" extensions as end of string. | 
|  | 76 | if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; | 
|  | 77 | return false; | 
|  | 78 | } | 
|  | 79 |  | 
|  | 80 | static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { | 
|  | 81 | const char* suffix = strstr(s, "//"); | 
|  | 82 | if (suffix) { | 
|  | 83 | if (!mode) return false; | 
|  | 84 | if (strcmp(suffix, "//IGNORE") == 0) { | 
|  | 85 | *mode = IGNORE; | 
|  | 86 | } else if (strcmp(suffix, "//TRANSLIT") == 0) { | 
|  | 87 | *mode = TRANSLIT; | 
|  | 88 | } else { | 
|  | 89 | return false; | 
|  | 90 | } | 
|  | 91 | } | 
|  | 92 | if (__match_encoding(s, "utf8")) { | 
|  | 93 | *encoding = UTF_8; | 
|  | 94 | } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { | 
|  | 95 | *encoding = US_ASCII; | 
|  | 96 | } else if (__match_encoding(s, "utf16le")) { | 
|  | 97 | *encoding = UTF_16_LE; | 
|  | 98 | } else if (__match_encoding(s, "utf16be")) { | 
|  | 99 | *encoding = UTF_16_BE; | 
|  | 100 | } else if (__match_encoding(s, "utf32le")) { | 
|  | 101 | *encoding = UTF_32_LE; | 
|  | 102 | } else if (__match_encoding(s, "utf32be")) { | 
|  | 103 | *encoding = UTF_32_BE; | 
|  | 104 | } else if (__match_encoding(s, "wchart")) { | 
|  | 105 | *encoding = WCHAR_T; | 
|  | 106 | } else { | 
|  | 107 | return false; | 
|  | 108 | } | 
|  | 109 | return true; | 
|  | 110 | } | 
|  | 111 |  | 
|  | 112 | struct __iconv_t { | 
|  | 113 | Encoding src_encoding; | 
|  | 114 | Encoding dst_encoding; | 
|  | 115 | Mode mode; | 
|  | 116 |  | 
|  | 117 | __iconv_t() : mode(ERROR) { | 
|  | 118 | } | 
|  | 119 |  | 
|  | 120 | int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { | 
|  | 121 | // Reset state. | 
|  | 122 | wc = 0; | 
|  | 123 | memset(&ps, 0, sizeof(ps)); | 
|  | 124 | replacement_count = 0; | 
|  | 125 | ignored = false; | 
|  | 126 | src_buf = src_buf0; | 
|  | 127 | src_bytes_left = src_bytes_left0; | 
|  | 128 | dst_buf = dst_buf0; | 
|  | 129 | dst_bytes_left = dst_bytes_left0; | 
|  | 130 |  | 
|  | 131 | while (*src_bytes_left > 0) { | 
|  | 132 | if (!GetNext() || !Convert()) return -1; | 
|  | 133 | } | 
|  | 134 | return Done(); | 
|  | 135 | } | 
|  | 136 |  | 
|  | 137 | private: | 
|  | 138 | char32_t wc; | 
|  | 139 | char buf[16]; | 
|  | 140 | size_t src_bytes_used; | 
|  | 141 | size_t dst_bytes_used; | 
|  | 142 | mbstate_t ps; | 
|  | 143 |  | 
|  | 144 | size_t replacement_count; | 
|  | 145 | bool ignored; | 
|  | 146 |  | 
|  | 147 | char** src_buf; | 
|  | 148 | size_t* src_bytes_left; | 
|  | 149 | char** dst_buf; | 
|  | 150 | size_t* dst_bytes_left; | 
|  | 151 |  | 
|  | 152 | bool GetNext() { | 
|  | 153 | errno = 0; | 
|  | 154 | switch (src_encoding) { | 
|  | 155 | case US_ASCII: | 
|  | 156 | wc = **src_buf; | 
|  | 157 | src_bytes_used = 1; | 
|  | 158 | if (wc > 0x7f) errno = EILSEQ; | 
|  | 159 | break; | 
|  | 160 |  | 
|  | 161 | case UTF_8: | 
|  | 162 | src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); | 
|  | 163 | if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { | 
|  | 164 | break;  // EILSEQ already set. | 
|  | 165 | } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { | 
|  | 166 | errno = EINVAL; | 
|  | 167 | return false; | 
|  | 168 | } | 
|  | 169 | break; | 
|  | 170 |  | 
|  | 171 | case UTF_16_BE: | 
|  | 172 | case UTF_16_LE: { | 
|  | 173 | if (*src_bytes_left < 2) { | 
|  | 174 | errno = EINVAL; | 
|  | 175 | return false; | 
|  | 176 | } | 
|  | 177 | bool swap = (src_encoding == UTF_16_BE); | 
|  | 178 | wc = In16(*src_buf, swap); | 
|  | 179 | // 0xd800-0xdbff: high surrogates | 
|  | 180 | // 0xdc00-0xdfff: low surrogates | 
|  | 181 | if (wc >= 0xd800 && wc <= 0xdfff) { | 
|  | 182 | if (wc >= 0xdc00) {  // Low surrogate before high surrogate. | 
|  | 183 | errno = EILSEQ; | 
|  | 184 | return false; | 
|  | 185 | } | 
|  | 186 | if (*src_bytes_left < 4) { | 
|  | 187 | errno = EINVAL; | 
|  | 188 | return false; | 
|  | 189 | } | 
|  | 190 | uint16_t hi = wc; | 
|  | 191 | uint16_t lo = In16(*src_buf + 2, swap); | 
|  | 192 | wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); | 
|  | 193 | src_bytes_used = 4; | 
|  | 194 | } | 
|  | 195 | break; | 
|  | 196 | } | 
|  | 197 |  | 
|  | 198 | case UTF_32_BE: | 
|  | 199 | case UTF_32_LE: | 
|  | 200 | case WCHAR_T: | 
|  | 201 | if (*src_bytes_left < 4) { | 
|  | 202 | errno = EINVAL; | 
|  | 203 | return false; | 
|  | 204 | } | 
|  | 205 | wc = In32(*src_buf, (src_encoding == UTF_32_BE)); | 
|  | 206 | break; | 
|  | 207 | } | 
|  | 208 |  | 
|  | 209 | if (errno == EILSEQ) { | 
|  | 210 | switch (mode) { | 
|  | 211 | case ERROR: | 
|  | 212 | return false; | 
|  | 213 | case IGNORE: | 
|  | 214 | *src_buf += src_bytes_used; | 
|  | 215 | *src_bytes_left -= src_bytes_used; | 
|  | 216 | ignored = true; | 
|  | 217 | return GetNext(); | 
|  | 218 | case TRANSLIT: | 
|  | 219 | wc = '?'; | 
|  | 220 | ++replacement_count; | 
|  | 221 | return true; | 
|  | 222 | } | 
|  | 223 | } | 
|  | 224 | return true; | 
|  | 225 | } | 
|  | 226 |  | 
|  | 227 | bool Convert() { | 
|  | 228 | errno = 0; | 
|  | 229 | switch (dst_encoding) { | 
|  | 230 | case US_ASCII: | 
|  | 231 | buf[0] = wc; | 
|  | 232 | dst_bytes_used = 1; | 
|  | 233 | if (wc > 0x7f) errno = EILSEQ; | 
|  | 234 | break; | 
|  | 235 |  | 
|  | 236 | case UTF_8: | 
|  | 237 | dst_bytes_used = c32rtomb(buf, wc, &ps); | 
|  | 238 | if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { | 
|  | 239 | break;  // EILSEQ already set. | 
|  | 240 | } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { | 
|  | 241 | errno = EINVAL; | 
|  | 242 | return false; | 
|  | 243 | } | 
|  | 244 | break; | 
|  | 245 |  | 
|  | 246 | case UTF_16_BE: | 
|  | 247 | case UTF_16_LE: { | 
|  | 248 | bool swap = (dst_encoding == UTF_16_BE); | 
|  | 249 | if (wc < 0x10000) {  // BMP. | 
|  | 250 | Out16(buf, wc, swap); | 
|  | 251 | } else {  // Supplementary plane; output surrogate pair. | 
|  | 252 | wc -= 0x10000; | 
|  | 253 | char16_t hi = 0xd800 | (wc >> 10); | 
|  | 254 | char16_t lo = 0xdc00 | (wc & 0x3ff); | 
|  | 255 | Out16(buf + 0, hi, swap); | 
|  | 256 | Out16(buf + 2, lo, swap); | 
|  | 257 | dst_bytes_used = 4; | 
|  | 258 | } | 
|  | 259 | } break; | 
|  | 260 |  | 
|  | 261 | case UTF_32_BE: | 
|  | 262 | case UTF_32_LE: | 
|  | 263 | case WCHAR_T: | 
|  | 264 | Out32(wc, (dst_encoding == UTF_32_BE)); | 
|  | 265 | break; | 
|  | 266 | } | 
|  | 267 |  | 
|  | 268 | if (errno == EILSEQ) { | 
|  | 269 | if (mode == IGNORE) { | 
|  | 270 | *src_buf += src_bytes_used; | 
|  | 271 | *src_bytes_left -= src_bytes_used; | 
|  | 272 | ignored = true; | 
|  | 273 | return true; | 
|  | 274 | } else if (mode == TRANSLIT) { | 
|  | 275 | wc = '?'; | 
|  | 276 | ++replacement_count; | 
|  | 277 | return Convert(); | 
|  | 278 | } | 
|  | 279 | return false; | 
|  | 280 | } | 
|  | 281 |  | 
|  | 282 | return Emit(); | 
|  | 283 | } | 
|  | 284 |  | 
|  | 285 | uint16_t In16(const char* buf, bool swap) { | 
|  | 286 | const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
|  | 287 | uint16_t wc = (src[0]) | (src[1] << 8); | 
|  | 288 | if (swap) wc = __swap16(wc); | 
|  | 289 | src_bytes_used = 2; | 
|  | 290 | return wc; | 
|  | 291 | } | 
|  | 292 |  | 
|  | 293 | uint32_t In32(const char* buf, bool swap) { | 
|  | 294 | const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
|  | 295 | uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); | 
|  | 296 | if (swap) wc = __swap32(wc); | 
|  | 297 | src_bytes_used = 4; | 
|  | 298 | return wc; | 
|  | 299 | } | 
|  | 300 |  | 
|  | 301 | void Out16(char* dst, char16_t ch, bool swap) { | 
|  | 302 | if (swap) ch = __swap16(ch); | 
|  | 303 | dst[0] = ch; | 
|  | 304 | dst[1] = ch >> 8; | 
|  | 305 | dst_bytes_used = 2; | 
|  | 306 | } | 
|  | 307 |  | 
|  | 308 | void Out32(char32_t ch, bool swap) { | 
|  | 309 | if (swap) ch = __swap32(ch); | 
|  | 310 | buf[0] = ch; | 
|  | 311 | buf[1] = ch >> 8; | 
|  | 312 | buf[2] = ch >> 16; | 
|  | 313 | buf[3] = ch >> 24; | 
|  | 314 | dst_bytes_used = 4; | 
|  | 315 | } | 
|  | 316 |  | 
|  | 317 | bool Emit() { | 
|  | 318 | if (dst_bytes_used > *dst_bytes_left) { | 
|  | 319 | errno = E2BIG; | 
|  | 320 | return false; | 
|  | 321 | } | 
|  | 322 |  | 
|  | 323 | memcpy(*dst_buf, buf, dst_bytes_used); | 
|  | 324 | *src_buf += src_bytes_used; | 
|  | 325 | *src_bytes_left -= src_bytes_used; | 
|  | 326 | *dst_buf += dst_bytes_used; | 
|  | 327 | *dst_bytes_left -= dst_bytes_used; | 
|  | 328 | return true; | 
|  | 329 | } | 
|  | 330 |  | 
|  | 331 | int Done() { | 
|  | 332 | if (mode == TRANSLIT) return replacement_count; | 
|  | 333 | if (ignored) { | 
|  | 334 | errno = EILSEQ; | 
|  | 335 | return -1; | 
|  | 336 | } | 
|  | 337 | return 0; | 
|  | 338 | } | 
|  | 339 | }; | 
|  | 340 |  | 
|  | 341 | iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { | 
|  | 342 | iconv_t result = new __iconv_t; | 
|  | 343 | if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) || | 
|  | 344 | !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { | 
|  | 345 | delete result; | 
|  | 346 | errno = EINVAL; | 
|  | 347 | return INVALID_ICONV_T; | 
|  | 348 | } | 
|  | 349 | return result; | 
|  | 350 | } | 
|  | 351 |  | 
|  | 352 | size_t iconv(iconv_t __converter, | 
|  | 353 | char** __src_buf, size_t* __src_bytes_left, | 
|  | 354 | char** __dst_buf, size_t* __dst_bytes_left) { | 
|  | 355 | if (__converter == INVALID_ICONV_T) { | 
|  | 356 | errno = EBADF; | 
|  | 357 | return -1; | 
|  | 358 | } | 
|  | 359 | return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); | 
|  | 360 | } | 
|  | 361 |  | 
|  | 362 | int iconv_close(iconv_t __converter) { | 
|  | 363 | if (__converter == INVALID_ICONV_T) { | 
|  | 364 | errno = EBADF; | 
|  | 365 | return -1; | 
|  | 366 | } | 
|  | 367 | delete __converter; | 
|  | 368 | return 0; | 
|  | 369 | } |