| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * Copyright (C) 2017 The Android Open Source Project | 
 | 3 |  * All rights reserved. | 
 | 4 |  * | 
 | 5 |  * Redistribution and use in source and binary forms, with or without | 
 | 6 |  * modification, are permitted provided that the following conditions | 
 | 7 |  * are met: | 
 | 8 |  *  * Redistributions of source code must retain the above copyright | 
 | 9 |  *    notice, this list of conditions and the following disclaimer. | 
 | 10 |  *  * Redistributions in binary form must reproduce the above copyright | 
 | 11 |  *    notice, this list of conditions and the following disclaimer in | 
 | 12 |  *    the documentation and/or other materials provided with the | 
 | 13 |  *    distribution. | 
 | 14 |  * | 
 | 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 | 16 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 | 17 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
 | 18 |  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
 | 19 |  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
 | 20 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
 | 21 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
 | 22 |  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
 | 23 |  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
 | 24 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
 | 25 |  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
 | 26 |  * SUCH DAMAGE. | 
 | 27 |  */ | 
 | 28 |  | 
 | 29 | #include <iconv.h> | 
 | 30 |  | 
 | 31 | #include <ctype.h> | 
 | 32 | #include <endian.h> | 
 | 33 | #include <errno.h> | 
 | 34 | #include <stdlib.h> | 
| Dan Albert | 1c78cb0 | 2017-10-11 11:25:25 -0700 | [diff] [blame] | 35 | #include <string.h> | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 36 | #include <uchar.h> | 
 | 37 |  | 
 | 38 | #include "private/bionic_mbstate.h" | 
 | 39 |  | 
 | 40 | #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) | 
 | 41 |  | 
 | 42 | // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something | 
 | 43 | // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're | 
 | 44 | // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. | 
 | 45 | enum Encoding { | 
 | 46 |   US_ASCII, | 
 | 47 |   UTF_8, | 
 | 48 |   UTF_16_LE, | 
 | 49 |   UTF_16_BE, | 
 | 50 |   UTF_32_LE, | 
 | 51 |   UTF_32_BE, | 
 | 52 |   WCHAR_T, | 
 | 53 | }; | 
 | 54 |  | 
 | 55 | enum Mode { | 
 | 56 |   ERROR, | 
 | 57 |   IGNORE, | 
 | 58 |   TRANSLIT, | 
 | 59 | }; | 
 | 60 |  | 
 | 61 | // This matching is strange but true. | 
 | 62 | // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. | 
 | 63 | static bool __match_encoding(const char* lhs, const char* rhs) { | 
 | 64 |   while (*lhs && *rhs) { | 
 | 65 |     // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. | 
 | 66 |     // Also implement the "delete each 0 that is not preceded by a digit" rule. | 
 | 67 |     for (; *lhs; ++lhs) { | 
 | 68 |       if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; | 
 | 69 |     } | 
 | 70 |     // Case doesn't matter either. | 
 | 71 |     if (tolower(*lhs) != tolower(*rhs)) break; | 
 | 72 |     ++lhs; | 
 | 73 |     ++rhs; | 
 | 74 |   } | 
 | 75 |   // As a special case we treat the GNU "//" extensions as end of string. | 
 | 76 |   if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; | 
 | 77 |   return false; | 
 | 78 | } | 
 | 79 |  | 
 | 80 | static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { | 
 | 81 |   const char* suffix = strstr(s, "//"); | 
 | 82 |   if (suffix) { | 
 | 83 |     if (!mode) return false; | 
 | 84 |     if (strcmp(suffix, "//IGNORE") == 0) { | 
 | 85 |       *mode = IGNORE; | 
 | 86 |     } else if (strcmp(suffix, "//TRANSLIT") == 0) { | 
 | 87 |       *mode = TRANSLIT; | 
 | 88 |     } else { | 
 | 89 |       return false; | 
 | 90 |     } | 
 | 91 |   } | 
 | 92 |   if (__match_encoding(s, "utf8")) { | 
 | 93 |     *encoding = UTF_8; | 
 | 94 |   } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { | 
 | 95 |     *encoding = US_ASCII; | 
 | 96 |   } else if (__match_encoding(s, "utf16le")) { | 
 | 97 |     *encoding = UTF_16_LE; | 
 | 98 |   } else if (__match_encoding(s, "utf16be")) { | 
 | 99 |     *encoding = UTF_16_BE; | 
 | 100 |   } else if (__match_encoding(s, "utf32le")) { | 
 | 101 |     *encoding = UTF_32_LE; | 
 | 102 |   } else if (__match_encoding(s, "utf32be")) { | 
 | 103 |     *encoding = UTF_32_BE; | 
 | 104 |   } else if (__match_encoding(s, "wchart")) { | 
 | 105 |     *encoding = WCHAR_T; | 
 | 106 |   } else { | 
 | 107 |     return false; | 
 | 108 |   } | 
 | 109 |   return true; | 
 | 110 | } | 
 | 111 |  | 
 | 112 | struct __iconv_t { | 
 | 113 |   Encoding src_encoding; | 
 | 114 |   Encoding dst_encoding; | 
 | 115 |   Mode mode; | 
 | 116 |  | 
 | 117 |   __iconv_t() : mode(ERROR) { | 
 | 118 |   } | 
 | 119 |  | 
 | 120 |   int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { | 
 | 121 |     // Reset state. | 
 | 122 |     wc = 0; | 
 | 123 |     memset(&ps, 0, sizeof(ps)); | 
 | 124 |     replacement_count = 0; | 
 | 125 |     ignored = false; | 
 | 126 |     src_buf = src_buf0; | 
 | 127 |     src_bytes_left = src_bytes_left0; | 
 | 128 |     dst_buf = dst_buf0; | 
 | 129 |     dst_bytes_left = dst_bytes_left0; | 
 | 130 |  | 
 | 131 |     while (*src_bytes_left > 0) { | 
 | 132 |       if (!GetNext() || !Convert()) return -1; | 
 | 133 |     } | 
 | 134 |     return Done(); | 
 | 135 |   } | 
 | 136 |  | 
 | 137 |  private: | 
 | 138 |   char32_t wc; | 
 | 139 |   char buf[16]; | 
 | 140 |   size_t src_bytes_used; | 
 | 141 |   size_t dst_bytes_used; | 
 | 142 |   mbstate_t ps; | 
 | 143 |  | 
 | 144 |   size_t replacement_count; | 
 | 145 |   bool ignored; | 
 | 146 |  | 
 | 147 |   char** src_buf; | 
 | 148 |   size_t* src_bytes_left; | 
 | 149 |   char** dst_buf; | 
 | 150 |   size_t* dst_bytes_left; | 
 | 151 |  | 
 | 152 |   bool GetNext() { | 
 | 153 |     errno = 0; | 
 | 154 |     switch (src_encoding) { | 
 | 155 |       case US_ASCII: | 
 | 156 |         wc = **src_buf; | 
 | 157 |         src_bytes_used = 1; | 
 | 158 |         if (wc > 0x7f) errno = EILSEQ; | 
 | 159 |         break; | 
 | 160 |  | 
 | 161 |       case UTF_8: | 
 | 162 |         src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); | 
| Dan Albert | a9e914d | 2023-07-21 21:41:55 +0000 | [diff] [blame] | 163 |         if (src_bytes_used == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) { | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 164 |           break;  // EILSEQ already set. | 
| Dan Albert | a9e914d | 2023-07-21 21:41:55 +0000 | [diff] [blame] | 165 |         } else if (src_bytes_used == BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE) { | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 166 |           errno = EINVAL; | 
 | 167 |           return false; | 
 | 168 |         } | 
 | 169 |         break; | 
 | 170 |  | 
 | 171 |       case UTF_16_BE: | 
 | 172 |       case UTF_16_LE: { | 
 | 173 |         if (*src_bytes_left < 2) { | 
 | 174 |           errno = EINVAL; | 
 | 175 |           return false; | 
 | 176 |         } | 
 | 177 |         bool swap = (src_encoding == UTF_16_BE); | 
 | 178 |         wc = In16(*src_buf, swap); | 
 | 179 |         // 0xd800-0xdbff: high surrogates | 
 | 180 |         // 0xdc00-0xdfff: low surrogates | 
 | 181 |         if (wc >= 0xd800 && wc <= 0xdfff) { | 
 | 182 |           if (wc >= 0xdc00) {  // Low surrogate before high surrogate. | 
 | 183 |             errno = EILSEQ; | 
 | 184 |             return false; | 
 | 185 |           } | 
 | 186 |           if (*src_bytes_left < 4) { | 
 | 187 |             errno = EINVAL; | 
 | 188 |             return false; | 
 | 189 |           } | 
 | 190 |           uint16_t hi = wc; | 
 | 191 |           uint16_t lo = In16(*src_buf + 2, swap); | 
 | 192 |           wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); | 
 | 193 |           src_bytes_used = 4; | 
 | 194 |         } | 
 | 195 |         break; | 
 | 196 |       } | 
 | 197 |  | 
 | 198 |       case UTF_32_BE: | 
 | 199 |       case UTF_32_LE: | 
 | 200 |       case WCHAR_T: | 
 | 201 |         if (*src_bytes_left < 4) { | 
 | 202 |           errno = EINVAL; | 
 | 203 |           return false; | 
 | 204 |         } | 
 | 205 |         wc = In32(*src_buf, (src_encoding == UTF_32_BE)); | 
 | 206 |         break; | 
 | 207 |     } | 
 | 208 |  | 
 | 209 |     if (errno == EILSEQ) { | 
 | 210 |       switch (mode) { | 
 | 211 |         case ERROR: | 
 | 212 |           return false; | 
 | 213 |         case IGNORE: | 
 | 214 |           *src_buf += src_bytes_used; | 
 | 215 |           *src_bytes_left -= src_bytes_used; | 
 | 216 |           ignored = true; | 
 | 217 |           return GetNext(); | 
 | 218 |         case TRANSLIT: | 
 | 219 |           wc = '?'; | 
 | 220 |           ++replacement_count; | 
 | 221 |           return true; | 
 | 222 |       } | 
 | 223 |     } | 
 | 224 |     return true; | 
 | 225 |   } | 
 | 226 |  | 
 | 227 |   bool Convert() { | 
 | 228 |     errno = 0; | 
 | 229 |     switch (dst_encoding) { | 
 | 230 |       case US_ASCII: | 
 | 231 |         buf[0] = wc; | 
 | 232 |         dst_bytes_used = 1; | 
 | 233 |         if (wc > 0x7f) errno = EILSEQ; | 
 | 234 |         break; | 
 | 235 |  | 
 | 236 |       case UTF_8: | 
 | 237 |         dst_bytes_used = c32rtomb(buf, wc, &ps); | 
| Dan Albert | a9e914d | 2023-07-21 21:41:55 +0000 | [diff] [blame] | 238 |         if (dst_bytes_used == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) { | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 239 |           break;  // EILSEQ already set. | 
| Dan Albert | a9e914d | 2023-07-21 21:41:55 +0000 | [diff] [blame] | 240 |         } else if (dst_bytes_used == BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE) { | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 241 |           errno = EINVAL; | 
 | 242 |           return false; | 
 | 243 |         } | 
 | 244 |         break; | 
 | 245 |  | 
 | 246 |       case UTF_16_BE: | 
 | 247 |       case UTF_16_LE: { | 
 | 248 |         bool swap = (dst_encoding == UTF_16_BE); | 
 | 249 |         if (wc < 0x10000) {  // BMP. | 
 | 250 |           Out16(buf, wc, swap); | 
 | 251 |         } else {  // Supplementary plane; output surrogate pair. | 
 | 252 |           wc -= 0x10000; | 
 | 253 |           char16_t hi = 0xd800 | (wc >> 10); | 
 | 254 |           char16_t lo = 0xdc00 | (wc & 0x3ff); | 
 | 255 |           Out16(buf + 0, hi, swap); | 
 | 256 |           Out16(buf + 2, lo, swap); | 
 | 257 |           dst_bytes_used = 4; | 
 | 258 |         } | 
 | 259 |       } break; | 
 | 260 |  | 
 | 261 |       case UTF_32_BE: | 
 | 262 |       case UTF_32_LE: | 
 | 263 |       case WCHAR_T: | 
 | 264 |         Out32(wc, (dst_encoding == UTF_32_BE)); | 
 | 265 |         break; | 
 | 266 |     } | 
 | 267 |  | 
 | 268 |     if (errno == EILSEQ) { | 
 | 269 |       if (mode == IGNORE) { | 
 | 270 |         *src_buf += src_bytes_used; | 
 | 271 |         *src_bytes_left -= src_bytes_used; | 
 | 272 |         ignored = true; | 
 | 273 |         return true; | 
 | 274 |       } else if (mode == TRANSLIT) { | 
 | 275 |         wc = '?'; | 
 | 276 |         ++replacement_count; | 
 | 277 |         return Convert(); | 
 | 278 |       } | 
 | 279 |       return false; | 
 | 280 |     } | 
 | 281 |  | 
 | 282 |     return Emit(); | 
 | 283 |   } | 
 | 284 |  | 
 | 285 |   uint16_t In16(const char* buf, bool swap) { | 
 | 286 |     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
 | 287 |     uint16_t wc = (src[0]) | (src[1] << 8); | 
 | 288 |     if (swap) wc = __swap16(wc); | 
 | 289 |     src_bytes_used = 2; | 
 | 290 |     return wc; | 
 | 291 |   } | 
 | 292 |  | 
 | 293 |   uint32_t In32(const char* buf, bool swap) { | 
 | 294 |     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
 | 295 |     uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); | 
 | 296 |     if (swap) wc = __swap32(wc); | 
 | 297 |     src_bytes_used = 4; | 
 | 298 |     return wc; | 
 | 299 |   } | 
 | 300 |  | 
 | 301 |   void Out16(char* dst, char16_t ch, bool swap) { | 
 | 302 |     if (swap) ch = __swap16(ch); | 
 | 303 |     dst[0] = ch; | 
 | 304 |     dst[1] = ch >> 8; | 
 | 305 |     dst_bytes_used = 2; | 
 | 306 |   } | 
 | 307 |  | 
 | 308 |   void Out32(char32_t ch, bool swap) { | 
 | 309 |     if (swap) ch = __swap32(ch); | 
 | 310 |     buf[0] = ch; | 
 | 311 |     buf[1] = ch >> 8; | 
 | 312 |     buf[2] = ch >> 16; | 
 | 313 |     buf[3] = ch >> 24; | 
 | 314 |     dst_bytes_used = 4; | 
 | 315 |   } | 
 | 316 |  | 
 | 317 |   bool Emit() { | 
 | 318 |     if (dst_bytes_used > *dst_bytes_left) { | 
 | 319 |       errno = E2BIG; | 
 | 320 |       return false; | 
 | 321 |     } | 
 | 322 |  | 
 | 323 |     memcpy(*dst_buf, buf, dst_bytes_used); | 
 | 324 |     *src_buf += src_bytes_used; | 
 | 325 |     *src_bytes_left -= src_bytes_used; | 
 | 326 |     *dst_buf += dst_bytes_used; | 
 | 327 |     *dst_bytes_left -= dst_bytes_used; | 
 | 328 |     return true; | 
 | 329 |   } | 
 | 330 |  | 
 | 331 |   int Done() { | 
 | 332 |     if (mode == TRANSLIT) return replacement_count; | 
 | 333 |     if (ignored) { | 
 | 334 |       errno = EILSEQ; | 
 | 335 |       return -1; | 
 | 336 |     } | 
 | 337 |     return 0; | 
 | 338 |   } | 
 | 339 | }; | 
 | 340 |  | 
 | 341 | iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { | 
 | 342 |   iconv_t result = new __iconv_t; | 
 | 343 |   if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) || | 
 | 344 |       !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { | 
 | 345 |     delete result; | 
 | 346 |     errno = EINVAL; | 
 | 347 |     return INVALID_ICONV_T; | 
 | 348 |   } | 
 | 349 |   return result; | 
 | 350 | } | 
 | 351 |  | 
 | 352 | size_t iconv(iconv_t __converter, | 
 | 353 |              char** __src_buf, size_t* __src_bytes_left, | 
 | 354 |              char** __dst_buf, size_t* __dst_bytes_left) { | 
 | 355 |   if (__converter == INVALID_ICONV_T) { | 
 | 356 |     errno = EBADF; | 
 | 357 |     return -1; | 
 | 358 |   } | 
| Elliott Hughes | 20c023f | 2021-02-18 10:37:22 -0800 | [diff] [blame] | 359 |  | 
 | 360 |   // Since none of our encodings are stateful, state flushing is a no-op. | 
 | 361 |   if (!__src_buf) return 0; | 
 | 362 |  | 
| Elliott Hughes | a648733 | 2017-08-15 23:16:48 -0700 | [diff] [blame] | 363 |   return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); | 
 | 364 | } | 
 | 365 |  | 
 | 366 | int iconv_close(iconv_t __converter) { | 
 | 367 |   if (__converter == INVALID_ICONV_T) { | 
 | 368 |     errno = EBADF; | 
 | 369 |     return -1; | 
 | 370 |   } | 
 | 371 |   delete __converter; | 
 | 372 |   return 0; | 
 | 373 | } |