blob: b0372a1eb260a85a38892b00e2fb9d8d2cd1a08f [file] [log] [blame]
Elliott Hughesa6487332017-08-15 23:16:48 -07001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <iconv.h>
30
31#include <ctype.h>
32#include <endian.h>
33#include <errno.h>
34#include <stdlib.h>
35#include <uchar.h>
36
37#include "private/bionic_mbstate.h"
38
39#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
40
41// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
42// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
43// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
44enum Encoding {
45 US_ASCII,
46 UTF_8,
47 UTF_16_LE,
48 UTF_16_BE,
49 UTF_32_LE,
50 UTF_32_BE,
51 WCHAR_T,
52};
53
54enum Mode {
55 ERROR,
56 IGNORE,
57 TRANSLIT,
58};
59
60// This matching is strange but true.
61// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
62static bool __match_encoding(const char* lhs, const char* rhs) {
63 while (*lhs && *rhs) {
64 // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
65 // Also implement the "delete each 0 that is not preceded by a digit" rule.
66 for (; *lhs; ++lhs) {
67 if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break;
68 }
69 // Case doesn't matter either.
70 if (tolower(*lhs) != tolower(*rhs)) break;
71 ++lhs;
72 ++rhs;
73 }
74 // As a special case we treat the GNU "//" extensions as end of string.
75 if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true;
76 return false;
77}
78
79static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
80 const char* suffix = strstr(s, "//");
81 if (suffix) {
82 if (!mode) return false;
83 if (strcmp(suffix, "//IGNORE") == 0) {
84 *mode = IGNORE;
85 } else if (strcmp(suffix, "//TRANSLIT") == 0) {
86 *mode = TRANSLIT;
87 } else {
88 return false;
89 }
90 }
91 if (__match_encoding(s, "utf8")) {
92 *encoding = UTF_8;
93 } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) {
94 *encoding = US_ASCII;
95 } else if (__match_encoding(s, "utf16le")) {
96 *encoding = UTF_16_LE;
97 } else if (__match_encoding(s, "utf16be")) {
98 *encoding = UTF_16_BE;
99 } else if (__match_encoding(s, "utf32le")) {
100 *encoding = UTF_32_LE;
101 } else if (__match_encoding(s, "utf32be")) {
102 *encoding = UTF_32_BE;
103 } else if (__match_encoding(s, "wchart")) {
104 *encoding = WCHAR_T;
105 } else {
106 return false;
107 }
108 return true;
109}
110
111struct __iconv_t {
112 Encoding src_encoding;
113 Encoding dst_encoding;
114 Mode mode;
115
116 __iconv_t() : mode(ERROR) {
117 }
118
119 int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
120 // Reset state.
121 wc = 0;
122 memset(&ps, 0, sizeof(ps));
123 replacement_count = 0;
124 ignored = false;
125 src_buf = src_buf0;
126 src_bytes_left = src_bytes_left0;
127 dst_buf = dst_buf0;
128 dst_bytes_left = dst_bytes_left0;
129
130 while (*src_bytes_left > 0) {
131 if (!GetNext() || !Convert()) return -1;
132 }
133 return Done();
134 }
135
136 private:
137 char32_t wc;
138 char buf[16];
139 size_t src_bytes_used;
140 size_t dst_bytes_used;
141 mbstate_t ps;
142
143 size_t replacement_count;
144 bool ignored;
145
146 char** src_buf;
147 size_t* src_bytes_left;
148 char** dst_buf;
149 size_t* dst_bytes_left;
150
151 bool GetNext() {
152 errno = 0;
153 switch (src_encoding) {
154 case US_ASCII:
155 wc = **src_buf;
156 src_bytes_used = 1;
157 if (wc > 0x7f) errno = EILSEQ;
158 break;
159
160 case UTF_8:
161 src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps);
162 if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
163 break; // EILSEQ already set.
164 } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
165 errno = EINVAL;
166 return false;
167 }
168 break;
169
170 case UTF_16_BE:
171 case UTF_16_LE: {
172 if (*src_bytes_left < 2) {
173 errno = EINVAL;
174 return false;
175 }
176 bool swap = (src_encoding == UTF_16_BE);
177 wc = In16(*src_buf, swap);
178 // 0xd800-0xdbff: high surrogates
179 // 0xdc00-0xdfff: low surrogates
180 if (wc >= 0xd800 && wc <= 0xdfff) {
181 if (wc >= 0xdc00) { // Low surrogate before high surrogate.
182 errno = EILSEQ;
183 return false;
184 }
185 if (*src_bytes_left < 4) {
186 errno = EINVAL;
187 return false;
188 }
189 uint16_t hi = wc;
190 uint16_t lo = In16(*src_buf + 2, swap);
191 wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
192 src_bytes_used = 4;
193 }
194 break;
195 }
196
197 case UTF_32_BE:
198 case UTF_32_LE:
199 case WCHAR_T:
200 if (*src_bytes_left < 4) {
201 errno = EINVAL;
202 return false;
203 }
204 wc = In32(*src_buf, (src_encoding == UTF_32_BE));
205 break;
206 }
207
208 if (errno == EILSEQ) {
209 switch (mode) {
210 case ERROR:
211 return false;
212 case IGNORE:
213 *src_buf += src_bytes_used;
214 *src_bytes_left -= src_bytes_used;
215 ignored = true;
216 return GetNext();
217 case TRANSLIT:
218 wc = '?';
219 ++replacement_count;
220 return true;
221 }
222 }
223 return true;
224 }
225
226 bool Convert() {
227 errno = 0;
228 switch (dst_encoding) {
229 case US_ASCII:
230 buf[0] = wc;
231 dst_bytes_used = 1;
232 if (wc > 0x7f) errno = EILSEQ;
233 break;
234
235 case UTF_8:
236 dst_bytes_used = c32rtomb(buf, wc, &ps);
237 if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
238 break; // EILSEQ already set.
239 } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
240 errno = EINVAL;
241 return false;
242 }
243 break;
244
245 case UTF_16_BE:
246 case UTF_16_LE: {
247 bool swap = (dst_encoding == UTF_16_BE);
248 if (wc < 0x10000) { // BMP.
249 Out16(buf, wc, swap);
250 } else { // Supplementary plane; output surrogate pair.
251 wc -= 0x10000;
252 char16_t hi = 0xd800 | (wc >> 10);
253 char16_t lo = 0xdc00 | (wc & 0x3ff);
254 Out16(buf + 0, hi, swap);
255 Out16(buf + 2, lo, swap);
256 dst_bytes_used = 4;
257 }
258 } break;
259
260 case UTF_32_BE:
261 case UTF_32_LE:
262 case WCHAR_T:
263 Out32(wc, (dst_encoding == UTF_32_BE));
264 break;
265 }
266
267 if (errno == EILSEQ) {
268 if (mode == IGNORE) {
269 *src_buf += src_bytes_used;
270 *src_bytes_left -= src_bytes_used;
271 ignored = true;
272 return true;
273 } else if (mode == TRANSLIT) {
274 wc = '?';
275 ++replacement_count;
276 return Convert();
277 }
278 return false;
279 }
280
281 return Emit();
282 }
283
284 uint16_t In16(const char* buf, bool swap) {
285 const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
286 uint16_t wc = (src[0]) | (src[1] << 8);
287 if (swap) wc = __swap16(wc);
288 src_bytes_used = 2;
289 return wc;
290 }
291
292 uint32_t In32(const char* buf, bool swap) {
293 const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
294 uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24);
295 if (swap) wc = __swap32(wc);
296 src_bytes_used = 4;
297 return wc;
298 }
299
300 void Out16(char* dst, char16_t ch, bool swap) {
301 if (swap) ch = __swap16(ch);
302 dst[0] = ch;
303 dst[1] = ch >> 8;
304 dst_bytes_used = 2;
305 }
306
307 void Out32(char32_t ch, bool swap) {
308 if (swap) ch = __swap32(ch);
309 buf[0] = ch;
310 buf[1] = ch >> 8;
311 buf[2] = ch >> 16;
312 buf[3] = ch >> 24;
313 dst_bytes_used = 4;
314 }
315
316 bool Emit() {
317 if (dst_bytes_used > *dst_bytes_left) {
318 errno = E2BIG;
319 return false;
320 }
321
322 memcpy(*dst_buf, buf, dst_bytes_used);
323 *src_buf += src_bytes_used;
324 *src_bytes_left -= src_bytes_used;
325 *dst_buf += dst_bytes_used;
326 *dst_bytes_left -= dst_bytes_used;
327 return true;
328 }
329
330 int Done() {
331 if (mode == TRANSLIT) return replacement_count;
332 if (ignored) {
333 errno = EILSEQ;
334 return -1;
335 }
336 return 0;
337 }
338};
339
340iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
341 iconv_t result = new __iconv_t;
342 if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) ||
343 !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
344 delete result;
345 errno = EINVAL;
346 return INVALID_ICONV_T;
347 }
348 return result;
349}
350
351size_t iconv(iconv_t __converter,
352 char** __src_buf, size_t* __src_bytes_left,
353 char** __dst_buf, size_t* __dst_bytes_left) {
354 if (__converter == INVALID_ICONV_T) {
355 errno = EBADF;
356 return -1;
357 }
358 return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
359}
360
361int iconv_close(iconv_t __converter) {
362 if (__converter == INVALID_ICONV_T) {
363 errno = EBADF;
364 return -1;
365 }
366 delete __converter;
367 return 0;
368}