Blame - libc/bionic/iconv.cpp - android_bionic

blob: b0372a1eb260a85a38892b00e2fb9d8d2cd1a08f [file] [log] [blame]

Elliott Hughes	a648733	2017-08-15 23:16:48 -0700	[diff] [blame^]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28
				29	#include <iconv.h>
				30
				31	#include <ctype.h>
				32	#include <endian.h>
				33	#include <errno.h>
				34	#include <stdlib.h>
				35	#include <uchar.h>
				36
				37	#include "private/bionic_mbstate.h"
				38
				39	#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
				40
				41	// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
				42	// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
				43	// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
				44	enum Encoding {
				45	US_ASCII,
				46	UTF_8,
				47	UTF_16_LE,
				48	UTF_16_BE,
				49	UTF_32_LE,
				50	UTF_32_BE,
				51	WCHAR_T,
				52	};
				53
				54	enum Mode {
				55	ERROR,
				56	IGNORE,
				57	TRANSLIT,
				58	};
				59
				60	// This matching is strange but true.
				61	// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
				62	static bool __match_encoding(const char* lhs, const char* rhs) {
				63	while (lhs && rhs) {
				64	// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
				65	// Also implement the "delete each 0 that is not preceded by a digit" rule.
				66	for (; *lhs; ++lhs) {
				67	if (isalnum(lhs) && (lhs != '0' \|\| !isdigit(*(lhs + 1)))) break;
				68	}
				69	// Case doesn't matter either.
				70	if (tolower(lhs) != tolower(rhs)) break;
				71	++lhs;
				72	++rhs;
				73	}
				74	// As a special case we treat the GNU "//" extensions as end of string.
				75	if ((lhs == '\0' \|\| strstr(lhs, "//") == lhs) && rhs == '\0') return true;
				76	return false;
				77	}
				78
				79	static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
				80	const char* suffix = strstr(s, "//");
				81	if (suffix) {
				82	if (!mode) return false;
				83	if (strcmp(suffix, "//IGNORE") == 0) {
				84	*mode = IGNORE;
				85	} else if (strcmp(suffix, "//TRANSLIT") == 0) {
				86	*mode = TRANSLIT;
				87	} else {
				88	return false;
				89	}
				90	}
				91	if (__match_encoding(s, "utf8")) {
				92	*encoding = UTF_8;
				93	} else if (__match_encoding(s, "ascii") \|\| __match_encoding(s, "usascii")) {
				94	*encoding = US_ASCII;
				95	} else if (__match_encoding(s, "utf16le")) {
				96	*encoding = UTF_16_LE;
				97	} else if (__match_encoding(s, "utf16be")) {
				98	*encoding = UTF_16_BE;
				99	} else if (__match_encoding(s, "utf32le")) {
				100	*encoding = UTF_32_LE;
				101	} else if (__match_encoding(s, "utf32be")) {
				102	*encoding = UTF_32_BE;
				103	} else if (__match_encoding(s, "wchart")) {
				104	*encoding = WCHAR_T;
				105	} else {
				106	return false;
				107	}
				108	return true;
				109	}
				110
				111	struct __iconv_t {
				112	Encoding src_encoding;
				113	Encoding dst_encoding;
				114	Mode mode;
				115
				116	__iconv_t() : mode(ERROR) {
				117	}
				118
				119	int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
				120	// Reset state.
				121	wc = 0;
				122	memset(&ps, 0, sizeof(ps));
				123	replacement_count = 0;
				124	ignored = false;
				125	src_buf = src_buf0;
				126	src_bytes_left = src_bytes_left0;
				127	dst_buf = dst_buf0;
				128	dst_bytes_left = dst_bytes_left0;
				129
				130	while (*src_bytes_left > 0) {
				131	if (!GetNext() \|\| !Convert()) return -1;
				132	}
				133	return Done();
				134	}
				135
				136	private:
				137	char32_t wc;
				138	char buf[16];
				139	size_t src_bytes_used;
				140	size_t dst_bytes_used;
				141	mbstate_t ps;
				142
				143	size_t replacement_count;
				144	bool ignored;
				145
				146	char** src_buf;
				147	size_t* src_bytes_left;
				148	char** dst_buf;
				149	size_t* dst_bytes_left;
				150
				151	bool GetNext() {
				152	errno = 0;
				153	switch (src_encoding) {
				154	case US_ASCII:
				155	wc = **src_buf;
				156	src_bytes_used = 1;
				157	if (wc > 0x7f) errno = EILSEQ;
				158	break;
				159
				160	case UTF_8:
				161	src_bytes_used = mbrtoc32(&wc, src_buf, src_bytes_left, &ps);
				162	if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
				163	break; // EILSEQ already set.
				164	} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
				165	errno = EINVAL;
				166	return false;
				167	}
				168	break;
				169
				170	case UTF_16_BE:
				171	case UTF_16_LE: {
				172	if (*src_bytes_left < 2) {
				173	errno = EINVAL;
				174	return false;
				175	}
				176	bool swap = (src_encoding == UTF_16_BE);
				177	wc = In16(*src_buf, swap);
				178	// 0xd800-0xdbff: high surrogates
				179	// 0xdc00-0xdfff: low surrogates
				180	if (wc >= 0xd800 && wc <= 0xdfff) {
				181	if (wc >= 0xdc00) { // Low surrogate before high surrogate.
				182	errno = EILSEQ;
				183	return false;
				184	}
				185	if (*src_bytes_left < 4) {
				186	errno = EINVAL;
				187	return false;
				188	}
				189	uint16_t hi = wc;
				190	uint16_t lo = In16(*src_buf + 2, swap);
				191	wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
				192	src_bytes_used = 4;
				193	}
				194	break;
				195	}
				196
				197	case UTF_32_BE:
				198	case UTF_32_LE:
				199	case WCHAR_T:
				200	if (*src_bytes_left < 4) {
				201	errno = EINVAL;
				202	return false;
				203	}
				204	wc = In32(*src_buf, (src_encoding == UTF_32_BE));
				205	break;
				206	}
				207
				208	if (errno == EILSEQ) {
				209	switch (mode) {
				210	case ERROR:
				211	return false;
				212	case IGNORE:
				213	*src_buf += src_bytes_used;
				214	*src_bytes_left -= src_bytes_used;
				215	ignored = true;
				216	return GetNext();
				217	case TRANSLIT:
				218	wc = '?';
				219	++replacement_count;
				220	return true;
				221	}
				222	}
				223	return true;
				224	}
				225
				226	bool Convert() {
				227	errno = 0;
				228	switch (dst_encoding) {
				229	case US_ASCII:
				230	buf[0] = wc;
				231	dst_bytes_used = 1;
				232	if (wc > 0x7f) errno = EILSEQ;
				233	break;
				234
				235	case UTF_8:
				236	dst_bytes_used = c32rtomb(buf, wc, &ps);
				237	if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
				238	break; // EILSEQ already set.
				239	} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
				240	errno = EINVAL;
				241	return false;
				242	}
				243	break;
				244
				245	case UTF_16_BE:
				246	case UTF_16_LE: {
				247	bool swap = (dst_encoding == UTF_16_BE);
				248	if (wc < 0x10000) { // BMP.
				249	Out16(buf, wc, swap);
				250	} else { // Supplementary plane; output surrogate pair.
				251	wc -= 0x10000;
				252	char16_t hi = 0xd800 \| (wc >> 10);
				253	char16_t lo = 0xdc00 \| (wc & 0x3ff);
				254	Out16(buf + 0, hi, swap);
				255	Out16(buf + 2, lo, swap);
				256	dst_bytes_used = 4;
				257	}
				258	} break;
				259
				260	case UTF_32_BE:
				261	case UTF_32_LE:
				262	case WCHAR_T:
				263	Out32(wc, (dst_encoding == UTF_32_BE));
				264	break;
				265	}
				266
				267	if (errno == EILSEQ) {
				268	if (mode == IGNORE) {
				269	*src_buf += src_bytes_used;
				270	*src_bytes_left -= src_bytes_used;
				271	ignored = true;
				272	return true;
				273	} else if (mode == TRANSLIT) {
				274	wc = '?';
				275	++replacement_count;
				276	return Convert();
				277	}
				278	return false;
				279	}
				280
				281	return Emit();
				282	}
				283
				284	uint16_t In16(const char* buf, bool swap) {
				285	const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
				286	uint16_t wc = (src[0]) \| (src[1] << 8);
				287	if (swap) wc = __swap16(wc);
				288	src_bytes_used = 2;
				289	return wc;
				290	}
				291
				292	uint32_t In32(const char* buf, bool swap) {
				293	const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
				294	uint32_t wc = (src[0]) \| (src[1] << 8) \| (src[2] << 16) \| (src[3] << 24);
				295	if (swap) wc = __swap32(wc);
				296	src_bytes_used = 4;
				297	return wc;
				298	}
				299
				300	void Out16(char* dst, char16_t ch, bool swap) {
				301	if (swap) ch = __swap16(ch);
				302	dst[0] = ch;
				303	dst[1] = ch >> 8;
				304	dst_bytes_used = 2;
				305	}
				306
				307	void Out32(char32_t ch, bool swap) {
				308	if (swap) ch = __swap32(ch);
				309	buf[0] = ch;
				310	buf[1] = ch >> 8;
				311	buf[2] = ch >> 16;
				312	buf[3] = ch >> 24;
				313	dst_bytes_used = 4;
				314	}
				315
				316	bool Emit() {
				317	if (dst_bytes_used > *dst_bytes_left) {
				318	errno = E2BIG;
				319	return false;
				320	}
				321
				322	memcpy(*dst_buf, buf, dst_bytes_used);
				323	*src_buf += src_bytes_used;
				324	*src_bytes_left -= src_bytes_used;
				325	*dst_buf += dst_bytes_used;
				326	*dst_bytes_left -= dst_bytes_used;
				327	return true;
				328	}
				329
				330	int Done() {
				331	if (mode == TRANSLIT) return replacement_count;
				332	if (ignored) {
				333	errno = EILSEQ;
				334	return -1;
				335	}
				336	return 0;
				337	}
				338	};
				339
				340	iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
				341	iconv_t result = new __iconv_t;
				342	if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) \|\|
				343	!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
				344	delete result;
				345	errno = EINVAL;
				346	return INVALID_ICONV_T;
				347	}
				348	return result;
				349	}
				350
				351	size_t iconv(iconv_t __converter,
				352	char** __src_buf, size_t* __src_bytes_left,
				353	char** __dst_buf, size_t* __dst_bytes_left) {
				354	if (__converter == INVALID_ICONV_T) {
				355	errno = EBADF;
				356	return -1;
				357	}
				358	return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
				359	}
				360
				361	int iconv_close(iconv_t __converter) {
				362	if (__converter == INVALID_ICONV_T) {
				363	errno = EBADF;
				364	return -1;
				365	}
				366	delete __converter;
				367	return 0;
				368	}