Blame - libc/bionic/iconv.cpp - android_bionic

blob: 015d70f5ea369cbf17bec29dce090b1471bfa6f4 [file] [log] [blame]

Elliott Hughes	a648733	2017-08-15 23:16:48 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28
				29	#include <iconv.h>
				30
				31	#include <ctype.h>
				32	#include <endian.h>
				33	#include <errno.h>
				34	#include <stdlib.h>
Dan Albert	1c78cb0	2017-10-11 11:25:25 -0700	[diff] [blame]	35	#include <string.h>
Elliott Hughes	a648733	2017-08-15 23:16:48 -0700	[diff] [blame]	36	#include <uchar.h>
				37
				38	#include "private/bionic_mbstate.h"
				39
				40	#define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1)
				41
				42	// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something
				43	// equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're
				44	// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead.
				45	enum Encoding {
				46	US_ASCII,
				47	UTF_8,
				48	UTF_16_LE,
				49	UTF_16_BE,
				50	UTF_32_LE,
				51	UTF_32_BE,
				52	WCHAR_T,
				53	};
				54
				55	enum Mode {
				56	ERROR,
				57	IGNORE,
				58	TRANSLIT,
				59	};
				60
				61	// This matching is strange but true.
				62	// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching.
				63	static bool __match_encoding(const char* lhs, const char* rhs) {
				64	while (lhs && rhs) {
				65	// Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent.
				66	// Also implement the "delete each 0 that is not preceded by a digit" rule.
				67	for (; *lhs; ++lhs) {
				68	if (isalnum(lhs) && (lhs != '0' \|\| !isdigit(*(lhs + 1)))) break;
				69	}
				70	// Case doesn't matter either.
				71	if (tolower(lhs) != tolower(rhs)) break;
				72	++lhs;
				73	++rhs;
				74	}
				75	// As a special case we treat the GNU "//" extensions as end of string.
				76	if ((lhs == '\0' \|\| strstr(lhs, "//") == lhs) && rhs == '\0') return true;
				77	return false;
				78	}
				79
				80	static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) {
				81	const char* suffix = strstr(s, "//");
				82	if (suffix) {
				83	if (!mode) return false;
				84	if (strcmp(suffix, "//IGNORE") == 0) {
				85	*mode = IGNORE;
				86	} else if (strcmp(suffix, "//TRANSLIT") == 0) {
				87	*mode = TRANSLIT;
				88	} else {
				89	return false;
				90	}
				91	}
				92	if (__match_encoding(s, "utf8")) {
				93	*encoding = UTF_8;
				94	} else if (__match_encoding(s, "ascii") \|\| __match_encoding(s, "usascii")) {
				95	*encoding = US_ASCII;
				96	} else if (__match_encoding(s, "utf16le")) {
				97	*encoding = UTF_16_LE;
				98	} else if (__match_encoding(s, "utf16be")) {
				99	*encoding = UTF_16_BE;
				100	} else if (__match_encoding(s, "utf32le")) {
				101	*encoding = UTF_32_LE;
				102	} else if (__match_encoding(s, "utf32be")) {
				103	*encoding = UTF_32_BE;
				104	} else if (__match_encoding(s, "wchart")) {
				105	*encoding = WCHAR_T;
				106	} else {
				107	return false;
				108	}
				109	return true;
				110	}
				111
				112	struct __iconv_t {
				113	Encoding src_encoding;
				114	Encoding dst_encoding;
				115	Mode mode;
				116
				117	__iconv_t() : mode(ERROR) {
				118	}
				119
				120	int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) {
				121	// Reset state.
				122	wc = 0;
				123	memset(&ps, 0, sizeof(ps));
				124	replacement_count = 0;
				125	ignored = false;
				126	src_buf = src_buf0;
				127	src_bytes_left = src_bytes_left0;
				128	dst_buf = dst_buf0;
				129	dst_bytes_left = dst_bytes_left0;
				130
				131	while (*src_bytes_left > 0) {
				132	if (!GetNext() \|\| !Convert()) return -1;
				133	}
				134	return Done();
				135	}
				136
				137	private:
				138	char32_t wc;
				139	char buf[16];
				140	size_t src_bytes_used;
				141	size_t dst_bytes_used;
				142	mbstate_t ps;
				143
				144	size_t replacement_count;
				145	bool ignored;
				146
				147	char** src_buf;
				148	size_t* src_bytes_left;
				149	char** dst_buf;
				150	size_t* dst_bytes_left;
				151
				152	bool GetNext() {
				153	errno = 0;
				154	switch (src_encoding) {
				155	case US_ASCII:
				156	wc = **src_buf;
				157	src_bytes_used = 1;
				158	if (wc > 0x7f) errno = EILSEQ;
				159	break;
				160
				161	case UTF_8:
				162	src_bytes_used = mbrtoc32(&wc, src_buf, src_bytes_left, &ps);
				163	if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
				164	break; // EILSEQ already set.
				165	} else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
				166	errno = EINVAL;
				167	return false;
				168	}
				169	break;
				170
				171	case UTF_16_BE:
				172	case UTF_16_LE: {
				173	if (*src_bytes_left < 2) {
				174	errno = EINVAL;
				175	return false;
				176	}
				177	bool swap = (src_encoding == UTF_16_BE);
				178	wc = In16(*src_buf, swap);
				179	// 0xd800-0xdbff: high surrogates
				180	// 0xdc00-0xdfff: low surrogates
				181	if (wc >= 0xd800 && wc <= 0xdfff) {
				182	if (wc >= 0xdc00) { // Low surrogate before high surrogate.
				183	errno = EILSEQ;
				184	return false;
				185	}
				186	if (*src_bytes_left < 4) {
				187	errno = EINVAL;
				188	return false;
				189	}
				190	uint16_t hi = wc;
				191	uint16_t lo = In16(*src_buf + 2, swap);
				192	wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00);
				193	src_bytes_used = 4;
				194	}
				195	break;
				196	}
				197
				198	case UTF_32_BE:
				199	case UTF_32_LE:
				200	case WCHAR_T:
				201	if (*src_bytes_left < 4) {
				202	errno = EINVAL;
				203	return false;
				204	}
				205	wc = In32(*src_buf, (src_encoding == UTF_32_BE));
				206	break;
				207	}
				208
				209	if (errno == EILSEQ) {
				210	switch (mode) {
				211	case ERROR:
				212	return false;
				213	case IGNORE:
				214	*src_buf += src_bytes_used;
				215	*src_bytes_left -= src_bytes_used;
				216	ignored = true;
				217	return GetNext();
				218	case TRANSLIT:
				219	wc = '?';
				220	++replacement_count;
				221	return true;
				222	}
				223	}
				224	return true;
				225	}
				226
				227	bool Convert() {
				228	errno = 0;
				229	switch (dst_encoding) {
				230	case US_ASCII:
				231	buf[0] = wc;
				232	dst_bytes_used = 1;
				233	if (wc > 0x7f) errno = EILSEQ;
				234	break;
				235
				236	case UTF_8:
				237	dst_bytes_used = c32rtomb(buf, wc, &ps);
				238	if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) {
				239	break; // EILSEQ already set.
				240	} else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) {
				241	errno = EINVAL;
				242	return false;
				243	}
				244	break;
				245
				246	case UTF_16_BE:
				247	case UTF_16_LE: {
				248	bool swap = (dst_encoding == UTF_16_BE);
				249	if (wc < 0x10000) { // BMP.
				250	Out16(buf, wc, swap);
				251	} else { // Supplementary plane; output surrogate pair.
				252	wc -= 0x10000;
				253	char16_t hi = 0xd800 \| (wc >> 10);
				254	char16_t lo = 0xdc00 \| (wc & 0x3ff);
				255	Out16(buf + 0, hi, swap);
				256	Out16(buf + 2, lo, swap);
				257	dst_bytes_used = 4;
				258	}
				259	} break;
				260
				261	case UTF_32_BE:
				262	case UTF_32_LE:
				263	case WCHAR_T:
				264	Out32(wc, (dst_encoding == UTF_32_BE));
				265	break;
				266	}
				267
				268	if (errno == EILSEQ) {
				269	if (mode == IGNORE) {
				270	*src_buf += src_bytes_used;
				271	*src_bytes_left -= src_bytes_used;
				272	ignored = true;
				273	return true;
				274	} else if (mode == TRANSLIT) {
				275	wc = '?';
				276	++replacement_count;
				277	return Convert();
				278	}
				279	return false;
				280	}
				281
				282	return Emit();
				283	}
				284
				285	uint16_t In16(const char* buf, bool swap) {
				286	const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
				287	uint16_t wc = (src[0]) \| (src[1] << 8);
				288	if (swap) wc = __swap16(wc);
				289	src_bytes_used = 2;
				290	return wc;
				291	}
				292
				293	uint32_t In32(const char* buf, bool swap) {
				294	const uint8_t* src = reinterpret_cast<const uint8_t*>(buf);
				295	uint32_t wc = (src[0]) \| (src[1] << 8) \| (src[2] << 16) \| (src[3] << 24);
				296	if (swap) wc = __swap32(wc);
				297	src_bytes_used = 4;
				298	return wc;
				299	}
				300
				301	void Out16(char* dst, char16_t ch, bool swap) {
				302	if (swap) ch = __swap16(ch);
				303	dst[0] = ch;
				304	dst[1] = ch >> 8;
				305	dst_bytes_used = 2;
				306	}
				307
				308	void Out32(char32_t ch, bool swap) {
				309	if (swap) ch = __swap32(ch);
				310	buf[0] = ch;
				311	buf[1] = ch >> 8;
				312	buf[2] = ch >> 16;
				313	buf[3] = ch >> 24;
				314	dst_bytes_used = 4;
				315	}
				316
				317	bool Emit() {
				318	if (dst_bytes_used > *dst_bytes_left) {
				319	errno = E2BIG;
				320	return false;
				321	}
				322
				323	memcpy(*dst_buf, buf, dst_bytes_used);
				324	*src_buf += src_bytes_used;
				325	*src_bytes_left -= src_bytes_used;
				326	*dst_buf += dst_bytes_used;
				327	*dst_bytes_left -= dst_bytes_used;
				328	return true;
				329	}
				330
				331	int Done() {
				332	if (mode == TRANSLIT) return replacement_count;
				333	if (ignored) {
				334	errno = EILSEQ;
				335	return -1;
				336	}
				337	return 0;
				338	}
				339	};
				340
				341	iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) {
				342	iconv_t result = new __iconv_t;
				343	if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) \|\|
				344	!__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) {
				345	delete result;
				346	errno = EINVAL;
				347	return INVALID_ICONV_T;
				348	}
				349	return result;
				350	}
				351
				352	size_t iconv(iconv_t __converter,
				353	char** __src_buf, size_t* __src_bytes_left,
				354	char** __dst_buf, size_t* __dst_bytes_left) {
				355	if (__converter == INVALID_ICONV_T) {
				356	errno = EBADF;
				357	return -1;
				358	}
				359	return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left);
				360	}
				361
				362	int iconv_close(iconv_t __converter) {
				363	if (__converter == INVALID_ICONV_T) {
				364	errno = EBADF;
				365	return -1;
				366	}
				367	delete __converter;
				368	return 0;
				369	}