Blame - libutils/Unicode.cpp - android_system_core

blob: 6e31ce48d5e6082f9bb97a28943aeece3d37b50c [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	17	#include <log/log.h>
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	18	#include <utils/Unicode.h>
				19
				20	#include <stddef.h>
				21
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	22	#include <string>
				23	#include <sstream>
				24
Elliott Hughes	adbf442	2015-07-29 17:45:24 -0700	[diff] [blame]	25	#if defined(_WIN32)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	26	# undef nhtol
				27	# undef htonl
				28	# undef nhtos
				29	# undef htons
				30
Elliott Hughes	97ac0e1	2014-11-21 23:01:59 -0800	[diff] [blame]	31	# define ntohl(x) ( ((x) << 24) \| (((x) >> 24) & 255) \| (((x) << 8) & 0xff0000) \| (((x) >> 8) & 0xff00) )
				32	# define htonl(x) ntohl(x)
				33	# define ntohs(x) ( (((x) << 8) & 0xff00) \| (((x) >> 8) & 255) )
				34	# define htons(x) ntohs(x)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	35	#else
				36	# include <netinet/in.h>
				37	#endif
				38
				39	extern "C" {
				40
				41	static const char32_t kByteMask = 0x000000BF;
				42	static const char32_t kByteMark = 0x00000080;
				43
				44	// Surrogates aren't valid for UTF-32 characters, so define some
				45	// constants that will let us screen them out.
				46	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
Andreas Gampe	a53c815	2014-11-24 09:42:07 -0800	[diff] [blame]	47	// Unused, here for completeness:
				48	// static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				49	// static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	50	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				51	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				52	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				53	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				54
				55	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				56	// indexed by number of bytes in the sequence.
				57	// 0xxxxxxx
				58	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				59	// 110yyyyx 10xxxxxx
				60	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				61	// 1110yyyy 10yxxxxx 10xxxxxx
				62	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				63	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				64	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				65	static const char32_t kFirstByteMark[] = {
				66	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				67	};
				68
				69	// --------------------------------------------------------------------------
				70	// UTF-32
				71	// --------------------------------------------------------------------------
				72
				73	/**
				74	* Return number of UTF-8 bytes required for the character. If the character
				75	* is invalid, return size of 0.
				76	*/
				77	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				78	{
				79	// Figure out how many bytes the result will require.
				80	if (srcChar < 0x00000080) {
				81	return 1;
				82	} else if (srcChar < 0x00000800) {
				83	return 2;
				84	} else if (srcChar < 0x00010000) {
				85	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				86	return 3;
				87	} else {
				88	// Surrogates are invalid UTF-32 characters.
				89	return 0;
				90	}
				91	}
				92	// Max code point for Unicode is 0x0010FFFF.
				93	else if (srcChar <= kUnicodeMaxCodepoint) {
				94	return 4;
				95	} else {
				96	// Invalid UTF-32 character.
				97	return 0;
				98	}
				99	}
				100
				101	// Write out the source character to <dstP>.
				102
				103	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				104	{
				105	dstP += bytes;
				106	switch (bytes)
				107	{ /* note: everything falls through. */
				108	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				109	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				110	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				111	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				112	}
				113	}
				114
				115	size_t strlen32(const char32_t *s)
				116	{
				117	const char32_t *ss = s;
				118	while ( *ss )
				119	ss++;
				120	return ss-s;
				121	}
				122
				123	size_t strnlen32(const char32_t *s, size_t maxlen)
				124	{
				125	const char32_t *ss = s;
				126	while ((maxlen > 0) && *ss) {
				127	ss++;
				128	maxlen--;
				129	}
				130	return ss-s;
				131	}
				132
				133	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				134	{
				135	const char first_char = *cur;
				136	if ((first_char & 0x80) == 0) { // ASCII
				137	*num_read = 1;
				138	return *cur;
				139	}
				140	cur++;
				141	char32_t mask, to_ignore_mask;
				142	size_t num_to_read = 0;
				143	char32_t utf32 = first_char;
				144	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				145	(first_char & mask);
				146	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				147	// 0x3F == 00111111
				148	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				149	}
				150	to_ignore_mask \|= mask;
				151	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				152
				153	*num_read = num_to_read;
				154	return static_cast<int32_t>(utf32);
				155	}
				156
				157	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				158	{
				159	if (index >= src_len) {
				160	return -1;
				161	}
				162	size_t dummy_index;
				163	if (next_index == NULL) {
				164	next_index = &dummy_index;
				165	}
				166	size_t num_read;
				167	int32_t ret = utf32_at_internal(src + index, &num_read);
				168	if (ret >= 0) {
				169	*next_index = index + num_read;
				170	}
				171
				172	return ret;
				173	}
				174
				175	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				176	{
				177	if (src == NULL \|\| src_len == 0) {
				178	return -1;
				179	}
				180
				181	size_t ret = 0;
				182	const char32_t *end = src + src_len;
				183	while (src < end) {
				184	ret += utf32_codepoint_utf8_length(*src++);
				185	}
				186	return ret;
				187	}
				188
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	189	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	190	{
				191	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				192	return;
				193	}
				194
				195	const char32_t *cur_utf32 = src;
				196	const char32_t *end_utf32 = src + src_len;
				197	char *cur = dst;
				198	while (cur_utf32 < end_utf32) {
				199	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	200	LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	201	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				202	cur += len;
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	203	dst_len -= len;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	204	}
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	205	LOG_ALWAYS_FATAL_IF(dst_len < 1, "dst_len < 1: %zu < 1", dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	206	*cur = '\0';
				207	}
				208
				209	// --------------------------------------------------------------------------
				210	// UTF-16
				211	// --------------------------------------------------------------------------
				212
				213	int strcmp16(const char16_t s1, const char16_t s2)
				214	{
				215	char16_t ch;
				216	int d = 0;
				217
				218	while ( 1 ) {
				219	d = (int)(ch = s1++) - (int)s2++;
				220	if ( d \|\| !ch )
				221	break;
				222	}
				223
				224	return d;
				225	}
				226
				227	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				228	{
				229	char16_t ch;
				230	int d = 0;
				231
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	232	if (n == 0) {
				233	return 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	234	}
				235
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	236	do {
				237	d = (int)(ch = s1++) - (int)s2++;
				238	if ( d \|\| !ch ) {
				239	break;
				240	}
				241	} while (--n);
				242
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	243	return d;
				244	}
				245
				246	char16_t strcpy16(char16_t dst, const char16_t *src)
				247	{
				248	char16_t *q = dst;
				249	const char16_t *p = src;
				250	char16_t ch;
				251
				252	do {
				253	q++ = ch = p++;
				254	} while ( ch );
				255
				256	return dst;
				257	}
				258
				259	size_t strlen16(const char16_t *s)
				260	{
				261	const char16_t *ss = s;
				262	while ( *ss )
				263	ss++;
				264	return ss-s;
				265	}
				266
				267
				268	char16_t strncpy16(char16_t dst, const char16_t *src, size_t n)
				269	{
				270	char16_t *q = dst;
				271	const char16_t *p = src;
				272	char ch;
				273
				274	while (n) {
				275	n--;
				276	q++ = ch = p++;
				277	if ( !ch )
				278	break;
				279	}
				280
				281	*q = 0;
				282
				283	return dst;
				284	}
				285
				286	size_t strnlen16(const char16_t *s, size_t maxlen)
				287	{
				288	const char16_t *ss = s;
				289
				290	/* Important: the maxlen test must precede the reference through ss;
				291	since the byte beyond the maximum may segfault */
				292	while ((maxlen > 0) && *ss) {
				293	ss++;
				294	maxlen--;
				295	}
				296	return ss-s;
				297	}
				298
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	299	char16_t* strstr16(const char16_t* src, const char16_t* target)
				300	{
				301	const char16_t needle = *target++;
Michael Wright	0fd60eb	2016-05-16 21:23:07 +0100	[diff] [blame]	302	const size_t target_len = strlen16(target);
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	303	if (needle != '\0') {
				304	do {
				305	do {
				306	if (*src == '\0') {
				307	return nullptr;
				308	}
				309	} while (*src++ != needle);
Michael Wright	0fd60eb	2016-05-16 21:23:07 +0100	[diff] [blame]	310	} while (strncmp16(src, target, target_len) != 0);
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	311	src--;
				312	}
				313
				314	return (char16_t*)src;
				315	}
				316
				317
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	318	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				319	{
				320	const char16_t* e1 = s1+n1;
				321	const char16_t* e2 = s2+n2;
				322
				323	while (s1 < e1 && s2 < e2) {
				324	const int d = (int)s1++ - (int)s2++;
				325	if (d) {
				326	return d;
				327	}
				328	}
				329
				330	return n1 < n2
				331	? (0 - (int)*s2)
				332	: (n1 > n2
				333	? ((int)*s1 - 0)
				334	: 0);
				335	}
				336
				337	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2)
				338	{
				339	const char16_t* e1 = s1H+n1;
				340	const char16_t* e2 = s2N+n2;
				341
				342	while (s1H < e1 && s2N < e2) {
				343	const char16_t c2 = ntohs(*s2N);
				344	const int d = (int)*s1H++ - (int)c2;
				345	s2N++;
				346	if (d) {
				347	return d;
				348	}
				349	}
				350
				351	return n1 < n2
				352	? (0 - (int)ntohs(*s2N))
				353	: (n1 > n2
				354	? ((int)*s1H - 0)
				355	: 0);
				356	}
				357
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	358	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	359	{
				360	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				361	return;
				362	}
				363
				364	const char16_t* cur_utf16 = src;
				365	const char16_t* const end_utf16 = src + src_len;
				366	char *cur = dst;
				367	while (cur_utf16 < end_utf16) {
				368	char32_t utf32;
				369	// surrogate pairs
Cylen Yao	72299bf	2014-06-04 19:11:27 +0800	[diff] [blame]	370	if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16
				371	&& (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	372	utf32 = (*cur_utf16++ - 0xD800) << 10;
				373	utf32 \|= *cur_utf16++ - 0xDC00;
				374	utf32 += 0x10000;
				375	} else {
				376	utf32 = (char32_t) *cur_utf16++;
				377	}
				378	const size_t len = utf32_codepoint_utf8_length(utf32);
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	379	LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	380	utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
				381	cur += len;
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	382	dst_len -= len;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	383	}
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	384	LOG_ALWAYS_FATAL_IF(dst_len < 1, "%zu < 1", dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	385	*cur = '\0';
				386	}
				387
				388	// --------------------------------------------------------------------------
				389	// UTF-8
				390	// --------------------------------------------------------------------------
				391
				392	ssize_t utf8_length(const char *src)
				393	{
				394	const char *cur = src;
				395	size_t ret = 0;
				396	while (*cur != '\0') {
				397	const char first_char = *cur++;
				398	if ((first_char & 0x80) == 0) { // ASCII
				399	ret += 1;
				400	continue;
				401	}
				402	// (UTF-8's character must not be like 10xxxxxx,
				403	// but 110xxxxx, 1110xxxx, ... or 1111110x)
				404	if ((first_char & 0x40) == 0) {
				405	return -1;
				406	}
				407
				408	int32_t mask, to_ignore_mask;
				409	size_t num_to_read = 0;
				410	char32_t utf32 = 0;
				411	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
				412	num_to_read < 5 && (first_char & mask);
				413	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				414	if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
				415	return -1;
				416	}
				417	// 0x3F == 00111111
				418	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				419	}
				420	// "first_char" must be (110xxxxx - 11110xxx)
				421	if (num_to_read == 5) {
				422	return -1;
				423	}
				424	to_ignore_mask \|= mask;
				425	utf32 \|= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
				426	if (utf32 > kUnicodeMaxCodepoint) {
				427	return -1;
				428	}
				429
				430	ret += num_to_read;
				431	}
				432	return ret;
				433	}
				434
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	435	// DO NOT USE. Flawed version, kept only to check whether the flaw is being exploited.
				436	static ssize_t flawed_utf16_to_utf8_length(const char16_t *src, size_t src_len)
				437	{
				438	if (src == NULL \|\| src_len == 0) {
				439	return 47;
				440	}
				441
				442	size_t ret = 0;
				443	const char16_t* const end = src + src_len;
				444	while (src < end) {
				445	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
				446	// Shouldn't increment src here as to be consistent with utf16_to_utf8
				447	&& (*++src & 0xFC00) == 0xDC00) {
				448	// surrogate pairs are always 4 bytes.
				449	ret += 4;
				450	// Should increment src here by two.
				451	src++;
				452	} else {
				453	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				454	}
				455	}
				456	return ret;
				457	}
				458
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	459	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				460	{
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	461	// Keep the original pointer to compute the flawed length. Unused if we remove logging.
				462	const char16_t *orig_src = src;
				463
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	464	if (src == NULL \|\| src_len == 0) {
				465	return -1;
				466	}
				467
				468	size_t ret = 0;
				469	const char16_t* const end = src + src_len;
				470	while (src < end) {
				471	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	472	&& (*(src + 1) & 0xFC00) == 0xDC00) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	473	// surrogate pairs are always 4 bytes.
				474	ret += 4;
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	475	src += 2;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	476	} else {
				477	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				478	}
				479	}
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	480	// Log whether b/29250543 is being exploited. It seems reasonable to assume that
				481	// at least 5 bytes would be needed for an exploit. A single misplaced character might lead to
				482	// a difference of 4, so this would rule out many false positives.
				483	long ret_difference = ret - flawed_utf16_to_utf8_length(orig_src, src_len);
				484	if (ret_difference >= 5) {
				485	// Log the difference between new and old calculation. A high number, or equal numbers
				486	// appearing frequently, would be indicative of an attack.
Sergio Giro	1e61274	2016-07-11 15:43:54 +0100	[diff] [blame^]	487	std::ostringstream logged_string_stream;
				488	logged_string_stream << ret_difference;
				489	std::string logged_string = logged_string_stream.str();
Sergio Giro	c4966a3	2016-06-28 18:02:29 +0100	[diff] [blame]	490	android_errorWriteWithInfoLog(0x534e4554, "29250543", -1 /* int_uid */,
				491	logged_string.c_str(), logged_string.length() + 1);
				492	}
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	493	return ret;
				494	}
				495
				496	/**
				497	* Returns 1-4 based on the number of leading bits.
				498	*
				499	* 1111 -> 4
				500	* 1110 -> 3
				501	* 110x -> 2
				502	* 10xx -> 1
				503	* 0xxx -> 1
				504	*/
				505	static inline size_t utf8_codepoint_len(uint8_t ch)
				506	{
				507	return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
				508	}
				509
				510	static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
				511	{
				512	*codePoint <<= 6;
				513	*codePoint \|= 0x3F & byte;
				514	}
				515
				516	size_t utf8_to_utf32_length(const char *src, size_t src_len)
				517	{
				518	if (src == NULL \|\| src_len == 0) {
				519	return 0;
				520	}
				521	size_t ret = 0;
				522	const char* cur;
				523	const char* end;
				524	size_t num_to_skip;
				525	for (cur = src, end = src + src_len, num_to_skip = 1;
				526	cur < end;
				527	cur += num_to_skip, ret++) {
				528	const char first_char = *cur;
				529	num_to_skip = 1;
				530	if ((first_char & 0x80) == 0) { // ASCII
				531	continue;
				532	}
				533	int32_t mask;
				534
				535	for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
				536	}
				537	}
				538	return ret;
				539	}
				540
				541	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
				542	{
				543	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				544	return;
				545	}
				546
				547	const char* cur = src;
				548	const char* const end = src + src_len;
				549	char32_t* cur_utf32 = dst;
				550	while (cur < end) {
				551	size_t num_read;
				552	*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
				553	cur += num_read;
				554	}
				555	*cur_utf32 = 0;
				556	}
				557
				558	static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
				559	{
				560	uint32_t unicode;
				561
				562	switch (length)
				563	{
				564	case 1:
				565	return src[0];
				566	case 2:
				567	unicode = src[0] & 0x1f;
				568	utf8_shift_and_mask(&unicode, src[1]);
				569	return unicode;
				570	case 3:
				571	unicode = src[0] & 0x0f;
				572	utf8_shift_and_mask(&unicode, src[1]);
				573	utf8_shift_and_mask(&unicode, src[2]);
				574	return unicode;
				575	case 4:
				576	unicode = src[0] & 0x07;
				577	utf8_shift_and_mask(&unicode, src[1]);
				578	utf8_shift_and_mask(&unicode, src[2]);
				579	utf8_shift_and_mask(&unicode, src[3]);
				580	return unicode;
				581	default:
				582	return 0xffff;
				583	}
				584
				585	//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
				586	}
				587
				588	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
				589	{
				590	const uint8_t* const u8end = u8str + u8len;
				591	const uint8_t* u8cur = u8str;
				592
				593	/* Validate that the UTF-8 is the correct len */
				594	size_t u16measuredLen = 0;
				595	while (u8cur < u8end) {
				596	u16measuredLen++;
				597	int u8charLen = utf8_codepoint_len(*u8cur);
				598	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
				599	if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
				600	u8cur += u8charLen;
				601	}
				602
				603	/**
				604	* Make sure that we ended where we thought we would and the output UTF-16
				605	* will be exactly how long we were told it would be.
				606	*/
				607	if (u8cur != u8end) {
				608	return -1;
				609	}
				610
				611	return u16measuredLen;
				612	}
				613
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	614	char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	615	{
				616	const uint8_t* const u8end = u8str + u8len;
				617	const uint8_t* u8cur = u8str;
				618	char16_t* u16cur = u16str;
				619
				620	while (u8cur < u8end) {
				621	size_t u8len = utf8_codepoint_len(*u8cur);
				622	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				623
				624	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				625	if (codepoint <= 0xFFFF) {
				626	// Single UTF16 character
				627	*u16cur++ = (char16_t) codepoint;
				628	} else {
				629	// Multiple UTF16 characters with surrogates
				630	codepoint = codepoint - 0x10000;
				631	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				632	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				633	}
				634
				635	u8cur += u8len;
				636	}
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	637	return u16cur;
				638	}
				639
				640	void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
				641	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
				642	*end = 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	643	}
				644
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	645	char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
				646	const uint8_t* const u8end = src + srcLen;
				647	const uint8_t* u8cur = src;
Mark Salyzyn	5bed803	2014-04-30 11:10:46 -0700	[diff] [blame]	648	const char16_t* const u16end = dst + dstLen;
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	649	char16_t* u16cur = dst;
				650
				651	while (u8cur < u8end && u16cur < u16end) {
				652	size_t u8len = utf8_codepoint_len(*u8cur);
				653	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				654
				655	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				656	if (codepoint <= 0xFFFF) {
				657	// Single UTF16 character
				658	*u16cur++ = (char16_t) codepoint;
				659	} else {
				660	// Multiple UTF16 characters with surrogates
				661	codepoint = codepoint - 0x10000;
				662	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				663	if (u16cur >= u16end) {
				664	// Ooops... not enough room for this surrogate pair.
				665	return u16cur-1;
				666	}
				667	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				668	}
				669
				670	u8cur += u8len;
				671	}
				672	return u16cur;
				673	}
				674
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	675	}