Blame - libutils/Unicode.cpp - android_system_core

blob: a75c258f17f14b11e8491d2a4ee551bbc3c85c7f [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	17	#include <log/log.h>
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	18	#include <utils/Unicode.h>
				19
				20	#include <stddef.h>
				21
				22	#ifdef HAVE_WINSOCK
				23	# undef nhtol
				24	# undef htonl
				25	# undef nhtos
				26	# undef htons
				27
				28	# ifdef HAVE_LITTLE_ENDIAN
				29	# define ntohl(x) ( ((x) << 24) \| (((x) >> 24) & 255) \| (((x) << 8) & 0xff0000) \| (((x) >> 8) & 0xff00) )
				30	# define htonl(x) ntohl(x)
				31	# define ntohs(x) ( (((x) << 8) & 0xff00) \| (((x) >> 8) & 255) )
				32	# define htons(x) ntohs(x)
				33	# else
				34	# define ntohl(x) (x)
				35	# define htonl(x) (x)
				36	# define ntohs(x) (x)
				37	# define htons(x) (x)
				38	# endif
				39	#else
				40	# include <netinet/in.h>
				41	#endif
				42
				43	extern "C" {
				44
				45	static const char32_t kByteMask = 0x000000BF;
				46	static const char32_t kByteMark = 0x00000080;
				47
				48	// Surrogates aren't valid for UTF-32 characters, so define some
				49	// constants that will let us screen them out.
				50	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
				51	static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				52	static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
				53	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				54	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				55	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				56	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				57
				58	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				59	// indexed by number of bytes in the sequence.
				60	// 0xxxxxxx
				61	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				62	// 110yyyyx 10xxxxxx
				63	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				64	// 1110yyyy 10yxxxxx 10xxxxxx
				65	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				66	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				67	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				68	static const char32_t kFirstByteMark[] = {
				69	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				70	};
				71
				72	// --------------------------------------------------------------------------
				73	// UTF-32
				74	// --------------------------------------------------------------------------
				75
				76	/**
				77	* Return number of UTF-8 bytes required for the character. If the character
				78	* is invalid, return size of 0.
				79	*/
				80	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				81	{
				82	// Figure out how many bytes the result will require.
				83	if (srcChar < 0x00000080) {
				84	return 1;
				85	} else if (srcChar < 0x00000800) {
				86	return 2;
				87	} else if (srcChar < 0x00010000) {
				88	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				89	return 3;
				90	} else {
				91	// Surrogates are invalid UTF-32 characters.
				92	return 0;
				93	}
				94	}
				95	// Max code point for Unicode is 0x0010FFFF.
				96	else if (srcChar <= kUnicodeMaxCodepoint) {
				97	return 4;
				98	} else {
				99	// Invalid UTF-32 character.
				100	return 0;
				101	}
				102	}
				103
				104	// Write out the source character to <dstP>.
				105
				106	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				107	{
				108	dstP += bytes;
				109	switch (bytes)
				110	{ /* note: everything falls through. */
				111	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				112	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				113	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				114	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				115	}
				116	}
				117
				118	size_t strlen32(const char32_t *s)
				119	{
				120	const char32_t *ss = s;
				121	while ( *ss )
				122	ss++;
				123	return ss-s;
				124	}
				125
				126	size_t strnlen32(const char32_t *s, size_t maxlen)
				127	{
				128	const char32_t *ss = s;
				129	while ((maxlen > 0) && *ss) {
				130	ss++;
				131	maxlen--;
				132	}
				133	return ss-s;
				134	}
				135
				136	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				137	{
				138	const char first_char = *cur;
				139	if ((first_char & 0x80) == 0) { // ASCII
				140	*num_read = 1;
				141	return *cur;
				142	}
				143	cur++;
				144	char32_t mask, to_ignore_mask;
				145	size_t num_to_read = 0;
				146	char32_t utf32 = first_char;
				147	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				148	(first_char & mask);
				149	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				150	// 0x3F == 00111111
				151	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				152	}
				153	to_ignore_mask \|= mask;
				154	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				155
				156	*num_read = num_to_read;
				157	return static_cast<int32_t>(utf32);
				158	}
				159
				160	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				161	{
				162	if (index >= src_len) {
				163	return -1;
				164	}
				165	size_t dummy_index;
				166	if (next_index == NULL) {
				167	next_index = &dummy_index;
				168	}
				169	size_t num_read;
				170	int32_t ret = utf32_at_internal(src + index, &num_read);
				171	if (ret >= 0) {
				172	*next_index = index + num_read;
				173	}
				174
				175	return ret;
				176	}
				177
				178	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				179	{
				180	if (src == NULL \|\| src_len == 0) {
				181	return -1;
				182	}
				183
				184	size_t ret = 0;
				185	const char32_t *end = src + src_len;
				186	while (src < end) {
				187	ret += utf32_codepoint_utf8_length(*src++);
				188	}
				189	return ret;
				190	}
				191
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	192	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	193	{
				194	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				195	return;
				196	}
				197
				198	const char32_t *cur_utf32 = src;
				199	const char32_t *end_utf32 = src + src_len;
				200	char *cur = dst;
				201	while (cur_utf32 < end_utf32) {
				202	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	203	LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	204	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				205	cur += len;
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	206	dst_len -= len;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	207	}
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	208	LOG_ALWAYS_FATAL_IF(dst_len < 1, "dst_len < 1: %zu < 1", dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	209	*cur = '\0';
				210	}
				211
				212	// --------------------------------------------------------------------------
				213	// UTF-16
				214	// --------------------------------------------------------------------------
				215
				216	int strcmp16(const char16_t s1, const char16_t s2)
				217	{
				218	char16_t ch;
				219	int d = 0;
				220
				221	while ( 1 ) {
				222	d = (int)(ch = s1++) - (int)s2++;
				223	if ( d \|\| !ch )
				224	break;
				225	}
				226
				227	return d;
				228	}
				229
				230	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				231	{
				232	char16_t ch;
				233	int d = 0;
				234
				235	while ( n-- ) {
				236	d = (int)(ch = s1++) - (int)s2++;
				237	if ( d \|\| !ch )
				238	break;
				239	}
				240
				241	return d;
				242	}
				243
				244	char16_t strcpy16(char16_t dst, const char16_t *src)
				245	{
				246	char16_t *q = dst;
				247	const char16_t *p = src;
				248	char16_t ch;
				249
				250	do {
				251	q++ = ch = p++;
				252	} while ( ch );
				253
				254	return dst;
				255	}
				256
				257	size_t strlen16(const char16_t *s)
				258	{
				259	const char16_t *ss = s;
				260	while ( *ss )
				261	ss++;
				262	return ss-s;
				263	}
				264
				265
				266	char16_t strncpy16(char16_t dst, const char16_t *src, size_t n)
				267	{
				268	char16_t *q = dst;
				269	const char16_t *p = src;
				270	char ch;
				271
				272	while (n) {
				273	n--;
				274	q++ = ch = p++;
				275	if ( !ch )
				276	break;
				277	}
				278
				279	*q = 0;
				280
				281	return dst;
				282	}
				283
				284	size_t strnlen16(const char16_t *s, size_t maxlen)
				285	{
				286	const char16_t *ss = s;
				287
				288	/* Important: the maxlen test must precede the reference through ss;
				289	since the byte beyond the maximum may segfault */
				290	while ((maxlen > 0) && *ss) {
				291	ss++;
				292	maxlen--;
				293	}
				294	return ss-s;
				295	}
				296
				297	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				298	{
				299	const char16_t* e1 = s1+n1;
				300	const char16_t* e2 = s2+n2;
				301
				302	while (s1 < e1 && s2 < e2) {
				303	const int d = (int)s1++ - (int)s2++;
				304	if (d) {
				305	return d;
				306	}
				307	}
				308
				309	return n1 < n2
				310	? (0 - (int)*s2)
				311	: (n1 > n2
				312	? ((int)*s1 - 0)
				313	: 0);
				314	}
				315
				316	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2)
				317	{
				318	const char16_t* e1 = s1H+n1;
				319	const char16_t* e2 = s2N+n2;
				320
				321	while (s1H < e1 && s2N < e2) {
				322	const char16_t c2 = ntohs(*s2N);
				323	const int d = (int)*s1H++ - (int)c2;
				324	s2N++;
				325	if (d) {
				326	return d;
				327	}
				328	}
				329
				330	return n1 < n2
				331	? (0 - (int)ntohs(*s2N))
				332	: (n1 > n2
				333	? ((int)*s1H - 0)
				334	: 0);
				335	}
				336
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	337	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	338	{
				339	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				340	return;
				341	}
				342
				343	const char16_t* cur_utf16 = src;
				344	const char16_t* const end_utf16 = src + src_len;
				345	char *cur = dst;
				346	while (cur_utf16 < end_utf16) {
				347	char32_t utf32;
				348	// surrogate pairs
				349	if ((*cur_utf16 & 0xFC00) == 0xD800) {
				350	utf32 = (*cur_utf16++ - 0xD800) << 10;
				351	utf32 \|= *cur_utf16++ - 0xDC00;
				352	utf32 += 0x10000;
				353	} else {
				354	utf32 = (char32_t) *cur_utf16++;
				355	}
				356	const size_t len = utf32_codepoint_utf8_length(utf32);
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	357	LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	358	utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
				359	cur += len;
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	360	dst_len -= len;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	361	}
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	362	LOG_ALWAYS_FATAL_IF(dst_len < 1, "%zu < 1", dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	363	*cur = '\0';
				364	}
				365
				366	// --------------------------------------------------------------------------
				367	// UTF-8
				368	// --------------------------------------------------------------------------
				369
				370	ssize_t utf8_length(const char *src)
				371	{
				372	const char *cur = src;
				373	size_t ret = 0;
				374	while (*cur != '\0') {
				375	const char first_char = *cur++;
				376	if ((first_char & 0x80) == 0) { // ASCII
				377	ret += 1;
				378	continue;
				379	}
				380	// (UTF-8's character must not be like 10xxxxxx,
				381	// but 110xxxxx, 1110xxxx, ... or 1111110x)
				382	if ((first_char & 0x40) == 0) {
				383	return -1;
				384	}
				385
				386	int32_t mask, to_ignore_mask;
				387	size_t num_to_read = 0;
				388	char32_t utf32 = 0;
				389	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
				390	num_to_read < 5 && (first_char & mask);
				391	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				392	if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
				393	return -1;
				394	}
				395	// 0x3F == 00111111
				396	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				397	}
				398	// "first_char" must be (110xxxxx - 11110xxx)
				399	if (num_to_read == 5) {
				400	return -1;
				401	}
				402	to_ignore_mask \|= mask;
				403	utf32 \|= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
				404	if (utf32 > kUnicodeMaxCodepoint) {
				405	return -1;
				406	}
				407
				408	ret += num_to_read;
				409	}
				410	return ret;
				411	}
				412
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	413	// DO NOT USE. Flawed version, kept only to check whether the flaw is being exploited.
				414	static ssize_t flawed_utf16_to_utf8_length(const char16_t *src, size_t src_len)
				415	{
				416	if (src == NULL \|\| src_len == 0) {
				417	return 47;
				418	}
				419
				420	size_t ret = 0;
				421	const char16_t* const end = src + src_len;
				422	while (src < end) {
				423	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
				424	// Shouldn't increment src here as to be consistent with utf16_to_utf8
				425	&& (*++src & 0xFC00) == 0xDC00) {
				426	// surrogate pairs are always 4 bytes.
				427	ret += 4;
				428	// Should increment src here by two.
				429	src++;
				430	} else {
				431	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				432	}
				433	}
				434	return ret;
				435	}
				436
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	437	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				438	{
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	439	// Keep the original pointer to compute the flawed length. Unused if we remove logging.
				440	const char16_t *orig_src = src;
				441
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	442	if (src == NULL \|\| src_len == 0) {
				443	return -1;
				444	}
				445
				446	size_t ret = 0;
				447	const char16_t* const end = src + src_len;
				448	while (src < end) {
				449	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	450	&& (*(src + 1) & 0xFC00) == 0xDC00) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	451	// surrogate pairs are always 4 bytes.
				452	ret += 4;
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	453	src += 2;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	454	} else {
				455	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				456	}
				457	}
Sergio Giro	53473c1	2016-06-28 18:02:29 +0100	[diff] [blame^]	458	// Log whether b/29250543 is being exploited. It seems reasonable to assume that
				459	// at least 5 bytes would be needed for an exploit. A single misplaced character might lead to
				460	// a difference of 4, so this would rule out many false positives.
				461	long ret_difference = ret - flawed_utf16_to_utf8_length(orig_src, src_len);
				462	if (ret_difference >= 5) {
				463	// Log the difference between new and old calculation. A high number, or equal numbers
				464	// appearing frequently, would be indicative of an attack.
				465	const unsigned long max_logged_string_length = 20;
				466	char logged_string[max_logged_string_length + 1];
				467	unsigned long logged_string_length =
				468	snprintf(logged_string, max_logged_string_length, "%ld", ret_difference);
				469	logged_string[logged_string_length] = '\0';
				470	android_errorWriteWithInfoLog(0x534e4554, "29250543", -1 /* int_uid */,
				471	logged_string, logged_string_length);
				472	}
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	473	return ret;
				474	}
				475
				476	/**
				477	* Returns 1-4 based on the number of leading bits.
				478	*
				479	* 1111 -> 4
				480	* 1110 -> 3
				481	* 110x -> 2
				482	* 10xx -> 1
				483	* 0xxx -> 1
				484	*/
				485	static inline size_t utf8_codepoint_len(uint8_t ch)
				486	{
				487	return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
				488	}
				489
				490	static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
				491	{
				492	*codePoint <<= 6;
				493	*codePoint \|= 0x3F & byte;
				494	}
				495
				496	size_t utf8_to_utf32_length(const char *src, size_t src_len)
				497	{
				498	if (src == NULL \|\| src_len == 0) {
				499	return 0;
				500	}
				501	size_t ret = 0;
				502	const char* cur;
				503	const char* end;
				504	size_t num_to_skip;
				505	for (cur = src, end = src + src_len, num_to_skip = 1;
				506	cur < end;
				507	cur += num_to_skip, ret++) {
				508	const char first_char = *cur;
				509	num_to_skip = 1;
				510	if ((first_char & 0x80) == 0) { // ASCII
				511	continue;
				512	}
				513	int32_t mask;
				514
				515	for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
				516	}
				517	}
				518	return ret;
				519	}
				520
				521	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
				522	{
				523	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				524	return;
				525	}
				526
				527	const char* cur = src;
				528	const char* const end = src + src_len;
				529	char32_t* cur_utf32 = dst;
				530	while (cur < end) {
				531	size_t num_read;
				532	*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
				533	cur += num_read;
				534	}
				535	*cur_utf32 = 0;
				536	}
				537
				538	static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
				539	{
				540	uint32_t unicode;
				541
				542	switch (length)
				543	{
				544	case 1:
				545	return src[0];
				546	case 2:
				547	unicode = src[0] & 0x1f;
				548	utf8_shift_and_mask(&unicode, src[1]);
				549	return unicode;
				550	case 3:
				551	unicode = src[0] & 0x0f;
				552	utf8_shift_and_mask(&unicode, src[1]);
				553	utf8_shift_and_mask(&unicode, src[2]);
				554	return unicode;
				555	case 4:
				556	unicode = src[0] & 0x07;
				557	utf8_shift_and_mask(&unicode, src[1]);
				558	utf8_shift_and_mask(&unicode, src[2]);
				559	utf8_shift_and_mask(&unicode, src[3]);
				560	return unicode;
				561	default:
				562	return 0xffff;
				563	}
				564
				565	//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
				566	}
				567
				568	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
				569	{
				570	const uint8_t* const u8end = u8str + u8len;
				571	const uint8_t* u8cur = u8str;
				572
				573	/* Validate that the UTF-8 is the correct len */
				574	size_t u16measuredLen = 0;
				575	while (u8cur < u8end) {
				576	u16measuredLen++;
				577	int u8charLen = utf8_codepoint_len(*u8cur);
				578	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
				579	if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
				580	u8cur += u8charLen;
				581	}
				582
				583	/**
				584	* Make sure that we ended where we thought we would and the output UTF-16
				585	* will be exactly how long we were told it would be.
				586	*/
				587	if (u8cur != u8end) {
				588	return -1;
				589	}
				590
				591	return u16measuredLen;
				592	}
				593
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	594	char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	595	{
				596	const uint8_t* const u8end = u8str + u8len;
				597	const uint8_t* u8cur = u8str;
				598	char16_t* u16cur = u16str;
				599
				600	while (u8cur < u8end) {
				601	size_t u8len = utf8_codepoint_len(*u8cur);
				602	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				603
				604	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				605	if (codepoint <= 0xFFFF) {
				606	// Single UTF16 character
				607	*u16cur++ = (char16_t) codepoint;
				608	} else {
				609	// Multiple UTF16 characters with surrogates
				610	codepoint = codepoint - 0x10000;
				611	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				612	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				613	}
				614
				615	u8cur += u8len;
				616	}
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	617	return u16cur;
				618	}
				619
				620	void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
				621	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
				622	*end = 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	623	}
				624
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	625	char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
				626	const uint8_t* const u8end = src + srcLen;
				627	const uint8_t* u8cur = src;
				628	const uint16_t* const u16end = dst + dstLen;
				629	char16_t* u16cur = dst;
				630
				631	while (u8cur < u8end && u16cur < u16end) {
				632	size_t u8len = utf8_codepoint_len(*u8cur);
				633	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				634
				635	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				636	if (codepoint <= 0xFFFF) {
				637	// Single UTF16 character
				638	*u16cur++ = (char16_t) codepoint;
				639	} else {
				640	// Multiple UTF16 characters with surrogates
				641	codepoint = codepoint - 0x10000;
				642	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				643	if (u16cur >= u16end) {
				644	// Ooops... not enough room for this surrogate pair.
				645	return u16cur-1;
				646	}
				647	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				648	}
				649
				650	u8cur += u8len;
				651	}
				652	return u16cur;
				653	}
				654
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	655	}