Blame - libutils/Unicode.cpp - android_system_core

blob: 1aca8e782b7fd859afa030ab6a467dd3d97fae94 [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include <utils/Unicode.h>
				18
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame^]	19	#include <limits.h>
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	20	#include <stddef.h>
				21
Elliott Hughes	adbf442	2015-07-29 17:45:24 -0700	[diff] [blame]	22	#if defined(_WIN32)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	23	# undef nhtol
				24	# undef htonl
				25	# undef nhtos
				26	# undef htons
				27
Elliott Hughes	97ac0e1	2014-11-21 23:01:59 -0800	[diff] [blame]	28	# define ntohl(x) ( ((x) << 24) \| (((x) >> 24) & 255) \| (((x) << 8) & 0xff0000) \| (((x) >> 8) & 0xff00) )
				29	# define htonl(x) ntohl(x)
				30	# define ntohs(x) ( (((x) << 8) & 0xff00) \| (((x) >> 8) & 255) )
				31	# define htons(x) ntohs(x)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	32	#else
				33	# include <netinet/in.h>
				34	#endif
				35
				36	extern "C" {
				37
				38	static const char32_t kByteMask = 0x000000BF;
				39	static const char32_t kByteMark = 0x00000080;
				40
				41	// Surrogates aren't valid for UTF-32 characters, so define some
				42	// constants that will let us screen them out.
				43	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
Andreas Gampe	a53c815	2014-11-24 09:42:07 -0800	[diff] [blame]	44	// Unused, here for completeness:
				45	// static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				46	// static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	47	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				48	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				49	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				50	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				51
				52	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				53	// indexed by number of bytes in the sequence.
				54	// 0xxxxxxx
				55	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				56	// 110yyyyx 10xxxxxx
				57	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				58	// 1110yyyy 10yxxxxx 10xxxxxx
				59	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				60	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				61	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				62	static const char32_t kFirstByteMark[] = {
				63	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				64	};
				65
				66	// --------------------------------------------------------------------------
				67	// UTF-32
				68	// --------------------------------------------------------------------------
				69
				70	/**
				71	* Return number of UTF-8 bytes required for the character. If the character
				72	* is invalid, return size of 0.
				73	*/
				74	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				75	{
				76	// Figure out how many bytes the result will require.
				77	if (srcChar < 0x00000080) {
				78	return 1;
				79	} else if (srcChar < 0x00000800) {
				80	return 2;
				81	} else if (srcChar < 0x00010000) {
				82	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				83	return 3;
				84	} else {
				85	// Surrogates are invalid UTF-32 characters.
				86	return 0;
				87	}
				88	}
				89	// Max code point for Unicode is 0x0010FFFF.
				90	else if (srcChar <= kUnicodeMaxCodepoint) {
				91	return 4;
				92	} else {
				93	// Invalid UTF-32 character.
				94	return 0;
				95	}
				96	}
				97
				98	// Write out the source character to <dstP>.
				99
				100	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				101	{
				102	dstP += bytes;
				103	switch (bytes)
				104	{ /* note: everything falls through. */
				105	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				106	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				107	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				108	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				109	}
				110	}
				111
				112	size_t strlen32(const char32_t *s)
				113	{
				114	const char32_t *ss = s;
				115	while ( *ss )
				116	ss++;
				117	return ss-s;
				118	}
				119
				120	size_t strnlen32(const char32_t *s, size_t maxlen)
				121	{
				122	const char32_t *ss = s;
				123	while ((maxlen > 0) && *ss) {
				124	ss++;
				125	maxlen--;
				126	}
				127	return ss-s;
				128	}
				129
				130	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				131	{
				132	const char first_char = *cur;
				133	if ((first_char & 0x80) == 0) { // ASCII
				134	*num_read = 1;
				135	return *cur;
				136	}
				137	cur++;
				138	char32_t mask, to_ignore_mask;
				139	size_t num_to_read = 0;
				140	char32_t utf32 = first_char;
				141	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				142	(first_char & mask);
				143	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				144	// 0x3F == 00111111
				145	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				146	}
				147	to_ignore_mask \|= mask;
				148	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				149
				150	*num_read = num_to_read;
				151	return static_cast<int32_t>(utf32);
				152	}
				153
				154	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				155	{
				156	if (index >= src_len) {
				157	return -1;
				158	}
				159	size_t dummy_index;
				160	if (next_index == NULL) {
				161	next_index = &dummy_index;
				162	}
				163	size_t num_read;
				164	int32_t ret = utf32_at_internal(src + index, &num_read);
				165	if (ret >= 0) {
				166	*next_index = index + num_read;
				167	}
				168
				169	return ret;
				170	}
				171
				172	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				173	{
				174	if (src == NULL \|\| src_len == 0) {
				175	return -1;
				176	}
				177
				178	size_t ret = 0;
				179	const char32_t *end = src + src_len;
				180	while (src < end) {
				181	ret += utf32_codepoint_utf8_length(*src++);
				182	}
				183	return ret;
				184	}
				185
				186	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
				187	{
				188	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				189	return;
				190	}
				191
				192	const char32_t *cur_utf32 = src;
				193	const char32_t *end_utf32 = src + src_len;
				194	char *cur = dst;
				195	while (cur_utf32 < end_utf32) {
				196	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
				197	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				198	cur += len;
				199	}
				200	*cur = '\0';
				201	}
				202
				203	// --------------------------------------------------------------------------
				204	// UTF-16
				205	// --------------------------------------------------------------------------
				206
				207	int strcmp16(const char16_t s1, const char16_t s2)
				208	{
				209	char16_t ch;
				210	int d = 0;
				211
				212	while ( 1 ) {
				213	d = (int)(ch = s1++) - (int)s2++;
				214	if ( d \|\| !ch )
				215	break;
				216	}
				217
				218	return d;
				219	}
				220
				221	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				222	{
				223	char16_t ch;
				224	int d = 0;
				225
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	226	if (n == 0) {
				227	return 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	228	}
				229
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	230	do {
				231	d = (int)(ch = s1++) - (int)s2++;
				232	if ( d \|\| !ch ) {
				233	break;
				234	}
				235	} while (--n);
				236
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	237	return d;
				238	}
				239
				240	char16_t strcpy16(char16_t dst, const char16_t *src)
				241	{
				242	char16_t *q = dst;
				243	const char16_t *p = src;
				244	char16_t ch;
				245
				246	do {
				247	q++ = ch = p++;
				248	} while ( ch );
				249
				250	return dst;
				251	}
				252
				253	size_t strlen16(const char16_t *s)
				254	{
				255	const char16_t *ss = s;
				256	while ( *ss )
				257	ss++;
				258	return ss-s;
				259	}
				260
				261
				262	char16_t strncpy16(char16_t dst, const char16_t *src, size_t n)
				263	{
				264	char16_t *q = dst;
				265	const char16_t *p = src;
				266	char ch;
				267
				268	while (n) {
				269	n--;
				270	q++ = ch = p++;
				271	if ( !ch )
				272	break;
				273	}
				274
				275	*q = 0;
				276
				277	return dst;
				278	}
				279
				280	size_t strnlen16(const char16_t *s, size_t maxlen)
				281	{
				282	const char16_t *ss = s;
				283
				284	/* Important: the maxlen test must precede the reference through ss;
				285	since the byte beyond the maximum may segfault */
				286	while ((maxlen > 0) && *ss) {
				287	ss++;
				288	maxlen--;
				289	}
				290	return ss-s;
				291	}
				292
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	293	char16_t* strstr16(const char16_t* src, const char16_t* target)
				294	{
				295	const char16_t needle = *target++;
Michael Wright	0fd60eb	2016-05-16 21:23:07 +0100	[diff] [blame]	296	const size_t target_len = strlen16(target);
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	297	if (needle != '\0') {
				298	do {
				299	do {
				300	if (*src == '\0') {
				301	return nullptr;
				302	}
				303	} while (*src++ != needle);
Michael Wright	0fd60eb	2016-05-16 21:23:07 +0100	[diff] [blame]	304	} while (strncmp16(src, target, target_len) != 0);
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	305	src--;
				306	}
				307
				308	return (char16_t*)src;
				309	}
				310
				311
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	312	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				313	{
				314	const char16_t* e1 = s1+n1;
				315	const char16_t* e2 = s2+n2;
				316
				317	while (s1 < e1 && s2 < e2) {
				318	const int d = (int)s1++ - (int)s2++;
				319	if (d) {
				320	return d;
				321	}
				322	}
				323
				324	return n1 < n2
				325	? (0 - (int)*s2)
				326	: (n1 > n2
				327	? ((int)*s1 - 0)
				328	: 0);
				329	}
				330
				331	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2)
				332	{
				333	const char16_t* e1 = s1H+n1;
				334	const char16_t* e2 = s2N+n2;
				335
				336	while (s1H < e1 && s2N < e2) {
				337	const char16_t c2 = ntohs(*s2N);
				338	const int d = (int)*s1H++ - (int)c2;
				339	s2N++;
				340	if (d) {
				341	return d;
				342	}
				343	}
				344
				345	return n1 < n2
				346	? (0 - (int)ntohs(*s2N))
				347	: (n1 > n2
				348	? ((int)*s1H - 0)
				349	: 0);
				350	}
				351
				352	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
				353	{
				354	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				355	return;
				356	}
				357
				358	const char16_t* cur_utf16 = src;
				359	const char16_t* const end_utf16 = src + src_len;
				360	char *cur = dst;
				361	while (cur_utf16 < end_utf16) {
				362	char32_t utf32;
				363	// surrogate pairs
Cylen Yao	72299bf	2014-06-04 19:11:27 +0800	[diff] [blame]	364	if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16
				365	&& (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	366	utf32 = (*cur_utf16++ - 0xD800) << 10;
				367	utf32 \|= *cur_utf16++ - 0xDC00;
				368	utf32 += 0x10000;
				369	} else {
				370	utf32 = (char32_t) *cur_utf16++;
				371	}
				372	const size_t len = utf32_codepoint_utf8_length(utf32);
				373	utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
				374	cur += len;
				375	}
				376	*cur = '\0';
				377	}
				378
				379	// --------------------------------------------------------------------------
				380	// UTF-8
				381	// --------------------------------------------------------------------------
				382
				383	ssize_t utf8_length(const char *src)
				384	{
				385	const char *cur = src;
				386	size_t ret = 0;
				387	while (*cur != '\0') {
				388	const char first_char = *cur++;
				389	if ((first_char & 0x80) == 0) { // ASCII
				390	ret += 1;
				391	continue;
				392	}
				393	// (UTF-8's character must not be like 10xxxxxx,
				394	// but 110xxxxx, 1110xxxx, ... or 1111110x)
				395	if ((first_char & 0x40) == 0) {
				396	return -1;
				397	}
				398
				399	int32_t mask, to_ignore_mask;
				400	size_t num_to_read = 0;
				401	char32_t utf32 = 0;
				402	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
				403	num_to_read < 5 && (first_char & mask);
				404	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				405	if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
				406	return -1;
				407	}
				408	// 0x3F == 00111111
				409	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				410	}
				411	// "first_char" must be (110xxxxx - 11110xxx)
				412	if (num_to_read == 5) {
				413	return -1;
				414	}
				415	to_ignore_mask \|= mask;
				416	utf32 \|= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
				417	if (utf32 > kUnicodeMaxCodepoint) {
				418	return -1;
				419	}
				420
				421	ret += num_to_read;
				422	}
				423	return ret;
				424	}
				425
				426	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				427	{
				428	if (src == NULL \|\| src_len == 0) {
				429	return -1;
				430	}
				431
				432	size_t ret = 0;
				433	const char16_t* const end = src + src_len;
				434	while (src < end) {
				435	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
				436	&& (*++src & 0xFC00) == 0xDC00) {
				437	// surrogate pairs are always 4 bytes.
				438	ret += 4;
				439	src++;
				440	} else {
				441	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				442	}
				443	}
				444	return ret;
				445	}
				446
				447	/**
				448	* Returns 1-4 based on the number of leading bits.
				449	*
				450	* 1111 -> 4
				451	* 1110 -> 3
				452	* 110x -> 2
				453	* 10xx -> 1
				454	* 0xxx -> 1
				455	*/
				456	static inline size_t utf8_codepoint_len(uint8_t ch)
				457	{
				458	return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
				459	}
				460
				461	static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
				462	{
				463	*codePoint <<= 6;
				464	*codePoint \|= 0x3F & byte;
				465	}
				466
				467	size_t utf8_to_utf32_length(const char *src, size_t src_len)
				468	{
				469	if (src == NULL \|\| src_len == 0) {
				470	return 0;
				471	}
				472	size_t ret = 0;
				473	const char* cur;
				474	const char* end;
				475	size_t num_to_skip;
				476	for (cur = src, end = src + src_len, num_to_skip = 1;
				477	cur < end;
				478	cur += num_to_skip, ret++) {
				479	const char first_char = *cur;
				480	num_to_skip = 1;
				481	if ((first_char & 0x80) == 0) { // ASCII
				482	continue;
				483	}
				484	int32_t mask;
				485
				486	for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
				487	}
				488	}
				489	return ret;
				490	}
				491
				492	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
				493	{
				494	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				495	return;
				496	}
				497
				498	const char* cur = src;
				499	const char* const end = src + src_len;
				500	char32_t* cur_utf32 = dst;
				501	while (cur < end) {
				502	size_t num_read;
				503	*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
				504	cur += num_read;
				505	}
				506	*cur_utf32 = 0;
				507	}
				508
				509	static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
				510	{
				511	uint32_t unicode;
				512
				513	switch (length)
				514	{
				515	case 1:
				516	return src[0];
				517	case 2:
				518	unicode = src[0] & 0x1f;
				519	utf8_shift_and_mask(&unicode, src[1]);
				520	return unicode;
				521	case 3:
				522	unicode = src[0] & 0x0f;
				523	utf8_shift_and_mask(&unicode, src[1]);
				524	utf8_shift_and_mask(&unicode, src[2]);
				525	return unicode;
				526	case 4:
				527	unicode = src[0] & 0x07;
				528	utf8_shift_and_mask(&unicode, src[1]);
				529	utf8_shift_and_mask(&unicode, src[2]);
				530	utf8_shift_and_mask(&unicode, src[3]);
				531	return unicode;
				532	default:
				533	return 0xffff;
				534	}
				535
				536	//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
				537	}
				538
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame^]	539	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len, bool overreadIsFatal)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	540	{
				541	const uint8_t* const u8end = u8str + u8len;
				542	const uint8_t* u8cur = u8str;
				543
				544	/* Validate that the UTF-8 is the correct len */
				545	size_t u16measuredLen = 0;
				546	while (u8cur < u8end) {
				547	u16measuredLen++;
				548	int u8charLen = utf8_codepoint_len(*u8cur);
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame^]	549	// Malformed utf8, some characters are beyond the end.
				550	// Cases:
				551	// If u8charLen == 1, this becomes u8cur >= u8end, which cannot happen as u8cur < u8end,
				552	// then this condition fail and we continue, as expected.
				553	// If u8charLen == 2, this becomes u8cur + 1 >= u8end, which fails only if
				554	// u8cur == u8end - 1, that is, there was only one remaining character to read but we need
				555	// 2 of them. This condition holds and we return -1, as expected.
				556	if (u8cur + u8charLen - 1 >= u8end) {
				557	if (overreadIsFatal) {
				558	LOG_ALWAYS_FATAL("Attempt to overread computing length of utf8 string");
				559	} else {
				560	return -1;
				561	}
				562	}
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	563	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
				564	if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
				565	u8cur += u8charLen;
				566	}
				567
				568	/**
				569	* Make sure that we ended where we thought we would and the output UTF-16
				570	* will be exactly how long we were told it would be.
				571	*/
				572	if (u8cur != u8end) {
				573	return -1;
				574	}
				575
				576	return u16measuredLen;
				577	}
				578
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame^]	579	char16_t* utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str, size_t u16len) {
				580	// A value > SSIZE_MAX is probably a negative value returned as an error and casted.
				581	LOG_ALWAYS_FATAL_IF(u16len == 0 \|\| u16len > SSIZE_MAX, "u16len is %zu", u16len);
				582	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str, u16len - 1);
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	583	*end = 0;
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame^]	584	return end;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	585	}
				586
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame^]	587	char16_t* utf8_to_utf16_no_null_terminator(
				588	const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
				589	if (dstLen == 0) {
				590	return dst;
				591	}
				592	// A value > SSIZE_MAX is probably a negative value returned as an error and casted.
				593	LOG_ALWAYS_FATAL_IF(dstLen > SSIZE_MAX, "dstLen is %zu", dstLen);
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	594	const uint8_t* const u8end = src + srcLen;
				595	const uint8_t* u8cur = src;
Mark Salyzyn	5bed803	2014-04-30 11:10:46 -0700	[diff] [blame]	596	const char16_t* const u16end = dst + dstLen;
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	597	char16_t* u16cur = dst;
				598
				599	while (u8cur < u8end && u16cur < u16end) {
				600	size_t u8len = utf8_codepoint_len(*u8cur);
				601	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				602
				603	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				604	if (codepoint <= 0xFFFF) {
				605	// Single UTF16 character
				606	*u16cur++ = (char16_t) codepoint;
				607	} else {
				608	// Multiple UTF16 characters with surrogates
				609	codepoint = codepoint - 0x10000;
				610	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				611	if (u16cur >= u16end) {
				612	// Ooops... not enough room for this surrogate pair.
				613	return u16cur-1;
				614	}
				615	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				616	}
				617
				618	u8cur += u8len;
				619	}
				620	return u16cur;
				621	}
				622
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	623	}