Blame - libutils/Unicode.cpp - android_system_core

blob: 364a1778c6320832316f17b28024bfdc399ef2cc [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
Mark Salyzyn	cfd5b08	2016-10-17 14:28:00 -0700	[diff] [blame]	17	#define LOG_TAG "unicode"
				18
Chih-Hung Hsieh	502f486	2018-09-13 11:08:41 -0700	[diff] [blame]	19	#include <android-base/macros.h>
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	20	#include <limits.h>
Chih-Hung Hsieh	502f486	2018-09-13 11:08:41 -0700	[diff] [blame]	21	#include <utils/Unicode.h>
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	22
Mark Salyzyn	30f991f	2017-01-10 13:19:54 -0800	[diff] [blame]	23	#include <log/log.h>
Mark Salyzyn	ff2dcd9	2016-09-28 15:54:45 -0700	[diff] [blame]	24
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	25	extern "C" {
				26
				27	static const char32_t kByteMask = 0x000000BF;
				28	static const char32_t kByteMark = 0x00000080;
				29
				30	// Surrogates aren't valid for UTF-32 characters, so define some
				31	// constants that will let us screen them out.
				32	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
Andreas Gampe	a53c815	2014-11-24 09:42:07 -0800	[diff] [blame]	33	// Unused, here for completeness:
				34	// static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				35	// static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	36	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				37	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				38	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				39	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				40
				41	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				42	// indexed by number of bytes in the sequence.
				43	// 0xxxxxxx
				44	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				45	// 110yyyyx 10xxxxxx
				46	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				47	// 1110yyyy 10yxxxxx 10xxxxxx
				48	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				49	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				50	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				51	static const char32_t kFirstByteMark[] = {
				52	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				53	};
				54
				55	// --------------------------------------------------------------------------
				56	// UTF-32
				57	// --------------------------------------------------------------------------
				58
				59	/**
				60	* Return number of UTF-8 bytes required for the character. If the character
				61	* is invalid, return size of 0.
				62	*/
				63	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				64	{
				65	// Figure out how many bytes the result will require.
				66	if (srcChar < 0x00000080) {
				67	return 1;
				68	} else if (srcChar < 0x00000800) {
				69	return 2;
				70	} else if (srcChar < 0x00010000) {
				71	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				72	return 3;
				73	} else {
				74	// Surrogates are invalid UTF-32 characters.
				75	return 0;
				76	}
				77	}
				78	// Max code point for Unicode is 0x0010FFFF.
				79	else if (srcChar <= kUnicodeMaxCodepoint) {
				80	return 4;
				81	} else {
				82	// Invalid UTF-32 character.
				83	return 0;
				84	}
				85	}
				86
				87	// Write out the source character to <dstP>.
				88
				89	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				90	{
				91	dstP += bytes;
				92	switch (bytes)
				93	{ /* note: everything falls through. */
				94	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
Chih-Hung Hsieh	502f486	2018-09-13 11:08:41 -0700	[diff] [blame]	95	FALLTHROUGH_INTENDED;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	96	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
Chih-Hung Hsieh	502f486	2018-09-13 11:08:41 -0700	[diff] [blame]	97	FALLTHROUGH_INTENDED;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	98	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
Chih-Hung Hsieh	502f486	2018-09-13 11:08:41 -0700	[diff] [blame]	99	FALLTHROUGH_INTENDED;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	100	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				101	}
				102	}
				103
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	104	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				105	{
				106	const char first_char = *cur;
				107	if ((first_char & 0x80) == 0) { // ASCII
				108	*num_read = 1;
				109	return *cur;
				110	}
				111	cur++;
				112	char32_t mask, to_ignore_mask;
				113	size_t num_to_read = 0;
				114	char32_t utf32 = first_char;
				115	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				116	(first_char & mask);
				117	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				118	// 0x3F == 00111111
				119	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				120	}
				121	to_ignore_mask \|= mask;
				122	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				123
				124	*num_read = num_to_read;
				125	return static_cast<int32_t>(utf32);
				126	}
				127
				128	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				129	{
				130	if (index >= src_len) {
				131	return -1;
				132	}
Dan Albert	ac4500e	2020-07-27 14:03:56 -0700	[diff] [blame]	133	size_t unused_index;
Yi Kong	e1731a4	2018-07-16 18:11:34 -0700	[diff] [blame]	134	if (next_index == nullptr) {
Dan Albert	ac4500e	2020-07-27 14:03:56 -0700	[diff] [blame]	135	next_index = &unused_index;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	136	}
				137	size_t num_read;
				138	int32_t ret = utf32_at_internal(src + index, &num_read);
				139	if (ret >= 0) {
				140	*next_index = index + num_read;
				141	}
				142
				143	return ret;
				144	}
				145
				146	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				147	{
Yi Kong	e1731a4	2018-07-16 18:11:34 -0700	[diff] [blame]	148	if (src == nullptr \|\| src_len == 0) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	149	return -1;
				150	}
				151
				152	size_t ret = 0;
				153	const char32_t *end = src + src_len;
				154	while (src < end) {
Adam Vartanian	47efc67	2017-08-14 15:51:29 +0100	[diff] [blame]	155	size_t char_len = utf32_codepoint_utf8_length(*src++);
				156	if (SSIZE_MAX - char_len < ret) {
				157	// If this happens, we would overflow the ssize_t type when
				158	// returning from this function, so we cannot express how
				159	// long this string is in an ssize_t.
				160	android_errorWriteLog(0x534e4554, "37723026");
				161	return -1;
				162	}
				163	ret += char_len;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	164	}
				165	return ret;
				166	}
				167
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	168	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	169	{
Yi Kong	e1731a4	2018-07-16 18:11:34 -0700	[diff] [blame]	170	if (src == nullptr \|\| src_len == 0 \|\| dst == nullptr) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	171	return;
				172	}
				173
				174	const char32_t *cur_utf32 = src;
				175	const char32_t *end_utf32 = src + src_len;
				176	char *cur = dst;
				177	while (cur_utf32 < end_utf32) {
				178	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	179	LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	180	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				181	cur += len;
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	182	dst_len -= len;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	183	}
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	184	LOG_ALWAYS_FATAL_IF(dst_len < 1, "dst_len < 1: %zu < 1", dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	185	*cur = '\0';
				186	}
				187
				188	// --------------------------------------------------------------------------
				189	// UTF-16
				190	// --------------------------------------------------------------------------
				191
				192	int strcmp16(const char16_t s1, const char16_t s2)
				193	{
				194	char16_t ch;
				195	int d = 0;
				196
				197	while ( 1 ) {
				198	d = (int)(ch = s1++) - (int)s2++;
				199	if ( d \|\| !ch )
				200	break;
				201	}
				202
				203	return d;
				204	}
				205
				206	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				207	{
				208	char16_t ch;
				209	int d = 0;
				210
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	211	if (n == 0) {
				212	return 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	213	}
				214
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	215	do {
				216	d = (int)(ch = s1++) - (int)s2++;
				217	if ( d \|\| !ch ) {
				218	break;
				219	}
				220	} while (--n);
				221
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	222	return d;
				223	}
				224
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	225	size_t strlen16(const char16_t *s)
				226	{
				227	const char16_t *ss = s;
				228	while ( *ss )
				229	ss++;
				230	return ss-s;
				231	}
				232
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	233	size_t strnlen16(const char16_t *s, size_t maxlen)
				234	{
				235	const char16_t *ss = s;
				236
				237	/* Important: the maxlen test must precede the reference through ss;
				238	since the byte beyond the maximum may segfault */
				239	while ((maxlen > 0) && *ss) {
				240	ss++;
				241	maxlen--;
				242	}
				243	return ss-s;
				244	}
				245
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	246	char16_t* strstr16(const char16_t* src, const char16_t* target)
				247	{
Branislav Rankov	bf3fff1	2017-10-12 15:08:42 +0200	[diff] [blame]	248	const char16_t needle = *target;
				249	if (needle == '\0') return (char16_t*)src;
				250
				251	const size_t target_len = strlen16(++target);
				252	do {
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	253	do {
Branislav Rankov	bf3fff1	2017-10-12 15:08:42 +0200	[diff] [blame]	254	if (*src == '\0') {
				255	return nullptr;
				256	}
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	257	} while (*src++ != needle);
Branislav Rankov	bf3fff1	2017-10-12 15:08:42 +0200	[diff] [blame]	258	} while (strncmp16(src, target, target_len) != 0);
				259	src--;
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	260
				261	return (char16_t*)src;
				262	}
				263
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	264	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				265	{
				266	const char16_t* e1 = s1+n1;
				267	const char16_t* e2 = s2+n2;
				268
				269	while (s1 < e1 && s2 < e2) {
				270	const int d = (int)s1++ - (int)s2++;
				271	if (d) {
				272	return d;
				273	}
				274	}
				275
				276	return n1 < n2
				277	? (0 - (int)*s2)
				278	: (n1 > n2
				279	? ((int)*s1 - 0)
				280	: 0);
				281	}
				282
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	283	// is_any_surrogate() returns true if w is either a high or low surrogate
				284	static constexpr bool is_any_surrogate(char16_t w) {
				285	return (w & 0xf800) == 0xd800;
				286	}
				287
				288	// is_surrogate_pair() returns true if w1 and w2 form a valid surrogate pair
				289	static constexpr bool is_surrogate_pair(char16_t w1, char16_t w2) {
				290	return ((w1 & 0xfc00) == 0xd800) && ((w2 & 0xfc00) == 0xdc00);
				291	}
				292
				293	// TODO: currently utf16_to_utf8_length() returns -1 if src_len == 0,
				294	// which is inconsistent with utf8_to_utf16_length(), here we keep the
				295	// current behavior as intended not to break compatibility
				296	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				297	{
				298	if (src == nullptr \|\| src_len == 0)
				299	return -1;
				300
				301	const char16_t* const end = src + src_len;
				302	const char16_t* in = src;
				303	size_t utf8_len = 0;
				304
				305	while (in < end) {
				306	char16_t w = *in++;
				307	if (LIKELY(w < 0x0080)) {
				308	utf8_len += 1;
				309	continue;
				310	}
				311	if (LIKELY(w < 0x0800)) {
				312	utf8_len += 2;
				313	continue;
				314	}
				315	if (LIKELY(!is_any_surrogate(w))) {
				316	utf8_len += 3;
				317	continue;
				318	}
				319	if (in < end && is_surrogate_pair(w, *in)) {
				320	utf8_len += 4;
				321	in++;
				322	continue;
				323	}
				324	/* skip if at the end of the string or invalid surrogate pair */
				325	}
				326	return (in == end && utf8_len < SSIZE_MAX) ? utf8_len : -1;
				327	}
				328
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	329	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	330	{
Yi Kong	e1731a4	2018-07-16 18:11:34 -0700	[diff] [blame]	331	if (src == nullptr \|\| src_len == 0 \|\| dst == nullptr) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	332	return;
				333	}
				334
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	335	const char16_t* in = src;
				336	const char16_t* const in_end = src + src_len;
				337	char* out = dst;
				338	const char* const out_end = dst + dst_len;
				339	char16_t w2;
				340
				341	auto err_out = [&out, &out_end, &dst_len]() {
				342	LOG_ALWAYS_FATAL_IF(out >= out_end,
				343	"target utf8 string size %zu too short", dst_len);
				344	};
				345
				346	while (in < in_end) {
				347	char16_t w = *in++;
				348	if (LIKELY(w < 0x0080)) {
				349	if (out + 1 > out_end)
				350	return err_out();
				351	*out++ = (char)(w & 0xff);
				352	continue;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	353	}
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	354	if (LIKELY(w < 0x0800)) {
				355	if (out + 2 > out_end)
				356	return err_out();
				357	*out++ = (char)(0xc0 \| ((w >> 6) & 0x1f));
				358	*out++ = (char)(0x80 \| ((w >> 0) & 0x3f));
				359	continue;
				360	}
				361	if (LIKELY(!is_any_surrogate(w))) {
				362	if (out + 3 > out_end)
				363	return err_out();
				364	*out++ = (char)(0xe0 \| ((w >> 12) & 0xf));
				365	*out++ = (char)(0x80 \| ((w >> 6) & 0x3f));
				366	*out++ = (char)(0x80 \| ((w >> 0) & 0x3f));
				367	continue;
				368	}
				369	/* surrogate pair */
				370	if (in < in_end && (w2 = *in, is_surrogate_pair(w, w2))) {
				371	if (out + 4 > out_end)
				372	return err_out();
				373	char32_t dw = (char32_t)(0x10000 + ((w - 0xd800) << 10) + (w2 - 0xdc00));
				374	*out++ = (char)(0xf0 \| ((dw >> 18) & 0x07));
				375	*out++ = (char)(0x80 \| ((dw >> 12) & 0x3f));
				376	*out++ = (char)(0x80 \| ((dw >> 6) & 0x3f));
				377	*out++ = (char)(0x80 \| ((dw >> 0) & 0x3f));
				378	in++;
				379	}
				380	/* We reach here in two cases:
				381	* 1) (in == in_end), which means end of the input string
				382	* 2) (w2 & 0xfc00) != 0xdc00, which means invalid surrogate pair
				383	* In either case, we intentionally do nothing and skip
				384	*/
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	385	}
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	386	*out = '\0';
				387	return;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	388	}
				389
				390	// --------------------------------------------------------------------------
				391	// UTF-8
				392	// --------------------------------------------------------------------------
				393
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	394	static char32_t utf8_4b_to_utf32(uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4) {
				395	return ((c1 & 0x07) << 18) \| ((c2 & 0x3f) << 12) \| ((c3 & 0x3f) << 6) \| (c4 & 0x3f);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	396	}
				397
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	398	// TODO: current behavior of converting UTF8 to UTF-16 has a few issues below
				399	//
				400	// 1. invalid trailing bytes (i.e. not b'10xxxxxx) are treated as valid trailing
				401	// bytes and follows normal conversion rules
				402	// 2. invalid leading byte (b'10xxxxxx) is treated as a valid single UTF-8 byte
				403	// 3. invalid leading byte (b'11111xxx) is treated as a valid leading byte
				404	// (same as b'11110xxx) for a 4-byte UTF-8 sequence
				405	// 4. an invalid 4-byte UTF-8 sequence that translates to a codepoint < U+10000
				406	// will be converted as a valid UTF-16 character
				407	//
				408	// We keep the current behavior as is but with warnings logged, so as not to
				409	// break compatibility. However, this needs to be addressed later.
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	410
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	411	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len, bool overreadIsFatal)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	412	{
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	413	if (u8str == nullptr)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	414	return -1;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	415
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	416	const uint8_t* const in_end = u8str + u8len;
				417	const uint8_t* in = u8str;
				418	size_t utf16_len = 0;
				419
				420	while (in < in_end) {
				421	uint8_t c = *in;
				422	utf16_len++;
				423	if (LIKELY((c & 0x80) == 0)) {
				424	in++;
				425	continue;
				426	}
				427	if (UNLIKELY(c < 0xc0)) {
				428	ALOGW("Invalid UTF-8 leading byte: 0x%02x", c);
				429	in++;
				430	continue;
				431	}
				432	if (LIKELY(c < 0xe0)) {
				433	in += 2;
				434	continue;
				435	}
				436	if (LIKELY(c < 0xf0)) {
				437	in += 3;
				438	continue;
				439	} else {
				440	uint8_t c2, c3, c4;
				441	if (UNLIKELY(c >= 0xf8)) {
				442	ALOGW("Invalid UTF-8 leading byte: 0x%02x", c);
				443	}
				444	c2 = in[1]; c3 = in[2]; c4 = in[3];
				445	if (utf8_4b_to_utf32(c, c2, c3, c4) >= 0x10000) {
				446	utf16_len++;
				447	}
				448	in += 4;
				449	continue;
				450	}
				451	}
				452	if (in == in_end) {
				453	return utf16_len < SSIZE_MAX ? utf16_len : -1;
				454	}
				455	if (overreadIsFatal)
				456	LOG_ALWAYS_FATAL("Attempt to overread computing length of utf8 string");
				457	return -1;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	458	}
				459
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	460	char16_t* utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str, size_t u16len) {
				461	// A value > SSIZE_MAX is probably a negative value returned as an error and casted.
				462	LOG_ALWAYS_FATAL_IF(u16len == 0 \|\| u16len > SSIZE_MAX, "u16len is %zu", u16len);
				463	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str, u16len - 1);
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	464	*end = 0;
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	465	return end;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	466	}
				467
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	468	char16_t* utf8_to_utf16_no_null_terminator(
				469	const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	470	if (src == nullptr \|\| srcLen == 0 \|\| dstLen == 0) {
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	471	return dst;
				472	}
				473	// A value > SSIZE_MAX is probably a negative value returned as an error and casted.
				474	LOG_ALWAYS_FATAL_IF(dstLen > SSIZE_MAX, "dstLen is %zu", dstLen);
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	475
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	476	const uint8_t* const in_end = src + srcLen;
				477	const uint8_t* in = src;
				478	const char16_t* const out_end = dst + dstLen;
				479	char16_t* out = dst;
				480	uint8_t c, c2, c3, c4;
				481	char32_t w;
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	482
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	483	auto err_in = [&c, &out]() {
				484	ALOGW("Unended UTF-8 byte: 0x%02x", c);
				485	return out;
				486	};
				487
				488	while (in < in_end && out < out_end) {
				489	c = *in++;
				490	if (LIKELY((c & 0x80) == 0)) {
				491	*out++ = (char16_t)(c);
				492	continue;
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	493	}
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	494	if (UNLIKELY(c < 0xc0)) {
				495	ALOGW("Invalid UTF-8 leading byte: 0x%02x", c);
				496	*out++ = (char16_t)(c);
				497	continue;
				498	}
				499	if (LIKELY(c < 0xe0)) {
				500	if (UNLIKELY(in + 1 > in_end)) {
				501	return err_in();
				502	}
				503	c2 = *in++;
				504	*out++ = (char16_t)(((c & 0x1f) << 6) \| (c2 & 0x3f));
				505	continue;
				506	}
				507	if (LIKELY(c < 0xf0)) {
				508	if (UNLIKELY(in + 2 > in_end)) {
				509	return err_in();
				510	}
				511	c2 = in++; c3 = in++;
				512	*out++ = (char16_t)(((c & 0x0f) << 12) \|
				513	((c2 & 0x3f) << 6) \| (c3 & 0x3f));
				514	continue;
				515	} else {
				516	if (UNLIKELY(in + 3 > in_end)) {
				517	return err_in();
				518	}
				519	if (UNLIKELY(c >= 0xf8)) {
				520	ALOGW("Invalid UTF-8 leading byte: 0x%02x", c);
				521	}
				522	// Multiple UTF16 characters with surrogates
				523	c2 = in++; c3 = in++; c4 = *in++;
				524	w = utf8_4b_to_utf32(c, c2, c3, c4);
				525	if (UNLIKELY(w < 0x10000)) {
				526	*out++ = (char16_t)(w);
				527	} else {
				528	if (UNLIKELY(out + 2 > out_end)) {
				529	// Ooops.... not enough room for this surrogate pair.
				530	return out;
				531	}
				532	*out++ = (char16_t)(((w - 0x10000) >> 10) + 0xd800);
				533	*out++ = (char16_t)(((w - 0x10000) & 0x3ff) + 0xdc00);
				534	}
				535	continue;
				536	}
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	537	}
Eric Miao	cb199b4	2022-11-30 16:05:49 -0800	[diff] [blame^]	538	return out;
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	539	}
				540
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	541	}