Blame - libutils/Unicode.cpp - android_system_core

blob: fb876c91f0909d396497306ba953f15abb84af88 [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include <utils/Unicode.h>
				18
				19	#include <stddef.h>
				20
				21	#ifdef HAVE_WINSOCK
				22	# undef nhtol
				23	# undef htonl
				24	# undef nhtos
				25	# undef htons
				26
Elliott Hughes	97ac0e1	2014-11-21 23:01:59 -0800	[diff] [blame]	27	# define ntohl(x) ( ((x) << 24) \| (((x) >> 24) & 255) \| (((x) << 8) & 0xff0000) \| (((x) >> 8) & 0xff00) )
				28	# define htonl(x) ntohl(x)
				29	# define ntohs(x) ( (((x) << 8) & 0xff00) \| (((x) >> 8) & 255) )
				30	# define htons(x) ntohs(x)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	31	#else
				32	# include <netinet/in.h>
				33	#endif
				34
				35	extern "C" {
				36
				37	static const char32_t kByteMask = 0x000000BF;
				38	static const char32_t kByteMark = 0x00000080;
				39
				40	// Surrogates aren't valid for UTF-32 characters, so define some
				41	// constants that will let us screen them out.
				42	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
Andreas Gampe	a53c815	2014-11-24 09:42:07 -0800	[diff] [blame]	43	// Unused, here for completeness:
				44	// static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				45	// static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	46	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				47	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				48	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				49	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				50
				51	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				52	// indexed by number of bytes in the sequence.
				53	// 0xxxxxxx
				54	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				55	// 110yyyyx 10xxxxxx
				56	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				57	// 1110yyyy 10yxxxxx 10xxxxxx
				58	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				59	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				60	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				61	static const char32_t kFirstByteMark[] = {
				62	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				63	};
				64
				65	// --------------------------------------------------------------------------
				66	// UTF-32
				67	// --------------------------------------------------------------------------
				68
				69	/**
				70	* Return number of UTF-8 bytes required for the character. If the character
				71	* is invalid, return size of 0.
				72	*/
				73	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				74	{
				75	// Figure out how many bytes the result will require.
				76	if (srcChar < 0x00000080) {
				77	return 1;
				78	} else if (srcChar < 0x00000800) {
				79	return 2;
				80	} else if (srcChar < 0x00010000) {
				81	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				82	return 3;
				83	} else {
				84	// Surrogates are invalid UTF-32 characters.
				85	return 0;
				86	}
				87	}
				88	// Max code point for Unicode is 0x0010FFFF.
				89	else if (srcChar <= kUnicodeMaxCodepoint) {
				90	return 4;
				91	} else {
				92	// Invalid UTF-32 character.
				93	return 0;
				94	}
				95	}
				96
				97	// Write out the source character to <dstP>.
				98
				99	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				100	{
				101	dstP += bytes;
				102	switch (bytes)
				103	{ /* note: everything falls through. */
				104	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				105	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				106	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				107	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				108	}
				109	}
				110
				111	size_t strlen32(const char32_t *s)
				112	{
				113	const char32_t *ss = s;
				114	while ( *ss )
				115	ss++;
				116	return ss-s;
				117	}
				118
				119	size_t strnlen32(const char32_t *s, size_t maxlen)
				120	{
				121	const char32_t *ss = s;
				122	while ((maxlen > 0) && *ss) {
				123	ss++;
				124	maxlen--;
				125	}
				126	return ss-s;
				127	}
				128
				129	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				130	{
				131	const char first_char = *cur;
				132	if ((first_char & 0x80) == 0) { // ASCII
				133	*num_read = 1;
				134	return *cur;
				135	}
				136	cur++;
				137	char32_t mask, to_ignore_mask;
				138	size_t num_to_read = 0;
				139	char32_t utf32 = first_char;
				140	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				141	(first_char & mask);
				142	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				143	// 0x3F == 00111111
				144	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				145	}
				146	to_ignore_mask \|= mask;
				147	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				148
				149	*num_read = num_to_read;
				150	return static_cast<int32_t>(utf32);
				151	}
				152
				153	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				154	{
				155	if (index >= src_len) {
				156	return -1;
				157	}
				158	size_t dummy_index;
				159	if (next_index == NULL) {
				160	next_index = &dummy_index;
				161	}
				162	size_t num_read;
				163	int32_t ret = utf32_at_internal(src + index, &num_read);
				164	if (ret >= 0) {
				165	*next_index = index + num_read;
				166	}
				167
				168	return ret;
				169	}
				170
				171	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				172	{
				173	if (src == NULL \|\| src_len == 0) {
				174	return -1;
				175	}
				176
				177	size_t ret = 0;
				178	const char32_t *end = src + src_len;
				179	while (src < end) {
				180	ret += utf32_codepoint_utf8_length(*src++);
				181	}
				182	return ret;
				183	}
				184
				185	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
				186	{
				187	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				188	return;
				189	}
				190
				191	const char32_t *cur_utf32 = src;
				192	const char32_t *end_utf32 = src + src_len;
				193	char *cur = dst;
				194	while (cur_utf32 < end_utf32) {
				195	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
				196	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				197	cur += len;
				198	}
				199	*cur = '\0';
				200	}
				201
				202	// --------------------------------------------------------------------------
				203	// UTF-16
				204	// --------------------------------------------------------------------------
				205
				206	int strcmp16(const char16_t s1, const char16_t s2)
				207	{
				208	char16_t ch;
				209	int d = 0;
				210
				211	while ( 1 ) {
				212	d = (int)(ch = s1++) - (int)s2++;
				213	if ( d \|\| !ch )
				214	break;
				215	}
				216
				217	return d;
				218	}
				219
				220	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				221	{
				222	char16_t ch;
				223	int d = 0;
				224
				225	while ( n-- ) {
				226	d = (int)(ch = s1++) - (int)s2++;
				227	if ( d \|\| !ch )
				228	break;
				229	}
				230
				231	return d;
				232	}
				233
				234	char16_t strcpy16(char16_t dst, const char16_t *src)
				235	{
				236	char16_t *q = dst;
				237	const char16_t *p = src;
				238	char16_t ch;
				239
				240	do {
				241	q++ = ch = p++;
				242	} while ( ch );
				243
				244	return dst;
				245	}
				246
				247	size_t strlen16(const char16_t *s)
				248	{
				249	const char16_t *ss = s;
				250	while ( *ss )
				251	ss++;
				252	return ss-s;
				253	}
				254
				255
				256	char16_t strncpy16(char16_t dst, const char16_t *src, size_t n)
				257	{
				258	char16_t *q = dst;
				259	const char16_t *p = src;
				260	char ch;
				261
				262	while (n) {
				263	n--;
				264	q++ = ch = p++;
				265	if ( !ch )
				266	break;
				267	}
				268
				269	*q = 0;
				270
				271	return dst;
				272	}
				273
				274	size_t strnlen16(const char16_t *s, size_t maxlen)
				275	{
				276	const char16_t *ss = s;
				277
				278	/* Important: the maxlen test must precede the reference through ss;
				279	since the byte beyond the maximum may segfault */
				280	while ((maxlen > 0) && *ss) {
				281	ss++;
				282	maxlen--;
				283	}
				284	return ss-s;
				285	}
				286
				287	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				288	{
				289	const char16_t* e1 = s1+n1;
				290	const char16_t* e2 = s2+n2;
				291
				292	while (s1 < e1 && s2 < e2) {
				293	const int d = (int)s1++ - (int)s2++;
				294	if (d) {
				295	return d;
				296	}
				297	}
				298
				299	return n1 < n2
				300	? (0 - (int)*s2)
				301	: (n1 > n2
				302	? ((int)*s1 - 0)
				303	: 0);
				304	}
				305
				306	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2)
				307	{
				308	const char16_t* e1 = s1H+n1;
				309	const char16_t* e2 = s2N+n2;
				310
				311	while (s1H < e1 && s2N < e2) {
				312	const char16_t c2 = ntohs(*s2N);
				313	const int d = (int)*s1H++ - (int)c2;
				314	s2N++;
				315	if (d) {
				316	return d;
				317	}
				318	}
				319
				320	return n1 < n2
				321	? (0 - (int)ntohs(*s2N))
				322	: (n1 > n2
				323	? ((int)*s1H - 0)
				324	: 0);
				325	}
				326
				327	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
				328	{
				329	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				330	return;
				331	}
				332
				333	const char16_t* cur_utf16 = src;
				334	const char16_t* const end_utf16 = src + src_len;
				335	char *cur = dst;
				336	while (cur_utf16 < end_utf16) {
				337	char32_t utf32;
				338	// surrogate pairs
Cylen Yao	72299bf	2014-06-04 19:11:27 +0800	[diff] [blame]	339	if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16
				340	&& (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	341	utf32 = (*cur_utf16++ - 0xD800) << 10;
				342	utf32 \|= *cur_utf16++ - 0xDC00;
				343	utf32 += 0x10000;
				344	} else {
				345	utf32 = (char32_t) *cur_utf16++;
				346	}
				347	const size_t len = utf32_codepoint_utf8_length(utf32);
				348	utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
				349	cur += len;
				350	}
				351	*cur = '\0';
				352	}
				353
				354	// --------------------------------------------------------------------------
				355	// UTF-8
				356	// --------------------------------------------------------------------------
				357
				358	ssize_t utf8_length(const char *src)
				359	{
				360	const char *cur = src;
				361	size_t ret = 0;
				362	while (*cur != '\0') {
				363	const char first_char = *cur++;
				364	if ((first_char & 0x80) == 0) { // ASCII
				365	ret += 1;
				366	continue;
				367	}
				368	// (UTF-8's character must not be like 10xxxxxx,
				369	// but 110xxxxx, 1110xxxx, ... or 1111110x)
				370	if ((first_char & 0x40) == 0) {
				371	return -1;
				372	}
				373
				374	int32_t mask, to_ignore_mask;
				375	size_t num_to_read = 0;
				376	char32_t utf32 = 0;
				377	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
				378	num_to_read < 5 && (first_char & mask);
				379	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				380	if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
				381	return -1;
				382	}
				383	// 0x3F == 00111111
				384	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				385	}
				386	// "first_char" must be (110xxxxx - 11110xxx)
				387	if (num_to_read == 5) {
				388	return -1;
				389	}
				390	to_ignore_mask \|= mask;
				391	utf32 \|= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
				392	if (utf32 > kUnicodeMaxCodepoint) {
				393	return -1;
				394	}
				395
				396	ret += num_to_read;
				397	}
				398	return ret;
				399	}
				400
				401	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				402	{
				403	if (src == NULL \|\| src_len == 0) {
				404	return -1;
				405	}
				406
				407	size_t ret = 0;
				408	const char16_t* const end = src + src_len;
				409	while (src < end) {
				410	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
				411	&& (*++src & 0xFC00) == 0xDC00) {
				412	// surrogate pairs are always 4 bytes.
				413	ret += 4;
				414	src++;
				415	} else {
				416	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				417	}
				418	}
				419	return ret;
				420	}
				421
				422	/**
				423	* Returns 1-4 based on the number of leading bits.
				424	*
				425	* 1111 -> 4
				426	* 1110 -> 3
				427	* 110x -> 2
				428	* 10xx -> 1
				429	* 0xxx -> 1
				430	*/
				431	static inline size_t utf8_codepoint_len(uint8_t ch)
				432	{
				433	return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
				434	}
				435
				436	static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
				437	{
				438	*codePoint <<= 6;
				439	*codePoint \|= 0x3F & byte;
				440	}
				441
				442	size_t utf8_to_utf32_length(const char *src, size_t src_len)
				443	{
				444	if (src == NULL \|\| src_len == 0) {
				445	return 0;
				446	}
				447	size_t ret = 0;
				448	const char* cur;
				449	const char* end;
				450	size_t num_to_skip;
				451	for (cur = src, end = src + src_len, num_to_skip = 1;
				452	cur < end;
				453	cur += num_to_skip, ret++) {
				454	const char first_char = *cur;
				455	num_to_skip = 1;
				456	if ((first_char & 0x80) == 0) { // ASCII
				457	continue;
				458	}
				459	int32_t mask;
				460
				461	for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
				462	}
				463	}
				464	return ret;
				465	}
				466
				467	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
				468	{
				469	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				470	return;
				471	}
				472
				473	const char* cur = src;
				474	const char* const end = src + src_len;
				475	char32_t* cur_utf32 = dst;
				476	while (cur < end) {
				477	size_t num_read;
				478	*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
				479	cur += num_read;
				480	}
				481	*cur_utf32 = 0;
				482	}
				483
				484	static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
				485	{
				486	uint32_t unicode;
				487
				488	switch (length)
				489	{
				490	case 1:
				491	return src[0];
				492	case 2:
				493	unicode = src[0] & 0x1f;
				494	utf8_shift_and_mask(&unicode, src[1]);
				495	return unicode;
				496	case 3:
				497	unicode = src[0] & 0x0f;
				498	utf8_shift_and_mask(&unicode, src[1]);
				499	utf8_shift_and_mask(&unicode, src[2]);
				500	return unicode;
				501	case 4:
				502	unicode = src[0] & 0x07;
				503	utf8_shift_and_mask(&unicode, src[1]);
				504	utf8_shift_and_mask(&unicode, src[2]);
				505	utf8_shift_and_mask(&unicode, src[3]);
				506	return unicode;
				507	default:
				508	return 0xffff;
				509	}
				510
				511	//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
				512	}
				513
				514	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
				515	{
				516	const uint8_t* const u8end = u8str + u8len;
				517	const uint8_t* u8cur = u8str;
				518
				519	/* Validate that the UTF-8 is the correct len */
				520	size_t u16measuredLen = 0;
				521	while (u8cur < u8end) {
				522	u16measuredLen++;
				523	int u8charLen = utf8_codepoint_len(*u8cur);
				524	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
				525	if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
				526	u8cur += u8charLen;
				527	}
				528
				529	/**
				530	* Make sure that we ended where we thought we would and the output UTF-16
				531	* will be exactly how long we were told it would be.
				532	*/
				533	if (u8cur != u8end) {
				534	return -1;
				535	}
				536
				537	return u16measuredLen;
				538	}
				539
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	540	char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	541	{
				542	const uint8_t* const u8end = u8str + u8len;
				543	const uint8_t* u8cur = u8str;
				544	char16_t* u16cur = u16str;
				545
				546	while (u8cur < u8end) {
				547	size_t u8len = utf8_codepoint_len(*u8cur);
				548	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				549
				550	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				551	if (codepoint <= 0xFFFF) {
				552	// Single UTF16 character
				553	*u16cur++ = (char16_t) codepoint;
				554	} else {
				555	// Multiple UTF16 characters with surrogates
				556	codepoint = codepoint - 0x10000;
				557	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				558	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				559	}
				560
				561	u8cur += u8len;
				562	}
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	563	return u16cur;
				564	}
				565
				566	void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
				567	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
				568	*end = 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	569	}
				570
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	571	char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
				572	const uint8_t* const u8end = src + srcLen;
				573	const uint8_t* u8cur = src;
Mark Salyzyn	5bed803	2014-04-30 11:10:46 -0700	[diff] [blame]	574	const char16_t* const u16end = dst + dstLen;
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	575	char16_t* u16cur = dst;
				576
				577	while (u8cur < u8end && u16cur < u16end) {
				578	size_t u8len = utf8_codepoint_len(*u8cur);
				579	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				580
				581	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				582	if (codepoint <= 0xFFFF) {
				583	// Single UTF16 character
				584	*u16cur++ = (char16_t) codepoint;
				585	} else {
				586	// Multiple UTF16 characters with surrogates
				587	codepoint = codepoint - 0x10000;
				588	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				589	if (u16cur >= u16end) {
				590	// Ooops... not enough room for this surrogate pair.
				591	return u16cur-1;
				592	}
				593	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				594	}
				595
				596	u8cur += u8len;
				597	}
				598	return u16cur;
				599	}
				600
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	601	}