Blame - include/utils/Unicode.h - android_system_core

blob: a13f347793030fe8f307997ccbb0918239fbbfd9 [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef ANDROID_UNICODE_H
				18	#define ANDROID_UNICODE_H
				19
				20	#include <sys/types.h>
				21	#include <stdint.h>
				22
				23	extern "C" {
				24
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	25	// Standard string functions on char16_t strings.
				26	int strcmp16(const char16_t , const char16_t );
				27	int strncmp16(const char16_t s1, const char16_t s2, size_t n);
				28	size_t strlen16(const char16_t *);
				29	size_t strnlen16(const char16_t *, size_t);
				30	char16_t strcpy16(char16_t , const char16_t *);
				31	char16_t strncpy16(char16_t , const char16_t *, size_t);
Michael Wright	5bacef3	2016-05-09 14:43:31 +0100	[diff] [blame]	32	char16_t strstr16(const char16_t, const char16_t*);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	33
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	34	// Version of comparison that supports embedded NULs.
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	35	// This is different than strncmp() because we don't stop
				36	// at a nul character and consider the strings to be different
				37	// if the lengths are different (thus we need to supply the
				38	// lengths of both strings). This can also be used when
				39	// your string is not nul-terminated as it will have the
				40	// equivalent result as strcmp16 (unlike strncmp16).
				41	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2);
				42
				43	// Version of strzcmp16 for comparing strings in different endianness.
				44	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2);
				45
				46	// Standard string functions on char32_t strings.
				47	size_t strlen32(const char32_t *);
				48	size_t strnlen32(const char32_t *, size_t);
				49
				50	/**
				51	* Measure the length of a UTF-32 string in UTF-8. If the string is invalid
				52	* such as containing a surrogate character, -1 will be returned.
				53	*/
				54	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len);
				55
				56	/**
				57	* Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not
				58	* large enough to store the string, the part of the "src" string is stored
				59	* into "dst" as much as possible. See the examples for more detail.
				60	* Returns the size actually used for storing the string.
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	61	* dst" is not nul-terminated when dst_len is fully used (like strncpy).
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	62	*
				63	* Example 1
				64	* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
				65	* "src_len" == 2
				66	* "dst_len" >= 7
				67	* ->
				68	* Returned value == 6
				69	* "dst" becomes \xE3\x81\x82\xE3\x81\x84\0
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	70	* (note that "dst" is nul-terminated)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	71	*
				72	* Example 2
				73	* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
				74	* "src_len" == 2
				75	* "dst_len" == 5
				76	* ->
				77	* Returned value == 3
				78	* "dst" becomes \xE3\x81\x82\0
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	79	* (note that "dst" is nul-terminated, but \u3044 is not stored in "dst"
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	80	* since "dst" does not have enough size to store the character)
				81	*
				82	* Example 3
				83	* "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84)
				84	* "src_len" == 2
				85	* "dst_len" == 6
				86	* ->
				87	* Returned value == 6
				88	* "dst" becomes \xE3\x81\x82\xE3\x81\x84
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	89	* (note that "dst" is NOT nul-terminated, like strncpy)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	90	*/
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	91	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	92
				93	/**
				94	* Returns the unicode value at "index".
				95	* Returns -1 when the index is invalid (equals to or more than "src_len").
				96	* If returned value is positive, it is able to be converted to char32_t, which
				97	* is unsigned. Then, if "next_index" is not NULL, the next index to be used is
				98	* stored in "next_index". "next_index" can be NULL.
				99	*/
				100	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index);
				101
				102
				103	/**
				104	* Returns the UTF-8 length of UTF-16 string "src".
				105	*/
				106	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len);
				107
				108	/**
				109	* Converts a UTF-16 string to UTF-8. The destination buffer must be large
				110	* enough to fit the UTF-16 as measured by utf16_to_utf8_length with an added
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	111	* NUL terminator.
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	112	*/
Sergio Giro	1cfa56d	2016-06-28 18:02:29 +0100	[diff] [blame]	113	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	114
				115	/**
				116	* Returns the length of "src" when "src" is valid UTF-8 string.
				117	* Returns 0 if src is NULL or 0-length string. Returns -1 when the source
				118	* is an invalid string.
				119	*
				120	* This function should be used to determine whether "src" is valid UTF-8
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	121	* characters with valid unicode codepoints. "src" must be nul-terminated.
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	122	*
				123	* If you are going to use other utf8_to_... functions defined in this header
				124	* with string which may not be valid UTF-8 with valid codepoint (form 0 to
				125	* 0x10FFFF), you should use this function before calling others, since the
				126	* other functions do not check whether the string is valid UTF-8 or not.
				127	*
				128	* If you do not care whether "src" is valid UTF-8 or not, you should use
				129	* strlen() as usual, which should be much faster.
				130	*/
				131	ssize_t utf8_length(const char *src);
				132
				133	/**
				134	* Measure the length of a UTF-32 string.
				135	*/
				136	size_t utf8_to_utf32_length(const char *src, size_t src_len);
				137
				138	/**
				139	* Stores a UTF-32 string converted from "src" in "dst". "dst" must be large
				140	* enough to store the entire converted string as measured by
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	141	* utf8_to_utf32_length plus space for a NUL terminator.
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	142	*/
				143	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst);
				144
				145	/**
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	146	* Returns the UTF-16 length of UTF-8 string "src". Returns -1 in case
				147	* it's invalid utf8. No buffer over-read occurs because of bound checks. Using overreadIsFatal you
				148	* can ask to log a message and fail in case the invalid utf8 could have caused an override if no
				149	* bound checks were used (otherwise -1 is returned).
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	150	*/
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	151	ssize_t utf8_to_utf16_length(const uint8_t* src, size_t srcLen, bool overreadIsFatal = false);
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	152
				153	/**
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	154	* Convert UTF-8 to UTF-16 including surrogate pairs.
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	155	* Returns a pointer to the end of the string (where a NUL terminator might go
				156	* if you wanted to add one). At most dstLen characters are written; it won't emit half a surrogate
				157	* pair. If dstLen == 0 nothing is written and dst is returned. If dstLen > SSIZE_MAX it aborts
				158	* (this being probably a negative number returned as an error and casted to unsigned).
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	159	*/
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	160	char16_t* utf8_to_utf16_no_null_terminator(
				161	const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen);
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	162
				163	/**
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	164	* Convert UTF-8 to UTF-16 including surrogate pairs. At most dstLen - 1
				165	* characters are written; it won't emit half a surrogate pair; and a NUL terminator is appended
				166	* after. dstLen - 1 can be measured beforehand using utf8_to_utf16_length. Aborts if dstLen == 0
				167	* (at least one character is needed for the NUL terminator) or dstLen > SSIZE_MAX (the latter
				168	* case being likely a negative number returned as an error and casted to unsigned) . Returns a
				169	* pointer to the NUL terminator.
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	170	*/
Sergio Giro	9de6776	2016-07-20 20:01:33 +0100	[diff] [blame]	171	char16_t *utf8_to_utf16(
				172	const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen);
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	173
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	174	}
				175
				176	#endif