Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2005 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Mark Salyzyn | cfd5b08 | 2016-10-17 14:28:00 -0700 | [diff] [blame] | 17 | #define LOG_TAG "unicode" |
| 18 | |
Sergio Giro | 9de6776 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 19 | #include <limits.h> |
Chih-Hung Hsieh | 502f486 | 2018-09-13 11:08:41 -0700 | [diff] [blame] | 20 | #include <utils/Unicode.h> |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 21 | |
Mark Salyzyn | 30f991f | 2017-01-10 13:19:54 -0800 | [diff] [blame] | 22 | #include <log/log.h> |
Mark Salyzyn | ff2dcd9 | 2016-09-28 15:54:45 -0700 | [diff] [blame] | 23 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 24 | extern "C" { |
| 25 | |
| 26 | static const char32_t kByteMask = 0x000000BF; |
| 27 | static const char32_t kByteMark = 0x00000080; |
| 28 | |
| 29 | // Surrogates aren't valid for UTF-32 characters, so define some |
| 30 | // constants that will let us screen them out. |
| 31 | static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; |
Andreas Gampe | a53c815 | 2014-11-24 09:42:07 -0800 | [diff] [blame] | 32 | // Unused, here for completeness: |
| 33 | // static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; |
| 34 | // static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 35 | static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; |
| 36 | static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; |
| 37 | static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; |
| 38 | static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; |
| 39 | |
| 40 | // Mask used to set appropriate bits in first byte of UTF-8 sequence, |
| 41 | // indexed by number of bytes in the sequence. |
| 42 | // 0xxxxxxx |
| 43 | // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 |
| 44 | // 110yyyyx 10xxxxxx |
| 45 | // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 |
| 46 | // 1110yyyy 10yxxxxx 10xxxxxx |
| 47 | // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 |
| 48 | // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx |
| 49 | // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 |
| 50 | static const char32_t kFirstByteMark[] = { |
| 51 | 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 |
| 52 | }; |
| 53 | |
| 54 | // -------------------------------------------------------------------------- |
| 55 | // UTF-32 |
| 56 | // -------------------------------------------------------------------------- |
| 57 | |
| 58 | /** |
| 59 | * Return number of UTF-8 bytes required for the character. If the character |
| 60 | * is invalid, return size of 0. |
| 61 | */ |
| 62 | static inline size_t utf32_codepoint_utf8_length(char32_t srcChar) |
| 63 | { |
| 64 | // Figure out how many bytes the result will require. |
| 65 | if (srcChar < 0x00000080) { |
| 66 | return 1; |
| 67 | } else if (srcChar < 0x00000800) { |
| 68 | return 2; |
| 69 | } else if (srcChar < 0x00010000) { |
| 70 | if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) { |
| 71 | return 3; |
| 72 | } else { |
| 73 | // Surrogates are invalid UTF-32 characters. |
| 74 | return 0; |
| 75 | } |
| 76 | } |
| 77 | // Max code point for Unicode is 0x0010FFFF. |
| 78 | else if (srcChar <= kUnicodeMaxCodepoint) { |
| 79 | return 4; |
| 80 | } else { |
| 81 | // Invalid UTF-32 character. |
| 82 | return 0; |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | // Write out the source character to <dstP>. |
| 87 | |
| 88 | static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) |
| 89 | { |
| 90 | dstP += bytes; |
| 91 | switch (bytes) |
| 92 | { /* note: everything falls through. */ |
| 93 | case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 94 | [[fallthrough]]; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 95 | case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 96 | [[fallthrough]]; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 97 | case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 98 | [[fallthrough]]; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 99 | case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); |
| 100 | } |
| 101 | } |
| 102 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 103 | static inline int32_t utf32_at_internal(const char* cur, size_t *num_read) |
| 104 | { |
| 105 | const char first_char = *cur; |
| 106 | if ((first_char & 0x80) == 0) { // ASCII |
| 107 | *num_read = 1; |
| 108 | return *cur; |
| 109 | } |
| 110 | cur++; |
| 111 | char32_t mask, to_ignore_mask; |
| 112 | size_t num_to_read = 0; |
| 113 | char32_t utf32 = first_char; |
| 114 | for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; |
| 115 | (first_char & mask); |
| 116 | num_to_read++, to_ignore_mask |= mask, mask >>= 1) { |
| 117 | // 0x3F == 00111111 |
| 118 | utf32 = (utf32 << 6) + (*cur++ & 0x3F); |
| 119 | } |
| 120 | to_ignore_mask |= mask; |
| 121 | utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); |
| 122 | |
| 123 | *num_read = num_to_read; |
| 124 | return static_cast<int32_t>(utf32); |
| 125 | } |
| 126 | |
| 127 | int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index) |
| 128 | { |
| 129 | if (index >= src_len) { |
| 130 | return -1; |
| 131 | } |
Dan Albert | ac4500e | 2020-07-27 14:03:56 -0700 | [diff] [blame] | 132 | size_t unused_index; |
Yi Kong | e1731a4 | 2018-07-16 18:11:34 -0700 | [diff] [blame] | 133 | if (next_index == nullptr) { |
Dan Albert | ac4500e | 2020-07-27 14:03:56 -0700 | [diff] [blame] | 134 | next_index = &unused_index; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 135 | } |
| 136 | size_t num_read; |
| 137 | int32_t ret = utf32_at_internal(src + index, &num_read); |
| 138 | if (ret >= 0) { |
| 139 | *next_index = index + num_read; |
| 140 | } |
| 141 | |
| 142 | return ret; |
| 143 | } |
| 144 | |
| 145 | ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len) |
| 146 | { |
Yi Kong | e1731a4 | 2018-07-16 18:11:34 -0700 | [diff] [blame] | 147 | if (src == nullptr || src_len == 0) { |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 148 | return -1; |
| 149 | } |
| 150 | |
| 151 | size_t ret = 0; |
| 152 | const char32_t *end = src + src_len; |
| 153 | while (src < end) { |
Adam Vartanian | 47efc67 | 2017-08-14 15:51:29 +0100 | [diff] [blame] | 154 | size_t char_len = utf32_codepoint_utf8_length(*src++); |
| 155 | if (SSIZE_MAX - char_len < ret) { |
| 156 | // If this happens, we would overflow the ssize_t type when |
| 157 | // returning from this function, so we cannot express how |
| 158 | // long this string is in an ssize_t. |
| 159 | android_errorWriteLog(0x534e4554, "37723026"); |
| 160 | return -1; |
| 161 | } |
| 162 | ret += char_len; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 163 | } |
| 164 | return ret; |
| 165 | } |
| 166 | |
Sergio Giro | 1cfa56d | 2016-06-28 18:02:29 +0100 | [diff] [blame] | 167 | void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst, size_t dst_len) |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 168 | { |
Yi Kong | e1731a4 | 2018-07-16 18:11:34 -0700 | [diff] [blame] | 169 | if (src == nullptr || src_len == 0 || dst == nullptr) { |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 170 | return; |
| 171 | } |
| 172 | |
| 173 | const char32_t *cur_utf32 = src; |
| 174 | const char32_t *end_utf32 = src + src_len; |
| 175 | char *cur = dst; |
| 176 | while (cur_utf32 < end_utf32) { |
| 177 | size_t len = utf32_codepoint_utf8_length(*cur_utf32); |
Sergio Giro | 1cfa56d | 2016-06-28 18:02:29 +0100 | [diff] [blame] | 178 | LOG_ALWAYS_FATAL_IF(dst_len < len, "%zu < %zu", dst_len, len); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 179 | utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len); |
| 180 | cur += len; |
Sergio Giro | 1cfa56d | 2016-06-28 18:02:29 +0100 | [diff] [blame] | 181 | dst_len -= len; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 182 | } |
Sergio Giro | 1cfa56d | 2016-06-28 18:02:29 +0100 | [diff] [blame] | 183 | LOG_ALWAYS_FATAL_IF(dst_len < 1, "dst_len < 1: %zu < 1", dst_len); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 184 | *cur = '\0'; |
| 185 | } |
| 186 | |
| 187 | // -------------------------------------------------------------------------- |
| 188 | // UTF-16 |
| 189 | // -------------------------------------------------------------------------- |
| 190 | |
| 191 | int strcmp16(const char16_t *s1, const char16_t *s2) |
| 192 | { |
| 193 | char16_t ch; |
| 194 | int d = 0; |
| 195 | |
| 196 | while ( 1 ) { |
| 197 | d = (int)(ch = *s1++) - (int)*s2++; |
| 198 | if ( d || !ch ) |
| 199 | break; |
| 200 | } |
| 201 | |
| 202 | return d; |
| 203 | } |
| 204 | |
| 205 | int strncmp16(const char16_t *s1, const char16_t *s2, size_t n) |
| 206 | { |
| 207 | char16_t ch; |
| 208 | int d = 0; |
| 209 | |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 210 | if (n == 0) { |
| 211 | return 0; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 212 | } |
| 213 | |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 214 | do { |
| 215 | d = (int)(ch = *s1++) - (int)*s2++; |
| 216 | if ( d || !ch ) { |
| 217 | break; |
| 218 | } |
| 219 | } while (--n); |
| 220 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 221 | return d; |
| 222 | } |
| 223 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 224 | size_t strlen16(const char16_t *s) |
| 225 | { |
| 226 | const char16_t *ss = s; |
| 227 | while ( *ss ) |
| 228 | ss++; |
| 229 | return ss-s; |
| 230 | } |
| 231 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 232 | size_t strnlen16(const char16_t *s, size_t maxlen) |
| 233 | { |
| 234 | const char16_t *ss = s; |
| 235 | |
| 236 | /* Important: the maxlen test must precede the reference through ss; |
| 237 | since the byte beyond the maximum may segfault */ |
| 238 | while ((maxlen > 0) && *ss) { |
| 239 | ss++; |
| 240 | maxlen--; |
| 241 | } |
| 242 | return ss-s; |
| 243 | } |
| 244 | |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 245 | char16_t* strstr16(const char16_t* src, const char16_t* target) |
| 246 | { |
Branislav Rankov | bf3fff1 | 2017-10-12 15:08:42 +0200 | [diff] [blame] | 247 | const char16_t needle = *target; |
| 248 | if (needle == '\0') return (char16_t*)src; |
| 249 | |
| 250 | const size_t target_len = strlen16(++target); |
| 251 | do { |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 252 | do { |
Branislav Rankov | bf3fff1 | 2017-10-12 15:08:42 +0200 | [diff] [blame] | 253 | if (*src == '\0') { |
| 254 | return nullptr; |
| 255 | } |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 256 | } while (*src++ != needle); |
Branislav Rankov | bf3fff1 | 2017-10-12 15:08:42 +0200 | [diff] [blame] | 257 | } while (strncmp16(src, target, target_len) != 0); |
| 258 | src--; |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 259 | |
| 260 | return (char16_t*)src; |
| 261 | } |
| 262 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 263 | int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2) |
| 264 | { |
| 265 | const char16_t* e1 = s1+n1; |
| 266 | const char16_t* e2 = s2+n2; |
| 267 | |
| 268 | while (s1 < e1 && s2 < e2) { |
| 269 | const int d = (int)*s1++ - (int)*s2++; |
| 270 | if (d) { |
| 271 | return d; |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | return n1 < n2 |
| 276 | ? (0 - (int)*s2) |
| 277 | : (n1 > n2 |
| 278 | ? ((int)*s1 - 0) |
| 279 | : 0); |
| 280 | } |
| 281 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 282 | // is_any_surrogate() returns true if w is either a high or low surrogate |
| 283 | static constexpr bool is_any_surrogate(char16_t w) { |
| 284 | return (w & 0xf800) == 0xd800; |
| 285 | } |
| 286 | |
| 287 | // is_surrogate_pair() returns true if w1 and w2 form a valid surrogate pair |
| 288 | static constexpr bool is_surrogate_pair(char16_t w1, char16_t w2) { |
| 289 | return ((w1 & 0xfc00) == 0xd800) && ((w2 & 0xfc00) == 0xdc00); |
| 290 | } |
| 291 | |
| 292 | // TODO: currently utf16_to_utf8_length() returns -1 if src_len == 0, |
| 293 | // which is inconsistent with utf8_to_utf16_length(), here we keep the |
| 294 | // current behavior as intended not to break compatibility |
| 295 | ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len) |
| 296 | { |
| 297 | if (src == nullptr || src_len == 0) |
| 298 | return -1; |
| 299 | |
| 300 | const char16_t* const end = src + src_len; |
| 301 | const char16_t* in = src; |
| 302 | size_t utf8_len = 0; |
| 303 | |
| 304 | while (in < end) { |
| 305 | char16_t w = *in++; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 306 | if (w < 0x0080) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 307 | utf8_len += 1; |
| 308 | continue; |
| 309 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 310 | if (w < 0x0800) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 311 | utf8_len += 2; |
| 312 | continue; |
| 313 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 314 | if (!is_any_surrogate(w)) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 315 | utf8_len += 3; |
| 316 | continue; |
| 317 | } |
| 318 | if (in < end && is_surrogate_pair(w, *in)) { |
| 319 | utf8_len += 4; |
| 320 | in++; |
| 321 | continue; |
| 322 | } |
| 323 | /* skip if at the end of the string or invalid surrogate pair */ |
| 324 | } |
| 325 | return (in == end && utf8_len < SSIZE_MAX) ? utf8_len : -1; |
| 326 | } |
| 327 | |
Sergio Giro | 1cfa56d | 2016-06-28 18:02:29 +0100 | [diff] [blame] | 328 | void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len) |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 329 | { |
Yi Kong | e1731a4 | 2018-07-16 18:11:34 -0700 | [diff] [blame] | 330 | if (src == nullptr || src_len == 0 || dst == nullptr) { |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 331 | return; |
| 332 | } |
| 333 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 334 | const char16_t* in = src; |
| 335 | const char16_t* const in_end = src + src_len; |
| 336 | char* out = dst; |
| 337 | const char* const out_end = dst + dst_len; |
| 338 | char16_t w2; |
| 339 | |
| 340 | auto err_out = [&out, &out_end, &dst_len]() { |
| 341 | LOG_ALWAYS_FATAL_IF(out >= out_end, |
| 342 | "target utf8 string size %zu too short", dst_len); |
| 343 | }; |
| 344 | |
| 345 | while (in < in_end) { |
| 346 | char16_t w = *in++; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 347 | if (w < 0x0080) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 348 | if (out + 1 > out_end) |
| 349 | return err_out(); |
| 350 | *out++ = (char)(w & 0xff); |
| 351 | continue; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 352 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 353 | if (w < 0x0800) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 354 | if (out + 2 > out_end) |
| 355 | return err_out(); |
| 356 | *out++ = (char)(0xc0 | ((w >> 6) & 0x1f)); |
| 357 | *out++ = (char)(0x80 | ((w >> 0) & 0x3f)); |
| 358 | continue; |
| 359 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 360 | if (!is_any_surrogate(w)) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 361 | if (out + 3 > out_end) |
| 362 | return err_out(); |
| 363 | *out++ = (char)(0xe0 | ((w >> 12) & 0xf)); |
| 364 | *out++ = (char)(0x80 | ((w >> 6) & 0x3f)); |
| 365 | *out++ = (char)(0x80 | ((w >> 0) & 0x3f)); |
| 366 | continue; |
| 367 | } |
| 368 | /* surrogate pair */ |
| 369 | if (in < in_end && (w2 = *in, is_surrogate_pair(w, w2))) { |
| 370 | if (out + 4 > out_end) |
| 371 | return err_out(); |
| 372 | char32_t dw = (char32_t)(0x10000 + ((w - 0xd800) << 10) + (w2 - 0xdc00)); |
| 373 | *out++ = (char)(0xf0 | ((dw >> 18) & 0x07)); |
| 374 | *out++ = (char)(0x80 | ((dw >> 12) & 0x3f)); |
| 375 | *out++ = (char)(0x80 | ((dw >> 6) & 0x3f)); |
| 376 | *out++ = (char)(0x80 | ((dw >> 0) & 0x3f)); |
| 377 | in++; |
| 378 | } |
| 379 | /* We reach here in two cases: |
| 380 | * 1) (in == in_end), which means end of the input string |
| 381 | * 2) (w2 & 0xfc00) != 0xdc00, which means invalid surrogate pair |
| 382 | * In either case, we intentionally do nothing and skip |
| 383 | */ |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 384 | } |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 385 | *out = '\0'; |
| 386 | return; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 387 | } |
| 388 | |
| 389 | // -------------------------------------------------------------------------- |
| 390 | // UTF-8 |
| 391 | // -------------------------------------------------------------------------- |
| 392 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 393 | static char32_t utf8_4b_to_utf32(uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4) { |
| 394 | return ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 395 | } |
| 396 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 397 | // TODO: current behavior of converting UTF8 to UTF-16 has a few issues below |
| 398 | // |
| 399 | // 1. invalid trailing bytes (i.e. not b'10xxxxxx) are treated as valid trailing |
| 400 | // bytes and follows normal conversion rules |
| 401 | // 2. invalid leading byte (b'10xxxxxx) is treated as a valid single UTF-8 byte |
| 402 | // 3. invalid leading byte (b'11111xxx) is treated as a valid leading byte |
| 403 | // (same as b'11110xxx) for a 4-byte UTF-8 sequence |
| 404 | // 4. an invalid 4-byte UTF-8 sequence that translates to a codepoint < U+10000 |
| 405 | // will be converted as a valid UTF-16 character |
| 406 | // |
| 407 | // We keep the current behavior as is but with warnings logged, so as not to |
| 408 | // break compatibility. However, this needs to be addressed later. |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 409 | |
Sergio Giro | 9de6776 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 410 | ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len, bool overreadIsFatal) |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 411 | { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 412 | if (u8str == nullptr) |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 413 | return -1; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 414 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 415 | const uint8_t* const in_end = u8str + u8len; |
| 416 | const uint8_t* in = u8str; |
| 417 | size_t utf16_len = 0; |
| 418 | |
| 419 | while (in < in_end) { |
| 420 | uint8_t c = *in; |
| 421 | utf16_len++; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 422 | if ((c & 0x80) == 0) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 423 | in++; |
| 424 | continue; |
| 425 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 426 | if (c < 0xc0) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 427 | ALOGW("Invalid UTF-8 leading byte: 0x%02x", c); |
| 428 | in++; |
| 429 | continue; |
| 430 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 431 | if (c < 0xe0) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 432 | in += 2; |
| 433 | continue; |
| 434 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 435 | if (c < 0xf0) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 436 | in += 3; |
| 437 | continue; |
| 438 | } else { |
| 439 | uint8_t c2, c3, c4; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 440 | if (c >= 0xf8) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 441 | ALOGW("Invalid UTF-8 leading byte: 0x%02x", c); |
| 442 | } |
| 443 | c2 = in[1]; c3 = in[2]; c4 = in[3]; |
| 444 | if (utf8_4b_to_utf32(c, c2, c3, c4) >= 0x10000) { |
| 445 | utf16_len++; |
| 446 | } |
| 447 | in += 4; |
| 448 | continue; |
| 449 | } |
| 450 | } |
| 451 | if (in == in_end) { |
| 452 | return utf16_len < SSIZE_MAX ? utf16_len : -1; |
| 453 | } |
| 454 | if (overreadIsFatal) |
| 455 | LOG_ALWAYS_FATAL("Attempt to overread computing length of utf8 string"); |
| 456 | return -1; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 457 | } |
| 458 | |
Sergio Giro | 9de6776 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 459 | char16_t* utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str, size_t u16len) { |
| 460 | // A value > SSIZE_MAX is probably a negative value returned as an error and casted. |
| 461 | LOG_ALWAYS_FATAL_IF(u16len == 0 || u16len > SSIZE_MAX, "u16len is %zu", u16len); |
| 462 | char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str, u16len - 1); |
Jeff Brown | aa983c9 | 2011-10-07 13:28:18 -0700 | [diff] [blame] | 463 | *end = 0; |
Sergio Giro | 9de6776 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 464 | return end; |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 465 | } |
| 466 | |
Sergio Giro | 9de6776 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 467 | char16_t* utf8_to_utf16_no_null_terminator( |
| 468 | const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 469 | if (src == nullptr || srcLen == 0 || dstLen == 0) { |
Sergio Giro | 9de6776 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 470 | return dst; |
| 471 | } |
| 472 | // A value > SSIZE_MAX is probably a negative value returned as an error and casted. |
| 473 | LOG_ALWAYS_FATAL_IF(dstLen > SSIZE_MAX, "dstLen is %zu", dstLen); |
Dianne Hackborn | 0f10d0a | 2013-07-31 16:04:39 -0700 | [diff] [blame] | 474 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 475 | const uint8_t* const in_end = src + srcLen; |
| 476 | const uint8_t* in = src; |
| 477 | const char16_t* const out_end = dst + dstLen; |
| 478 | char16_t* out = dst; |
| 479 | uint8_t c, c2, c3, c4; |
| 480 | char32_t w; |
Dianne Hackborn | 0f10d0a | 2013-07-31 16:04:39 -0700 | [diff] [blame] | 481 | |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 482 | auto err_in = [&c, &out]() { |
| 483 | ALOGW("Unended UTF-8 byte: 0x%02x", c); |
| 484 | return out; |
| 485 | }; |
| 486 | |
| 487 | while (in < in_end && out < out_end) { |
| 488 | c = *in++; |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 489 | if ((c & 0x80) == 0) [[likely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 490 | *out++ = (char16_t)(c); |
| 491 | continue; |
Dianne Hackborn | 0f10d0a | 2013-07-31 16:04:39 -0700 | [diff] [blame] | 492 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 493 | if (c < 0xc0) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 494 | ALOGW("Invalid UTF-8 leading byte: 0x%02x", c); |
| 495 | *out++ = (char16_t)(c); |
| 496 | continue; |
| 497 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 498 | if (c < 0xe0) [[likely]] { |
| 499 | if (in + 1 > in_end) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 500 | return err_in(); |
| 501 | } |
| 502 | c2 = *in++; |
| 503 | *out++ = (char16_t)(((c & 0x1f) << 6) | (c2 & 0x3f)); |
| 504 | continue; |
| 505 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 506 | if (c < 0xf0) [[likely]] { |
| 507 | if (in + 2 > in_end) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 508 | return err_in(); |
| 509 | } |
| 510 | c2 = *in++; c3 = *in++; |
| 511 | *out++ = (char16_t)(((c & 0x0f) << 12) | |
| 512 | ((c2 & 0x3f) << 6) | (c3 & 0x3f)); |
| 513 | continue; |
| 514 | } else { |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 515 | if (in + 3 > in_end) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 516 | return err_in(); |
| 517 | } |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 518 | if (c >= 0xf8) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 519 | ALOGW("Invalid UTF-8 leading byte: 0x%02x", c); |
| 520 | } |
| 521 | // Multiple UTF16 characters with surrogates |
| 522 | c2 = *in++; c3 = *in++; c4 = *in++; |
| 523 | w = utf8_4b_to_utf32(c, c2, c3, c4); |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 524 | if (w < 0x10000) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 525 | *out++ = (char16_t)(w); |
| 526 | } else { |
Steven Moreland | c738370 | 2023-10-21 00:43:52 +0000 | [diff] [blame] | 527 | if (out + 2 > out_end) [[unlikely]] { |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 528 | // Ooops.... not enough room for this surrogate pair. |
| 529 | return out; |
| 530 | } |
| 531 | *out++ = (char16_t)(((w - 0x10000) >> 10) + 0xd800); |
| 532 | *out++ = (char16_t)(((w - 0x10000) & 0x3ff) + 0xdc00); |
| 533 | } |
| 534 | continue; |
| 535 | } |
Dianne Hackborn | 0f10d0a | 2013-07-31 16:04:39 -0700 | [diff] [blame] | 536 | } |
Eric Miao | cb199b4 | 2022-11-30 16:05:49 -0800 | [diff] [blame] | 537 | return out; |
Dianne Hackborn | 0f10d0a | 2013-07-31 16:04:39 -0700 | [diff] [blame] | 538 | } |
| 539 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 540 | } |