Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2010 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #define LOG_TAG "Unicode_test" |
Branislav Rankov | bf3fff1 | 2017-10-12 15:08:42 +0200 | [diff] [blame] | 18 | |
| 19 | #include <sys/mman.h> |
| 20 | #include <unistd.h> |
| 21 | |
| 22 | #include <log/log.h> |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 23 | #include <utils/Unicode.h> |
| 24 | |
| 25 | #include <gtest/gtest.h> |
| 26 | |
| 27 | namespace android { |
| 28 | |
| 29 | class UnicodeTest : public testing::Test { |
| 30 | protected: |
| 31 | virtual void SetUp() { |
| 32 | } |
| 33 | |
| 34 | virtual void TearDown() { |
| 35 | } |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 36 | |
| 37 | char16_t const * const kSearchString = u"I am a leaf on the wind."; |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 38 | |
| 39 | constexpr static size_t BUFSIZE = 64; // large enough for all tests |
| 40 | |
| 41 | void TestUTF8toUTF16(std::initializer_list<uint8_t> input, |
| 42 | std::initializer_list<char16_t> expect, |
| 43 | const char* err_msg_length = "", |
| 44 | ssize_t expected_length = 0) { |
| 45 | uint8_t empty_str[] = {}; |
| 46 | char16_t output[BUFSIZE]; |
| 47 | |
| 48 | const size_t inlen = input.size(), outlen = expect.size(); |
| 49 | ASSERT_LT(outlen, BUFSIZE); |
| 50 | |
| 51 | const uint8_t *input_data = inlen ? std::data(input) : empty_str; |
| 52 | ssize_t measured = utf8_to_utf16_length(input_data, inlen); |
| 53 | EXPECT_EQ(expected_length ? : (ssize_t)outlen, measured) << err_msg_length; |
| 54 | |
| 55 | utf8_to_utf16(input_data, inlen, output, outlen + 1); |
| 56 | for (size_t i = 0; i < outlen; i++) { |
| 57 | EXPECT_EQ(std::data(expect)[i], output[i]); |
| 58 | } |
| 59 | EXPECT_EQ(0, output[outlen]) << "should be null terminated"; |
| 60 | } |
| 61 | |
| 62 | void TestUTF16toUTF8(std::initializer_list<char16_t> input, |
| 63 | std::initializer_list<char> expect, |
| 64 | const char* err_msg_length = "", |
| 65 | ssize_t expected_length = 0) { |
| 66 | char16_t empty_str[] = {}; |
| 67 | char output[BUFSIZE]; |
| 68 | |
| 69 | const size_t inlen = input.size(), outlen = expect.size(); |
| 70 | ASSERT_LT(outlen, BUFSIZE); |
| 71 | |
| 72 | const char16_t *input_data = inlen ? std::data(input) : empty_str; |
| 73 | ssize_t measured = utf16_to_utf8_length(input_data, inlen); |
| 74 | EXPECT_EQ(expected_length ? : (ssize_t)outlen, measured) << err_msg_length; |
| 75 | |
| 76 | utf16_to_utf8(input_data, inlen, output, outlen + 1); |
| 77 | for (size_t i = 0; i < outlen; i++) { |
| 78 | EXPECT_EQ(std::data(expect)[i], output[i]); |
| 79 | } |
| 80 | EXPECT_EQ(0, output[outlen]) << "should be null terminated"; |
| 81 | } |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 82 | }; |
| 83 | |
| 84 | TEST_F(UnicodeTest, UTF8toUTF16ZeroLength) { |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 85 | TestUTF8toUTF16({}, {}, |
| 86 | "Zero length input should return zero length output."); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 87 | } |
| 88 | |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 89 | TEST_F(UnicodeTest, UTF8toUTF16ASCII) { |
| 90 | TestUTF8toUTF16( |
| 91 | { 0x30 }, // U+0030 or ASCII '0' |
| 92 | { 0x0030 }, |
| 93 | "ASCII codepoints should have a length of 1 char16_t"); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 94 | } |
| 95 | |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 96 | TEST_F(UnicodeTest, UTF8toUTF16Plane1) { |
| 97 | TestUTF8toUTF16( |
| 98 | { 0xE2, 0x8C, 0xA3 }, // U+2323 SMILE |
| 99 | { 0x2323 }, |
| 100 | "Plane 1 codepoints should have a length of 1 char16_t"); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 101 | } |
| 102 | |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 103 | TEST_F(UnicodeTest, UTF8toUTF16Surrogate) { |
| 104 | TestUTF8toUTF16( |
| 105 | { 0xF0, 0x90, 0x80, 0x80 }, // U+10000 |
| 106 | { 0xD800, 0xDC00 }, |
| 107 | "Surrogate pairs should have a length of 2 char16_t"); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 108 | } |
| 109 | |
| 110 | TEST_F(UnicodeTest, UTF8toUTF16TruncatedUTF8) { |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 111 | TestUTF8toUTF16( |
| 112 | { 0xE2, 0x8C }, // Truncated U+2323 SMILE |
| 113 | { }, // Conversion should still work but produce nothing |
| 114 | "Truncated UTF-8 should return -1 to indicate invalid", |
| 115 | -1); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 116 | } |
| 117 | |
| 118 | TEST_F(UnicodeTest, UTF8toUTF16Normal) { |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 119 | TestUTF8toUTF16({ |
| 120 | 0x30, // U+0030, 1 UTF-16 character |
| 121 | 0xC4, 0x80, // U+0100, 1 UTF-16 character |
| 122 | 0xE2, 0x8C, 0xA3, // U+2323, 1 UTF-16 character |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 123 | 0xF0, 0x90, 0x80, 0x80, // U+10000, 2 UTF-16 character |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 124 | }, { |
| 125 | 0x0030, |
| 126 | 0x0100, |
| 127 | 0x2323, |
| 128 | 0xD800, 0xDC00 |
| 129 | }); |
| 130 | } |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 131 | |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 132 | TEST_F(UnicodeTest, UTF8toUTF16Invalid) { |
| 133 | // TODO: The current behavior of utf8_to_utf16 is to treat invalid |
| 134 | // leading byte (>= 0xf8) as a 4-byte UTF8 sequence, and to treat |
| 135 | // invalid trailing byte(s) (i.e. bytes not having MSB set) as if |
| 136 | // they are valid and do the normal conversion. However, a better |
| 137 | // handling would be to treat invalid sequences as errors, such |
| 138 | // cases need to be reported and invalid characters (e.g. U+FFFD) |
| 139 | // could be produced at the place of error. Until a fix is ready |
| 140 | // and compatibility is not an issue, we will keep testing the |
| 141 | // current behavior |
| 142 | TestUTF8toUTF16({ |
| 143 | 0xf8, // invalid leading byte |
| 144 | 0xc4, 0x00, // U+0100 with invalid trailing byte |
| 145 | 0xe2, 0x0c, 0xa3, // U+2323 with invalid trailing bytes |
| 146 | 0xf0, 0x10, 0x00, 0x00, // U+10000 with invalid trailing bytes |
| 147 | }, { |
| 148 | 0x4022, // invalid leading byte (>=0xfc) is treated |
| 149 | // as valid for 4-byte UTF8 sequence |
| 150 | 0x000C, |
| 151 | 0x00A3, // invalid leadnig byte (b'10xxxxxx) is |
| 152 | // treated as valid single UTF-8 byte |
| 153 | 0xD800, // invalid trailing bytes are treated |
| 154 | 0xDC00, // as valid bytes and follow normal |
| 155 | }); |
| 156 | } |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 157 | |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 158 | TEST_F(UnicodeTest, UTF16toUTF8ZeroLength) { |
| 159 | // TODO: The current behavior of utf16_to_utf8_length() is that |
| 160 | // it returns -1 if the input is a zero length UTF16 string. |
| 161 | // This is inconsistent with utf8_to_utf16_length() where a zero |
| 162 | // length string returns 0. However, to fix the current behavior, |
| 163 | // we could have compatibility issue. Until then, we will keep |
| 164 | // testing the current behavior |
| 165 | TestUTF16toUTF8({}, {}, |
| 166 | "Zero length UTF16 input should return length of -1.", -1); |
| 167 | } |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 168 | |
Eric Miao | c252707 | 2022-11-30 16:04:55 -0800 | [diff] [blame^] | 169 | TEST_F(UnicodeTest, UTF16toUTF8ASCII) { |
| 170 | TestUTF16toUTF8( |
| 171 | { 0x0030 }, // U+0030 or ASCII '0' |
| 172 | { '\x30' }, |
| 173 | "ASCII codepoints in UTF16 should give a length of 1 in UTF8"); |
| 174 | } |
| 175 | |
| 176 | TEST_F(UnicodeTest, UTF16toUTF8Plane1) { |
| 177 | TestUTF16toUTF8( |
| 178 | { 0x2323 }, // U+2323 SMILE |
| 179 | { '\xE2', '\x8C', '\xA3' }, |
| 180 | "Plane 1 codepoints should have a length of 3 char in UTF-8"); |
| 181 | } |
| 182 | |
| 183 | TEST_F(UnicodeTest, UTF16toUTF8Surrogate) { |
| 184 | TestUTF16toUTF8( |
| 185 | { 0xD800, 0xDC00 }, // U+10000 |
| 186 | { '\xF0', '\x90', '\x80', '\x80' }, |
| 187 | "Surrogate pairs should have a length of 4 chars"); |
| 188 | } |
| 189 | |
| 190 | TEST_F(UnicodeTest, UTF16toUTF8UnpairedSurrogate) { |
| 191 | TestUTF16toUTF8( |
| 192 | { 0xD800 }, // U+10000 with high surrogate pair only |
| 193 | { }, // Unpaired surrogate should be ignored |
| 194 | "A single unpaired high surrogate should have a length of 0 chars"); |
| 195 | |
| 196 | TestUTF16toUTF8( |
| 197 | { 0xDC00 }, // U+10000 with low surrogate pair only |
| 198 | { }, // Unpaired surrogate should be ignored |
| 199 | "A single unpaired low surrogate should have a length of 0 chars"); |
| 200 | |
| 201 | TestUTF16toUTF8( |
| 202 | // U+0030, U+0100, U+10000 with high surrogate pair only, U+2323 |
| 203 | { 0x0030, 0x0100, 0xDC00, 0x2323 }, |
| 204 | { '\x30', '\xC4', '\x80', '\xE2', '\x8C', '\xA3' }, |
| 205 | "Unpaired high surrogate should be skipped in the middle"); |
| 206 | |
| 207 | TestUTF16toUTF8( |
| 208 | // U+0030, U+0100, U+10000 with high surrogate pair only, U+2323 |
| 209 | { 0x0030, 0x0100, 0xDC00, 0x2323 }, |
| 210 | { '\x30', '\xC4', '\x80', '\xE2', '\x8C', '\xA3' }, |
| 211 | "Unpaired low surrogate should be skipped in the middle"); |
| 212 | } |
| 213 | |
| 214 | TEST_F(UnicodeTest, UTF16toUTF8CorrectInvalidSurrogate) { |
| 215 | // http://b/29250543 |
| 216 | // d841d8 is an invalid start for a surrogate pair. Make sure this is handled by ignoring the |
| 217 | // first character in the pair and handling the rest correctly. |
| 218 | TestUTF16toUTF8( |
| 219 | { 0xD841, 0xD841, 0xDC41 }, // U+20441 |
| 220 | { '\xF0', '\xA0', '\x91', '\x81' }, |
| 221 | "Invalid start for a surrogate pair should be ignored"); |
| 222 | } |
| 223 | |
| 224 | TEST_F(UnicodeTest, UTF16toUTF8Normal) { |
| 225 | TestUTF16toUTF8({ |
| 226 | 0x0024, // U+0024 ($) --> 0x24, 1 UTF-8 byte |
| 227 | 0x00A3, // U+00A3 (£) --> 0xC2 0xA3, 2 UTF-8 bytes |
| 228 | 0x0939, // U+0939 (ह) --> 0xE0 0xA4 0xB9, 3 UTF-8 bytes |
| 229 | 0x20AC, // U+20AC (€) --> 0xE2 0x82 0xAC, 3 UTF-8 bytes |
| 230 | 0xD55C, // U+D55C (한)--> 0xED 0x95 0x9C, 3 UTF-8 bytes |
| 231 | 0xD801, 0xDC37, // U+10437 (𐐷) --> 0xF0 0x90 0x90 0xB7, 4 UTF-8 bytes |
| 232 | }, { |
| 233 | '\x24', |
| 234 | '\xC2', '\xA3', |
| 235 | '\xE0', '\xA4', '\xB9', |
| 236 | '\xE2', '\x82', '\xAC', |
| 237 | '\xED', '\x95', '\x9C', |
| 238 | '\xF0', '\x90', '\x90', '\xB7', |
| 239 | }); |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 240 | } |
| 241 | |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 242 | TEST_F(UnicodeTest, strstr16EmptyTarget) { |
| 243 | EXPECT_EQ(strstr16(kSearchString, u""), kSearchString) |
| 244 | << "should return the original pointer"; |
| 245 | } |
| 246 | |
Branislav Rankov | bf3fff1 | 2017-10-12 15:08:42 +0200 | [diff] [blame] | 247 | TEST_F(UnicodeTest, strstr16EmptyTarget_bug) { |
| 248 | // In the original code when target is an empty string strlen16() would |
| 249 | // start reading the memory until a "terminating null" (that is, zero) |
| 250 | // character is found. This happens because "*target++" in the original |
| 251 | // code would increment the pointer beyond the actual string. |
| 252 | void* memptr; |
| 253 | const size_t alignment = sysconf(_SC_PAGESIZE); |
| 254 | const size_t size = 2 * alignment; |
| 255 | ASSERT_EQ(posix_memalign(&memptr, alignment, size), 0); |
| 256 | // Fill allocated memory. |
| 257 | memset(memptr, 'A', size); |
| 258 | // Create a pointer to an "empty" string on the first page. |
| 259 | char16_t* const emptyString = (char16_t* const)((char*)memptr + alignment - 4); |
| 260 | *emptyString = (char16_t)0; |
| 261 | // Protect the second page to show that strstr16() violates that. |
| 262 | ASSERT_EQ(mprotect((char*)memptr + alignment, alignment, PROT_NONE), 0); |
| 263 | // Test strstr16(): when bug is present a segmentation fault is raised. |
| 264 | ASSERT_EQ(strstr16((char16_t*)memptr, emptyString), (char16_t*)memptr) |
| 265 | << "should not read beyond the first char16_t."; |
| 266 | // Reset protection of the second page |
| 267 | ASSERT_EQ(mprotect((char*)memptr + alignment, alignment, PROT_READ | PROT_WRITE), 0); |
| 268 | // Free allocated memory. |
| 269 | free(memptr); |
| 270 | } |
| 271 | |
Michael Wright | 5bacef3 | 2016-05-09 14:43:31 +0100 | [diff] [blame] | 272 | TEST_F(UnicodeTest, strstr16SameString) { |
| 273 | const char16_t* result = strstr16(kSearchString, kSearchString); |
| 274 | EXPECT_EQ(kSearchString, result) |
| 275 | << "should return the original pointer"; |
| 276 | } |
| 277 | |
| 278 | TEST_F(UnicodeTest, strstr16TargetStartOfString) { |
| 279 | const char16_t* result = strstr16(kSearchString, u"I am"); |
| 280 | EXPECT_EQ(kSearchString, result) |
| 281 | << "should return the original pointer"; |
| 282 | } |
| 283 | |
| 284 | |
| 285 | TEST_F(UnicodeTest, strstr16TargetEndOfString) { |
| 286 | const char16_t* result = strstr16(kSearchString, u"wind."); |
| 287 | EXPECT_EQ(kSearchString+19, result); |
| 288 | } |
| 289 | |
| 290 | TEST_F(UnicodeTest, strstr16TargetWithinString) { |
| 291 | const char16_t* result = strstr16(kSearchString, u"leaf"); |
| 292 | EXPECT_EQ(kSearchString+7, result); |
| 293 | } |
| 294 | |
| 295 | TEST_F(UnicodeTest, strstr16TargetNotPresent) { |
| 296 | const char16_t* result = strstr16(kSearchString, u"soar"); |
| 297 | EXPECT_EQ(nullptr, result); |
| 298 | } |
| 299 | |
Sergio Giro | 1dcc0c8 | 2016-07-20 20:01:33 +0100 | [diff] [blame] | 300 | // http://b/29267949 |
| 301 | // Test that overreading in utf8_to_utf16_length is detected |
| 302 | TEST_F(UnicodeTest, InvalidUtf8OverreadDetected) { |
| 303 | // An utf8 char starting with \xc4 is two bytes long. |
| 304 | // Add extra zeros so no extra memory is read in case the code doesn't |
| 305 | // work as expected. |
| 306 | static char utf8[] = "\xc4\x00\x00\x00"; |
| 307 | ASSERT_DEATH(utf8_to_utf16_length((uint8_t *) utf8, strlen(utf8), |
| 308 | true /* overreadIsFatal */), "" /* regex for ASSERT_DEATH */); |
| 309 | } |
| 310 | |
Kenny Root | ba0165b | 2010-11-09 14:37:23 -0800 | [diff] [blame] | 311 | } |