Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2015 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "base/utf8.h" |
| 18 | |
| 19 | #include <gtest/gtest.h> |
| 20 | |
| 21 | #include "base/macros.h" |
| 22 | |
| 23 | namespace android { |
| 24 | namespace base { |
| 25 | |
| 26 | TEST(UTFStringConversionsTest, ConvertInvalidUTF8) { |
| 27 | std::wstring wide; |
| 28 | |
Spencer Low | d21dc82 | 2015-11-12 15:20:15 -0800 | [diff] [blame^] | 29 | errno = 0; |
| 30 | |
Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 31 | // Standalone \xa2 is an invalid UTF-8 sequence, so this should return an |
| 32 | // error. Concatenate two C/C++ literal string constants to prevent the |
| 33 | // compiler from giving an error about "\xa2af" containing a "hex escape |
| 34 | // sequence out of range". |
| 35 | EXPECT_FALSE(android::base::UTF8ToWide("before\xa2" "after", &wide)); |
| 36 | |
Spencer Low | d21dc82 | 2015-11-12 15:20:15 -0800 | [diff] [blame^] | 37 | EXPECT_EQ(EILSEQ, errno); |
| 38 | |
Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 39 | // Even if an invalid character is encountered, UTF8ToWide() should still do |
| 40 | // its best to convert the rest of the string. sysdeps_win32.cpp: |
| 41 | // _console_write_utf8() depends on this behavior. |
| 42 | // |
| 43 | // Thus, we verify that the valid characters are converted, but we ignore the |
| 44 | // specific replacement character that UTF8ToWide() may replace the invalid |
| 45 | // UTF-8 characters with because we want to allow that to change if the |
| 46 | // implementation changes. |
| 47 | EXPECT_EQ(0, wide.find(L"before")); |
| 48 | const wchar_t after_wide[] = L"after"; |
| 49 | EXPECT_EQ(wide.length() - (arraysize(after_wide) - 1), wide.find(after_wide)); |
| 50 | } |
| 51 | |
| 52 | // Below is adapted from https://chromium.googlesource.com/chromium/src/+/master/base/strings/utf_string_conversions_unittest.cc |
| 53 | |
| 54 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 55 | // Use of this source code is governed by a BSD-style license that can be |
| 56 | // found in the LICENSE file. |
| 57 | |
| 58 | // The tests below from utf_string_conversions_unittest.cc check for this |
| 59 | // preprocessor symbol, so define it, as it is appropriate for Windows. |
| 60 | #define WCHAR_T_IS_UTF16 |
| 61 | static_assert(sizeof(wchar_t) == 2, "wchar_t is not 2 bytes"); |
| 62 | |
| 63 | // The tests below from utf_string_conversions_unittest.cc call versions of |
| 64 | // UTF8ToWide() and WideToUTF8() that don't return success/failure, so these are |
| 65 | // stub implementations with that signature. These are just for testing and |
| 66 | // should not be moved to base because they assert/expect no errors which is |
| 67 | // probably not a good idea (or at least it is something that should be left |
| 68 | // up to the caller, not a base library). |
| 69 | |
| 70 | static std::wstring UTF8ToWide(const std::string& utf8) { |
| 71 | std::wstring utf16; |
| 72 | EXPECT_TRUE(UTF8ToWide(utf8, &utf16)); |
| 73 | return utf16; |
| 74 | } |
| 75 | |
| 76 | static std::string WideToUTF8(const std::wstring& utf16) { |
| 77 | std::string utf8; |
| 78 | EXPECT_TRUE(WideToUTF8(utf16, &utf8)); |
| 79 | return utf8; |
| 80 | } |
| 81 | |
| 82 | namespace { |
| 83 | |
| 84 | const wchar_t* const kConvertRoundtripCases[] = { |
| 85 | L"Google Video", |
| 86 | // "网页 图片 资讯更多 »" |
| 87 | L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", |
| 88 | // "Παγκόσμιος Ιστός" |
| 89 | L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" |
| 90 | L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", |
| 91 | // "Поиск страниц на русском" |
| 92 | L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" |
| 93 | L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" |
| 94 | L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c", |
| 95 | // "전체서비스" |
| 96 | L"\xc804\xccb4\xc11c\xbe44\xc2a4", |
| 97 | |
| 98 | // Test characters that take more than 16 bits. This will depend on whether |
| 99 | // wchar_t is 16 or 32 bits. |
| 100 | #if defined(WCHAR_T_IS_UTF16) |
| 101 | L"\xd800\xdf00", |
| 102 | // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) |
| 103 | L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44", |
| 104 | #elif defined(WCHAR_T_IS_UTF32) |
| 105 | L"\x10300", |
| 106 | // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E) |
| 107 | L"\x11d40\x11d41\x11d42\x11d43\x11d44", |
| 108 | #endif |
| 109 | }; |
| 110 | |
| 111 | } // namespace |
| 112 | |
| 113 | TEST(UTFStringConversionsTest, ConvertUTF8AndWide) { |
| 114 | // we round-trip all the wide strings through UTF-8 to make sure everything |
| 115 | // agrees on the conversion. This uses the stream operators to test them |
| 116 | // simultaneously. |
| 117 | for (size_t i = 0; i < arraysize(kConvertRoundtripCases); ++i) { |
| 118 | std::ostringstream utf8; |
| 119 | utf8 << WideToUTF8(kConvertRoundtripCases[i]); |
| 120 | std::wostringstream wide; |
| 121 | wide << UTF8ToWide(utf8.str()); |
| 122 | |
| 123 | EXPECT_EQ(kConvertRoundtripCases[i], wide.str()); |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) { |
| 128 | // An empty std::wstring should be converted to an empty std::string, |
| 129 | // and vice versa. |
| 130 | std::wstring wempty; |
| 131 | std::string empty; |
| 132 | EXPECT_EQ(empty, WideToUTF8(wempty)); |
| 133 | EXPECT_EQ(wempty, UTF8ToWide(empty)); |
| 134 | } |
| 135 | |
| 136 | TEST(UTFStringConversionsTest, ConvertUTF8ToWide) { |
| 137 | struct UTF8ToWideCase { |
| 138 | const char* utf8; |
| 139 | const wchar_t* wide; |
| 140 | bool success; |
| 141 | } convert_cases[] = { |
| 142 | // Regular UTF-8 input. |
| 143 | {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true}, |
| 144 | // Non-character is passed through. |
| 145 | {"\xef\xbf\xbfHello", L"\xffffHello", true}, |
| 146 | // Truncated UTF-8 sequence. |
| 147 | {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false}, |
| 148 | // Truncated off the end. |
| 149 | {"\xe5\xa5\xbd\xe4\xa0", L"\x597d\xfffd", false}, |
| 150 | // Non-shortest-form UTF-8. |
| 151 | {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false}, |
| 152 | // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal. |
| 153 | // Note that for whatever reason, this test fails on Windows XP. |
| 154 | {"\xed\xb0\x80", L"\xfffd", false}, |
| 155 | // Non-BMP characters. The second is a non-character regarded as valid. |
| 156 | // The result will either be in UTF-16 or UTF-32. |
| 157 | #if defined(WCHAR_T_IS_UTF16) |
| 158 | {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true}, |
| 159 | {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true}, |
| 160 | #elif defined(WCHAR_T_IS_UTF32) |
| 161 | {"A\xF0\x90\x8C\x80z", L"A\x10300z", true}, |
| 162 | {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true}, |
| 163 | #endif |
| 164 | }; |
| 165 | |
| 166 | for (size_t i = 0; i < arraysize(convert_cases); i++) { |
| 167 | std::wstring converted; |
Spencer Low | d21dc82 | 2015-11-12 15:20:15 -0800 | [diff] [blame^] | 168 | errno = 0; |
Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 169 | const bool success = UTF8ToWide(convert_cases[i].utf8, |
| 170 | strlen(convert_cases[i].utf8), |
| 171 | &converted); |
| 172 | EXPECT_EQ(convert_cases[i].success, success); |
| 173 | // The original test always compared expected and converted, but don't do |
| 174 | // that because our implementation of UTF8ToWide() does not guarantee to |
| 175 | // produce the same output in error situations. |
| 176 | if (success) { |
| 177 | std::wstring expected(convert_cases[i].wide); |
| 178 | EXPECT_EQ(expected, converted); |
Spencer Low | d21dc82 | 2015-11-12 15:20:15 -0800 | [diff] [blame^] | 179 | } else { |
| 180 | EXPECT_EQ(EILSEQ, errno); |
Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 181 | } |
| 182 | } |
| 183 | |
| 184 | // Manually test an embedded NULL. |
| 185 | std::wstring converted; |
| 186 | EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted)); |
| 187 | ASSERT_EQ(3U, converted.length()); |
| 188 | EXPECT_EQ(static_cast<wchar_t>(0), converted[0]); |
| 189 | EXPECT_EQ('Z', converted[1]); |
| 190 | EXPECT_EQ('\t', converted[2]); |
| 191 | |
| 192 | // Make sure that conversion replaces, not appends. |
| 193 | EXPECT_TRUE(UTF8ToWide("B", 1, &converted)); |
| 194 | ASSERT_EQ(1U, converted.length()); |
| 195 | EXPECT_EQ('B', converted[0]); |
| 196 | } |
| 197 | |
| 198 | #if defined(WCHAR_T_IS_UTF16) |
| 199 | // This test is only valid when wchar_t == UTF-16. |
| 200 | TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) { |
| 201 | struct WideToUTF8Case { |
| 202 | const wchar_t* utf16; |
| 203 | const char* utf8; |
| 204 | bool success; |
| 205 | } convert_cases[] = { |
| 206 | // Regular UTF-16 input. |
| 207 | {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, |
| 208 | // Test a non-BMP character. |
| 209 | {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true}, |
| 210 | // Non-characters are passed through. |
| 211 | {L"\xffffHello", "\xEF\xBF\xBFHello", true}, |
| 212 | {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true}, |
| 213 | // The first character is a truncated UTF-16 character. |
| 214 | // Note that for whatever reason, this test fails on Windows XP. |
| 215 | {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", |
| 216 | #if (WINVER >= 0x0600) |
| 217 | // Only Vista and later has a new API/flag that correctly returns false. |
| 218 | false |
| 219 | #else |
| 220 | true |
| 221 | #endif |
| 222 | }, |
| 223 | // Truncated at the end. |
| 224 | // Note that for whatever reason, this test fails on Windows XP. |
| 225 | {L"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", |
| 226 | #if (WINVER >= 0x0600) |
| 227 | // Only Vista and later has a new API/flag that correctly returns false. |
| 228 | false |
| 229 | #else |
| 230 | true |
| 231 | #endif |
| 232 | }, |
| 233 | }; |
| 234 | |
| 235 | for (size_t i = 0; i < arraysize(convert_cases); i++) { |
| 236 | std::string converted; |
Spencer Low | d21dc82 | 2015-11-12 15:20:15 -0800 | [diff] [blame^] | 237 | errno = 0; |
Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 238 | const bool success = WideToUTF8(convert_cases[i].utf16, |
| 239 | wcslen(convert_cases[i].utf16), |
| 240 | &converted); |
| 241 | EXPECT_EQ(convert_cases[i].success, success); |
| 242 | // The original test always compared expected and converted, but don't do |
| 243 | // that because our implementation of WideToUTF8() does not guarantee to |
| 244 | // produce the same output in error situations. |
| 245 | if (success) { |
| 246 | std::string expected(convert_cases[i].utf8); |
| 247 | EXPECT_EQ(expected, converted); |
Spencer Low | d21dc82 | 2015-11-12 15:20:15 -0800 | [diff] [blame^] | 248 | } else { |
| 249 | EXPECT_EQ(EILSEQ, errno); |
Elliott Hughes | c1fd492 | 2015-11-11 18:02:29 +0000 | [diff] [blame] | 250 | } |
| 251 | } |
| 252 | } |
| 253 | |
| 254 | #elif defined(WCHAR_T_IS_UTF32) |
| 255 | // This test is only valid when wchar_t == UTF-32. |
| 256 | TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) { |
| 257 | struct WideToUTF8Case { |
| 258 | const wchar_t* utf32; |
| 259 | const char* utf8; |
| 260 | bool success; |
| 261 | } convert_cases[] = { |
| 262 | // Regular 16-bit input. |
| 263 | {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true}, |
| 264 | // Test a non-BMP character. |
| 265 | {L"A\x10300z", "A\xF0\x90\x8C\x80z", true}, |
| 266 | // Non-characters are passed through. |
| 267 | {L"\xffffHello", "\xEF\xBF\xBFHello", true}, |
| 268 | {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true}, |
| 269 | // Invalid Unicode code points. |
| 270 | {L"\xfffffffHello", "\xEF\xBF\xBDHello", false}, |
| 271 | // The first character is a truncated UTF-16 character. |
| 272 | {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false}, |
| 273 | {L"\xdc01Hello", "\xef\xbf\xbdHello", false}, |
| 274 | }; |
| 275 | |
| 276 | for (size_t i = 0; i < arraysize(convert_cases); i++) { |
| 277 | std::string converted; |
| 278 | EXPECT_EQ(convert_cases[i].success, |
| 279 | WideToUTF8(convert_cases[i].utf32, |
| 280 | wcslen(convert_cases[i].utf32), |
| 281 | &converted)); |
| 282 | std::string expected(convert_cases[i].utf8); |
| 283 | EXPECT_EQ(expected, converted); |
| 284 | } |
| 285 | } |
| 286 | #endif // defined(WCHAR_T_IS_UTF32) |
| 287 | |
| 288 | // The test below uses these types and functions, so just do enough to get the |
| 289 | // test running. |
| 290 | typedef wchar_t char16; |
| 291 | typedef std::wstring string16; |
| 292 | |
| 293 | template<typename T> |
| 294 | static void* WriteInto(T* t, size_t size) { |
| 295 | // std::(w)string::resize() already includes space for a NULL terminator. |
| 296 | t->resize(size - 1); |
| 297 | return &(*t)[0]; |
| 298 | } |
| 299 | |
| 300 | // A stub implementation that calls a helper from above, just to get the test |
| 301 | // below working. This is just for testing and should not be moved to base |
| 302 | // because this ignores errors which is probably not a good idea, plus it takes |
| 303 | // a string16 type which we don't really have. |
| 304 | static std::string UTF16ToUTF8(const string16& utf16) { |
| 305 | return WideToUTF8(utf16); |
| 306 | } |
| 307 | |
| 308 | TEST(UTFStringConversionsTest, ConvertMultiString) { |
| 309 | static char16 multi16[] = { |
| 310 | 'f', 'o', 'o', '\0', |
| 311 | 'b', 'a', 'r', '\0', |
| 312 | 'b', 'a', 'z', '\0', |
| 313 | '\0' |
| 314 | }; |
| 315 | static char multi[] = { |
| 316 | 'f', 'o', 'o', '\0', |
| 317 | 'b', 'a', 'r', '\0', |
| 318 | 'b', 'a', 'z', '\0', |
| 319 | '\0' |
| 320 | }; |
| 321 | string16 multistring16; |
| 322 | memcpy(WriteInto(&multistring16, arraysize(multi16)), multi16, |
| 323 | sizeof(multi16)); |
| 324 | EXPECT_EQ(arraysize(multi16) - 1, multistring16.length()); |
| 325 | std::string expected; |
| 326 | memcpy(WriteInto(&expected, arraysize(multi)), multi, sizeof(multi)); |
| 327 | EXPECT_EQ(arraysize(multi) - 1, expected.length()); |
| 328 | const std::string& converted = UTF16ToUTF8(multistring16); |
| 329 | EXPECT_EQ(arraysize(multi) - 1, converted.length()); |
| 330 | EXPECT_EQ(expected, converted); |
| 331 | } |
| 332 | |
| 333 | // The tests below from sys_string_conversions_unittest.cc call SysWideToUTF8() |
| 334 | // and SysUTF8ToWide(), so these are stub implementations that call the helpers |
| 335 | // above. These are just for testing and should not be moved to base because |
| 336 | // they ignore errors which is probably not a good idea. |
| 337 | |
| 338 | static std::string SysWideToUTF8(const std::wstring& utf16) { |
| 339 | return WideToUTF8(utf16); |
| 340 | } |
| 341 | |
| 342 | static std::wstring SysUTF8ToWide(const std::string& utf8) { |
| 343 | return UTF8ToWide(utf8); |
| 344 | } |
| 345 | |
| 346 | // Below is adapted from https://chromium.googlesource.com/chromium/src/+/master/base/strings/sys_string_conversions_unittest.cc |
| 347 | |
| 348 | // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 349 | // Use of this source code is governed by a BSD-style license that can be |
| 350 | // found in the LICENSE file. |
| 351 | |
| 352 | #ifdef WCHAR_T_IS_UTF32 |
| 353 | static const std::wstring kSysWideOldItalicLetterA = L"\x10300"; |
| 354 | #else |
| 355 | static const std::wstring kSysWideOldItalicLetterA = L"\xd800\xdf00"; |
| 356 | #endif |
| 357 | |
| 358 | TEST(SysStrings, SysWideToUTF8) { |
| 359 | EXPECT_EQ("Hello, world", SysWideToUTF8(L"Hello, world")); |
| 360 | EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToUTF8(L"\x4f60\x597d")); |
| 361 | |
| 362 | // >16 bits |
| 363 | EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToUTF8(kSysWideOldItalicLetterA)); |
| 364 | |
| 365 | // Error case. When Windows finds a UTF-16 character going off the end of |
| 366 | // a string, it just converts that literal value to UTF-8, even though this |
| 367 | // is invalid. |
| 368 | // |
| 369 | // This is what XP does, but Vista has different behavior, so we don't bother |
| 370 | // verifying it: |
| 371 | // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw", |
| 372 | // SysWideToUTF8(L"\x4f60\xd800zyxw")); |
| 373 | |
| 374 | // Test embedded NULLs. |
| 375 | std::wstring wide_null(L"a"); |
| 376 | wide_null.push_back(0); |
| 377 | wide_null.push_back('b'); |
| 378 | |
| 379 | std::string expected_null("a"); |
| 380 | expected_null.push_back(0); |
| 381 | expected_null.push_back('b'); |
| 382 | |
| 383 | EXPECT_EQ(expected_null, SysWideToUTF8(wide_null)); |
| 384 | } |
| 385 | |
| 386 | TEST(SysStrings, SysUTF8ToWide) { |
| 387 | EXPECT_EQ(L"Hello, world", SysUTF8ToWide("Hello, world")); |
| 388 | EXPECT_EQ(L"\x4f60\x597d", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5\xbd")); |
| 389 | // >16 bits |
| 390 | EXPECT_EQ(kSysWideOldItalicLetterA, SysUTF8ToWide("\xF0\x90\x8C\x80")); |
| 391 | |
| 392 | // Error case. When Windows finds an invalid UTF-8 character, it just skips |
| 393 | // it. This seems weird because it's inconsistent with the reverse conversion. |
| 394 | // |
| 395 | // This is what XP does, but Vista has different behavior, so we don't bother |
| 396 | // verifying it: |
| 397 | // EXPECT_EQ(L"\x4f60zyxw", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5zyxw")); |
| 398 | |
| 399 | // Test embedded NULLs. |
| 400 | std::string utf8_null("a"); |
| 401 | utf8_null.push_back(0); |
| 402 | utf8_null.push_back('b'); |
| 403 | |
| 404 | std::wstring expected_null(L"a"); |
| 405 | expected_null.push_back(0); |
| 406 | expected_null.push_back('b'); |
| 407 | |
| 408 | EXPECT_EQ(expected_null, SysUTF8ToWide(utf8_null)); |
| 409 | } |
| 410 | |
| 411 | } // namespace base |
| 412 | } // namespace android |