Fix some wcwidth() special cases. Detailed explanation in the code comments. Test: treehugger Change-Id: I2aef2510724c1c622b83e226b51d4c8429b88272

commit: 20a9f99b3b600bf49b11bf9169d37d86a8fd21d6 [log] [tgz]
author: Elliott Hughes <enh@google.com> Wed May 29 21:45:51 2024 +0000
committer: Elliott Hughes <enh@google.com> Wed May 29 21:45:51 2024 +0000
tree: f7e14a08e95f183fb5feac9483b5d10131693df2
parent: 7c22b4c3723243f91406335d4a8689795a291c51 [diff] [blame]
diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp
index 5256b08..387d23b 100644
--- a/tests/wchar_test.cpp
+++ b/tests/wchar_test.cpp

@@ -1075,10 +1075,39 @@
 
   EXPECT_EQ(0, wcwidth(0x0300)); // Combining grave.
   EXPECT_EQ(0, wcwidth(0x20dd)); // Combining enclosing circle.
-  EXPECT_EQ(0, wcwidth(0x00ad)); // Soft hyphen (SHY).
   EXPECT_EQ(0, wcwidth(0x200b)); // Zero width space.
 }
 
+TEST(wchar, wcwidth_non_spacing_special_cases) {
+  if (!have_dl()) return;
+
+  // U+00AD is a soft hyphen, which normally shouldn't be rendered at all.
+  // I think the assumption here is that you elide the soft hyphen character
+  // completely in that case, and never call wcwidth() if you don't want to
+  // render it as an actual hyphen. Whereas if you do want to render it,
+  // you call wcwidth(), and 1 is the right answer. This is what Markus Kuhn's
+  // original https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c did,
+  // and glibc and iOS do the same.
+  // See also: https://en.wikipedia.org/wiki/Soft_hyphen#Text_to_be_formatted_by_the_recipient
+  EXPECT_EQ(1, wcwidth(0x00ad)); // Soft hyphen (SHY).
+
+  // U+115F is the Hangeul choseong filler (for a degenerate composed
+  // character missing an initial consonant (as opposed to one with a
+  // leading ieung). Since the code points for combining jungseong (medial
+  // vowels) and jongseong (trailing consonants) have width 0, the choseong
+  // (initial consonant) has width 2 to cover the entire syllable. So unless
+  // U+115f has width 2, a degenerate composed "syllable" without an initial
+  // consonant or ieung would have a total width of 0, which is silly.
+  // The following sequence is effectively "약" without the leading ieung...
+  EXPECT_EQ(2, wcwidth(0x115f)); // Hangeul choseong filler.
+  EXPECT_EQ(0, wcwidth(0x1163)); // Hangeul jungseong "ya".
+  EXPECT_EQ(0, wcwidth(0x11a8)); // Hangeul jongseong "kiyeok".
+
+  // U+1160, the jungseong filler, has width 0 because it must have been
+  // preceded by either a choseong or choseong filler.
+  EXPECT_EQ(0, wcwidth(0x1160));
+}
+
 TEST(wchar, wcwidth_cjk) {
   if (!have_dl()) return;
 
@@ -1102,8 +1131,10 @@
   if (!have_dl()) return;
 
   EXPECT_EQ(2, wcwidth(0xac00)); // Start of block.
-  EXPECT_EQ(2, wcwidth(0xd7a3)); // End of defined code points in Unicode 7.
-  // Undefined characters at the end of the block have width 1.
+  EXPECT_EQ(2, wcwidth(0xd7a3)); // End of defined code points as of Unicode 15.
+
+  // Undefined characters at the end of the block currently have width 1,
+  // but since they're undefined, we don't test that.
 }
 
 TEST(wchar, wcwidth_kana) {
@@ -1137,11 +1168,21 @@
   EXPECT_EQ(0, wcwidth(0xe0000)); // ...through 0xe0fff.
 }
 
-TEST(wchar, wcwidth_korean_common_non_syllables) {
+TEST(wchar, wcwidth_hangeul_compatibility_jamo) {
   if (!have_dl()) return;
 
-  EXPECT_EQ(2, wcwidth(L'ㅜ')); // Korean "crying" emoticon.
-  EXPECT_EQ(2, wcwidth(L'ㅋ')); // Korean "laughing" emoticon.
+  // These are actually the *compatibility* jamo code points, *not* the regular
+  // jamo code points (U+1100-U+11FF) using a jungseong filler. If you use the
+  // Android IME to type any of these, you get these code points.
+
+  // (Half of) the Korean "crying" emoticon "ㅠㅠ".
+  // Actually U+3160 "Hangeul Letter Yu" from Hangeul Compatibility Jamo.
+  EXPECT_EQ(2, wcwidth(L'ㅠ'));
+  // The two halves of the Korean internet shorthand "ㄱㅅ" (short for 감사).
+  // Actually U+3131 "Hangeul Letter Kiyeok" and U+3145 "Hangeul Letter Sios"
+  // from Hangeul Compatibility Jamo.
+  EXPECT_EQ(2, wcwidth(L'ㄱ'));
+  EXPECT_EQ(2, wcwidth(L'ㅅ'));
 }
 
 TEST(wchar, wcswidth) {
commit	20a9f99b3b600bf49b11bf9169d37d86a8fd21d6	[log] [tgz]
author	Elliott Hughes <enh@google.com>	Wed May 29 21:45:51 2024 +0000
committer	Elliott Hughes <enh@google.com>	Wed May 29 21:45:51 2024 +0000
tree	f7e14a08e95f183fb5feac9483b5d10131693df2
parent	7c22b4c3723243f91406335d4a8689795a291c51 [diff] [blame]