Fix some wcwidth() special cases.
Detailed explanation in the code comments.
Test: treehugger
Change-Id: I2aef2510724c1c622b83e226b51d4c8429b88272
diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp
index 5256b08..387d23b 100644
--- a/tests/wchar_test.cpp
+++ b/tests/wchar_test.cpp
@@ -1075,10 +1075,39 @@
EXPECT_EQ(0, wcwidth(0x0300)); // Combining grave.
EXPECT_EQ(0, wcwidth(0x20dd)); // Combining enclosing circle.
- EXPECT_EQ(0, wcwidth(0x00ad)); // Soft hyphen (SHY).
EXPECT_EQ(0, wcwidth(0x200b)); // Zero width space.
}
+TEST(wchar, wcwidth_non_spacing_special_cases) {
+ if (!have_dl()) return;
+
+ // U+00AD is a soft hyphen, which normally shouldn't be rendered at all.
+ // I think the assumption here is that you elide the soft hyphen character
+ // completely in that case, and never call wcwidth() if you don't want to
+ // render it as an actual hyphen. Whereas if you do want to render it,
+ // you call wcwidth(), and 1 is the right answer. This is what Markus Kuhn's
+ // original https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c did,
+ // and glibc and iOS do the same.
+ // See also: https://en.wikipedia.org/wiki/Soft_hyphen#Text_to_be_formatted_by_the_recipient
+ EXPECT_EQ(1, wcwidth(0x00ad)); // Soft hyphen (SHY).
+
+ // U+115F is the Hangeul choseong filler (for a degenerate composed
+ // character missing an initial consonant (as opposed to one with a
+ // leading ieung). Since the code points for combining jungseong (medial
+ // vowels) and jongseong (trailing consonants) have width 0, the choseong
+ // (initial consonant) has width 2 to cover the entire syllable. So unless
+ // U+115f has width 2, a degenerate composed "syllable" without an initial
+ // consonant or ieung would have a total width of 0, which is silly.
+ // The following sequence is effectively "약" without the leading ieung...
+ EXPECT_EQ(2, wcwidth(0x115f)); // Hangeul choseong filler.
+ EXPECT_EQ(0, wcwidth(0x1163)); // Hangeul jungseong "ya".
+ EXPECT_EQ(0, wcwidth(0x11a8)); // Hangeul jongseong "kiyeok".
+
+ // U+1160, the jungseong filler, has width 0 because it must have been
+ // preceded by either a choseong or choseong filler.
+ EXPECT_EQ(0, wcwidth(0x1160));
+}
+
TEST(wchar, wcwidth_cjk) {
if (!have_dl()) return;
@@ -1102,8 +1131,10 @@
if (!have_dl()) return;
EXPECT_EQ(2, wcwidth(0xac00)); // Start of block.
- EXPECT_EQ(2, wcwidth(0xd7a3)); // End of defined code points in Unicode 7.
- // Undefined characters at the end of the block have width 1.
+ EXPECT_EQ(2, wcwidth(0xd7a3)); // End of defined code points as of Unicode 15.
+
+ // Undefined characters at the end of the block currently have width 1,
+ // but since they're undefined, we don't test that.
}
TEST(wchar, wcwidth_kana) {
@@ -1137,11 +1168,21 @@
EXPECT_EQ(0, wcwidth(0xe0000)); // ...through 0xe0fff.
}
-TEST(wchar, wcwidth_korean_common_non_syllables) {
+TEST(wchar, wcwidth_hangeul_compatibility_jamo) {
if (!have_dl()) return;
- EXPECT_EQ(2, wcwidth(L'ㅜ')); // Korean "crying" emoticon.
- EXPECT_EQ(2, wcwidth(L'ㅋ')); // Korean "laughing" emoticon.
+ // These are actually the *compatibility* jamo code points, *not* the regular
+ // jamo code points (U+1100-U+11FF) using a jungseong filler. If you use the
+ // Android IME to type any of these, you get these code points.
+
+ // (Half of) the Korean "crying" emoticon "ㅠㅠ".
+ // Actually U+3160 "Hangeul Letter Yu" from Hangeul Compatibility Jamo.
+ EXPECT_EQ(2, wcwidth(L'ㅠ'));
+ // The two halves of the Korean internet shorthand "ㄱㅅ" (short for 감사).
+ // Actually U+3131 "Hangeul Letter Kiyeok" and U+3145 "Hangeul Letter Sios"
+ // from Hangeul Compatibility Jamo.
+ EXPECT_EQ(2, wcwidth(L'ㄱ'));
+ EXPECT_EQ(2, wcwidth(L'ㅅ'));
}
TEST(wchar, wcswidth) {