Fix some long-standing UTF-8 bugs.
We we incorrectly rejecting U+fffe and U+ffff, and incorrectly accepting
characters above U+10ffff (see https://tools.ietf.org/html/rfc3629
section 12 for that restriction).
Bug: http://lists.landley.net/pipermail/toybox-landley.net/2017-September/009146.html
Test: ran tests
Test: also ran the exhaustive test from that email thread
Change-Id: I8ae8e41cef01b02933bd4f653ee07791932b79a5
diff --git a/libc/bionic/mbrtoc32.cpp b/libc/bionic/mbrtoc32.cpp
index f004b78..88a077c 100644
--- a/libc/bionic/mbrtoc32.cpp
+++ b/libc/bionic/mbrtoc32.cpp
@@ -127,7 +127,7 @@
// Malformed input; redundant encoding.
return mbstate_reset_and_return_illegal(EILSEQ, state);
}
- if ((c32 >= 0xd800 && c32 <= 0xdfff) || c32 == 0xfffe || c32 == 0xffff) {
+ if ((c32 >= 0xd800 && c32 <= 0xdfff) || (c32 > 0x10ffff)) {
// Malformed input; invalid code points.
return mbstate_reset_and_return_illegal(EILSEQ, state);
}
diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp
index 8b29667..2b4c4d3 100644
--- a/tests/uchar_test.cpp
+++ b/tests/uchar_test.cpp
@@ -322,6 +322,35 @@
#endif
}
+TEST(uchar, mbrtoc32_valid_non_characters) {
+#if HAVE_UCHAR
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ char32_t out[8] = {};
+ ASSERT_EQ(3U, mbrtoc32(out, "\xef\xbf\xbe", 3, nullptr));
+ ASSERT_EQ(0xfffeU, out[0]);
+ ASSERT_EQ(3U, mbrtoc32(out, "\xef\xbf\xbf", 3, nullptr));
+ ASSERT_EQ(0xffffU, out[0]);
+#else
+ GTEST_LOG_(INFO) << "uchar.h is unavailable.\n";
+#endif
+}
+
+TEST(uchar, mbrtoc32_out_of_range) {
+#if HAVE_UCHAR
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ char32_t out[8] = {};
+ errno = 0;
+ ASSERT_EQ(static_cast<size_t>(-1), mbrtoc32(out, "\xf5\x80\x80\x80", 4, nullptr));
+ ASSERT_EQ(EILSEQ, errno);
+#else
+ GTEST_LOG_(INFO) << "uchar.h is unavailable.\n";
+#endif
+}
+
TEST(uchar, mbrtoc32) {
#if HAVE_UCHAR
char32_t out[8];
diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp
index b42e13c..e2def07 100644
--- a/tests/wchar_test.cpp
+++ b/tests/wchar_test.cpp
@@ -305,6 +305,28 @@
ASSERT_EQ(EILSEQ, errno);
}
+TEST(wchar, mbrtowc_valid_non_characters) {
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ wchar_t out[8] = {};
+
+ ASSERT_EQ(3U, mbrtowc(out, "\xef\xbf\xbe", 3, nullptr));
+ ASSERT_EQ(static_cast<wchar_t>(0xfffe), out[0]);
+ ASSERT_EQ(3U, mbrtowc(out, "\xef\xbf\xbf", 3, nullptr));
+ ASSERT_EQ(static_cast<wchar_t>(0xffff), out[0]);
+}
+
+TEST(wchar, mbrtowc_out_of_range) {
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ wchar_t out[8] = {};
+ errno = 0;
+ ASSERT_EQ(static_cast<size_t>(-1), mbrtowc(out, "\xf5\x80\x80\x80", 4, nullptr));
+ ASSERT_EQ(EILSEQ, errno);
+}
+
static void test_mbrtowc_incomplete(mbstate_t* ps) {
ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
uselocale(LC_GLOBAL_LOCALE);