Fix mismatched return types for surrogate pairs.
We've had these backward all this time. The relevant quote is in a
code comment in the implementation, but the first call after
completely decoding a code point that requires a surrogate pair should
return the number of bytes decoded by the most recent call, and the
second call should return -3 (if only C had given those some named
constants that might have been more obviously wrong).
Bug: https://issuetracker.google.com/289419882
Test: Fixed the test, tests run against glibc and musl to confirm
Change-Id: Idabf01075b1cad35b604ede8d676d6f0b1dc91e6
diff --git a/libc/bionic/mbrtoc16.cpp b/libc/bionic/mbrtoc16.cpp
index 154b8a3..e87991a 100644
--- a/libc/bionic/mbrtoc16.cpp
+++ b/libc/bionic/mbrtoc16.cpp
@@ -47,15 +47,36 @@
mbstate_set_byte(state, 3, nconv & 0xff);
*pc16 = ((c32 & 0xffc00) >> 10) | 0xd800;
- // Defined by POSIX as return value for first surrogate character.
- return static_cast<size_t>(-3);
+ // https://issuetracker.google.com/289419882
+ //
+ // We misread the spec when implementing this. The first call should return
+ // the length of the decoded character, and the second call should return -3
+ // to indicate that the output is a continuation of the character decoded by
+ // the first call.
+ //
+ // C23 7.30.1.3.4:
+ //
+ // between 1 and n inclusive if the next n or fewer bytes complete a valid
+ // multibyte character (which is the value stored); the value returned is
+ // the number of bytes that complete the multibyte character.
+ //
+ // (size_t)(-3) if the next character resulting from a previous call has
+ // been stored (no bytes from the input have been consumed by this call).
+ //
+ // The first call returns the number of bytes consumed, and the second call
+ // returns -3.
+ //
+ // All UTF-8 sequences that encode a surrogate pair are 4 bytes, but we may
+ // not have seen the full sequence yet.
+ return nconv;
}
static size_t finish_surrogate(char16_t* pc16, mbstate_t* state) {
char16_t trail = mbstate_get_byte(state, 1) << 8 |
mbstate_get_byte(state, 0);
*pc16 = trail;
- return mbstate_reset_and_return(mbstate_get_byte(state, 3), state);
+ mbstate_reset(state);
+ return static_cast<size_t>(-3);
}
size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps) {
diff --git a/libc/private/bionic_mbstate.h b/libc/private/bionic_mbstate.h
index 29f5aa6..0e5f861 100644
--- a/libc/private/bionic_mbstate.h
+++ b/libc/private/bionic_mbstate.h
@@ -57,14 +57,18 @@
return ps->__seq[n];
}
+static inline void mbstate_reset(mbstate_t* ps) {
+ *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+}
+
static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) {
errno = _errno;
- *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+ mbstate_reset(ps);
return BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE;
}
static inline __wur size_t mbstate_reset_and_return(size_t _return, mbstate_t* ps) {
- *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+ mbstate_reset(ps);
return _return;
}
diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp
index 9e42f58..c08b574 100644
--- a/tests/uchar_test.cpp
+++ b/tests/uchar_test.cpp
@@ -224,32 +224,6 @@
ASSERT_EQ(3U, mbrtoc16(&out, "\xe2\x82\xac" "def", 6, nullptr));
ASSERT_EQ(static_cast<char16_t>(0x20ac), out);
// 4-byte UTF-8 will be returned as a surrogate pair...
-#ifdef __BIONIC__
- // https://issuetracker.google.com/289419882
- //
- // We misread the spec when implementing this. The first call should return
- // the length of the decoded character, and the second call should return -3
- // to indicate that the output is a continuation of the character decoded by
- // the first call.
- //
- // C23 7.30.1.3.4:
- //
- // between 1 and n inclusive if the next n or fewer bytes complete a valid
- // multibyte character (which is the value stored); the value returned is
- // the number of bytes that complete the multibyte character.
- //
- // (size_t)(-3) if the next character resulting from a previous call has
- // been stored (no bytes from the input have been consumed by this call).
- //
- // Leaving the test for the wrong outputs here while we clean up and improve
- // the rest of the tests to get a better handle on the behavior differences
- // before fixing the bug.
- ASSERT_EQ(static_cast<size_t>(-3),
- mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
- ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
- ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d" "ef", 6, nullptr));
- ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#else
ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out,
@@ -257,7 +231,6 @@
"ef",
6, nullptr));
ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#endif
}
TEST(uchar, mbrtoc16_long_sequences) {
@@ -326,14 +299,6 @@
// 4-byte UTF-8.
ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xf4", 1, ps));
ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\x8a\xaf", 2, ps));
-#ifdef __BIONIC__
- // https://issuetracker.google.com/289419882
- // See explanation in mbrtoc16 test for the same bug.
- ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out, "\x8d" "ef", 3, ps));
- ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
- ASSERT_EQ(1U, mbrtoc16(&out, "\x80" "ef", 3, ps));
- ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#else
ASSERT_EQ(1U, mbrtoc16(&out,
"\x8d"
"ef",
@@ -344,7 +309,6 @@
"ef",
3, ps));
ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#endif
ASSERT_TRUE(mbsinit(ps));
// Invalid 2-byte