Fix mismatched return types for surrogate pairs. We've had these backward all this time. The relevant quote is in a code comment in the implementation, but the first call after completely decoding a code point that requires a surrogate pair should return the number of bytes decoded by the most recent call, and the second call should return -3 (if only C had given those some named constants that might have been more obviously wrong). Bug: https://issuetracker.google.com/289419882 Test: Fixed the test, tests run against glibc and musl to confirm Change-Id: Idabf01075b1cad35b604ede8d676d6f0b1dc91e6

commit: 3a8fed1ac339b279000ba89f85041d9e28829288 [log] [tgz]
author: Dan Albert <danalbert@google.com> Thu Jul 20 23:08:22 2023 +0000
committer: Dan Albert <danalbert@google.com> Fri Aug 04 19:48:35 2023 +0000
tree: c5179cf54ec70c3fce54e39fda35eb7acee35386
parent: 013d747dd3a20780336178992a0f85d7beba4db6 [diff]
diff --git a/libc/bionic/mbrtoc16.cpp b/libc/bionic/mbrtoc16.cpp
index 154b8a3..e87991a 100644
--- a/libc/bionic/mbrtoc16.cpp
+++ b/libc/bionic/mbrtoc16.cpp

@@ -47,15 +47,36 @@
   mbstate_set_byte(state, 3, nconv & 0xff);
 
   *pc16 = ((c32 & 0xffc00) >> 10) | 0xd800;
-  // Defined by POSIX as return value for first surrogate character.
-  return static_cast<size_t>(-3);
+  // https://issuetracker.google.com/289419882
+  //
+  // We misread the spec when implementing this. The first call should return
+  // the length of the decoded character, and the second call should return -3
+  // to indicate that the output is a continuation of the character decoded by
+  // the first call.
+  //
+  // C23 7.30.1.3.4:
+  //
+  //     between 1 and n inclusive if the next n or fewer bytes complete a valid
+  //     multibyte character (which is the value stored); the value returned is
+  //     the number of bytes that complete the multibyte character.
+  //
+  //     (size_t)(-3) if the next character resulting from a previous call has
+  //     been stored (no bytes from the input have been consumed by this call).
+  //
+  // The first call returns the number of bytes consumed, and the second call
+  // returns -3.
+  //
+  // All UTF-8 sequences that encode a surrogate pair are 4 bytes, but we may
+  // not have seen the full sequence yet.
+  return nconv;
 }
 
 static size_t finish_surrogate(char16_t* pc16, mbstate_t* state) {
   char16_t trail = mbstate_get_byte(state, 1) << 8 |
                    mbstate_get_byte(state, 0);
   *pc16 = trail;
-  return mbstate_reset_and_return(mbstate_get_byte(state, 3), state);
+  mbstate_reset(state);
+  return static_cast<size_t>(-3);
 }
 
 size_t mbrtoc16(char16_t* pc16, const char* s, size_t n, mbstate_t* ps) {

diff --git a/libc/private/bionic_mbstate.h b/libc/private/bionic_mbstate.h
index 29f5aa6..0e5f861 100644
--- a/libc/private/bionic_mbstate.h
+++ b/libc/private/bionic_mbstate.h

@@ -57,14 +57,18 @@
   return ps->__seq[n];
 }
 
+static inline void mbstate_reset(mbstate_t* ps) {
+  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+}
+
 static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) {
   errno = _errno;
-  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+  mbstate_reset(ps);
   return BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE;
 }
 
 static inline __wur size_t mbstate_reset_and_return(size_t _return, mbstate_t* ps) {
-  *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+  mbstate_reset(ps);
   return _return;
 }
 

diff --git a/tests/uchar_test.cpp b/tests/uchar_test.cpp
index 9e42f58..c08b574 100644
--- a/tests/uchar_test.cpp
+++ b/tests/uchar_test.cpp

@@ -224,32 +224,6 @@
   ASSERT_EQ(3U, mbrtoc16(&out, "\xe2\x82\xac" "def", 6, nullptr));
   ASSERT_EQ(static_cast<char16_t>(0x20ac), out);
   // 4-byte UTF-8 will be returned as a surrogate pair...
-#ifdef __BIONIC__
-  // https://issuetracker.google.com/289419882
-  //
-  // We misread the spec when implementing this. The first call should return
-  // the length of the decoded character, and the second call should return -3
-  // to indicate that the output is a continuation of the character decoded by
-  // the first call.
-  //
-  // C23 7.30.1.3.4:
-  //
-  //     between 1 and n inclusive if the next n or fewer bytes complete a valid
-  //     multibyte character (which is the value stored); the value returned is
-  //     the number of bytes that complete the multibyte character.
-  //
-  //     (size_t)(-3) if the next character resulting from a previous call has
-  //     been stored (no bytes from the input have been consumed by this call).
-  //
-  // Leaving the test for the wrong outputs here while we clean up and improve
-  // the rest of the tests to get a better handle on the behavior differences
-  // before fixing the bug.
-  ASSERT_EQ(static_cast<size_t>(-3),
-            mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
-  ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
-  ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d" "ef", 6, nullptr));
-  ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#else
   ASSERT_EQ(4U, mbrtoc16(&out, "\xf4\x8a\xaf\x8d", 6, nullptr));
   ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
   ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out,
@@ -257,7 +231,6 @@
                                               "ef",
                                               6, nullptr));
   ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#endif
 }
 
 TEST(uchar, mbrtoc16_long_sequences) {
@@ -326,14 +299,6 @@
   // 4-byte UTF-8.
   ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\xf4", 1, ps));
   ASSERT_EQ(static_cast<size_t>(-2), mbrtoc16(&out, "\x8a\xaf", 2, ps));
-#ifdef __BIONIC__
-  // https://issuetracker.google.com/289419882
-  // See explanation in mbrtoc16 test for the same bug.
-  ASSERT_EQ(static_cast<size_t>(-3), mbrtoc16(&out, "\x8d" "ef", 3, ps));
-  ASSERT_EQ(static_cast<char16_t>(0xdbea), out);
-  ASSERT_EQ(1U, mbrtoc16(&out, "\x80" "ef", 3, ps));
-  ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#else
   ASSERT_EQ(1U, mbrtoc16(&out,
                          "\x8d"
                          "ef",
@@ -344,7 +309,6 @@
                                               "ef",
                                               3, ps));
   ASSERT_EQ(static_cast<char16_t>(0xdfcd), out);
-#endif
   ASSERT_TRUE(mbsinit(ps));
 
   // Invalid 2-byte
commit	3a8fed1ac339b279000ba89f85041d9e28829288	[log] [tgz]
author	Dan Albert <danalbert@google.com>	Thu Jul 20 23:08:22 2023 +0000
committer	Dan Albert <danalbert@google.com>	Fri Aug 04 19:48:35 2023 +0000
tree	c5179cf54ec70c3fce54e39fda35eb7acee35386
parent	013d747dd3a20780336178992a0f85d7beba4db6 [diff]