Adds functionality specified by uchar.h
mbrtoc32 and c32rtomb get their implementations from mbrtowc and wcrtomb. The
wc functions now simply call the c32 functions.
Bug: 14646575
Change-Id: I49d4b95fed0f9d790260c996c4d0f8bfd1686324
diff --git a/libc/bionic/wchar.cpp b/libc/bionic/wchar.cpp
index 5da882f..acb2761 100644
--- a/libc/bionic/wchar.cpp
+++ b/libc/bionic/wchar.cpp
@@ -27,9 +27,12 @@
*/
#include <errno.h>
-#include <string.h>
#include <sys/param.h>
+#include <string.h>
#include <wchar.h>
+#include <uchar.h>
+
+#include "private/bionic_mbstate.h"
//
// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
@@ -50,36 +53,6 @@
// function pointers.
//
-#define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)
-#define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)
-
-static size_t mbstate_bytes_so_far(const mbstate_t* ps) {
- return
- (ps->__seq[2] != 0) ? 3 :
- (ps->__seq[1] != 0) ? 2 :
- (ps->__seq[0] != 0) ? 1 : 0;
-}
-
-static void mbstate_set_byte(mbstate_t* ps, int i, char byte) {
- ps->__seq[i] = static_cast<uint8_t>(byte);
-}
-
-static uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {
- return ps->__seq[n];
-}
-
-static size_t reset_and_return_illegal(int _errno, mbstate_t* ps) {
- errno = _errno;
- *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
- return ERR_ILLEGAL_SEQUENCE;
-}
-
-static size_t reset_and_return(int _return, mbstate_t* ps) {
- *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
- return _return;
-}
-
-
int mbsinit(const mbstate_t* ps) {
return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
}
@@ -88,104 +61,8 @@
static mbstate_t __private_state;
mbstate_t* state = (ps == NULL) ? &__private_state : ps;
- // We should never get to a state which has all 4 bytes of the sequence set.
- // Full state verification is done when decoding the sequence (after we have
- // all the bytes).
- if (mbstate_get_byte(state, 3) != 0) {
- return reset_and_return_illegal(EINVAL, state);
- }
-
- if (s == NULL) {
- s = "";
- n = 1;
- pwc = NULL;
- }
-
- if (n == 0) {
- return 0;
- }
-
- uint8_t ch;
- if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) {
- // Fast path for plain ASCII characters.
- if (pwc != NULL) {
- *pwc = ch;
- }
- return (ch != '\0' ? 1 : 0);
- }
-
- // Determine the number of octets that make up this character
- // from the first octet, and a mask that extracts the
- // interesting bits of the first octet. We already know
- // the character is at least two bytes long.
- size_t length;
- int mask;
-
- // We also specify a lower bound for the character code to
- // detect redundant, non-"shortest form" encodings. For
- // example, the sequence C0 80 is _not_ a legal representation
- // of the null character. This enforces a 1-to-1 mapping
- // between character codes and their multibyte representations.
- wchar_t lower_bound;
-
- // The first byte in the state (if any) tells the length.
- size_t bytes_so_far = mbstate_bytes_so_far(state);
- ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s);
- if ((ch & 0x80) == 0) {
- mask = 0x7f;
- length = 1;
- lower_bound = 0;
- } else if ((ch & 0xe0) == 0xc0) {
- mask = 0x1f;
- length = 2;
- lower_bound = 0x80;
- } else if ((ch & 0xf0) == 0xe0) {
- mask = 0x0f;
- length = 3;
- lower_bound = 0x800;
- } else if ((ch & 0xf8) == 0xf0) {
- mask = 0x07;
- length = 4;
- lower_bound = 0x10000;
- } else {
- // Malformed input; input is not UTF-8. See RFC 3629.
- return reset_and_return_illegal(EILSEQ, state);
- }
-
- // Fill in the state.
- size_t bytes_wanted = length - bytes_so_far;
- size_t i;
- for (i = 0; i < MIN(bytes_wanted, n); i++) {
- if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) {
- // Malformed input; bad characters in the middle of a character.
- return reset_and_return_illegal(EILSEQ, state);
- }
- mbstate_set_byte(state, bytes_so_far + i, *s++);
- }
- if (i < bytes_wanted) {
- return ERR_INCOMPLETE_SEQUENCE;
- }
-
- // Decode the octet sequence representing the character in chunks
- // of 6 bits, most significant first.
- wchar_t wch = mbstate_get_byte(state, 0) & mask;
- for (i = 1; i < length; i++) {
- wch <<= 6;
- wch |= mbstate_get_byte(state, i) & 0x3f;
- }
-
- if (wch < lower_bound) {
- // Malformed input; redundant encoding.
- return reset_and_return_illegal(EILSEQ, state);
- }
- if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) {
- // Malformed input; invalid code points.
- return reset_and_return_illegal(EILSEQ, state);
- }
- if (pwc != NULL) {
- *pwc = wch;
- }
- return reset_and_return(wch == L'\0' ? 0 : bytes_wanted, state);
+ // Our wchar_t is UTF-32
+ return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
}
size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
@@ -212,10 +89,10 @@
r = 1;
} else {
r = mbrtowc(NULL, *src + i, nmc - i, state);
- if (r == ERR_ILLEGAL_SEQUENCE) {
+ if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
return reset_and_return_illegal(EILSEQ, state);
}
- if (r == ERR_INCOMPLETE_SEQUENCE) {
+ if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
return reset_and_return_illegal(EILSEQ, state);
}
if (r == 0) {
@@ -246,11 +123,11 @@
r = 1;
} else {
r = mbrtowc(dst + o, *src + i, nmc - i, state);
- if (r == ERR_ILLEGAL_SEQUENCE) {
+ if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
*src += i;
return reset_and_return_illegal(EILSEQ, state);
}
- if (r == ERR_INCOMPLETE_SEQUENCE) {
+ if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
*src += nmc;
return reset_and_return(EILSEQ, state);
}
@@ -272,63 +149,8 @@
static mbstate_t __private_state;
mbstate_t* state = (ps == NULL) ? &__private_state : ps;
- if (s == NULL) {
- // Equivalent to wcrtomb(buf, L'\0', ps).
- return reset_and_return(1, state);
- }
-
- // POSIX states that if wc is a null wide character, a null byte shall be
- // stored, preceded by any shift sequence needed to restore the initial shift
- // state. Since shift states are not supported, only the null byte is stored.
- if (wc == L'\0') {
- *s = '\0';
- reset_and_return(1, state);
- }
-
- if (!mbsinit(state)) {
- return reset_and_return_illegal(EILSEQ, state);
- }
-
- if ((wc & ~0x7f) == 0) {
- // Fast path for plain ASCII characters.
- *s = wc;
- return 1;
- }
-
- // Determine the number of octets needed to represent this character.
- // We always output the shortest sequence possible. Also specify the
- // first few bits of the first octet, which contains the information
- // about the sequence length.
- uint8_t lead;
- size_t length;
- if ((wc & ~0x7f) == 0) {
- lead = 0;
- length = 1;
- } else if ((wc & ~0x7ff) == 0) {
- lead = 0xc0;
- length = 2;
- } else if ((wc & ~0xffff) == 0) {
- lead = 0xe0;
- length = 3;
- } else if ((wc & ~0x1fffff) == 0) {
- lead = 0xf0;
- length = 4;
- } else {
- errno = EILSEQ;
- return ERR_ILLEGAL_SEQUENCE;
- }
-
- // Output the octets representing the character in chunks
- // of 6 bits, least significant last. The first octet is
- // a special case because it contains the sequence length
- // information.
- for (size_t i = length - 1; i > 0; i--) {
- s[i] = (wc & 0x3f) | 0x80;
- wc >>= 6;
- }
- *s = (wc & 0xff) | lead;
-
- return length;
+ // Our wchar_t is UTF-32
+ return c32rtomb(s, static_cast<char32_t>(wc), state);
}
size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
@@ -352,7 +174,7 @@
r = 1;
} else {
r = wcrtomb(buf, wc, state);
- if (r == ERR_ILLEGAL_SEQUENCE) {
+ if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
return r;
}
}
@@ -373,14 +195,14 @@
} else if (len - o >= sizeof(buf)) {
// Enough space to translate in-place.
r = wcrtomb(dst + o, wc, state);
- if (r == ERR_ILLEGAL_SEQUENCE) {
+ if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
*src += i;
return r;
}
} else {
// May not be enough space; use temp buffer.
r = wcrtomb(buf, wc, state);
- if (r == ERR_ILLEGAL_SEQUENCE) {
+ if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
*src += i;
return r;
}