| /* | 
 |  * Copyright (C) 2014 The Android Open Source Project | 
 |  * All rights reserved. | 
 |  * | 
 |  * Redistribution and use in source and binary forms, with or without | 
 |  * modification, are permitted provided that the following conditions | 
 |  * are met: | 
 |  *  * Redistributions of source code must retain the above copyright | 
 |  *    notice, this list of conditions and the following disclaimer. | 
 |  *  * Redistributions in binary form must reproduce the above copyright | 
 |  *    notice, this list of conditions and the following disclaimer in | 
 |  *    the documentation and/or other materials provided with the | 
 |  *    distribution. | 
 |  * | 
 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
 |  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
 |  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
 |  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
 |  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
 |  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
 |  * SUCH DAMAGE. | 
 |  */ | 
 |  | 
 | #include <errno.h> | 
 | #include <sys/param.h> | 
 | #include <uchar.h> | 
 | #include <wchar.h> | 
 |  | 
 | #include "private/bionic_mbstate.h" | 
 |  | 
 | size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) { | 
 |   static mbstate_t __private_state; | 
 |   mbstate_t* state = (ps == nullptr) ? &__private_state : ps; | 
 |  | 
 |   // We should never get to a state which has all 4 bytes of the sequence set. | 
 |   // Full state verification is done when decoding the sequence (after we have | 
 |   // all the bytes). | 
 |   if (mbstate_get_byte(state, 3) != 0) { | 
 |     return mbstate_reset_and_return_illegal(EINVAL, state); | 
 |   } | 
 |  | 
 |   if (s == nullptr) { | 
 |     s = ""; | 
 |     n = 1; | 
 |     pc32 = nullptr; | 
 |   } | 
 |  | 
 |   if (n == 0) { | 
 |     // C23 7.30.1 (for each `mbrtoc*` function) says: | 
 |     // | 
 |     // Returns: | 
 |     // | 
 |     //     0 if the next n or fewer bytes complete the multibyte character that | 
 |     //     corresponds to the null wide character (which is the value stored). | 
 |     // | 
 |     //     (size_t)(-2) if the next n bytes contribute to an incomplete (but | 
 |     //     potentially valid) multibyte character, and all n bytes have been | 
 |     //     processed (no value is stored). | 
 |     // | 
 |     // Bionic historically interpreted the behavior when n is 0 to be the next 0 | 
 |     // bytes decoding to the null. That's a pretty bad interpretation, and both | 
 |     // glibc and musl return -2 for that case. | 
 |     return BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE; | 
 |   } | 
 |  | 
 |   uint8_t ch; | 
 |   if (mbstate_is_initial(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { | 
 |     // Fast path for plain ASCII characters. | 
 |     if (pc32 != nullptr) { | 
 |       *pc32 = ch; | 
 |     } | 
 |     return (ch != '\0' ? 1 : 0); | 
 |   } | 
 |  | 
 |   // Determine the number of octets that make up this character | 
 |   // from the first octet, and a mask that extracts the | 
 |   // interesting bits of the first octet. We already know | 
 |   // the character is at least two bytes long. | 
 |   size_t length; | 
 |   int mask; | 
 |  | 
 |   // We also specify a lower bound for the character code to | 
 |   // detect redundant, non-"shortest form" encodings. For | 
 |   // example, the sequence C0 80 is _not_ a legal representation | 
 |   // of the null character. This enforces a 1-to-1 mapping | 
 |   // between character codes and their multibyte representations. | 
 |   char32_t lower_bound; | 
 |  | 
 |   // The first byte in the state (if any) tells the length. | 
 |   size_t bytes_so_far = mbstate_bytes_so_far(state); | 
 |   ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); | 
 |   // We already handled the 1-byte case above, so we go straight to 2-bytes... | 
 |   if ((ch & 0xe0) == 0xc0) { | 
 |     mask = 0x1f; | 
 |     length = 2; | 
 |     lower_bound = 0x80; | 
 |   } else if ((ch & 0xf0) == 0xe0) { | 
 |     mask = 0x0f; | 
 |     length = 3; | 
 |     lower_bound = 0x800; | 
 |   } else if ((ch & 0xf8) == 0xf0) { | 
 |     mask = 0x07; | 
 |     length = 4; | 
 |     lower_bound = 0x10000; | 
 |   } else { | 
 |     // Malformed input; input is not UTF-8. See RFC 3629. | 
 |     return mbstate_reset_and_return_illegal(EILSEQ, state); | 
 |   } | 
 |  | 
 |   // Fill in the state. | 
 |   size_t bytes_wanted = length - bytes_so_far; | 
 |   size_t i; | 
 |   for (i = 0; i < MIN(bytes_wanted, n); i++) { | 
 |     if (!mbstate_is_initial(state) && ((*s & 0xc0) != 0x80)) { | 
 |       // Malformed input; bad characters in the middle of a character. | 
 |       return mbstate_reset_and_return_illegal(EILSEQ, state); | 
 |     } | 
 |     mbstate_set_byte(state, bytes_so_far + i, *s++); | 
 |   } | 
 |   if (i < bytes_wanted) { | 
 |     return BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE; | 
 |   } | 
 |  | 
 |   // Decode the octet sequence representing the character in chunks | 
 |   // of 6 bits, most significant first. | 
 |   char32_t c32 = mbstate_get_byte(state, 0) & mask; | 
 |   for (i = 1; i < length; i++) { | 
 |     c32 <<= 6; | 
 |     c32 |= mbstate_get_byte(state, i) & 0x3f; | 
 |   } | 
 |  | 
 |   if (c32 < lower_bound) { | 
 |     // Malformed input; redundant encoding. | 
 |     return mbstate_reset_and_return_illegal(EILSEQ, state); | 
 |   } | 
 |   if ((c32 >= 0xd800 && c32 <= 0xdfff) || (c32 > 0x10ffff)) { | 
 |     // Malformed input; invalid code points. | 
 |     return mbstate_reset_and_return_illegal(EILSEQ, state); | 
 |   } | 
 |   if (pc32 != nullptr) { | 
 |     *pc32 = c32; | 
 |   } | 
 |   return mbstate_reset_and_return(c32 == U'\0' ? 0 : bytes_wanted, state); | 
 | } |