| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * Copyright (C) 2014 The Android Open Source Project | 
 | 3 |  * All rights reserved. | 
 | 4 |  * | 
 | 5 |  * Redistribution and use in source and binary forms, with or without | 
 | 6 |  * modification, are permitted provided that the following conditions | 
 | 7 |  * are met: | 
 | 8 |  *  * Redistributions of source code must retain the above copyright | 
 | 9 |  *    notice, this list of conditions and the following disclaimer. | 
 | 10 |  *  * Redistributions in binary form must reproduce the above copyright | 
 | 11 |  *    notice, this list of conditions and the following disclaimer in | 
 | 12 |  *    the documentation and/or other materials provided with the | 
 | 13 |  *    distribution. | 
 | 14 |  * | 
 | 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 | 16 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 | 17 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
 | 18 |  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
 | 19 |  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
 | 20 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
 | 21 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
 | 22 |  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
 | 23 |  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
 | 24 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
 | 25 |  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
 | 26 |  * SUCH DAMAGE. | 
 | 27 |  */ | 
 | 28 |  | 
 | 29 | #include <errno.h> | 
 | 30 | #include <sys/param.h> | 
 | 31 | #include <uchar.h> | 
 | 32 | #include <wchar.h> | 
 | 33 |  | 
 | 34 | #include "private/bionic_mbstate.h" | 
 | 35 |  | 
 | 36 | size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) { | 
 | 37 |   static mbstate_t __private_state; | 
| Yi Kong | 32bc0fc | 2018-08-02 17:31:13 -0700 | [diff] [blame] | 38 |   mbstate_t* state = (ps == nullptr) ? &__private_state : ps; | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 39 |  | 
 | 40 |   // We should never get to a state which has all 4 bytes of the sequence set. | 
 | 41 |   // Full state verification is done when decoding the sequence (after we have | 
 | 42 |   // all the bytes). | 
 | 43 |   if (mbstate_get_byte(state, 3) != 0) { | 
| Elliott Hughes | 697f42a | 2017-07-14 17:00:05 -0700 | [diff] [blame] | 44 |     return mbstate_reset_and_return_illegal(EINVAL, state); | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 45 |   } | 
 | 46 |  | 
| Yi Kong | 32bc0fc | 2018-08-02 17:31:13 -0700 | [diff] [blame] | 47 |   if (s == nullptr) { | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 48 |     s = ""; | 
 | 49 |     n = 1; | 
| Yi Kong | 32bc0fc | 2018-08-02 17:31:13 -0700 | [diff] [blame] | 50 |     pc32 = nullptr; | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 51 |   } | 
 | 52 |  | 
 | 53 |   if (n == 0) { | 
 | 54 |     return 0; | 
 | 55 |   } | 
 | 56 |  | 
 | 57 |   uint8_t ch; | 
 | 58 |   if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { | 
 | 59 |     // Fast path for plain ASCII characters. | 
| Yi Kong | 32bc0fc | 2018-08-02 17:31:13 -0700 | [diff] [blame] | 60 |     if (pc32 != nullptr) { | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 61 |       *pc32 = ch; | 
 | 62 |     } | 
 | 63 |     return (ch != '\0' ? 1 : 0); | 
 | 64 |   } | 
 | 65 |  | 
 | 66 |   // Determine the number of octets that make up this character | 
 | 67 |   // from the first octet, and a mask that extracts the | 
 | 68 |   // interesting bits of the first octet. We already know | 
 | 69 |   // the character is at least two bytes long. | 
 | 70 |   size_t length; | 
 | 71 |   int mask; | 
 | 72 |  | 
 | 73 |   // We also specify a lower bound for the character code to | 
 | 74 |   // detect redundant, non-"shortest form" encodings. For | 
 | 75 |   // example, the sequence C0 80 is _not_ a legal representation | 
 | 76 |   // of the null character. This enforces a 1-to-1 mapping | 
 | 77 |   // between character codes and their multibyte representations. | 
 | 78 |   char32_t lower_bound; | 
 | 79 |  | 
 | 80 |   // The first byte in the state (if any) tells the length. | 
 | 81 |   size_t bytes_so_far = mbstate_bytes_so_far(state); | 
 | 82 |   ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); | 
 | 83 |   if ((ch & 0x80) == 0) { | 
 | 84 |     mask = 0x7f; | 
 | 85 |     length = 1; | 
 | 86 |     lower_bound = 0; | 
 | 87 |   } else if ((ch & 0xe0) == 0xc0) { | 
 | 88 |     mask = 0x1f; | 
 | 89 |     length = 2; | 
 | 90 |     lower_bound = 0x80; | 
 | 91 |   } else if ((ch & 0xf0) == 0xe0) { | 
 | 92 |     mask = 0x0f; | 
 | 93 |     length = 3; | 
 | 94 |     lower_bound = 0x800; | 
 | 95 |   } else if ((ch & 0xf8) == 0xf0) { | 
 | 96 |     mask = 0x07; | 
 | 97 |     length = 4; | 
 | 98 |     lower_bound = 0x10000; | 
 | 99 |   } else { | 
 | 100 |     // Malformed input; input is not UTF-8. See RFC 3629. | 
| Elliott Hughes | 697f42a | 2017-07-14 17:00:05 -0700 | [diff] [blame] | 101 |     return mbstate_reset_and_return_illegal(EILSEQ, state); | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 102 |   } | 
 | 103 |  | 
 | 104 |   // Fill in the state. | 
 | 105 |   size_t bytes_wanted = length - bytes_so_far; | 
 | 106 |   size_t i; | 
 | 107 |   for (i = 0; i < MIN(bytes_wanted, n); i++) { | 
 | 108 |     if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) { | 
 | 109 |       // Malformed input; bad characters in the middle of a character. | 
| Elliott Hughes | 697f42a | 2017-07-14 17:00:05 -0700 | [diff] [blame] | 110 |       return mbstate_reset_and_return_illegal(EILSEQ, state); | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 111 |     } | 
 | 112 |     mbstate_set_byte(state, bytes_so_far + i, *s++); | 
 | 113 |   } | 
 | 114 |   if (i < bytes_wanted) { | 
 | 115 |     return __MB_ERR_INCOMPLETE_SEQUENCE; | 
 | 116 |   } | 
 | 117 |  | 
 | 118 |   // Decode the octet sequence representing the character in chunks | 
 | 119 |   // of 6 bits, most significant first. | 
 | 120 |   char32_t c32 = mbstate_get_byte(state, 0) & mask; | 
 | 121 |   for (i = 1; i < length; i++) { | 
 | 122 |     c32 <<= 6; | 
 | 123 |     c32 |= mbstate_get_byte(state, i) & 0x3f; | 
 | 124 |   } | 
 | 125 |  | 
 | 126 |   if (c32 < lower_bound) { | 
 | 127 |     // Malformed input; redundant encoding. | 
| Elliott Hughes | 697f42a | 2017-07-14 17:00:05 -0700 | [diff] [blame] | 128 |     return mbstate_reset_and_return_illegal(EILSEQ, state); | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 129 |   } | 
| Elliott Hughes | 402c762 | 2018-07-06 17:18:05 -0700 | [diff] [blame] | 130 |   if ((c32 >= 0xd800 && c32 <= 0xdfff) || (c32 > 0x10ffff)) { | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 131 |     // Malformed input; invalid code points. | 
| Elliott Hughes | 697f42a | 2017-07-14 17:00:05 -0700 | [diff] [blame] | 132 |     return mbstate_reset_and_return_illegal(EILSEQ, state); | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 133 |   } | 
| Yi Kong | 32bc0fc | 2018-08-02 17:31:13 -0700 | [diff] [blame] | 134 |   if (pc32 != nullptr) { | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 135 |     *pc32 = c32; | 
 | 136 |   } | 
| Elliott Hughes | 697f42a | 2017-07-14 17:00:05 -0700 | [diff] [blame] | 137 |   return mbstate_reset_and_return(c32 == U'\0' ? 0 : bytes_wanted, state); | 
| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 138 | } |