| Dan Albert | 7a7f995 | 2014-06-02 11:33:04 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2014 The Android Open Source Project | 
|  | 3 | * All rights reserved. | 
|  | 4 | * | 
|  | 5 | * Redistribution and use in source and binary forms, with or without | 
|  | 6 | * modification, are permitted provided that the following conditions | 
|  | 7 | * are met: | 
|  | 8 | *  * Redistributions of source code must retain the above copyright | 
|  | 9 | *    notice, this list of conditions and the following disclaimer. | 
|  | 10 | *  * Redistributions in binary form must reproduce the above copyright | 
|  | 11 | *    notice, this list of conditions and the following disclaimer in | 
|  | 12 | *    the documentation and/or other materials provided with the | 
|  | 13 | *    distribution. | 
|  | 14 | * | 
|  | 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 16 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 17 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
|  | 18 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
|  | 19 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
|  | 20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
|  | 21 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
|  | 22 | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
|  | 23 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
|  | 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
|  | 25 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 26 | * SUCH DAMAGE. | 
|  | 27 | */ | 
|  | 28 |  | 
|  | 29 | #include <errno.h> | 
|  | 30 | #include <sys/param.h> | 
|  | 31 | #include <uchar.h> | 
|  | 32 | #include <wchar.h> | 
|  | 33 |  | 
|  | 34 | #include "private/bionic_mbstate.h" | 
|  | 35 |  | 
|  | 36 | size_t mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) { | 
|  | 37 | static mbstate_t __private_state; | 
|  | 38 | mbstate_t* state = (ps == NULL) ? &__private_state : ps; | 
|  | 39 |  | 
|  | 40 | // We should never get to a state which has all 4 bytes of the sequence set. | 
|  | 41 | // Full state verification is done when decoding the sequence (after we have | 
|  | 42 | // all the bytes). | 
|  | 43 | if (mbstate_get_byte(state, 3) != 0) { | 
|  | 44 | return reset_and_return_illegal(EINVAL, state); | 
|  | 45 | } | 
|  | 46 |  | 
|  | 47 | if (s == NULL) { | 
|  | 48 | s = ""; | 
|  | 49 | n = 1; | 
|  | 50 | pc32 = NULL; | 
|  | 51 | } | 
|  | 52 |  | 
|  | 53 | if (n == 0) { | 
|  | 54 | return 0; | 
|  | 55 | } | 
|  | 56 |  | 
|  | 57 | uint8_t ch; | 
|  | 58 | if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { | 
|  | 59 | // Fast path for plain ASCII characters. | 
|  | 60 | if (pc32 != NULL) { | 
|  | 61 | *pc32 = ch; | 
|  | 62 | } | 
|  | 63 | return (ch != '\0' ? 1 : 0); | 
|  | 64 | } | 
|  | 65 |  | 
|  | 66 | // Determine the number of octets that make up this character | 
|  | 67 | // from the first octet, and a mask that extracts the | 
|  | 68 | // interesting bits of the first octet. We already know | 
|  | 69 | // the character is at least two bytes long. | 
|  | 70 | size_t length; | 
|  | 71 | int mask; | 
|  | 72 |  | 
|  | 73 | // We also specify a lower bound for the character code to | 
|  | 74 | // detect redundant, non-"shortest form" encodings. For | 
|  | 75 | // example, the sequence C0 80 is _not_ a legal representation | 
|  | 76 | // of the null character. This enforces a 1-to-1 mapping | 
|  | 77 | // between character codes and their multibyte representations. | 
|  | 78 | char32_t lower_bound; | 
|  | 79 |  | 
|  | 80 | // The first byte in the state (if any) tells the length. | 
|  | 81 | size_t bytes_so_far = mbstate_bytes_so_far(state); | 
|  | 82 | ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); | 
|  | 83 | if ((ch & 0x80) == 0) { | 
|  | 84 | mask = 0x7f; | 
|  | 85 | length = 1; | 
|  | 86 | lower_bound = 0; | 
|  | 87 | } else if ((ch & 0xe0) == 0xc0) { | 
|  | 88 | mask = 0x1f; | 
|  | 89 | length = 2; | 
|  | 90 | lower_bound = 0x80; | 
|  | 91 | } else if ((ch & 0xf0) == 0xe0) { | 
|  | 92 | mask = 0x0f; | 
|  | 93 | length = 3; | 
|  | 94 | lower_bound = 0x800; | 
|  | 95 | } else if ((ch & 0xf8) == 0xf0) { | 
|  | 96 | mask = 0x07; | 
|  | 97 | length = 4; | 
|  | 98 | lower_bound = 0x10000; | 
|  | 99 | } else { | 
|  | 100 | // Malformed input; input is not UTF-8. See RFC 3629. | 
|  | 101 | return reset_and_return_illegal(EILSEQ, state); | 
|  | 102 | } | 
|  | 103 |  | 
|  | 104 | // Fill in the state. | 
|  | 105 | size_t bytes_wanted = length - bytes_so_far; | 
|  | 106 | size_t i; | 
|  | 107 | for (i = 0; i < MIN(bytes_wanted, n); i++) { | 
|  | 108 | if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) { | 
|  | 109 | // Malformed input; bad characters in the middle of a character. | 
|  | 110 | return reset_and_return_illegal(EILSEQ, state); | 
|  | 111 | } | 
|  | 112 | mbstate_set_byte(state, bytes_so_far + i, *s++); | 
|  | 113 | } | 
|  | 114 | if (i < bytes_wanted) { | 
|  | 115 | return __MB_ERR_INCOMPLETE_SEQUENCE; | 
|  | 116 | } | 
|  | 117 |  | 
|  | 118 | // Decode the octet sequence representing the character in chunks | 
|  | 119 | // of 6 bits, most significant first. | 
|  | 120 | char32_t c32 = mbstate_get_byte(state, 0) & mask; | 
|  | 121 | for (i = 1; i < length; i++) { | 
|  | 122 | c32 <<= 6; | 
|  | 123 | c32 |= mbstate_get_byte(state, i) & 0x3f; | 
|  | 124 | } | 
|  | 125 |  | 
|  | 126 | if (c32 < lower_bound) { | 
|  | 127 | // Malformed input; redundant encoding. | 
|  | 128 | return reset_and_return_illegal(EILSEQ, state); | 
|  | 129 | } | 
|  | 130 | if ((c32 >= 0xd800 && c32 <= 0xdfff) || c32 == 0xfffe || c32 == 0xffff) { | 
|  | 131 | // Malformed input; invalid code points. | 
|  | 132 | return reset_and_return_illegal(EILSEQ, state); | 
|  | 133 | } | 
|  | 134 | if (pc32 != NULL) { | 
|  | 135 | *pc32 = c32; | 
|  | 136 | } | 
|  | 137 | return reset_and_return(c32 == U'\0' ? 0 : bytes_wanted, state); | 
|  | 138 | } |