Ken Wakasa | 07cab72 | 2010-04-20 01:24:57 +0900 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2010 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #ifndef LATINIME_CHAR_UTILS_H |
| 18 | #define LATINIME_CHAR_UTILS_H |
| 19 | |
Ken Wakasa | de8a9a8 | 2012-08-17 13:06:28 +0900 | [diff] [blame] | 20 | #include <cctype> |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 21 | |
| 22 | #include "defines.h" |
Ken Wakasa | de8a9a8 | 2012-08-17 13:06:28 +0900 | [diff] [blame] | 23 | |
Ken Wakasa | 07cab72 | 2010-04-20 01:24:57 +0900 | [diff] [blame] | 24 | namespace latinime { |
| 25 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 26 | inline static bool isAsciiUpper(int c) { |
Tom Ouyang | edd5b73 | 2012-09-25 17:04:35 -0700 | [diff] [blame] | 27 | // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to |
| 28 | // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). |
| 29 | return (c >= 'A' && c <= 'Z'); |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 30 | } |
| 31 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 32 | inline static int toAsciiLower(int c) { |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 33 | return c - 'A' + 'a'; |
| 34 | } |
| 35 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 36 | inline static bool isAscii(int c) { |
| 37 | return isascii(c) != 0; |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 38 | } |
| 39 | |
Ken Wakasa | de8a9a8 | 2012-08-17 13:06:28 +0900 | [diff] [blame] | 40 | unsigned short latin_tolower(const unsigned short c); |
Ken Wakasa | 07cab72 | 2010-04-20 01:24:57 +0900 | [diff] [blame] | 41 | |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 42 | /** |
| 43 | * Table mapping most combined Latin, Greek, and Cyrillic characters |
| 44 | * to their base characters. If c is in range, BASE_CHARS[c] == c |
| 45 | * if c is not a combined character, or the base character if it |
| 46 | * is combined. |
| 47 | */ |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 48 | static const int BASE_CHARS_SIZE = 0x0500; |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 49 | extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 50 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 51 | inline static int toBaseCodePoint(int c) { |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 52 | if (c < BASE_CHARS_SIZE) { |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 53 | return static_cast<int>(BASE_CHARS[c]); |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 54 | } |
| 55 | return c; |
| 56 | } |
| 57 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 58 | inline static int toLowerCase(const int c) { |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 59 | if (isAsciiUpper(c)) { |
| 60 | return toAsciiLower(c); |
| 61 | } else if (isAscii(c)) { |
| 62 | return c; |
| 63 | } |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 64 | return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); |
Tadashi G. Takaoka | 6e3cb27 | 2011-11-11 14:26:13 +0900 | [diff] [blame] | 65 | } |
| 66 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 67 | inline static int toBaseLowerCase(const int c) { |
| 68 | return toLowerCase(toBaseCodePoint(c)); |
Jean Chalard | e9a86e2 | 2012-06-28 21:01:29 +0900 | [diff] [blame] | 69 | } |
Ken Wakasa | 5150e15 | 2012-09-27 19:21:25 +0900 | [diff] [blame] | 70 | |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 71 | inline static bool isSkippableCodePoint(const int codePoint) { |
Ken Wakasa | 5150e15 | 2012-09-27 19:21:25 +0900 | [diff] [blame] | 72 | // TODO: Do not hardcode here |
Ken Wakasa | 1e61493 | 2012-10-29 18:06:22 +0900 | [diff] [blame^] | 73 | return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; |
Ken Wakasa | 5150e15 | 2012-09-27 19:21:25 +0900 | [diff] [blame] | 74 | } |
| 75 | |
Ken Wakasa | ce9e52a | 2011-06-18 13:09:55 +0900 | [diff] [blame] | 76 | } // namespace latinime |
Ken Wakasa | 07cab72 | 2010-04-20 01:24:57 +0900 | [diff] [blame] | 77 | #endif // LATINIME_CHAR_UTILS_H |