Reorganize char_utils.h and basechars.h * make BASE_CHARS[] const * add several inline menthods for ASCII character handling Change-Id: I49664f219af88faf0aef43ac350cfc216570b185

commit: 6e3cb27cffa525d555b289111678f6fa0495447e [log] [tgz]
author: Tadashi G. Takaoka <takaoka@google.com> Fri Nov 11 14:26:13 2011 +0900
committer: Tadashi G. Takaoka <takaoka@google.com> Fri Nov 11 19:44:08 2011 +0900
tree: 37d9df7774aa03c4897860c8fda6941549398c3f
parent: 85170a9c17e7fbefb3d9a6dba0e211b72899aeae [diff]
diff --git a/native/src/basechars.h b/native/src/basechars.cpp
similarity index 98%
rename from native/src/basechars.h
rename to native/src/basechars.cpp
index 3843e11..31f1e18 100644
--- a/native/src/basechars.h
+++ b/native/src/basechars.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009 The Android Open Source Project
+ * Copyright (C) 2011 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#ifndef LATINIME_BASECHARS_H
-#define LATINIME_BASECHARS_H
+#include "char_utils.h"
+
+namespace latinime {
 
 /**
  * Table mapping most combined Latin, Greek, and Cyrillic characters
@@ -23,7 +24,7 @@
  * if c is not a combined character, or the base character if it
  * is combined.
  */
-static unsigned short BASE_CHARS[] = {
+const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = {
     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
@@ -189,4 +190,5 @@
 
 // generated with:
 // cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }'
-#endif // LATINIME_BASECHARS_H
+
+} // namespace latinime

diff --git a/native/src/char_utils.h b/native/src/char_utils.h
index a69a35e..607dc51 100644
--- a/native/src/char_utils.h
+++ b/native/src/char_utils.h

@@ -19,8 +19,47 @@
 
 namespace latinime {
 
+inline static int isAsciiUpper(unsigned short c) {
+    return c >= 'A' && c <= 'Z';
+}
+
+inline static unsigned short toAsciiLower(unsigned short c) {
+    return c - 'A' + 'a';
+}
+
+inline static int isAscii(unsigned short c) {
+    return c <= 127;
+}
+
 unsigned short latin_tolower(unsigned short c);
 
+/**
+ * Table mapping most combined Latin, Greek, and Cyrillic characters
+ * to their base characters.  If c is in range, BASE_CHARS[c] == c
+ * if c is not a combined character, or the base character if it
+ * is combined.
+ */
+
+static const int BASE_CHARS_SIZE = 0x0500;
+extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
+
+inline static unsigned short toBaseChar(unsigned short c) {
+    if (c < BASE_CHARS_SIZE) {
+        return BASE_CHARS[c];
+    }
+    return c;
+}
+
+inline static unsigned short toBaseLowerCase(unsigned short c) {
+    c = toBaseChar(c);
+    if (isAsciiUpper(c)) {
+        return toAsciiLower(c);
+    } else if (isAscii(c)) {
+        return c;
+    }
+    return latin_tolower(c);
+}
+
 } // namespace latinime
 
 #endif // LATINIME_CHAR_UTILS_H

diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index f6b7eb6..8b6d3b2 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp

@@ -21,6 +21,7 @@
 
 #define LOG_TAG "LatinIME: correction.cpp"
 
+#include "char_utils.h"
 #include "correction.h"
 #include "dictionary.h"
 #include "proximity_info.h"
@@ -48,13 +49,13 @@
 
     for (int i = 0; i < li - 1; ++i) {
         for (int j = 0; j < lo - 1; ++j) {
-            const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
-            const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
+            const uint32_t ci = toBaseLowerCase(input[i]);
+            const uint32_t co = toBaseLowerCase(output[j]);
             const uint16_t cost = (ci == co) ? 0 : 1;
             dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
                     min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
-            if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1])
-                    && co == Dictionary::toBaseLowerCase(input[i - 1])) {
+            if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
+                    && co == toBaseLowerCase(input[i - 1])) {
                 dp[(i + 1) * lo + (j + 1)] = min(
                         dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
             }
@@ -89,15 +90,13 @@
     const int *const prevprev =
             outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
     current[0] = outputLength;
-    const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]);
-    const uint32_t prevCO =
-            outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0;
+    const uint32_t co = toBaseLowerCase(output[outputLength - 1]);
+    const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
     for (int i = 1; i <= inputLength; ++i) {
-        const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]);
+        const uint32_t ci = toBaseLowerCase(input[i - 1]);
         const uint16_t cost = (ci == co) ? 0 : 1;
         current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
-        if (i >= 2 && prevprev && ci == prevCO
-                && co == Dictionary::toBaseLowerCase(input[i - 2])) {
+        if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
             current[i] = min(current[i], prevprev[i - 2] + 1);
         }
     }
@@ -607,13 +606,7 @@
 }
 
 inline static bool isUpperCase(unsigned short c) {
-     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
-         c = BASE_CHARS[c];
-     }
-     if (isupper(c)) {
-         return true;
-     }
-     return false;
+     return isAsciiUpper(toBaseChar(c));
 }
 
 //////////////////////

diff --git a/native/src/dictionary.h b/native/src/dictionary.h
index d5de008..f891e74 100644
--- a/native/src/dictionary.h
+++ b/native/src/dictionary.h

@@ -17,7 +17,6 @@
 #ifndef LATINIME_DICTIONARY_H
 #define LATINIME_DICTIONARY_H
 
-#include "basechars.h"
 #include "bigram_dictionary.h"
 #include "char_utils.h"
 #include "defines.h"
@@ -63,7 +62,6 @@
     static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion,
             const int pos, unsigned short *c, int *childrenPosition,
             bool *terminal, int *freq);
-    static inline unsigned short toBaseLowerCase(unsigned short c);
 
 private:
     bool hasBigram();
@@ -156,19 +154,6 @@
     return position;
 }
 
-
-inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) {
-    if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
-        c = BASE_CHARS[c];
-    }
-    if (c >='A' && c <= 'Z') {
-        c |= 32;
-    } else if (c > 127) {
-        c = latin_tolower(c);
-    }
-    return c;
-}
-
 } // namespace latinime
 
 #endif // LATINIME_DICTIONARY_H

diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp
index d635588..6857caf 100644
--- a/native/src/proximity_info.cpp
+++ b/native/src/proximity_info.cpp

@@ -167,7 +167,7 @@
         // We do not have the coordinate data
         return NOT_A_INDEX;
     }
-    const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
+    const unsigned short baseLowerC = toBaseLowerCase(c);
     if (baseLowerC > MAX_CHAR_CODE) {
         return NOT_A_INDEX;
     }
@@ -232,7 +232,7 @@
         const unsigned short c, const bool checkProximityChars, int *proximityIndex) const {
     const int *currentChars = getProximityCharsAt(index);
     const int firstChar = currentChars[0];
-    const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
+    const unsigned short baseLowerC = toBaseLowerCase(c);
 
     // The first char in the array is what user typed. If it matches right away,
     // that means the user typed that same char for this pos.
@@ -245,7 +245,7 @@
     // If the non-accented, lowercased version of that first character matches c,
     // then we have a non-accented version of the accented character the user
     // typed. Treat it as a close char.
-    if (Dictionary::toBaseLowerCase(firstChar) == baseLowerC)
+    if (toBaseLowerCase(firstChar) == baseLowerC)
         return NEAR_PROXIMITY_CHAR;
 
     // Not an exact nor an accent-alike match: search the list of close keys

diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 7ff2f4a..647bfde 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp

@@ -464,8 +464,8 @@
     const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
     int pos = startPos;
     int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
-    int32_t baseChar = Dictionary::toBaseLowerCase(character);
-    const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);
+    int32_t baseChar = toBaseLowerCase(character);
+    const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
 
     if (baseChar != wChar) {
         *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
@@ -477,8 +477,8 @@
     if (hasMultipleChars) {
         character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
         while (NOT_A_CHARACTER != character) {
-            baseChar = Dictionary::toBaseLowerCase(character);
-            if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
+            baseChar = toBaseLowerCase(character);
+            if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
                 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
                 *outInputIndex = startInputIndex;
                 return false;
commit	6e3cb27cffa525d555b289111678f6fa0495447e	[log] [tgz]
author	Tadashi G. Takaoka <takaoka@google.com>	Fri Nov 11 14:26:13 2011 +0900
committer	Tadashi G. Takaoka <takaoka@google.com>	Fri Nov 11 19:44:08 2011 +0900
tree	37d9df7774aa03c4897860c8fda6941549398c3f
parent	85170a9c17e7fbefb3d9a6dba0e211b72899aeae [diff]