Merge "add reconstructBigramFrequency" into jb-mr1-dev

commit: 72c0f4de1dfaaa1e404f46da48d7c91f28b76f74 [log] [tgz]
author: Ken Wakasa <kwakasa@google.com> Fri Aug 17 03:19:12 2012 -0700
committer: Android (Google) Code Review <android-gerrit@google.com> Fri Aug 17 03:19:12 2012 -0700
tree: acf13449b24eb5eaf4885586befbfb54f0bd8281
parent: 1edd557cf2b7d480275b1747d1915d919c788990 [diff]
parent: c0a75c8ecbd373c4eaee4f866e4080c0b800470b [diff]
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
index 273ee32..7f04233 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java

@@ -783,10 +783,10 @@
         // their lower bound and exclude their higher bound so we need to have the first step
         // start at exactly 1 unit higher than floor(unigramFreq + half a step).
         // Note : to reconstruct the score, the dictionary reader will need to divide
-        // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise, and add
-        // (discretizedFrequency + 0.5) times this value to get the median value of the step,
-        // which is the best approximation. This is how we get the most precise result with
-        // only four bits.
+        // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step,
+        // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best
+        // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the
+        // step pointed by the discretized frequency.
         final float stepSize =
                 (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + MAX_BIGRAM_FREQUENCY);
         final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f);

diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h
index d676cca..5f11ae8 100644
--- a/native/jni/src/bigram_dictionary.h
+++ b/native/jni/src/bigram_dictionary.h

@@ -29,8 +29,6 @@
     BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions);
     int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize,
             unsigned short *outWords, int *frequencies, int *outputTypes) const;
-    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
-            const bool forceLowerCaseSearch) const;
     void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
             std::map<int, int> *map, uint8_t *filter) const;
     bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
@@ -45,6 +43,8 @@
     bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; }
     bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; }
     bool checkFirstCharacter(unsigned short *word, int *inputCodes) const;
+    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
+            const bool forceLowerCaseSearch) const;
 
     const unsigned char *DICT;
     const int MAX_WORD_LENGTH;

diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 4cabc84..d8f3e83 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h

@@ -61,13 +61,6 @@
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
 
- private:
-    DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
-    const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
-    const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
-    const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
-
- public:
     const static int UNKNOWN_FORMAT = -1;
     // Originally, format version 1 had a 16-bit magic number, then the version number `01'
     // then options that must be 0. Hence the first 32-bits of the format are always as follow
@@ -94,7 +87,6 @@
     static int skipFrequency(const uint8_t flags, const int pos);
     static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
     static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
-    static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
     static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
             const int pos);
     static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
@@ -118,6 +110,13 @@
         REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
     };
     const static unsigned int NO_FLAGS = 0;
+
+ private:
+    DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
+    const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+    const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
+    const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
+    static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
 };
 
 inline int BinaryFormat::detectFormat(const uint8_t *const dict) {

diff --git a/native/jni/src/char_utils.cpp b/native/jni/src/char_utils.cpp
index 223291f..9d886da31 100644
--- a/native/jni/src/char_utils.cpp
+++ b/native/jni/src/char_utils.cpp

@@ -889,7 +889,7 @@
             - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
 }
 
-unsigned short latin_tolower(unsigned short c) {
+unsigned short latin_tolower(const unsigned short c) {
     struct LatinCapitalSmallPair *p =
             static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
                     sizeof(SORTED_CHAR_MAP) / sizeof(SORTED_CHAR_MAP[0]),

diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h
index edd96bb..b30677f 100644
--- a/native/jni/src/char_utils.h
+++ b/native/jni/src/char_utils.h

@@ -17,21 +17,23 @@
 #ifndef LATINIME_CHAR_UTILS_H
 #define LATINIME_CHAR_UTILS_H
 
+#include <cctype>
+
 namespace latinime {
 
-inline static int isAsciiUpper(unsigned short c) {
-    return c >= 'A' && c <= 'Z';
+inline static bool isAsciiUpper(unsigned short c) {
+    return isupper(static_cast<int>(c)) != 0;
 }
 
 inline static unsigned short toAsciiLower(unsigned short c) {
     return c - 'A' + 'a';
 }
 
-inline static int isAscii(unsigned short c) {
-    return c <= 127;
+inline static bool isAscii(unsigned short c) {
+    return isascii(static_cast<int>(c)) != 0;
 }
 
-unsigned short latin_tolower(unsigned short c);
+unsigned short latin_tolower(const unsigned short c);
 
 /**
  * Table mapping most combined Latin, Greek, and Cyrillic characters

diff --git a/tools/maketext/Android.mk b/tools/maketext/Android.mk
index 98731b7..77914ca 100644
--- a/tools/maketext/Android.mk
+++ b/tools/maketext/Android.mk

@@ -19,7 +19,6 @@
 LOCAL_SRC_FILES += $(call all-java-files-under,src)
 LOCAL_JAR_MANIFEST := etc/manifest.txt
 LOCAL_JAVA_RESOURCE_DIRS := res
-LOCAL_MODULE_TAGS := eng
 LOCAL_MODULE := maketext
 
 include $(BUILD_HOST_JAVA_LIBRARY)

diff --git a/tools/maketext/etc/Android.mk b/tools/maketext/etc/Android.mk
index 4fa194b..475676b 100644
--- a/tools/maketext/etc/Android.mk
+++ b/tools/maketext/etc/Android.mk

@@ -15,7 +15,6 @@
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 
-LOCAL_MODULE_TAGS := eng
-
 LOCAL_PREBUILT_EXECUTABLES := maketext
+
 include $(BUILD_HOST_PREBUILT)
commit	72c0f4de1dfaaa1e404f46da48d7c91f28b76f74	[log] [tgz]
author	Ken Wakasa <kwakasa@google.com>	Fri Aug 17 03:19:12 2012 -0700
committer	Android (Google) Code Review <android-gerrit@google.com>	Fri Aug 17 03:19:12 2012 -0700
tree	acf13449b24eb5eaf4885586befbfb54f0bd8281
parent	1edd557cf2b7d480275b1747d1915d919c788990 [diff]
parent	c0a75c8ecbd373c4eaee4f866e4080c0b800470b [diff]