Reconcile with jb-mr1-release - do not merge

Change-Id: I759cbab32b17e45dd3d3e97ccf5e6039cfdff691
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index bc34e4e..2201711 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -126,7 +126,7 @@
 
         // codesSize == 0 means we are trying to find bigram predictions.
         if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
-            const int bigramFreqTemp = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+            const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
             // Due to space constraints, the frequency for bigrams is approximate - the lower the
             // unigram frequency, the worse the precision. The theoritical maximum error in
             // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
@@ -139,7 +139,7 @@
                 ++bigramCount;
             }
         }
-    } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
+    } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
     return bigramCount;
 }
 
@@ -154,8 +154,8 @@
 
     if (NOT_VALID_WORD == pos) return 0;
     const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
-    if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0;
-    if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) {
+    if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
+    if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
         BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
     } else {
         pos = BinaryFormat::skipOtherCharacters(root, pos);
@@ -182,12 +182,12 @@
     int bigramFlags;
     do {
         bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
-        const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+        const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
         const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
                 &pos);
         (*map)[bigramPos] = frequency;
         setInFilter(filter, bigramPos);
-    } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
+    } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
 }
 
 bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
@@ -223,7 +223,7 @@
         if (bigramPos == nextWordPos) {
             return true;
         }
-    } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
+    } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
     return false;
 }
 
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 4155ef4..fdc13bf 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -18,13 +18,47 @@
 #define LATINIME_BINARY_FORMAT_H
 
 #include <limits>
+#include <map>
 #include "bloom_filter.h"
 #include "char_utils.h"
-#include "unigram_dictionary.h"
 
 namespace latinime {
 
 class BinaryFormat {
+ public:
+    // Mask and flags for children address type selection.
+    static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
+    static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
+    static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
+    static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
+    static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
+
+    // Flag for single/multiple char group
+    static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
+
+    // Flag for terminal groups
+    static const int FLAG_IS_TERMINAL = 0x10;
+
+    // Flag for shortcut targets presence
+    static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
+    // Flag for bigram presence
+    static const int FLAG_HAS_BIGRAMS = 0x04;
+
+    // Attribute (bigram/shortcut) related flags:
+    // Flag for presence of more attributes
+    static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
+    // Flag for sign of offset. If this flag is set, the offset value must be negated.
+    static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
+
+    // Mask for attribute frequency, stored on 4 bits inside the flags byte.
+    static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
+
+    // Mask and flags for attribute address type selection.
+    static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
+    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+
  private:
     DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
     const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
@@ -174,13 +208,13 @@
 
 static inline int attributeAddressSize(const uint8_t flags) {
     static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
-    return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
+    return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
     /* Note: this is a value-dependant optimization of what may probably be
        more readably written this way:
-       switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
-       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
-       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
-       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
+       switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
+       case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
+       case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
+       case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
        default: return 0;
        }
     */
@@ -189,7 +223,7 @@
 static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
     int currentPos = pos;
     uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
-    while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
+    while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
         currentPos += attributeAddressSize(flags);
         flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
     }
@@ -199,7 +233,7 @@
 
 static inline int childrenAddressSize(const uint8_t flags) {
     static const int CHILDREN_ADDRESS_SHIFT = 6;
-    return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
+    return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
     /* See the note in attributeAddressSize. The same applies here */
 }
 
@@ -212,12 +246,12 @@
 }
 
 inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
-    return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
+    return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
 }
 
 inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
         const int pos) {
-    if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+    if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
         return pos + shortcutByteSize(dict, pos);
     } else {
         return pos;
@@ -226,7 +260,7 @@
 
 inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
         const int pos) {
-    if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
+    if (FLAG_HAS_BIGRAMS & flags) {
         return skipExistingBigrams(dict, pos);
     } else {
         return pos;
@@ -253,15 +287,15 @@
 inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
         const int pos) {
     int offset = 0;
-    switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
-        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
+    switch (MASK_GROUP_ADDRESS_TYPE & flags) {
+        case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
             offset = dict[pos];
             break;
-        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
+        case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
             offset = dict[pos] << 8;
             offset += dict[pos + 1];
             break;
-        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
+        case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
             offset = dict[pos] << 16;
             offset += dict[pos + 1] << 8;
             offset += dict[pos + 2];
@@ -275,32 +309,31 @@
 }
 
 inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
-    return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
-            != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+    return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
 }
 
 inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
         const uint8_t flags, int *pos) {
     int offset = 0;
     const int origin = *pos;
-    switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
-        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+    switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
+        case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
             offset = dict[origin];
             *pos = origin + 1;
             break;
-        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+        case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
             offset = dict[origin] << 8;
             offset += dict[origin + 1];
             *pos = origin + 2;
             break;
-        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+        case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
             offset = dict[origin] << 16;
             offset += dict[origin + 1] << 8;
             offset += dict[origin + 2];
             *pos = origin + 3;
             break;
     }
-    if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
+    if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
         return origin - offset;
     } else {
         return origin + offset;
@@ -332,7 +365,7 @@
                 // char within a node, so either we found our match in this node, or there is
                 // no match and we can return NOT_VALID_WORD. So we will check all the characters
                 // in this character group indeed does match.
-                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+                if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                     character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                     while (NOT_A_CHARACTER != character) {
                         ++wordPos;
@@ -350,14 +383,13 @@
                 // If we don't match the length AND don't have children, then a word in the
                 // dictionary fully matches a prefix of the searched word but not the full word.
                 ++wordPos;
-                if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
+                if (FLAG_IS_TERMINAL & flags) {
                     if (wordPos == length) {
                         return charGroupPos;
                     }
-                    pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
+                    pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
                 }
-                if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
-                        == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
+                if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
                     return NOT_VALID_WORD;
                 }
                 // We have children and we are still shorter than the word we are searching for, so
@@ -367,7 +399,7 @@
                 break;
             } else {
                 // This chargroup does not match, so skip the remaining part and go to the next.
-                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+                if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                     pos = BinaryFormat::skipOtherCharacters(root, pos);
                 }
                 pos = BinaryFormat::skipFrequency(flags, pos);
@@ -420,7 +452,7 @@
                 // We found the address. Copy the rest of the word in the buffer and return
                 // the length.
                 outWord[wordPos] = character;
-                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+                if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                     int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
                     // We count chars in order to avoid infinite loops if the file is broken or
                     // if there is some other bug
@@ -435,7 +467,7 @@
             }
             // We need to skip past this char group, so skip any remaining chars after the
             // first and possibly the frequency.
-            if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+            if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                 pos = skipOtherCharacters(root, pos);
             }
             pos = skipFrequency(flags, pos);
@@ -443,8 +475,8 @@
             // The fact that this group has children is very important. Since we already know
             // that this group does not match, if it has no children we know it is irrelevant
             // to what we are searching for.
-            const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
-                    (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+            const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
+                    (MASK_GROUP_ADDRESS_TYPE & flags));
             // We will write in `found' whether we have passed the children address we are
             // searching for. For example if we search for "beer", the children of b are less
             // than the address we are searching for and the children of c are greater. When we
@@ -484,7 +516,7 @@
                             getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
                     // We copy all the characters in this group to the buffer
                     outWord[wordPos] = lastChar;
-                    if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
+                    if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
                         int32_t nextChar =
                                 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
                         int charCount = maxDepth;
diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h
index 755635f..d633645 100644
--- a/native/jni/src/terminal_attributes.h
+++ b/native/jni/src/terminal_attributes.h
@@ -17,7 +17,7 @@
 #ifndef LATINIME_TERMINAL_ATTRIBUTES_H
 #define LATINIME_TERMINAL_ATTRIBUTES_H
 
-#include "unigram_dictionary.h"
+#include "binary_format.h"
 
 namespace latinime {
 
@@ -36,7 +36,7 @@
      public:
         ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
                 mPos(pos) {
-            mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS));
+            mHasNextShortcutTarget = (0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS));
         }
 
         inline bool hasNextShortcutTarget() const {
@@ -49,7 +49,7 @@
         inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
             const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
             mHasNextShortcutTarget =
-                    0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
+                    0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
             unsigned int i;
             for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
                 const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index b2fc870..efcf512 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -707,7 +707,7 @@
         const uint8_t *const root, const int startPos,
         const uint16_t *const inWord, const int startInputIndex,
         int32_t *outNewWord, int *outInputIndex, int *outPos) {
-    const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
+    const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
     int pos = startPos;
     int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
     int32_t baseChar = toBaseLowerCase(character);
@@ -780,7 +780,7 @@
             // into inputIndex if there is a match.
             const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
                     inputIndex, newWord, &inputIndex, &pos);
-            if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
+            if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
                 const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
                 onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
             }
@@ -823,7 +823,7 @@
         return NOT_A_PROBABILITY;
     }
     const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
-    const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
+    const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
     if (hasMultipleChars) {
         pos = BinaryFormat::skipOtherCharacters(root, pos);
     } else {
@@ -871,8 +871,8 @@
     // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
     // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
     const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
-    const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
-    const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
+    const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
+    const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
 
     bool needsToInvokeOnTerminal = false;
 
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index ac17f50..6083f01 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -32,39 +32,6 @@
     typedef struct { int first; int second; int replacement; } digraph_t;
 
  public:
-    // Mask and flags for children address type selection.
-    static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
-    static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
-    static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
-    static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
-    static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
-
-    // Flag for single/multiple char group
-    static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
-
-    // Flag for terminal groups
-    static const int FLAG_IS_TERMINAL = 0x10;
-
-    // Flag for shortcut targets presence
-    static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
-    // Flag for bigram presence
-    static const int FLAG_HAS_BIGRAMS = 0x04;
-
-    // Attribute (bigram/shortcut) related flags:
-    // Flag for presence of more attributes
-    static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
-    // Flag for sign of offset. If this flag is set, the offset value must be negated.
-    static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
-
-    // Mask for attribute frequency, stored on 4 bits inside the flags byte.
-    static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
-
-    // Mask and flags for attribute address type selection.
-    static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
-    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
-    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
-    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
-
     // Error tolerances
     static const int DEFAULT_MAX_ERRORS = 2;
     static const int MAX_ERRORS_FOR_TWO_WORDS = 1;