Small native refactoring. Move a purely dictionary-format-related function that is needed both by unigrams and bigrams to the binary format handling file. Also remove the empty UnigramDictionary::getBigrams placeholder function, on grounds that it should be in the BigramDictionary class. Bug: 5046459 Change-Id: I8a67a25f72122e2fa0b19ae1d936db25eb0b20ba

commit: 6a0e9642a8d1046e3b730c6dd1a633a4ec0f656f [log] [tgz]
author: Jean Chalard <jchalard@google.com> Mon Jul 25 18:17:11 2011 +0900
committer: Jean Chalard <jchalard@google.com> Tue Jul 26 16:13:53 2011 +0900
tree: 1c9a255fe94a89ccea85c55c8f226c808ab69574
parent: 537ad5a56f791e74d485eb19463f00de4a1247e3 [diff]
diff --git a/native/src/binary_format.h b/native/src/binary_format.h
index 7deec27..a946b1e 100644
--- a/native/src/binary_format.h
+++ b/native/src/binary_format.h

@@ -48,6 +48,8 @@
     static bool hasChildrenInFlags(const uint8_t flags);
     static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
             int *pos);
+    static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
+            const int length);
 };
 
 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
@@ -217,6 +219,77 @@
     }
 }
 
+// This function gets the byte position of the last chargroup of the exact matching word in the
+// dictionary. If no match is found, it returns NOT_VALID_WORD.
+inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
+        const uint16_t* const inWord, const int length) {
+    int pos = 0;
+    int wordPos = 0;
+
+    while (true) {
+        // If we already traversed the tree further than the word is long, there means
+        // there was no match (or we would have found it).
+        if (wordPos > length) return NOT_VALID_WORD;
+        int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
+        const uint16_t wChar = inWord[wordPos];
+        while (true) {
+            // If there are no more character groups in this node, it means we could not
+            // find a matching character for this depth, therefore there is no match.
+            if (0 >= charGroupCount) return NOT_VALID_WORD;
+            const int charGroupPos = pos;
+            const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
+            int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+            if (character == wChar) {
+                // This is the correct node. Only one character group may start with the same
+                // char within a node, so either we found our match in this node, or there is
+                // no match and we can return NOT_VALID_WORD. So we will check all the characters
+                // in this character group indeed does match.
+                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+                    character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+                    while (NOT_A_CHARACTER != character) {
+                        ++wordPos;
+                        // If we shoot the length of the word we search for, or if we find a single
+                        // character that does not match, as explained above, it means the word is
+                        // not in the dictionary (by virtue of this chargroup being the only one to
+                        // match the word on the first character, but not matching the whole word).
+                        if (wordPos > length) return NOT_VALID_WORD;
+                        if (inWord[wordPos] != character) return NOT_VALID_WORD;
+                        character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+                    }
+                }
+                // If we come here we know that so far, we do match. Either we are on a terminal
+                // and we match the length, in which case we found it, or we traverse children.
+                // If we don't match the length AND don't have children, then a word in the
+                // dictionary fully matches a prefix of the searched word but not the full word.
+                ++wordPos;
+                if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
+                    if (wordPos == length) {
+                        return charGroupPos;
+                    }
+                    pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
+                }
+                if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
+                        == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
+                    return NOT_VALID_WORD;
+                }
+                // We have children and we are still shorter than the word we are searching for, so
+                // we need to traverse children. Put the pointer on the children position, and
+                // break
+                pos = BinaryFormat::readChildrenPosition(root, flags, pos);
+                break;
+            } else {
+                // This chargroup does not match, so skip the remaining part and go to the next.
+                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+                    pos = BinaryFormat::skipOtherCharacters(root, pos);
+                }
+                pos = BinaryFormat::skipFrequency(flags, pos);
+                pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
+            }
+            --charGroupCount;
+        }
+    }
+}
+
 } // namespace latinime
 
 #endif // LATINIME_BINARY_FORMAT_H

diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 3cfed6f..bccd37a 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp

@@ -1055,86 +1055,8 @@
     return maxFreq;
 }
 
-// This function gets the byte position of the last chargroup of the exact matching word in the
-// dictionary. If no match is found, it returns NOT_VALID_WORD.
-static inline int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
-        const int length) {
-    int pos = 0;
-    int wordPos = 0;
-
-    while (true) {
-        // If we already traversed the tree further than the word is long, there means
-        // there was no match (or we would have found it).
-        if (wordPos > length) return NOT_VALID_WORD;
-        int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
-        const uint16_t wChar = inWord[wordPos];
-        while (true) {
-            // If there are no more character groups in this node, it means we could not
-            // find a matching character for this depth, therefore there is no match.
-            if (0 >= charGroupCount) return NOT_VALID_WORD;
-            const int charGroupPos = pos;
-            const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
-            int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
-            if (character == wChar) {
-                // This is the correct node. Only one character group may start with the same
-                // char within a node, so either we found our match in this node, or there is
-                // no match and we can return NOT_VALID_WORD. So we will check all the characters
-                // in this character group indeed does match.
-                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
-                    character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
-                    while (NOT_A_CHARACTER != character) {
-                        ++wordPos;
-                        // If we shoot the length of the word we search for, or if we find a single
-                        // character that does not match, as explained above, it means the word is
-                        // not in the dictionary (by virtue of this chargroup being the only one to
-                        // match the word on the first character, but not matching the whole word).
-                        if (wordPos > length) return NOT_VALID_WORD;
-                        if (inWord[wordPos] != character) return NOT_VALID_WORD;
-                        character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
-                    }
-                }
-                // If we come here we know that so far, we do match. Either we are on a terminal
-                // and we match the length, in which case we found it, or we traverse children.
-                // If we don't match the length AND don't have children, then a word in the
-                // dictionary fully matches a prefix of the searched word but not the full word.
-                ++wordPos;
-                if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
-                    if (wordPos == length) {
-                        return charGroupPos;
-                    }
-                    pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
-                }
-                if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
-                        == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
-                    return NOT_VALID_WORD;
-                }
-                // We have children and we are still shorter than the word we are searching for, so
-                // we need to traverse children. Put the pointer on the children position, and
-                // break
-                pos = BinaryFormat::readChildrenPosition(root, flags, pos);
-                break;
-            } else {
-                // This chargroup does not match, so skip the remaining part and go to the next.
-                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
-                    pos = BinaryFormat::skipOtherCharacters(root, pos);
-                }
-                pos = BinaryFormat::skipFrequency(flags, pos);
-                pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
-            }
-            --charGroupCount;
-        }
-    }
-}
-
 bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
-    return NOT_VALID_WORD != getTerminalPosition(DICT_ROOT, inWord, length);
-}
-
-int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
-        unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
-        int maxAlternatives) {
-    // TODO: add implementation.
-    return 0;
+    return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
 }
 
 // TODO: remove this function.

diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 55771ee..97198ef 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h

@@ -71,9 +71,6 @@
     bool isValidWord(unsigned short *word, int length);
 #else // NEW_DICTIONARY_FORMAT
     bool isValidWord(const uint16_t* const inWord, const int length) const;
-    int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
-            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
-            int maxAlternatives);
 #endif // NEW_DICTIONARY_FORMAT
     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
commit	6a0e9642a8d1046e3b730c6dd1a633a4ec0f656f	[log] [tgz]
author	Jean Chalard <jchalard@google.com>	Mon Jul 25 18:17:11 2011 +0900
committer	Jean Chalard <jchalard@google.com>	Tue Jul 26 16:13:53 2011 +0900
tree	1c9a255fe94a89ccea85c55c8f226c808ab69574
parent	537ad5a56f791e74d485eb19463f00de4a1247e3 [diff]