Fix a bug that We can't suggest words with missing space if one of the words starts with a capitalized character. Bug: 3268825 Change-Id: I0634a243ad1e45dd096b30824b463c366a2e7f0f

commit: aee09dc5fadba630bbbca137cafb4871fb823099 [log] [tgz]
author: satok <satok@google.com> Thu Dec 09 19:21:51 2010 +0900
committer: satok <satok@google.com> Thu Dec 09 21:41:26 2010 +0900
tree: c2bbb88b9514ba64b71def7d19809dba1f9cb03f
parent: 8ec36be4f5d7f85a658235ccfeb9c6cb424ddb2a [diff]
diff --git a/native/src/defines.h b/native/src/defines.h
index b459ba7..98e93b9 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h

@@ -25,10 +25,12 @@
 #endif
 #define DEBUG_DICT true
 #define DEBUG_SHOW_FOUND_WORD false
+#define DEBUG_NODE true
 #else // FLAG_DBG
 #define LOGI
 #define DEBUG_DICT false
 #define DEBUG_SHOW_FOUND_WORD false
+#define DEBUG_NODE false
 #endif // FLAG_DBG
 
 #ifndef U_SHORT_MAX

diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 3856cb7..55d879f 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp

@@ -64,7 +64,7 @@
     }
 
     // Suggestions with missing space
-    if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER) {
+    if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
         for (int i = 1; i < codesSize; ++i) {
             if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
             getMissingSpaceWords(mInputLength, i);
@@ -236,33 +236,30 @@
 }
 
 bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
-    if (missingSpacePos <= 0 || missingSpacePos >= inputLength) return false;
-    const int firstFreq = getWordFreq(0, missingSpacePos);
-    const int secondFreq = getWordFreq(missingSpacePos, inputLength - missingSpacePos);
-    if (DEBUG_DICT) LOGI("First freq: %d, Second freq:  %d", firstFreq, secondFreq);
-
-    if (firstFreq <= 0 || secondFreq <= 0) return false;
-    int pairFreq = (firstFreq + secondFreq) / 2;
-    for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
+    if (missingSpacePos <= 0 || missingSpacePos >= inputLength
+            || inputLength >= MAX_WORD_LENGTH) return false;
     const int newWordLength = inputLength + 1;
     // Allocating variable length array on stack
     unsigned short word[newWordLength];
-    int j = 0;
+    const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);
+    if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);
+    if (firstFreq <= 0) return false;
+
     for (int i = 0; i < missingSpacePos; ++i) {
-        // Down-casting
-        if (DEBUG_DICT) {
-            assert((*(mInputCodes + i * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
-        }
-        word[i] = (unsigned short) *(mInputCodes + i * MAX_PROXIMITY_CHARS);
+        word[i] = mWord[i];
     }
+
+    const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
+    if (DEBUG_DICT) LOGI("Second freq:  %d", secondFreq);
+    if (secondFreq <= 0) return false;
+
     word[missingSpacePos] = SPACE;
     for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
-        // Down-casting
-        if (DEBUG_DICT) {
-            assert((*(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
-        }
-        word[i] = (unsigned short) *(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS);
+        word[i] = mWord[i - missingSpacePos - 1];
     }
+
+    int pairFreq = ((firstFreq + secondFreq) / 2);
+    for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
     addWord(word, newWordLength, pairFreq);
     return true;
 }
@@ -418,48 +415,89 @@
     return needsToTraverseChildrenNodes;
 }
 
-inline int UnigramDictionary::getWordFreq(const int startInputIndex, const int inputLength) {
+inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
+        unsigned short *word) {
     int pos = ROOT_POS;
     int count = Dictionary::getCount(DICT, &pos);
-    int freq = 0;
+    int maxFreq = 0;
+    int depth = 0;
+    unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
     bool terminal = false;
 
-    for (int i = 0; i < inputLength; ++i) {
-        bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(pos, count,
-                startInputIndex + i, &pos, &count, &terminal, &freq);
-        if (!needsToTraverseChildrenNodes && (i < inputLength - 1)) {
-            return 0;
+    mStackChildCount[0] = count;
+    mStackSiblingPos[0] = pos;
+
+    while (depth >= 0) {
+        if (mStackChildCount[depth] > 0) {
+            --mStackChildCount[depth];
+            int firstChildPos;
+            int newFreq;
+            int siblingPos = mStackSiblingPos[depth];
+            const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
+                    startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
+                    &siblingPos);
+            mStackSiblingPos[depth] = siblingPos;
+            if (depth == (inputLength - 1)) {
+                // Traverse sibling node
+                if (terminal) {
+                    if (newFreq > maxFreq) {
+                        for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
+                        if (DEBUG_DICT && DEBUG_NODE) {
+                            char s[inputLength + 1];
+                            for (int i = 0; i < inputLength; ++i) s[i] = word[i];
+                            s[inputLength] = 0;
+                            LOGI("New missing space word found: %d > %d (%s), %d, %d",
+                                    newFreq, maxFreq, s, inputLength, depth);
+                        }
+                        maxFreq = newFreq;
+                    }
+                }
+            } else if (needsToTraverseChildrenNodes) {
+                // Traverse children nodes
+                ++depth;
+                mStackChildCount[depth] = count;
+                mStackSiblingPos[depth] = firstChildPos;
+            }
+        } else {
+            // Traverse parent node
+            --depth;
         }
     }
-    if (terminal) {
-        return freq;
-    } else {
-        return 0;
-    }
+
+    word[inputLength] = 0;
+    return maxFreq;
 }
 
 inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
-        const int count, const int inputIndex, int *newChildPosition, int *newCount,
-        bool *newTerminal, int *newFreq) {
+        const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
+        int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
+    const int inputIndex = startInputIndex + depth;
     const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
-    int pos = firstChildPos;
     unsigned short c;
-    for (int i = 0; i < count; ++i) {
-        pos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
-                newChildPosition, newTerminal, newFreq);
-        const unsigned int inputC = currentChars[0];
-        const unsigned short lowerC = toLowerCase(c);
-        const bool matched = (inputC == lowerC || inputC == c);
-        const bool hasChild = *newChildPosition != 0;
-        if (matched) {
-            if (hasChild) {
-                *newCount = Dictionary::getCount(DICT, newChildPosition);
-                return true;
-            } else {
-                return false;
-            }
+    *siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,
+            newChildPosition, newTerminal, newFreq);
+    const unsigned int inputC = currentChars[0];
+    if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);
+    const unsigned short lowerC = toLowerCase(c);
+    const bool matched = (inputC == lowerC || inputC == c);
+    const bool hasChild = *newChildPosition != 0;
+    if (matched) {
+        word[depth] = c;
+        if (DEBUG_DICT && DEBUG_NODE) {
+            LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
+            if (*newTerminal) LOGI("Terminal %d", *newFreq);
         }
+        if (hasChild) {
+            *newCount = Dictionary::getCount(DICT, newChildPosition);
+            return true;
+        } else {
+            return false;
+        }
+    } else {
+        // If this node is not user typed character, this method treats this word as unmatched.
+        // Thus newTerminal shouldn't be true.
+        *newTerminal = false;
+        return false;
     }
-    return false;
 }
 } // namespace latinime

diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index d16d3fd..cdec465 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h

@@ -64,11 +64,11 @@
             const int nextLettersSize, int *newCount, int *newChildPosition,
             bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
             int *nextSiblingPosition);
-    int getWordFreq(const int startInputIndex, const int inputLength);
+    int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
     // Process a node by considering missing space
-    bool processCurrentNodeForExactMatch(const int firstChildPos, const int count,
-            const int inputIndex, int *newChildPosition, int *newCount, bool *newTerminal,
-            int *newFreq);
+    bool processCurrentNodeForExactMatch(const int firstChildPos,
+            const int startInputIndex, const int depth, unsigned short *word,
+            int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
 
     const unsigned char *DICT;
     const int MAX_WORDS;
commit	aee09dc5fadba630bbbca137cafb4871fb823099	[log] [tgz]
author	satok <satok@google.com>	Thu Dec 09 19:21:51 2010 +0900
committer	satok <satok@google.com>	Thu Dec 09 21:41:26 2010 +0900
tree	c2bbb88b9514ba64b71def7d19809dba1f9cb03f
parent	8ec36be4f5d7f85a658235ccfeb9c6cb424ddb2a [diff]