Promote full matches with differing accents. Stop considering accented characters as different from their base character for proximity scoring. Also give a huge boost (basically overriding frequency) to a word fully matched with only differing accents. Bug: 2550587 Change-Id: I2da7a71229fb3868d9e4a53703ccf8caeb6fcf10

commit: 8dc754a41129cad5371b7c39b6d5826758de550a [log] [tgz]
author: Jean Chalard <jchalard@google.com> Thu Jan 27 14:20:22 2011 +0900
committer: Jean Chalard <jchalard@google.com> Thu Jan 27 17:29:24 2011 +0900
tree: 8aad7ceff315daead8a99ea44e75821b7181184b
parent: 588d2a525c444c0126f88791fcb097deba5d4644 [diff]
diff --git a/native/src/defines.h b/native/src/defines.h
index 7374526..c1eaf0d 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h

@@ -129,6 +129,7 @@
 #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
 #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
 
+// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
@@ -136,6 +137,9 @@
 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120
 
+// This is used as a bare multiplier (not subject to /100)
+#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
+
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
 #define MAX_WORD_LENGTH_INTERNAL 48

diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index e27939d..dfbe822 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp

@@ -363,9 +363,14 @@
     }
     int lengthFreq = TYPED_LETTER_MULTIPLIER;
     for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER;
-    if (depth > 1 && lengthFreq == snr) {
-        if (DEBUG_DICT) LOGI("Found full matched word.");
-        multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
+    if (lengthFreq == snr) {
+        if (depth > 1) {
+            if (DEBUG_DICT) LOGI("Found full matched word.");
+            multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
+        }
+        if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
+            finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+        }
     }
     if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
     return finalFreq;
@@ -385,10 +390,9 @@
 
 inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
         unsigned short *word, const int inputIndex, const int depth, const int snr,
-        const int skipPos, const int excessivePos, const int transposedPos, const int freq,
-        const int addedWeight) {
+        const int skipPos, const int excessivePos, const int transposedPos, const int freq) {
     if (sameAsTyped(word, depth + 1)) return;
-    const int finalFreq = calculateFinalFreq(inputIndex, depth, snr * addedWeight, skipPos,
+    const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos,
             excessivePos, transposedPos, freq, true);
     // Proximity collection will promote a word of the same length as what user typed.
     if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
@@ -424,9 +428,9 @@
     return false;
 }
 
-inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
-        const unsigned short c, const int skipPos, const int excessivePos,
-        const int transposedPos) {
+inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
+        const int *currentChars, const unsigned short c, const int skipPos,
+        const int excessivePos, const int transposedPos) {
     const unsigned short lowerC = toLowerCase(c);
     int j = 0;
     while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
@@ -434,18 +438,19 @@
         // If skipPos is defined, not to search proximity collections.
         // First char is what user  typed.
         if (matched) {
-            return j;
+            if (j > 0) return NEAR_PROXIMITY_CHAR;
+            return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
         } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
             // Not to check proximity characters
-            return -1;
+            return UNRELATED_CHAR;
         }
         ++j;
     }
-    return -1;
+    return UNRELATED_CHAR;
 }
 
 inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
-        const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
+        const int maxDepth, const bool traverseAllNodes, int snr, int inputIndex,
         const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
         int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
         bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
@@ -492,22 +497,24 @@
 
         int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
                 transposedPos);
-        if (matchedProximityCharId < 0) return false;
+        if (UNRELATED_CHAR == matchedProximityCharId) return false;
         mWord[depth] = c;
         // If inputIndex is greater than mInputLength, that means there is no
         // proximity chars. So, we don't need to check proximity.
-        const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
+        if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
+            snr = snr * TYPED_LETTER_MULTIPLIER;
+        }
         bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
                 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
         if (isSameAsUserTypedLength && terminal) {
             onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, snr,
-                    skipPos, excessivePos, transposedPos, freq, addedWeight);
+                    skipPos, excessivePos, transposedPos, freq);
         }
         if (!needsToTraverseChildrenNodes) return false;
         // Start traversing all nodes after the index exceeds the user typed length
         *newTraverseAllNodes = isSameAsUserTypedLength;
-        *newSnr = snr * addedWeight;
-        *newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
+        *newSnr = snr;
+        *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
         *newInputIndex = inputIndex + 1;
     }
     // Optimization: Prune out words that are too long compared to how much was typed.

diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 95f9655..90c9814 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h

@@ -22,6 +22,13 @@
 namespace latinime {
 
 class UnigramDictionary {
+
+    typedef enum {                             // Used as a return value for character comparison
+        SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR,  // Same char, possibly with different case or accent
+        NEAR_PROXIMITY_CHAR,                   // It is a char located nearby on the keyboard
+        UNRELATED_CHAR                         // It is an unrelated char
+    } ProximityType;
+
 public:
     UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
             int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
@@ -60,11 +67,11 @@
             const int transposedPos, const int freq);
     void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word,
             const int inputIndex, const int depth, const int snr, const int skipPos,
-            const int excessivePos, const int transposedPos, const int freq, const int addedWeight);
+            const int excessivePos, const int transposedPos, const int freq);
     bool needsToSkipCurrentNode(const unsigned short c,
             const int inputIndex, const int skipPos, const int depth);
-    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
-            const int excessivePos, const int transposedPos);
+    ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c,
+            const int skipPos, const int excessivePos, const int transposedPos);
     // Process a node by considering proximity, missing and excessive character
     bool processCurrentNode(const int pos, const int depth,
             const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
commit	8dc754a41129cad5371b7c39b6d5826758de550a	[log] [tgz]
author	Jean Chalard <jchalard@google.com>	Thu Jan 27 14:20:22 2011 +0900
committer	Jean Chalard <jchalard@google.com>	Thu Jan 27 17:29:24 2011 +0900
tree	8aad7ceff315daead8a99ea44e75821b7181184b
parent	588d2a525c444c0126f88791fcb097deba5d4644 [diff]