Merge "Make DistracterFilter interface."

commit: 85befc0873e2765f229ad9c5c9072f2b59ce93ff [log] [tgz]
author: Keisuke Kuroyanagi <ksk@google.com> Fri May 23 10:11:56 2014 +0000
committer: Android (Google) Code Review <android-gerrit@google.com> Fri May 23 10:11:57 2014 +0000
tree: b6b56bbc206ccaa9a32f50d5875d06ee0591c72d
parent: 52fdaa2cd8b6903fcf1dcab23b786e917323241c [diff]
parent: f498e53933f64a344c7f9321a49d874a57611169 [diff]
diff --git a/java/src/com/android/inputmethod/dictionarypack/MetadataDbHelper.java b/java/src/com/android/inputmethod/dictionarypack/MetadataDbHelper.java
index 668eb92..743bc80 100644
--- a/java/src/com/android/inputmethod/dictionarypack/MetadataDbHelper.java
+++ b/java/src/com/android/inputmethod/dictionarypack/MetadataDbHelper.java

@@ -47,7 +47,7 @@
     // used to identify the versions for upgrades. This should never change going forward.
     private static final int METADATA_DATABASE_VERSION_WITH_CLIENTID = 6;
     // The current database version.
-    private static final int CURRENT_METADATA_DATABASE_VERSION = 8;
+    private static final int CURRENT_METADATA_DATABASE_VERSION = 9;
 
     private final static long NOT_A_DOWNLOAD_ID = -1;
 

diff --git a/java/src/com/android/inputmethod/latin/Constants.java b/java/src/com/android/inputmethod/latin/Constants.java
index 67ca595..efc5a61 100644
--- a/java/src/com/android/inputmethod/latin/Constants.java
+++ b/java/src/com/android/inputmethod/latin/Constants.java

@@ -192,7 +192,6 @@
     public static final int CODE_SPACE = ' ';
     public static final int CODE_PERIOD = '.';
     public static final int CODE_COMMA = ',';
-    public static final int CODE_ARMENIAN_PERIOD = 0x0589;
     public static final int CODE_DASH = '-';
     public static final int CODE_SINGLE_QUOTE = '\'';
     public static final int CODE_DOUBLE_QUOTE = '"';
@@ -208,6 +207,8 @@
     public static final int CODE_CLOSING_SQUARE_BRACKET = ']';
     public static final int CODE_CLOSING_CURLY_BRACKET = '}';
     public static final int CODE_CLOSING_ANGLE_BRACKET = '>';
+    public static final int CODE_INVERTED_QUESTION_MARK = 0xBF; // ¿
+    public static final int CODE_INVERTED_EXCLAMATION_MARK = 0xA1; // ¡
 
     /**
      * Special keys code. Must be negative.

diff --git a/java/src/com/android/inputmethod/latin/utils/CapsModeUtils.java b/java/src/com/android/inputmethod/latin/utils/CapsModeUtils.java
index 702688f..9362193 100644
--- a/java/src/com/android/inputmethod/latin/utils/CapsModeUtils.java
+++ b/java/src/com/android/inputmethod/latin/utils/CapsModeUtils.java

@@ -62,6 +62,22 @@
     }
 
     /**
+     * Helper method to find out if a code point is starting punctuation.
+     *
+     * This include the Unicode START_PUNCTUATION category, but also some other symbols that are
+     * starting, like the inverted question mark or the double quote.
+     *
+     * @param codePoint the code point
+     * @return true if it's starting punctuation, false otherwise.
+     */
+    private static boolean isStartPunctuation(final int codePoint) {
+        return (codePoint == Constants.CODE_DOUBLE_QUOTE || codePoint == Constants.CODE_SINGLE_QUOTE
+                || codePoint == Constants.CODE_INVERTED_QUESTION_MARK
+                || codePoint == Constants.CODE_INVERTED_EXCLAMATION_MARK
+                || Character.getType(codePoint) == Character.START_PUNCTUATION);
+    }
+
+    /**
      * Determine what caps mode should be in effect at the current offset in
      * the text. Only the mode bits set in <var>reqModes</var> will be
      * checked. Note that the caps mode flags here are explicitly defined
@@ -115,8 +131,7 @@
         } else {
             for (i = cs.length(); i > 0; i--) {
                 final char c = cs.charAt(i - 1);
-                if (c != Constants.CODE_DOUBLE_QUOTE && c != Constants.CODE_SINGLE_QUOTE
-                        && Character.getType(c) != Character.START_PUNCTUATION) {
+                if (!isStartPunctuation(c)) {
                     break;
                 }
             }
@@ -210,11 +225,14 @@
 
         // We found out that we have a period. We need to determine if this is a full stop or
         // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
-        // looks like (\w\.){2,}
+        // looks like (\w\.){2,}. Moreover, in German, you put periods after digits for dates
+        // and some other things, and in German specifically we need to not go into autocaps after
+        // a whitespace-digits-period sequence.
         // To find out, we will have a simple state machine with the following states :
-        // START, WORD, PERIOD, ABBREVIATION
+        // START, WORD, PERIOD, ABBREVIATION, NUMBER
         // On START : (just before the first period)
         //           letter => WORD
+        //           digit => NUMBER if German; end with caps otherwise
         //           whitespace => end with no caps (it was a stand-alone period)
         //           otherwise => end with caps (several periods/symbols in a row)
         // On WORD : (within the word just before the first period)
@@ -228,6 +246,11 @@
         //           letter => LETTER
         //           period => PERIOD
         //           otherwise => end with no caps (it was an abbreviation)
+        // On NUMBER : (period immediately preceded by one or more digits)
+        //           digit => NUMBER
+        //           letter => LETTER (promote to word)
+        //           otherwise => end with no caps (it was a whitespace-digits-period sequence,
+        //            or a punctuation-digits-period sequence like "11.11.")
         // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
         // should capitalize.
 
@@ -235,6 +258,7 @@
         final int WORD = 1;
         final int PERIOD = 2;
         final int LETTER = 3;
+        final int NUMBER = 4;
         final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
                 | TextUtils.CAP_MODE_SENTENCES) & reqModes;
         final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
@@ -247,6 +271,8 @@
                     state = WORD;
                 } else if (Character.isWhitespace(c)) {
                     return noCaps;
+                } else if (Character.isDigit(c) && spacingAndPunctuations.mUsesGermanRules) {
+                    state = NUMBER;
                 } else {
                     return caps;
                 }
@@ -275,6 +301,15 @@
                 } else {
                     return noCaps;
                 }
+                break;
+            case NUMBER:
+                if (Character.isLetter(c)) {
+                    state = WORD;
+                } else if (Character.isDigit(c)) {
+                    state = NUMBER;
+                } else {
+                    return noCaps;
+                }
             }
         }
         // Here we arrived at the start of the line. This should behave exactly like whitespace.

diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index 6223f86..5ad2114 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp

@@ -341,8 +341,8 @@
         shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
     }
     // Use 1 for count to indicate the word has inputted.
-    const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
-            probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
+    const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
+            isBlacklisted, probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
     dictionary->addUnigramEntry(codePoints, codePointCount, &unigramProperty);
 }
 
@@ -450,8 +450,9 @@
             shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
         }
         // Use 1 for count to indicate the word has inputted.
-        const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
-                unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
+        const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
+                isBlacklisted, unigramProbability, timestamp, 0 /* level */, 1 /* count */,
+                &shortcuts);
         dictionary->addUnigramEntry(word1CodePoints, word1Length, &unigramProperty);
         if (word0) {
             jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);

diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index fe3167a..bcf7d59 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp

@@ -82,6 +82,12 @@
 
 void Dictionary::addUnigramEntry(const int *const word, const int length,
         const UnigramProperty *const unigramProperty) {
+    if (unigramProperty->representsBeginningOfSentence()
+            && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy()
+                    ->supportsBeginningOfSentence()) {
+        AKLOGE("The dictionary doesn't support Beginning-of-Sentence.");
+        return;
+    }
     TimeKeeper::setCurrentTime();
     mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
 }

diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h
index d255105..902eb00 100644
--- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h
+++ b/native/jni/src/suggest/core/dictionary/property/unigram_property.h

@@ -48,15 +48,21 @@
     };
 
     UnigramProperty()
-            : mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY),
-              mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {}
+            : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
+              mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0),
+              mShortcuts() {}
 
-    UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability,
-            const int timestamp, const int level, const int count,
-            const std::vector<ShortcutProperty> *const shortcuts)
-            : mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
+    UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+            const bool isBlacklisted, const int probability, const int timestamp, const int level,
+            const int count, const std::vector<ShortcutProperty> *const shortcuts)
+            : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+              mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
               mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {}
 
+    bool representsBeginningOfSentence() const {
+        return mRepresentsBeginningOfSentence;
+    }
+
     bool isNotAWord() const {
         return mIsNotAWord;
     }
@@ -94,6 +100,7 @@
     DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
 
     // TODO: Make members const.
+    bool mRepresentsBeginningOfSentence;
     bool mIsNotAWord;
     bool mIsBlacklisted;
     int mProbability;

diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
index 845e629..a612276 100644
--- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h

@@ -51,6 +51,8 @@
 
     virtual const std::vector<int> *getLocale() const = 0;
 
+    virtual bool supportsBeginningOfSentence() const = 0;
+
  protected:
     DictionaryHeaderStructurePolicy() {}
 

diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index 479d151..281c5a8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h

@@ -246,6 +246,10 @@
         return &mLocale;
     }
 
+    bool supportsBeginningOfSentence() const {
+        return mDictFormatVersion == FormatUtils::VERSION_4_DEV;
+    }
+
  private:
     DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
 

diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp
index 97e1120..557a0b4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp

@@ -432,8 +432,8 @@
             shortcuts.emplace_back(&target, shortcutProbability);
         }
     }
-    const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
-            ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+    const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
+            ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
             historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
             historicalInfo->getCount(), &shortcuts);
     return WordProperty(&codePointVector, &unigramProperty, &bigrams);

diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
index 5704c2e..b2e60a8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h

@@ -160,7 +160,12 @@
     }
 
     AK_FORCE_INLINE bool representsNonWordInfo() const {
-        return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0])
+        return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0])
+                && isNotAWord();
+    }
+
+    AK_FORCE_INLINE int representsBeginningOfSentence() const {
+        return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE
                 && isNotAWord();
     }
 

diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index 30dcfba..a6a470c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp

@@ -383,8 +383,8 @@
             shortcuts.emplace_back(&shortcutTarget, shortcutProbability);
         }
     }
-    const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
-            ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+    const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
+            ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
             NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
     return WordProperty(&codePointVector, &unigramProperty, &bigrams);
 }

diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index 439e90e..1858441 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp

@@ -61,7 +61,7 @@
             isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
         }
         readingHelper.readNextSiblingNode(ptNodeParams);
-        if (!ptNodeParams.representsNonWordInfo()) {
+        if (ptNodeParams.representsNonWordInfo()) {
             // Skip PtNodes that represent non-word information.
             continue;
         }
@@ -430,8 +430,8 @@
             shortcuts.emplace_back(&target, shortcutProbability);
         }
     }
-    const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
-            ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+    const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
+            ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
             historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
             historicalInfo->getCount(), &shortcuts);
     return WordProperty(&codePointVector, &unigramProperty, &bigrams);

diff --git a/tests/src/com/android/inputmethod/latin/ShiftModeTests.java b/tests/src/com/android/inputmethod/latin/ShiftModeTests.java
index 6fc9df7..de5538e 100644
--- a/tests/src/com/android/inputmethod/latin/ShiftModeTests.java
+++ b/tests/src/com/android/inputmethod/latin/ShiftModeTests.java

@@ -78,4 +78,35 @@
         runMessages();
         assertTrue("Caps after a while after repeating Backspace a lot", isCapsModeAutoShifted());
     }
+
+    public void testAutoCapsAfterDigitsPeriod() {
+        changeLanguage("en");
+        type("On 22.11.");
+        assertFalse("(English) Auto caps after digits-period", isCapsModeAutoShifted());
+        type(" ");
+        assertTrue("(English) Auto caps after digits-period-whitespace", isCapsModeAutoShifted());
+        mEditText.setText("");
+        changeLanguage("fr");
+        type("Le 22.");
+        assertFalse("(French) Auto caps after digits-period", isCapsModeAutoShifted());
+        type(" ");
+        assertTrue("(French) Auto caps after digits-period-whitespace", isCapsModeAutoShifted());
+        mEditText.setText("");
+        changeLanguage("de");
+        type("Am 22.");
+        assertFalse("(German) Auto caps after digits-period", isCapsModeAutoShifted());
+        type(" ");
+        // For German, no auto-caps in this case
+        assertFalse("(German) Auto caps after digits-period-whitespace", isCapsModeAutoShifted());
+    }
+
+    public void testAutoCapsAfterInvertedMarks() {
+        changeLanguage("es");
+        assertTrue("(Spanish) Auto caps at start", isCapsModeAutoShifted());
+        type("Hey. ¿");
+        assertTrue("(Spanish) Auto caps after inverted what", isCapsModeAutoShifted());
+        mEditText.setText("");
+        type("¡");
+        assertTrue("(Spanish) Auto caps after inverted bang", isCapsModeAutoShifted());
+    }
 }
commit	85befc0873e2765f229ad9c5c9072f2b59ce93ff	[log] [tgz]
author	Keisuke Kuroyanagi <ksk@google.com>	Fri May 23 10:11:56 2014 +0000
committer	Android (Google) Code Review <android-gerrit@google.com>	Fri May 23 10:11:57 2014 +0000
tree	b6b56bbc206ccaa9a32f50d5875d06ee0591c72d
parent	52fdaa2cd8b6903fcf1dcab23b786e917323241c [diff]
parent	f498e53933f64a344c7f9321a49d874a57611169 [diff]