Demote words with a capitalized char

Bug: 5371514

+1       4
-1       2
+2       0
-2       0
+3       0
-3       0
+4       1
-4       3
+5       0
-5      12
+6       3
-6       3
+7      12
-7       0

Change-Id: I6b46e43f9059f1e8a1cc02a626ea6eb8f1f9924f
diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 5128c2e..9e75ffc 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <assert.h>
+#include <ctype.h>
 #include <stdio.h>
 #include <string.h>
 
@@ -89,8 +90,10 @@
     }
 }
 
-int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq) {
-    return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(firstFreq, secondFreq, this);
+int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
+        const unsigned short *word) {
+    return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
+            firstFreq, secondFreq, this, word);
 }
 
 int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@@ -498,6 +501,16 @@
     return quoteCount;
 }
 
+inline static bool isUpperCase(unsigned short c) {
+     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
+         c = BASE_CHARS[c];
+     }
+     if (isupper(c)) {
+         return true;
+     }
+     return false;
+}
+
 /* static */
 inline static int editDistance(
         int* editDistanceTable, const unsigned short* input,
@@ -749,7 +762,8 @@
 
 /* static */
 int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
-        const int firstFreq, const int secondFreq, const Correction* correction) {
+        const int firstFreq, const int secondFreq, const Correction* correction,
+        const unsigned short *word) {
     const int spaceProximityPos = correction->mSpaceProximityPos;
     const int missingSpacePos = correction->mMissingSpacePos;
     if (DEBUG_DICT) {
@@ -761,11 +775,27 @@
     const bool isSpaceProximity = spaceProximityPos >= 0;
     const int inputLength = correction->mInputLength;
     const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
-    const int secondWordLength = isSpaceProximity
-            ? (inputLength - spaceProximityPos - 1)
+    const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
             : (inputLength - missingSpacePos);
     const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
 
+    bool firstCapitalizedWordDemotion = false;
+    if (firstWordLength >= 2) {
+        firstCapitalizedWordDemotion = isUpperCase(word[0]);
+    }
+
+    bool secondCapitalizedWordDemotion = false;
+    if (secondWordLength >= 2) {
+        secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+    }
+
+    const bool capitalizedWordDemotion =
+            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
+
+    if (DEBUG_DICT_FULL) {
+        LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
+    }
+
     if (firstWordLength == 0 || secondWordLength == 0) {
         return 0;
     }
@@ -815,6 +845,11 @@
     }
 
     multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
+
+    if (capitalizedWordDemotion) {
+        multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
+    }
+
     return totalFreq;
 }
 
diff --git a/native/src/correction.h b/native/src/correction.h
index 84e0752..a630646 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -73,7 +73,8 @@
 
     bool needsToPrune() const;
 
-    int getFreqForSplitTwoWords(const int firstFreq, const int secondFreq);
+    int getFreqForSplitTwoWords(
+            const int firstFreq, const int secondFreq, const unsigned short *word);
     int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
 
     CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
@@ -151,7 +152,7 @@
         static int calculateFinalFreq(const int inputIndex, const int depth,
                 const int freq, int *editDistanceTable, const Correction* correction);
         static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
-                const Correction* correction);
+                const Correction* correction, const unsigned short *word);
     };
 };
 } // namespace latinime
diff --git a/native/src/defines.h b/native/src/defines.h
index dab8629..57bd9f7 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -189,6 +189,7 @@
 #define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45
 #define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
 #define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
+#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
 
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index f23bd32..8eb5a97 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -431,7 +431,7 @@
         word[i] = mWord[i - firstWordLength - 1];
     }
 
-    const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq);
+    const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
     if (DEBUG_DICT) {
         LOGI("Split two words:  %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
     }