Merge "[Rlog56] Buffer words before pushing out LogUnit"
diff --git a/java/src/com/android/inputmethod/research/FixedLogBuffer.java b/java/src/com/android/inputmethod/research/FixedLogBuffer.java
index 9613c2d..7771119 100644
--- a/java/src/com/android/inputmethod/research/FixedLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/FixedLogBuffer.java
@@ -81,7 +81,7 @@
         return logUnit;
     }
 
-    private void shiftOutThroughFirstWord() {
+    public void shiftOutThroughFirstWord() {
         final LinkedList<LogUnit> logUnits = getLogUnits();
         while (!logUnits.isEmpty()) {
             final LogUnit logUnit = logUnits.removeFirst();
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java
index 898a042..a8f255a 100644
--- a/java/src/com/android/inputmethod/research/MainLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java
@@ -26,18 +26,42 @@
 import java.util.Random;
 
 /**
- * Provide a log buffer of fixed length that enforces privacy restrictions.
+ * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
  *
- * The privacy restrictions include making sure that no numbers are logged, that all logged words
- * are in the dictionary, and that words are recorded infrequently enough that the user's meaning
- * cannot be easily determined.
+ * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
+ * be logged in enough detail to determine their contents, 2) only a subset of words are logged
+ * in detail, such as 10%, and 3) no numbers are logged.
+ *
+ * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
+ * words, they are added here.  But if the user backs up over their current word to edit a word
+ * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
+ * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
+ * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
+ * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
+ * a user can perform.
+ *
+ * To balance these requirements (keep history so user can edit, flush history so it does not pile
+ * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
+ * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
+ * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
+ * However, the additional non-detailed words are retained, in case the user backspaces to edit
+ * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
+ * as new words arrive.  After enough non-detailed words have been pushed out to account for the
+ * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
+ *
+ * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
+ * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
+ * dictionary words.
+ *
+ * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
+ * n-gram containing dictionary words.
  */
 public class MainLogBuffer extends FixedLogBuffer {
     private static final String TAG = MainLogBuffer.class.getSimpleName();
     private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
 
     // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
-    private static final int N_GRAM_SIZE = 2;
+    public static final int N_GRAM_SIZE = 2;
     // The number of words between n-grams to omit from the log.  If debugging, record 50% of all
     // words.  Otherwise, only record 10%.
     private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
@@ -46,49 +70,31 @@
     private final ResearchLog mResearchLog;
     private Suggest mSuggest;
 
-    // The minimum periodicity with which n-grams can be sampled.  E.g. mWinWordPeriod is 10 if
-    // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
-    // for 11-18, and the bigram at words 19 and 20.  If an n-gram is not safe (e.g. it  contains a
-    // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
-    // n-gram does appear.
-    /* package for test */ int mMinWordPeriod;
+    /* package for test */ int mNumWordsBetweenNGrams;
 
     // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
     // after a sample is taken.
-    /* package for test */ int mWordsUntilSafeToSample;
+    /* package for test */ int mNumWordsUntilSafeToSample;
 
     public MainLogBuffer(final ResearchLog researchLog) {
-        super(N_GRAM_SIZE);
+        super(N_GRAM_SIZE + DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES);
         mResearchLog = researchLog;
-        mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
+        mNumWordsBetweenNGrams = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES;
         final Random random = new Random();
-        mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
+        mNumWordsUntilSafeToSample = DEBUG ? 0 : random.nextInt(mNumWordsBetweenNGrams + 1);
     }
 
     public void setSuggest(final Suggest suggest) {
         mSuggest = suggest;
     }
 
-    @Override
-    public void shiftIn(final LogUnit newLogUnit) {
-        super.shiftIn(newLogUnit);
-        if (newLogUnit.hasWord()) {
-            if (mWordsUntilSafeToSample > 0) {
-                mWordsUntilSafeToSample--;
-            }
-        }
-        if (DEBUG) {
-            Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
-        }
-    }
-
     public void resetWordCounter() {
-        mWordsUntilSafeToSample = mMinWordPeriod;
+        mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
     }
 
     /**
-     * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
-     * form and still protect the user's privacy.
+     * Determines whether uploading the n words at the front the MainLogBuffer will not violate
+     * user privacy.
      *
      * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
      * non-character data that is typed between words.  The decision about privacy is made based on
@@ -97,10 +103,10 @@
      * the screen orientation and other characteristics about the device can be uploaded without
      * revealing much about the user.
      */
-    public boolean isSafeToLog() {
+    public boolean isNGramSafe() {
         // Check that we are not sampling too frequently.  Having sampled recently might disclose
         // too much of the user's intended meaning.
-        if (mWordsUntilSafeToSample > 0) {
+        if (mNumWordsUntilSafeToSample > 0) {
             return false;
         }
         if (mSuggest == null || !mSuggest.hasMainDictionary()) {
@@ -119,8 +125,8 @@
         // complete buffer contents in detail.
         final LinkedList<LogUnit> logUnits = getLogUnits();
         final int length = logUnits.size();
-        int wordsFound = 0;
-        for (int i = 0; i < length; i++) {
+        int wordsNeeded = N_GRAM_SIZE;
+        for (int i = 0; i < length && wordsNeeded > 0; i++) {
             final LogUnit logUnit = logUnits.get(i);
             final String word = logUnit.getWord();
             if (word == null) {
@@ -136,26 +142,41 @@
                                 + ", isValid: " + (dictionary.isValidWord(word)));
                     }
                     return false;
-                } else {
-                    wordsFound++;
                 }
             }
         }
-        if (wordsFound < N_GRAM_SIZE) {
-            // Not enough words.  Not unsafe, but reject anyway.
-            if (DEBUG) {
-                Log.d(TAG, "not enough words");
-            }
-            return false;
-        }
         // All checks have passed; this buffer's content can be safely uploaded.
         return true;
     }
 
+    public boolean isNGramComplete() {
+        final LinkedList<LogUnit> logUnits = getLogUnits();
+        final int length = logUnits.size();
+        int wordsNeeded = N_GRAM_SIZE;
+        for (int i = 0; i < length && wordsNeeded > 0; i++) {
+            final LogUnit logUnit = logUnits.get(i);
+            final String word = logUnit.getWord();
+            if (word != null) {
+                wordsNeeded--;
+            }
+        }
+        return wordsNeeded == 0;
+    }
+
     @Override
     protected void onShiftOut(final LogUnit logUnit) {
         if (mResearchLog != null) {
-            mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
+            mResearchLog.publish(logUnit,
+                    ResearchLogger.IS_LOGGING_EVERYTHING /* isIncludingPrivateData */);
+        }
+        if (logUnit.hasWord()) {
+            if (mNumWordsUntilSafeToSample > 0) {
+                mNumWordsUntilSafeToSample--;
+                Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
+            }
+        }
+        if (DEBUG) {
+            Log.d(TAG, "shiftedOut " + (logUnit.hasWord() ? logUnit.getWord() : ""));
         }
     }
 }
diff --git a/java/src/com/android/inputmethod/research/ResearchLogger.java b/java/src/com/android/inputmethod/research/ResearchLogger.java
index b61db27..f464fac 100644
--- a/java/src/com/android/inputmethod/research/ResearchLogger.java
+++ b/java/src/com/android/inputmethod/research/ResearchLogger.java
@@ -85,7 +85,7 @@
     private static final String TAG = ResearchLogger.class.getSimpleName();
     private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
     // Whether all n-grams should be logged.  true will disclose private info.
-    private static final boolean IS_LOGGING_EVERYTHING = false
+    public static final boolean IS_LOGGING_EVERYTHING = false
             && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
     // Whether the TextView contents are logged at the end of the session.  true will disclose
     // private info.
@@ -394,8 +394,16 @@
         commitCurrentLogUnit();
 
         if (mMainLogBuffer != null) {
-            publishLogBuffer(mMainLogBuffer, mMainResearchLog,
-                    IS_LOGGING_EVERYTHING /* isIncludingPrivateData */);
+            while (!mMainLogBuffer.isEmpty()) {
+                if ((mMainLogBuffer.isNGramSafe() || IS_LOGGING_EVERYTHING) &&
+                        mMainResearchLog != null) {
+                    publishLogBuffer(mMainLogBuffer, mMainResearchLog,
+                            true /* isIncludingPrivateData */);
+                    mMainLogBuffer.resetWordCounter();
+                } else {
+                    mMainLogBuffer.shiftOutThroughFirstWord();
+                }
+            }
             mMainResearchLog.close(null /* callback */);
             mMainLogBuffer = null;
         }
@@ -702,8 +710,9 @@
         }
         if (!mCurrentLogUnit.isEmpty()) {
             if (mMainLogBuffer != null) {
-                if ((mMainLogBuffer.isSafeToLog() || IS_LOGGING_EVERYTHING)
-                        && mMainResearchLog != null) {
+                if ((mMainLogBuffer.isNGramSafe() || IS_LOGGING_EVERYTHING) &&
+                        mMainLogBuffer.isNGramComplete() &&
+                        mMainResearchLog != null) {
                     publishLogBuffer(mMainLogBuffer, mMainResearchLog,
                             true /* isIncludingPrivateData */);
                     mMainLogBuffer.resetWordCounter();
@@ -714,6 +723,10 @@
                 mFeedbackLogBuffer.shiftIn(mCurrentLogUnit);
             }
             mCurrentLogUnit = new LogUnit();
+        } else {
+            if (DEBUG) {
+                Log.d(TAG, "Warning: tried to commit empty log unit.");
+            }
         }
     }
 
@@ -756,8 +769,8 @@
             mFeedbackLogBuffer.unshiftIn();
         }
         if (DEBUG) {
-            Log.d(TAG, "uncommitCurrentLogUnit back to " + (mCurrentLogUnit.hasWord()
-                    ? ": '" + mCurrentLogUnit.getWord() + "'" : ""));
+            Log.d(TAG, "uncommitCurrentLogUnit (dump=" + dumpCurrentLogUnit + ") back to "
+                    + (mCurrentLogUnit.hasWord() ? ": '" + mCurrentLogUnit.getWord() + "'" : ""));
         }
     }
 
@@ -773,12 +786,16 @@
                 isIncludingPrivateData);
         researchLog.publish(openingLogUnit, true /* isIncludingPrivateData */);
         LogUnit logUnit;
-        while ((logUnit = logBuffer.shiftOut()) != null) {
+        int numWordsToPublish = MainLogBuffer.N_GRAM_SIZE;
+        while ((logUnit = logBuffer.shiftOut()) != null && numWordsToPublish > 0) {
             if (DEBUG) {
                 Log.d(TAG, "publishLogBuffer: " + (logUnit.hasWord() ? logUnit.getWord()
                         : "<wordless>"));
             }
             researchLog.publish(logUnit, isIncludingPrivateData);
+            if (logUnit.getWord() != null) {
+                numWordsToPublish--;
+            }
         }
         final LogUnit closingLogUnit = new LogUnit();
         closingLogUnit.addLogStatement(LOGSTATEMENT_LOG_SEGMENT_CLOSING,
@@ -1254,9 +1271,12 @@
     public static void latinIME_revertCommit(final String committedWord,
             final String originallyTypedWord, final boolean isBatchMode) {
         final ResearchLogger researchLogger = getInstance();
-        final LogUnit logUnit = researchLogger.mMainLogBuffer.peekLastLogUnit();
+        // Assume that mCurrentLogUnit has been restored to contain the reverted word.
+        final LogUnit logUnit = researchLogger.mCurrentLogUnit;
         if (originallyTypedWord.length() > 0 && hasLetters(originallyTypedWord)) {
             if (logUnit != null) {
+                // Probably not necessary, but setting as a precaution in case the word isn't
+                // committed later.
                 logUnit.setWord(originallyTypedWord);
             }
         }