Separate shortcut targets from the trie file.

Bug: 10920165
Change-Id: I340759eadbde7fb64cb3b9a3c619ee3a768cedf8
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
index 2c3d134..216492b 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
@@ -23,11 +23,11 @@
 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
 
-import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.util.ArrayList;
@@ -219,14 +219,14 @@
         }
 
         /**
-         * Writes a string with our character format to a ByteArrayOutputStream.
+         * Writes a string with our character format to an OutputStream.
          *
          * This will also write the terminator byte.
          *
-         * @param buffer the ByteArrayOutputStream to write to.
+         * @param buffer the OutputStream to write to.
          * @param word the string to write.
          */
-        static void writeString(final ByteArrayOutputStream buffer, final String word) {
+        static void writeString(final OutputStream buffer, final String word) throws IOException {
             final int length = word.length();
             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
                 final int codePoint = word.codePointAt(i);
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java
index b602424..f761829 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java
@@ -383,8 +383,8 @@
                 nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
                         nodeSize + size, ptNode.mChildren));
             }
-            nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
             if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) {
+                nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
                 if (null != ptNode.mBigrams) {
                     for (WeightedString bigram : ptNode.mBigrams) {
                         final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
index a5516bd..5a5d7af 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
@@ -266,15 +266,28 @@
     // tat = Terminal Address Table
     static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
     static final String BIGRAM_FILE_EXTENSION = ".bigram";
+    static final String SHORTCUT_FILE_EXTENSION = ".shortcut";
     static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
     static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
     static final int FREQUENCY_AND_FLAGS_SIZE = 2;
     static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
+
+    // With the English main dictionary as of October 2013, the size of bigram address table is
+    // is 584KB with the block size being 4.
+    // This is 91% of that of full address table.
     static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
     static final int BIGRAM_CONTENT_COUNT = 1;
     static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
     static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
 
+    static final int SHORTCUT_CONTENT_COUNT = 1;
+    static final int SHORTCUT_CONTENT_INDEX = 0;
+    // With the English main dictionary as of October 2013, the size of shortcut address table is
+    // 29KB with the block size being 64.
+    // This is only 4.4% of that of full address table.
+    static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
+    static final String SHORTCUT_CONTENT_ID = "_shortcut";
+
     static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
     static final int NO_PARENT_ADDRESS = 0;
     static final int NO_FORWARD_LINK_ADDRESS = 0;
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
index 5089687..2d2da5f 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
@@ -23,6 +23,7 @@
 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
+import com.android.inputmethod.latin.utils.CollectionUtils;
 
 import android.util.Log;
 
@@ -43,6 +44,7 @@
     private static final int FILETYPE_FREQUENCY = 2;
     private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
     private static final int FILETYPE_BIGRAM_FREQ = 4;
+    private static final int FILETYPE_SHORTCUT = 5;
 
     private final File mDictDirectory;
     private final DictionaryBufferFactory mBufferFactory;
@@ -50,7 +52,9 @@
     private DictBuffer mFrequencyBuffer;
     private DictBuffer mTerminalAddressTableBuffer;
     private DictBuffer mBigramBuffer;
+    private DictBuffer mShortcutBuffer;
     private SparseTable mBigramAddressTable;
+    private SparseTable mShortcutAddressTable;
 
     @UsedForTesting
     /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
@@ -89,6 +93,10 @@
             return new File(mDictDirectory,
                     mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
                             + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
+        } else if (fileType == FILETYPE_SHORTCUT) {
+            return new File(mDictDirectory,
+                    mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION
+                            + FormatSpec.SHORTCUT_CONTENT_ID);
         } else {
             throw new RuntimeException("Unsupported kind of file : " + fileType);
         }
@@ -102,6 +110,8 @@
                 getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
         mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
         loadBigramAddressSparseTable();
+        mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT));
+        loadShortcutAddressSparseTable();
     }
 
     @Override
@@ -136,6 +146,18 @@
                 FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
     }
 
+    // TODO: Let's have something like SparseTableContentsReader in this class.
+    private void loadShortcutAddressSparseTable() throws IOException {
+        final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
+                + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
+        final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
+                + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+                + FormatSpec.SHORTCUT_CONTENT_ID);
+        mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
+                new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
+    }
+
+
     protected static class PtNodeReader extends DictDecoder.PtNodeReader {
         protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
             frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
@@ -147,6 +169,23 @@
         }
     }
 
+    private ArrayList<WeightedString> readShortcuts(final int terminalId) {
+        if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null;
+
+        final ArrayList<WeightedString> ret = CollectionUtils.newArrayList();
+        final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX,
+                terminalId);
+        mShortcutBuffer.position(posOfShortcuts);
+        while (true) {
+            final int flags = mShortcutBuffer.readUnsignedByte();
+            final String word = CharEncoding.readString(mShortcutBuffer);
+            ret.add(new WeightedString(word,
+                    flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
+            if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
+        }
+        return ret;
+    }
+
     // TODO: Make this buffer thread safe.
     // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
     private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
@@ -197,14 +236,7 @@
             childrenAddress += addressPointer;
         }
         addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
-        final ArrayList<WeightedString> shortcutTargets;
-        if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
-            // readShortcut will add shortcuts to shortcutTargets.
-            shortcutTargets = new ArrayList<WeightedString>();
-            addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets);
-        } else {
-            shortcutTargets = null;
-        }
+        final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId);
 
         final ArrayList<PendingAttribute> bigrams;
         if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
index b38c330..f9dcacf 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
@@ -49,6 +49,7 @@
     private File mDictDir;
     private String mBaseFilename;
     private BigramContentWriter mBigramWriter;
+    private ShortcutContentWriter mShortcutWriter;
 
     @UsedForTesting
     public Ver4DictEncoder(final File dictPlacedDir) {
@@ -152,6 +153,39 @@
         }
     }
 
+    private static class ShortcutContentWriter extends SparseTableContentWriter {
+        public ShortcutContentWriter(final String name, final int initialCapacity,
+                final File baseDir) {
+            super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT,
+                    initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
+                    new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
+                    new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
+        }
+
+        public void writeShortcutForOneWord(final int terminalId,
+                final Iterator<WeightedString> shortcutIterator) throws IOException {
+            write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
+                    new SparseTableContentWriterInterface() {
+                        @Override
+                        public void write(final OutputStream outStream) throws IOException {
+                            writeShortcutForOneWordInternal(outStream, shortcutIterator);
+                        }
+                    });
+        }
+
+        private void writeShortcutForOneWordInternal(final OutputStream outStream,
+                final Iterator<WeightedString> shortcutIterator) throws IOException {
+            while (shortcutIterator.hasNext()) {
+                final WeightedString target = shortcutIterator.next();
+                final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
+                        shortcutIterator.hasNext(), target.mFrequency);
+                BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags,
+                        FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
+                CharEncoding.writeString(outStream, target.mWord);
+            }
+        }
+    }
+
     private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
             throws FileNotFoundException, IOException {
         final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
@@ -225,6 +259,8 @@
         writeTerminalData(flatNodes, terminalCount);
         mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
         writeBigrams(flatNodes, dict);
+        mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
+        writeShortcuts(flatNodes);
 
         final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
         final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
@@ -306,29 +342,6 @@
         }
     }
 
-    private void writeShortcuts(ArrayList<WeightedString> shortcuts) {
-        if (null == shortcuts || shortcuts.isEmpty()) return;
-
-        final int indexOfShortcutByteSize = mTriePos;
-        mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
-        final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
-        while (shortcutIterator.hasNext()) {
-            final WeightedString target = shortcutIterator.next();
-            final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
-                    shortcutIterator.hasNext(), target.mFrequency);
-            mTrieBuf[mTriePos++] = (byte)shortcutFlags;
-            final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos,
-                    target.mWord);
-            mTriePos += shortcutShift;
-        }
-        final int shortcutByteSize = mTriePos - indexOfShortcutByteSize;
-        if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
-            throw new RuntimeException("Shortcut list too large : " + shortcutByteSize);
-        }
-        BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize,
-                shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
-    }
-
     private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
             throws IOException {
         mBigramWriter.openStreams();
@@ -343,6 +356,19 @@
         mBigramWriter.closeStreams();
     }
 
+    private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException {
+        mShortcutWriter.openStreams();
+        for (final PtNodeArray nodeArray : flatNodes) {
+            for (final PtNode ptNode : nodeArray.mData) {
+                if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) {
+                    mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId,
+                            ptNode.mShortcutTargets.iterator());
+                }
+            }
+        }
+        mShortcutWriter.closeStreams();
+    }
+
     @Override
     public void writeForwardLinkAddress(int forwardLinkAddress) {
         mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
@@ -359,7 +385,6 @@
             writeTerminalId(ptNode.mTerminalId);
         }
         writeChildrenPosition(ptNode, formatOptions);
-        writeShortcuts(ptNode.mShortcutTargets);
     }
 
     private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes,