Add a variable-length header region to the binary format.

Also bump up the format version to 2.

Bug: 5686638
Change-Id: I3aafdd7e42c422202122998ec093280051aa8e07
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp
index 19b6446..84048d7 100644
--- a/native/src/bigram_dictionary.cpp
+++ b/native/src/bigram_dictionary.cpp
@@ -28,7 +28,7 @@
 BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
         int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
         Dictionary *parentDictionary)
-    : DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength),
+    : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
     MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
     if (DEBUG_DICT) {
diff --git a/native/src/binary_format.h b/native/src/binary_format.h
index 1d74998..ab033ad 100644
--- a/native/src/binary_format.h
+++ b/native/src/binary_format.h
@@ -17,6 +17,7 @@
 #ifndef LATINIME_BINARY_FORMAT_H
 #define LATINIME_BINARY_FORMAT_H
 
+#include <limits>
 #include "unigram_dictionary.h"
 
 namespace latinime {
@@ -29,10 +30,18 @@
 
  public:
     const static int UNKNOWN_FORMAT = -1;
-    const static int FORMAT_VERSION_1 = 1;
-    const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
+    // Originally, format version 1 had a 16-bit magic number, then the version number `01'
+    // then options that must be 0. Hence the first 32-bits of the format are always as follow
+    // and it's okay to consider them a magic number as a whole.
+    const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
+    const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
+    // The versions of Latin IME that only handle format version 1 only test for the magic
+    // number, so we had to change it so that version 2 files would be rejected by older
+    // implementations. On this occasion, we made the magic number 32 bits long.
+    const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
 
     static int detectFormat(const uint8_t* const dict);
+    static unsigned int getHeaderSize(const uint8_t* const dict);
     static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
     static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
     static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
@@ -55,9 +64,37 @@
 };
 
 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
-    const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
-    if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
-    return UNKNOWN_FORMAT;
+    // The magic number is stored big-endian.
+    const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
+    switch (magicNumber) {
+    case FORMAT_VERSION_1_MAGIC_NUMBER:
+        // Format 1 header is exactly 5 bytes long and looks like:
+        // Magic number (2 bytes) 0x78 0xB1
+        // Version number (1 byte) 0x01
+        // Options (2 bytes) must be 0x00 0x00
+        return 1;
+    case FORMAT_VERSION_2_MAGIC_NUMBER:
+        // Format 2 header is as follows:
+        // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
+        // Version number (2 bytes) 0x00 0x02
+        // Options (2 bytes) must be 0x00 0x00
+        // Header size (4 bytes) : integer, big endian
+        return (dict[4] << 8) + dict[5];
+    default:
+        return UNKNOWN_FORMAT;
+    }
+}
+
+inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
+    switch (detectFormat(dict)) {
+    case 1:
+        return FORMAT_VERSION_1_HEADER_SIZE;
+    case 2:
+        // See the format of the header in the comment in detectFormat() above
+        return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
+    default:
+        return std::numeric_limits<unsigned int>::max();
+    }
 }
 
 inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
diff --git a/native/src/defines.h b/native/src/defines.h
index f402efa..afa1e04 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -162,9 +162,6 @@
 #define FLAG_BIGRAM_FREQ 0x7F
 
 #define DICTIONARY_VERSION_MIN 200
-// TODO: remove this constant when the switch to the new dict format is over
-#define DICTIONARY_HEADER_SIZE 2
-#define NEW_DICTIONARY_HEADER_SIZE 5
 #define NOT_VALID_WORD -99
 #define NOT_A_CHARACTER -1
 #define NOT_A_DISTANCE -1
diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp
index 822c215..8e252f7 100644
--- a/native/src/dictionary.cpp
+++ b/native/src/dictionary.cpp
@@ -19,6 +19,7 @@
 
 #define LOG_TAG "LatinIME: dictionary.cpp"
 
+#include "binary_format.h"
 #include "dictionary.h"
 
 namespace latinime {
@@ -41,10 +42,11 @@
     mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier);
     mWordsPriorityQueuePool = new WordsPriorityQueuePool(
             maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength);
-    mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier,
-            maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
-    mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives,
-            IS_LATEST_DICT_VERSION, hasBigram(), this);
+    const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict);
+    mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier,
+            fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
+    mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, maxAlternatives,
+            IS_LATEST_DICT_VERSION, true /* hasBigram */, this);
 }
 
 Dictionary::~Dictionary() {
@@ -54,10 +56,6 @@
     delete mBigramDictionary;
 }
 
-bool Dictionary::hasBigram() {
-    return ((mDict[1] & 0xFF) == 1);
-}
-
 bool Dictionary::isValidWord(unsigned short *word, int length) {
     return mUnigramDictionary->isValidWord(word, length);
 }
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 155bdcb..0b646d1 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -38,8 +38,7 @@
 UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
         int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
         const bool isLatestDictVersion)
-    : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
-    MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
+    : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
     MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
       // TODO : remove this variable.