Load main dic in native
Follow up to Id57dce51
bug: 3219819
Change-Id: I00e11ef21d0252ffa88c12dffb9c55b0f2e19a66
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp
index eebd69b..5ec310f 100644
--- a/native/src/bigram_dictionary.cpp
+++ b/native/src/bigram_dictionary.cpp
@@ -31,7 +31,7 @@
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
if (DEBUG_DICT) LOGI("BigramDictionary - constructor");
- if (DEBUG_DICT) LOGI("Has Bigram : %d \n", hasBigram);
+ if (DEBUG_DICT) LOGI("Has Bigram : %d", hasBigram);
}
BigramDictionary::~BigramDictionary() {
@@ -42,7 +42,7 @@
if (DEBUG_DICT) {
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
- LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
+ LOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
}
// Find the right insertion point
@@ -54,7 +54,7 @@
}
insertAt++;
}
- if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
+ if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
if (insertAt < mMaxBigrams) {
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
@@ -68,7 +68,7 @@
*dest++ = *word++;
}
*dest = 0; // NULL terminate
- if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
+ if (DEBUG_DICT) LOGI("Bigram: Added word at %d", insertAt);
return true;
}
return false;
@@ -107,7 +107,7 @@
if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
int pos = mParentDictionary->isValidWordRec(
DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
- if (DEBUG_DICT) LOGI("Pos -> %d\n", pos);
+ if (DEBUG_DICT) LOGI("Pos -> %d", pos);
if (pos < 0) {
return 0;
}
@@ -151,7 +151,7 @@
}
pos = followDownBranchAddress; // pos start at count
int count = DICT[pos] & 0xFF;
- if (DEBUG_DICT) LOGI("count - %d\n",count);
+ if (DEBUG_DICT) LOGI("count - %d",count);
pos++;
for (int i = 0; i < count; i++) {
// pos at data
diff --git a/native/src/defines.h b/native/src/defines.h
index 59eaa41..71aaf28 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -36,45 +36,47 @@
static double profile_old[PROF_BUF_SIZE];
static unsigned int profile_counter[PROF_BUF_SIZE];
-#define PROF_RESET prof_reset();
-#define PROF_COUNT(prof_buf_id) ++profile_counter[prof_buf_id];
-#define PROF_OPEN PROF_RESET;PROF_START(PROF_BUF_SIZE - 1);
-#define PROF_START(prof_buf_id) PROF_COUNT(prof_buf_id);profile_old[prof_buf_id] = (clock());
-#define PROF_CLOSE PROF_END(PROF_BUF_SIZE - 1);PROF_OUTALL;
-#define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]);
-#define PROF_CLOCKOUT(prof_buf_id) LOGI("%s : clock is %f", __FUNCTION__,\
- (clock() - profile_old[prof_buf_id]));
-#define PROF_OUTALL LOGI("--- %s ---", __FUNCTION__); prof_out();
+#define PROF_RESET prof_reset()
+#define PROF_COUNT(prof_buf_id) ++profile_counter[prof_buf_id]
+#define PROF_OPEN do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while(0)
+#define PROF_START(prof_buf_id) do { \
+ PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while(0)
+#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0)
+#define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id])
+#define PROF_CLOCKOUT(prof_buf_id) \
+ LOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id]))
+#define PROF_OUTALL do { LOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0)
-static void prof_reset(void){
- for(int i = 0;i < PROF_BUF_SIZE;++i){
+static void prof_reset(void) {
+ for (int i = 0; i < PROF_BUF_SIZE; ++i) {
profile_buf[i] = 0;
profile_old[i] = 0;
profile_counter[i] = 0;
}
}
-static void prof_out(void){
+static void prof_out(void) {
if (profile_counter[PROF_BUF_SIZE - 1] != 1) {
LOGI("Error: You must call PROF_OPEN before PROF_CLOSE.");
}
LOGI("Total time is %6.3f ms.",
- profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double) CLOCKS_PER_SEC);
+ profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double)CLOCKS_PER_SEC);
double all = 0;
- for(int i = 0; i < PROF_BUF_SIZE - 1; ++i){
+ for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
all += profile_buf[i];
}
- if(all == 0) all = 1;
- for(int i = 0; i < PROF_BUF_SIZE - 1; ++i){
- if(profile_buf[i] != 0) {
+ if (all == 0) all = 1;
+ for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
+ if (profile_buf[i] != 0) {
LOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.",
- i, (profile_buf[i] * 100 /all),
- profile_buf[i] * 1000 / (double) CLOCKS_PER_SEC, profile_counter[i]);
- }
+ i, (profile_buf[i] * 100 / all),
+ profile_buf[i] * 1000 / (double)CLOCKS_PER_SEC, profile_counter[i]);
+ }
}
}
#else // FLAG_DBG
+#define LOGE
#define LOGI
#define DEBUG_DICT false
#define DEBUG_DICT_FULL false
@@ -99,6 +101,11 @@
#define U_SHORT_MAX 1 << 16
#endif
+// Define this to use mmap() for dictionary loading. Undefine to use malloc() instead of mmap().
+// We measured and compared performance of both, and found mmap() is fairly good in terms of
+// loading time, and acceptable even for several initial lookups which involve page faults.
+#define USE_MMAP_FOR_DICTIONARY
+
// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
#define ADDRESS_MASK 0x3FFFFF
diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp
index 8d32909..fe33757 100644
--- a/native/src/dictionary.cpp
+++ b/native/src/dictionary.cpp
@@ -23,21 +23,23 @@
namespace latinime {
-Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
+Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
+ int typedLetterMultiplier, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxAlternatives)
- : DICT((unsigned char*) dict),
+ : mDict((unsigned char*) dict), mDictSize(dictSize),
+ mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust),
// Checks whether it has the latest dictionary or the old dictionary
IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) {
if (DEBUG_DICT) {
if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) {
LOGI("Max word length (%d) is greater than %d",
maxWordLength, MAX_WORD_LENGTH_INTERNAL);
- LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF));
+ LOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF));
}
}
- mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier,
+ mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier,
maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
- mBigramDictionary = new BigramDictionary(DICT, maxWordLength, maxAlternatives,
+ mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives,
IS_LATEST_DICT_VERSION, hasBigram(), this);
}
@@ -47,7 +49,7 @@
}
bool Dictionary::hasBigram() {
- return ((DICT[1] & 0xFF) == 1);
+ return ((mDict[1] & 0xFF) == 1);
}
// TODO: use uint16_t instead of unsigned short
@@ -64,12 +66,12 @@
// returns address of bigram data of that word
// return -99 if not found
- int count = Dictionary::getCount(DICT, &pos);
+ int count = Dictionary::getCount(mDict, &pos);
unsigned short currentChar = (unsigned short) word[offset];
for (int j = 0; j < count; j++) {
- unsigned short c = Dictionary::getChar(DICT, &pos);
- int terminal = Dictionary::getTerminal(DICT, &pos);
- int childPos = Dictionary::getAddress(DICT, &pos);
+ unsigned short c = Dictionary::getChar(mDict, &pos);
+ int terminal = Dictionary::getTerminal(mDict, &pos);
+ int childPos = Dictionary::getAddress(mDict, &pos);
if (c == currentChar) {
if (offset == length - 1) {
if (terminal) {
@@ -85,7 +87,7 @@
}
}
if (terminal) {
- Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos);
+ Dictionary::getFreq(mDict, IS_LATEST_DICT_VERSION, &pos);
}
// There could be two instances of each alphabet - upper and lower case. So continue
// looking ...
diff --git a/native/src/dictionary.h b/native/src/dictionary.h
index da87624..cef1cf9 100644
--- a/native/src/dictionary.h
+++ b/native/src/dictionary.h
@@ -25,8 +25,8 @@
class Dictionary {
public:
- Dictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength,
- int maxWords, int maxAlternatives);
+ Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler,
+ int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives);
int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int *nextLetters, int nextLettersSize) {
return mUnigramDictionary->getSuggestions(codes, codesSize, outWords, frequencies,
@@ -42,8 +42,10 @@
}
bool isValidWord(unsigned short *word, int length);
int isValidWordRec(int pos, unsigned short *word, int offset, int length);
- void setAsset(void *asset) { mAsset = asset; }
- void *getAsset() { return mAsset; }
+ void *getDict() { return (void *)mDict; }
+ int getDictSize() { return mDictSize; }
+ int getMmapFd() { return mMmapFd; }
+ int getDictBufAdjust() { return mDictBufAdjust; }
~Dictionary();
// public static utility methods
@@ -62,11 +64,17 @@
private:
bool hasBigram();
- const unsigned char *DICT;
+ const unsigned char *mDict;
+
+ // Used only for the mmap version of dictionary loading, but we use these as dummy variables
+ // also for the malloc version.
+ const int mDictSize;
+ const int mMmapFd;
+ const int mDictBufAdjust;
+
const bool IS_LATEST_DICT_VERSION;
- void *mAsset;
- BigramDictionary *mBigramDictionary;
UnigramDictionary *mUnigramDictionary;
+ BigramDictionary *mBigramDictionary;
};
// ----------------------------------------------------------------------------
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index af2cc97..3f9bcd7 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -113,7 +113,6 @@
LOGI("%c = %d,", k, nextLetters[k]);
}
}
- LOGI("\n");
}
PROF_END(6);
PROF_CLOSE;
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 445ff7a..7f7b7bd 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -80,13 +80,13 @@
bool existsAdjacentProximityChars(const int inputIndex, const int inputLength);
int* getInputCharsAt(const int index) {return mInputCodes + (index * MAX_PROXIMITY_CHARS);}
const unsigned char *DICT;
- const int MAX_WORDS;
const int MAX_WORD_LENGTH;
+ const int MAX_WORDS;
const int MAX_PROXIMITY_CHARS;
const bool IS_LATEST_DICT_VERSION;
- const int ROOT_POS;
const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER;
+ const int ROOT_POS;
int *mFrequencies;
unsigned short *mOutputChars;