native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME - Gitiles

 /*
 **
 ** Copyright 2010, The Android Open Source Project
 **
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 **
 **     http://www.apache.org/licenses/LICENSE-2.0
 **
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 */

 #include <stdio.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <string.h>

 #ifdef FLAG_DBG
 #define LOG_TAG "LatinIME: dictionary.cpp"
 #include <cutils/log.h>
 #define DEBUG_DICT 1
 #else // FLAG_DBG
 #define LOGI
 #define DEBUG_DICT 0
 #endif // FLAG_DBG

 #include "unigram_dictionary.h"
 #include "basechars.h"
 #include "char_utils.h"

 #define DICTIONARY_VERSION_MIN 200
 #define DICTIONARY_HEADER_SIZE 2
 #define NOT_VALID_WORD -99

 #define SUGGEST_MISSING_CHARACTERS true
 #define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5


 namespace latinime {

 UnigramDictionary::UnigramDictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
         int maxWordLength, int maxWords, int maxAlternatives, Dictionary *parentDictionary)
     : MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords), MAX_ALTERNATIVES(maxAlternatives)
 {
     LOGI("UnigramDictionary - constructor");
     mDict = (unsigned char*) dict;
     mTypedLetterMultiplier = typedLetterMultiplier;
     mFullWordMultiplier = fullWordMultiplier;
     mParentDictionary = parentDictionary;
     getVersionNumber();
 }

 UnigramDictionary::~UnigramDictionary()
 {
 }

 int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
         int *nextLetters, int nextLettersSize)
 {

     initSuggestions(codes, codesSize, outWords, frequencies);

     int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
             nextLettersSize);

     // If there aren't sufficient suggestions, search for words by allowing wild cards at
     // the different character positions. This feature is not ready for prime-time as we need
     // to figure out the best ranking for such words compared to proximity corrections and
     // completions.
     if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
         for (int i = 0; i < codesSize; ++i) {
             int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
             if (tempCount > suggestedWordsCount) {
                 suggestedWordsCount = tempCount;
                 break;
             }
         }
     }

     if (DEBUG_DICT) {
         LOGI("Returning %d words", suggestedWordsCount);
         LOGI("Next letters: ");
         for (int k = 0; k < nextLettersSize; k++) {
             if (nextLetters[k] > 0) {
                 LOGI("%c = %d,", k, nextLetters[k]);
             }
         }
         LOGI("\n");
     }
     return suggestedWordsCount;
 }

 void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
         int *frequencies) {
     mFrequencies = frequencies;
     mOutputChars = outWords;
     mInputCodes = codes;
     mInputLength = codesSize;
     mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
 }

 int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
         int *nextLetters, int nextLettersSize) {
     if (checkIfDictVersionIsLatest()) {
         getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
                 nextLetters, nextLettersSize);
     } else {
         getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
     }

     // Get the word count
     int suggestedWordsCount = 0;
     while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
         suggestedWordsCount++;
     }
     return suggestedWordsCount;
 }

 void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
     if (c < nextLettersSize) {
         nextLetters[c]++;
     }
 }

 // TODO: Should be const static variable calculate in the constructor
 void
 UnigramDictionary::getVersionNumber()
 {
     mVersion = (mDict[0] & 0xFF);
     mBigram = (mDict[1] & 0xFF);
     LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
 }

 // TODO: Should be const static variable calculate in the constructor
 // Checks whether it has the latest dictionary or the old dictionary
 bool
 UnigramDictionary::checkIfDictVersionIsLatest()
 {
     return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
 }

 unsigned short
 UnigramDictionary::getChar(int *pos)
 {
     unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
     // If the code is 255, then actual 16 bit code follows (in big endian)
     if (ch == 0xFF) {
         ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
         (*pos) += 2;
     }
     return ch;
 }

 int
 UnigramDictionary::getAddress(int *pos)
 {
     int address = 0;
     if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
         *pos += 1;
     } else {
         address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
         address += (mDict[*pos + 1] & 0xFF) << 8;
         address += (mDict[*pos + 2] & 0xFF);
         *pos += 3;
     }
     return address;
 }

 int
 UnigramDictionary::getFreq(int *pos)
 {
     int freq = mDict[(*pos)++] & 0xFF;

     if (checkIfDictVersionIsLatest()) {
         // skipping bigram
         int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
         if (bigramExist > 0) {
             int nextBigramExist = 1;
             while (nextBigramExist > 0) {
                 (*pos) += 3;
                 nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
             }
         } else {
             (*pos)++;
         }
     }

     return freq;
 }

 int
 UnigramDictionary::wideStrLen(unsigned short *str)
 {
     if (!str) return 0;
     unsigned short *end = str;
     while (*end)
         end++;
     return end - str;
 }

 bool
 UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
 {
     word[length] = 0;
     if (DEBUG_DICT) {
         char s[length + 1];
         for (int i = 0; i <= length; i++) s[i] = word[i];
         LOGI("Found word = %s, freq = %d : \n", s, frequency);
     }

     // Find the right insertion point
     int insertAt = 0;
     while (insertAt < MAX_WORDS) {
         if (frequency > mFrequencies[insertAt]
                  || (mFrequencies[insertAt] == frequency
                      && length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
             break;
         }
         insertAt++;
     }
     if (insertAt < MAX_WORDS) {
         memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
                (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
                (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
         mFrequencies[insertAt] = frequency;
         memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
                (char*) mOutputChars + (insertAt    ) * MAX_WORD_LENGTH * sizeof(short),
                (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
         unsigned short *dest = mOutputChars + (insertAt    ) * MAX_WORD_LENGTH;
         while (length--) {
             *dest++ = *word++;
         }
         *dest = 0; // NULL terminate
         if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
         return true;
     }
     return false;
 }

 bool
 UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
 {
     word[length] = 0;
     if (DEBUG_DICT) {
         char s[length + 1];
         for (int i = 0; i <= length; i++) s[i] = word[i];
         LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
     }

     // Find the right insertion point
     int insertAt = 0;
     while (insertAt < mMaxBigrams) {
         if (frequency > mBigramFreq[insertAt]
                  || (mBigramFreq[insertAt] == frequency
                      && length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
             break;
         }
         insertAt++;
     }
     LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
     if (insertAt < mMaxBigrams) {
         memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
                (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
                (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
         mBigramFreq[insertAt] = frequency;
         memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
                (char*) mBigramChars + (insertAt    ) * MAX_WORD_LENGTH * sizeof(short),
                (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
         unsigned short *dest = mBigramChars + (insertAt    ) * MAX_WORD_LENGTH;
         while (length--) {
             *dest++ = *word++;
         }
         *dest = 0; // NULL terminate
         if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
         return true;
     }
     return false;
 }

 unsigned short
 UnigramDictionary::toLowerCase(unsigned short c) {
     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
         c = BASE_CHARS[c];
     }
     if (c >='A' && c <= 'Z') {
         c |= 32;
     } else if (c > 127) {
         c = latin_tolower(c);
     }
     return c;
 }

 bool
 UnigramDictionary::sameAsTyped(unsigned short *word, int length)
 {
     if (length != mInputLength) {
         return false;
     }
     int *inputCodes = mInputCodes;
     while (length--) {
         if ((unsigned int) *inputCodes != (unsigned int) *word) {
             return false;
         }
         inputCodes += MAX_ALTERNATIVES;
         word++;
     }
     return true;
 }

 static char QUOTE = '\'';

 void
 UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
                         int diffs, int skipPos, int *nextLetters, int nextLettersSize)
 {
     // Optimization: Prune out words that are too long compared to how much was typed.
     if (depth > maxDepth) {
         return;
     }
     if (diffs > mMaxEditDistance) {
         return;
     }
     int count = getCount(&pos);
     int *currentChars = NULL;
     if (mInputLength <= inputIndex) {
         completion = true;
     } else {
         currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
     }

     for (int i = 0; i < count; i++) {
         // -- at char
         unsigned short c = getChar(&pos);
         // -- at flag/add
         unsigned short lowerC = toLowerCase(c);
         bool terminal = getTerminal(&pos);
         int childrenAddress = getAddress(&pos);
         // -- after address or flag
         int freq = 1;
         if (terminal) freq = getFreq(&pos);
         // -- after add or freq

         // If we are only doing completions, no need to look at the typed characters.
         if (completion) {
             mWord[depth] = c;
             if (terminal) {
                 addWord(mWord, depth + 1, freq * snr);
                 if (depth >= mInputLength && skipPos < 0) {
                     registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
                 }
             }
             if (childrenAddress != 0) {
                 getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
                         diffs, skipPos, nextLetters, nextLettersSize);
             }
         } else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) {
             // Skip the ' or other letter and continue deeper
             mWord[depth] = c;
             if (childrenAddress != 0) {
                 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
                         skipPos, nextLetters, nextLettersSize);
             }
         } else {
             int j = 0;
             while (currentChars[j] > 0) {
                 if (currentChars[j] == lowerC || currentChars[j] == c) {
                     int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
                     mWord[depth] = c;
                     if (mInputLength == inputIndex + 1) {
                         if (terminal) {
                             if (//INCLUDE_TYPED_WORD_IF_VALID ||
                                 !sameAsTyped(mWord, depth + 1)) {
                                 int finalFreq = freq * snr * addedWeight;
                                 if (skipPos < 0) finalFreq *= mFullWordMultiplier;
                                 addWord(mWord, depth + 1, finalFreq);
                             }
                         }
                         if (childrenAddress != 0) {
                             getWordsRec(childrenAddress, depth + 1,
                                     maxDepth, true, snr * addedWeight, inputIndex + 1,
                                     diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
                         }
                     } else if (childrenAddress != 0) {
                         getWordsRec(childrenAddress, depth + 1, maxDepth,
                                 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
                                 skipPos, nextLetters, nextLettersSize);
                     }
                 }
                 j++;
                 if (skipPos >= 0) break;
             }
         }
     }
 }

 int
 UnigramDictionary::getBigramAddress(int *pos, bool advance)
 {
     int address = 0;

     address += (mDict[*pos] & 0x3F) << 16;
     address += (mDict[*pos + 1] & 0xFF) << 8;
     address += (mDict[*pos + 2] & 0xFF);

     if (advance) {
         *pos += 3;
     }

     return address;
 }

 int
 UnigramDictionary::getBigramFreq(int *pos)
 {
     int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;

     return freq;
 }


 int
 UnigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
         unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
         int maxAlternatives)
 {
     mBigramFreq = bigramFreq;
     mBigramChars = bigramChars;
     mInputCodes = codes;
     mInputLength = codesSize;
     mMaxBigrams = maxBigrams;

     if (mBigram == 1 && checkIfDictVersionIsLatest()) {
         int pos = isValidWordRec(
                 DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
         LOGI("Pos -> %d\n", pos);
         if (pos < 0) {
             return 0;
         }

         int bigramCount = 0;
         int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
         if (bigramExist > 0) {
             int nextBigramExist = 1;
             while (nextBigramExist > 0 && bigramCount < maxBigrams) {
                 int bigramAddress = getBigramAddress(&pos, true);
                 int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
                 // search for all bigrams and store them
                 searchForTerminalNode(bigramAddress, frequency);
                 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
                 bigramCount++;
             }
         }

         return bigramCount;
     }
     return 0;
 }

 void
 UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
 {
     // track word with such address and store it in an array
     unsigned short word[MAX_WORD_LENGTH];

     int pos;
     int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
     bool found = false;
     char followingChar = ' ';
     int depth = -1;

     while(!found) {
         bool followDownAddressSearchStop = false;
         bool firstAddress = true;
         bool haveToSearchAll = true;

         if (depth >= 0) {
             word[depth] = (unsigned short) followingChar;
         }
         pos = followDownBranchAddress; // pos start at count
         int count = mDict[pos] & 0xFF;
         LOGI("count - %d\n",count);
         pos++;
         for (int i = 0; i < count; i++) {
             // pos at data
             pos++;
             // pos now at flag
             if (!getFirstBitOfByte(&pos)) { // non-terminal
                 if (!followDownAddressSearchStop) {
                     int addr = getBigramAddress(&pos, false);
                     if (addr > addressLookingFor) {
                         followDownAddressSearchStop = true;
                         if (firstAddress) {
                             firstAddress = false;
                             haveToSearchAll = true;
                         } else if (!haveToSearchAll) {
                             break;
                         }
                     } else {
                         followDownBranchAddress = addr;
                         followingChar = (char)(0xFF & mDict[pos-1]);
                         if (firstAddress) {
                             firstAddress = false;
                             haveToSearchAll = false;
                         }
                     }
                 }
                 pos += 3;
             } else if (getFirstBitOfByte(&pos)) { // terminal
                 if (addressLookingFor == (pos-1)) { // found !!
                     depth++;
                     word[depth] = (0xFF & mDict[pos-1]);
                     found = true;
                     break;
                 }
                 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
                     if (!followDownAddressSearchStop) {
                         int addr = getBigramAddress(&pos, false);
                         if (addr > addressLookingFor) {
                             followDownAddressSearchStop = true;
                             if (firstAddress) {
                                 firstAddress = false;
                                 haveToSearchAll = true;
                             } else if (!haveToSearchAll) {
                                 break;
                             }
                         } else {
                             followDownBranchAddress = addr;
                             followingChar = (char)(0xFF & mDict[pos-1]);
                             if (firstAddress) {
                                 firstAddress = false;
                                 haveToSearchAll = true;
                             }
                         }
                     }
                     pos += 4;
                 } else { // freq only (2 byte)
                     pos += 2;
                 }

                 // skipping bigram
                 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
                 if (bigramExist > 0) {
                     int nextBigramExist = 1;
                     while (nextBigramExist > 0) {
                         pos += 3;
                         nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
                     }
                 } else {
                     pos++;
                 }
             }
         }
         depth++;
         if (followDownBranchAddress == 0) {
             LOGI("ERROR!!! Cannot find bigram!!");
             break;
         }
     }
     if (checkFirstCharacter(word)) {
         addWordBigram(word, depth, frequency);
     }
 }

 bool
 UnigramDictionary::checkFirstCharacter(unsigned short *word)
 {
     // Checks whether this word starts with same character or neighboring characters of
     // what user typed.

     int *inputCodes = mInputCodes;
     int maxAlt = MAX_ALTERNATIVES;
     while (maxAlt > 0) {
         if ((unsigned int) *inputCodes == (unsigned int) *word) {
             return true;
         }
         inputCodes++;
         maxAlt--;
     }
     return false;
 }

 // TODO: Move to parent dictionary
 bool
 UnigramDictionary::isValidWord(unsigned short *word, int length)
 {
     if (checkIfDictVersionIsLatest()) {
         return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
     } else {
         return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
     }
 }

 int
 UnigramDictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
     // returns address of bigram data of that word
     // return -99 if not found

     int count = getCount(&pos);
     unsigned short currentChar = (unsigned short) word[offset];
     for (int j = 0; j < count; j++) {
         unsigned short c = getChar(&pos);
         int terminal = getTerminal(&pos);
         int childPos = getAddress(&pos);
         if (c == currentChar) {
             if (offset == length - 1) {
                 if (terminal) {
                     return (pos+1);
                 }
             } else {
                 if (childPos != 0) {
                     int t = isValidWordRec(childPos, word, offset + 1, length);
                     if (t > 0) {
                         return t;
                     }
                 }
             }
         }
         if (terminal) {
             getFreq(&pos);
         }
         // There could be two instances of each alphabet - upper and lower case. So continue
         // looking ...
     }
     return NOT_VALID_WORD;
 }
 } // namespace latinime
	/*
	**
	** Copyright 2010, The Android Open Source Project
	**
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	**
	** http://www.apache.org/licenses/LICENSE-2.0
	**
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*/

	#include <stdio.h>
	#include <fcntl.h>
	#include <sys/mman.h>
	#include <string.h>

	#ifdef FLAG_DBG
	#define LOG_TAG "LatinIME: dictionary.cpp"
	#include <cutils/log.h>
	#define DEBUG_DICT 1
	#else // FLAG_DBG
	#define LOGI
	#define DEBUG_DICT 0
	#endif // FLAG_DBG

	#include "unigram_dictionary.h"
	#include "basechars.h"
	#include "char_utils.h"

	#define DICTIONARY_VERSION_MIN 200
	#define DICTIONARY_HEADER_SIZE 2
	#define NOT_VALID_WORD -99

	#define SUGGEST_MISSING_CHARACTERS true
	#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5


	namespace latinime {

	UnigramDictionary::UnigramDictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
	int maxWordLength, int maxWords, int maxAlternatives, Dictionary *parentDictionary)
	: MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords), MAX_ALTERNATIVES(maxAlternatives)
	{
	LOGI("UnigramDictionary - constructor");
	mDict = (unsigned char*) dict;
	mTypedLetterMultiplier = typedLetterMultiplier;
	mFullWordMultiplier = fullWordMultiplier;
	mParentDictionary = parentDictionary;
	getVersionNumber();
	}

	UnigramDictionary::~UnigramDictionary()
	{
	}

	int UnigramDictionary::getSuggestions(int codes, int codesSize, unsigned short outWords, int *frequencies,
	int *nextLetters, int nextLettersSize)
	{

	initSuggestions(codes, codesSize, outWords, frequencies);

	int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
	nextLettersSize);

	// If there aren't sufficient suggestions, search for words by allowing wild cards at
	// the different character positions. This feature is not ready for prime-time as we need
	// to figure out the best ranking for such words compared to proximity corrections and
	// completions.
	if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
	for (int i = 0; i < codesSize; ++i) {
	int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
	if (tempCount > suggestedWordsCount) {
	suggestedWordsCount = tempCount;
	break;
	}
	}
	}

	if (DEBUG_DICT) {
	LOGI("Returning %d words", suggestedWordsCount);
	LOGI("Next letters: ");
	for (int k = 0; k < nextLettersSize; k++) {
	if (nextLetters[k] > 0) {
	LOGI("%c = %d,", k, nextLetters[k]);
	}
	}
	LOGI("\n");
	}
	return suggestedWordsCount;
	}

	void UnigramDictionary::initSuggestions(int codes, int codesSize, unsigned short outWords,
	int *frequencies) {
	mFrequencies = frequencies;
	mOutputChars = outWords;
	mInputCodes = codes;
	mInputLength = codesSize;
	mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
	}

	int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
	int *nextLetters, int nextLettersSize) {
	if (checkIfDictVersionIsLatest()) {
	getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
	nextLetters, nextLettersSize);
	} else {
	getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
	}

	// Get the word count
	int suggestedWordsCount = 0;
	while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
	suggestedWordsCount++;
	}
	return suggestedWordsCount;
	}

	void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
	if (c < nextLettersSize) {
	nextLetters[c]++;
	}
	}

	// TODO: Should be const static variable calculate in the constructor
	void
	UnigramDictionary::getVersionNumber()
	{
	mVersion = (mDict[0] & 0xFF);
	mBigram = (mDict[1] & 0xFF);
	LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
	}

	// TODO: Should be const static variable calculate in the constructor
	// Checks whether it has the latest dictionary or the old dictionary
	bool
	UnigramDictionary::checkIfDictVersionIsLatest()
	{
	return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 \|\| mBigram == 0);
	}

	unsigned short
	UnigramDictionary::getChar(int *pos)
	{
	unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
	// If the code is 255, then actual 16 bit code follows (in big endian)
	if (ch == 0xFF) {
	ch = ((mDict[pos] & 0xFF) << 8) \| (mDict[pos + 1] & 0xFF);
	(*pos) += 2;
	}
	return ch;
	}

	int
	UnigramDictionary::getAddress(int *pos)
	{
	int address = 0;
	if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
	*pos += 1;
	} else {
	address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
	address += (mDict[*pos + 1] & 0xFF) << 8;
	address += (mDict[*pos + 2] & 0xFF);
	*pos += 3;
	}
	return address;
	}

	int
	UnigramDictionary::getFreq(int *pos)
	{
	int freq = mDict[(*pos)++] & 0xFF;

	if (checkIfDictVersionIsLatest()) {
	// skipping bigram
	int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
	if (bigramExist > 0) {
	int nextBigramExist = 1;
	while (nextBigramExist > 0) {
	(*pos) += 3;
	nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
	}
	} else {
	(*pos)++;
	}
	}

	return freq;
	}

	int
	UnigramDictionary::wideStrLen(unsigned short *str)
	{
	if (!str) return 0;
	unsigned short *end = str;
	while (*end)
	end++;
	return end - str;
	}

	bool
	UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
	{
	word[length] = 0;
	if (DEBUG_DICT) {
	char s[length + 1];
	for (int i = 0; i <= length; i++) s[i] = word[i];
	LOGI("Found word = %s, freq = %d : \n", s, frequency);
	}

	// Find the right insertion point
	int insertAt = 0;
	while (insertAt < MAX_WORDS) {
	if (frequency > mFrequencies[insertAt]
	\|\| (mFrequencies[insertAt] == frequency
	&& length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
	break;
	}
	insertAt++;
	}
	if (insertAt < MAX_WORDS) {
	memmove((char) mFrequencies + (insertAt + 1) sizeof(mFrequencies[0]),
	(char) mFrequencies + insertAt sizeof(mFrequencies[0]),
	(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
	mFrequencies[insertAt] = frequency;
	memmove((char) mOutputChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
	(char) mOutputChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
	(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
	unsigned short dest = mOutputChars + (insertAt ) MAX_WORD_LENGTH;
	while (length--) {
	dest++ = word++;
	}
	*dest = 0; // NULL terminate
	if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
	return true;
	}
	return false;
	}

	bool
	UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
	{
	word[length] = 0;
	if (DEBUG_DICT) {
	char s[length + 1];
	for (int i = 0; i <= length; i++) s[i] = word[i];
	LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
	}

	// Find the right insertion point
	int insertAt = 0;
	while (insertAt < mMaxBigrams) {
	if (frequency > mBigramFreq[insertAt]
	\|\| (mBigramFreq[insertAt] == frequency
	&& length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
	break;
	}
	insertAt++;
	}
	LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
	if (insertAt < mMaxBigrams) {
	memmove((char) mBigramFreq + (insertAt + 1) sizeof(mBigramFreq[0]),
	(char) mBigramFreq + insertAt sizeof(mBigramFreq[0]),
	(mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
	mBigramFreq[insertAt] = frequency;
	memmove((char) mBigramChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
	(char) mBigramChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
	(mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
	unsigned short dest = mBigramChars + (insertAt ) MAX_WORD_LENGTH;
	while (length--) {
	dest++ = word++;
	}
	*dest = 0; // NULL terminate
	if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
	return true;
	}
	return false;
	}

	unsigned short
	UnigramDictionary::toLowerCase(unsigned short c) {
	if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
	c = BASE_CHARS[c];
	}
	if (c >='A' && c <= 'Z') {
	c \|= 32;
	} else if (c > 127) {
	c = latin_tolower(c);
	}
	return c;
	}

	bool
	UnigramDictionary::sameAsTyped(unsigned short *word, int length)
	{
	if (length != mInputLength) {
	return false;
	}
	int *inputCodes = mInputCodes;
	while (length--) {
	if ((unsigned int) inputCodes != (unsigned int) word) {
	return false;
	}
	inputCodes += MAX_ALTERNATIVES;
	word++;
	}
	return true;
	}

	static char QUOTE = '\'';

	void
	UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
	int diffs, int skipPos, int *nextLetters, int nextLettersSize)
	{
	// Optimization: Prune out words that are too long compared to how much was typed.
	if (depth > maxDepth) {
	return;
	}
	if (diffs > mMaxEditDistance) {
	return;
	}
	int count = getCount(&pos);
	int *currentChars = NULL;
	if (mInputLength <= inputIndex) {
	completion = true;
	} else {
	currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
	}

	for (int i = 0; i < count; i++) {
	// -- at char
	unsigned short c = getChar(&pos);
	// -- at flag/add
	unsigned short lowerC = toLowerCase(c);
	bool terminal = getTerminal(&pos);
	int childrenAddress = getAddress(&pos);
	// -- after address or flag
	int freq = 1;
	if (terminal) freq = getFreq(&pos);
	// -- after add or freq

	// If we are only doing completions, no need to look at the typed characters.
	if (completion) {
	mWord[depth] = c;
	if (terminal) {
	addWord(mWord, depth + 1, freq * snr);
	if (depth >= mInputLength && skipPos < 0) {
	registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
	}
	}
	if (childrenAddress != 0) {
	getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
	diffs, skipPos, nextLetters, nextLettersSize);
	}
	} else if ((c == QUOTE && currentChars[0] != QUOTE) \|\| skipPos == depth) {
	// Skip the ' or other letter and continue deeper
	mWord[depth] = c;
	if (childrenAddress != 0) {
	getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
	skipPos, nextLetters, nextLettersSize);
	}
	} else {
	int j = 0;
	while (currentChars[j] > 0) {
	if (currentChars[j] == lowerC \|\| currentChars[j] == c) {
	int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
	mWord[depth] = c;
	if (mInputLength == inputIndex + 1) {
	if (terminal) {
	if (//INCLUDE_TYPED_WORD_IF_VALID \|\|
	!sameAsTyped(mWord, depth + 1)) {
	int finalFreq = freq * snr * addedWeight;
	if (skipPos < 0) finalFreq *= mFullWordMultiplier;
	addWord(mWord, depth + 1, finalFreq);
	}
	}
	if (childrenAddress != 0) {
	getWordsRec(childrenAddress, depth + 1,
	maxDepth, true, snr * addedWeight, inputIndex + 1,
	diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
	}
	} else if (childrenAddress != 0) {
	getWordsRec(childrenAddress, depth + 1, maxDepth,
	false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
	skipPos, nextLetters, nextLettersSize);
	}
	}
	j++;
	if (skipPos >= 0) break;
	}
	}
	}
	}

	int
	UnigramDictionary::getBigramAddress(int *pos, bool advance)
	{
	int address = 0;

	address += (mDict[*pos] & 0x3F) << 16;
	address += (mDict[*pos + 1] & 0xFF) << 8;
	address += (mDict[*pos + 2] & 0xFF);

	if (advance) {
	*pos += 3;
	}

	return address;
	}

	int
	UnigramDictionary::getBigramFreq(int *pos)
	{
	int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;

	return freq;
	}


	int
	UnigramDictionary::getBigrams(unsigned short prevWord, int prevWordLength, int codes, int codesSize,
	unsigned short bigramChars, int bigramFreq, int maxWordLength, int maxBigrams,
	int maxAlternatives)
	{
	mBigramFreq = bigramFreq;
	mBigramChars = bigramChars;
	mInputCodes = codes;
	mInputLength = codesSize;
	mMaxBigrams = maxBigrams;

	if (mBigram == 1 && checkIfDictVersionIsLatest()) {
	int pos = isValidWordRec(
	DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
	LOGI("Pos -> %d\n", pos);
	if (pos < 0) {
	return 0;
	}

	int bigramCount = 0;
	int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
	if (bigramExist > 0) {
	int nextBigramExist = 1;
	while (nextBigramExist > 0 && bigramCount < maxBigrams) {
	int bigramAddress = getBigramAddress(&pos, true);
	int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
	// search for all bigrams and store them
	searchForTerminalNode(bigramAddress, frequency);
	nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
	bigramCount++;
	}
	}

	return bigramCount;
	}
	return 0;
	}

	void
	UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
	{
	// track word with such address and store it in an array
	unsigned short word[MAX_WORD_LENGTH];

	int pos;
	int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
	bool found = false;
	char followingChar = ' ';
	int depth = -1;

	while(!found) {
	bool followDownAddressSearchStop = false;
	bool firstAddress = true;
	bool haveToSearchAll = true;

	if (depth >= 0) {
	word[depth] = (unsigned short) followingChar;
	}
	pos = followDownBranchAddress; // pos start at count
	int count = mDict[pos] & 0xFF;
	LOGI("count - %d\n",count);
	pos++;
	for (int i = 0; i < count; i++) {
	// pos at data
	pos++;
	// pos now at flag
	if (!getFirstBitOfByte(&pos)) { // non-terminal
	if (!followDownAddressSearchStop) {
	int addr = getBigramAddress(&pos, false);
	if (addr > addressLookingFor) {
	followDownAddressSearchStop = true;
	if (firstAddress) {
	firstAddress = false;
	haveToSearchAll = true;
	} else if (!haveToSearchAll) {
	break;
	}
	} else {
	followDownBranchAddress = addr;
	followingChar = (char)(0xFF & mDict[pos-1]);
	if (firstAddress) {
	firstAddress = false;
	haveToSearchAll = false;
	}
	}
	}
	pos += 3;
	} else if (getFirstBitOfByte(&pos)) { // terminal
	if (addressLookingFor == (pos-1)) { // found !!
	depth++;
	word[depth] = (0xFF & mDict[pos-1]);
	found = true;
	break;
	}
	if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
	if (!followDownAddressSearchStop) {
	int addr = getBigramAddress(&pos, false);
	if (addr > addressLookingFor) {
	followDownAddressSearchStop = true;
	if (firstAddress) {
	firstAddress = false;
	haveToSearchAll = true;
	} else if (!haveToSearchAll) {
	break;
	}
	} else {
	followDownBranchAddress = addr;
	followingChar = (char)(0xFF & mDict[pos-1]);
	if (firstAddress) {
	firstAddress = false;
	haveToSearchAll = true;
	}
	}
	}
	pos += 4;
	} else { // freq only (2 byte)
	pos += 2;
	}

	// skipping bigram
	int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
	if (bigramExist > 0) {
	int nextBigramExist = 1;
	while (nextBigramExist > 0) {
	pos += 3;
	nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
	}
	} else {
	pos++;
	}
	}
	}
	depth++;
	if (followDownBranchAddress == 0) {
	LOGI("ERROR!!! Cannot find bigram!!");
	break;
	}
	}
	if (checkFirstCharacter(word)) {
	addWordBigram(word, depth, frequency);
	}
	}

	bool
	UnigramDictionary::checkFirstCharacter(unsigned short *word)
	{
	// Checks whether this word starts with same character or neighboring characters of
	// what user typed.

	int *inputCodes = mInputCodes;
	int maxAlt = MAX_ALTERNATIVES;
	while (maxAlt > 0) {
	if ((unsigned int) inputCodes == (unsigned int) word) {
	return true;
	}
	inputCodes++;
	maxAlt--;
	}
	return false;
	}

	// TODO: Move to parent dictionary
	bool
	UnigramDictionary::isValidWord(unsigned short *word, int length)
	{
	if (checkIfDictVersionIsLatest()) {
	return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
	} else {
	return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
	}
	}

	int
	UnigramDictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
	// returns address of bigram data of that word
	// return -99 if not found

	int count = getCount(&pos);
	unsigned short currentChar = (unsigned short) word[offset];
	for (int j = 0; j < count; j++) {
	unsigned short c = getChar(&pos);
	int terminal = getTerminal(&pos);
	int childPos = getAddress(&pos);
	if (c == currentChar) {
	if (offset == length - 1) {
	if (terminal) {
	return (pos+1);
	}
	} else {
	if (childPos != 0) {
	int t = isValidWordRec(childPos, word, offset + 1, length);
	if (t > 0) {
	return t;
	}
	}
	}
	}
	if (terminal) {
	getFreq(&pos);
	}
	// There could be two instances of each alphabet - upper and lower case. So continue
	// looking ...
	}
	return NOT_VALID_WORD;
	}
	} // namespace latinime