blob: 42951bf7a08fc4552fb4fc9a6c531efd63cd6d58 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
28namespace latinime {
29
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090030const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
31 { { 'a', 'e' },
32 { 'o', 'e' },
33 { 'u', 'e' } };
34
Jean Chalard293ece02011-06-16 20:55:16 +090035// TODO: check the header
36UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090037 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090038 const bool isLatestDictVersion)
Jean Chalard293ece02011-06-16 20:55:16 +090039 : DICT_ROOT(streamStart),
40 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090041 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
42 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090043 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalarda787dba2011-03-04 12:17:48 +090044 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
45 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090046 if (DEBUG_DICT) {
47 LOGI("UnigramDictionary - constructor");
48 }
satok30088252010-12-01 21:22:15 +090049}
50
satok18c28f42010-12-02 18:11:54 +090051UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090052
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090053static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
54 const int MAX_PROXIMITY_CHARS) {
55 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
56}
57
58bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
59
60 // There can't be a digraph if we don't have at least 2 characters to examine
61 if (i + 2 > codesSize) return false;
62
63 // Search for the first char of some digraph
64 int lastDigraphIndex = -1;
65 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
66 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
67 lastDigraphIndex >= 0; --lastDigraphIndex) {
68 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
69 }
70 // No match: return early
71 if (lastDigraphIndex < 0) return false;
72
73 // It's an interesting digraph if the second char matches too.
74 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
75}
76
77// Mostly the same arguments as the non-recursive version, except:
78// codes is the original value. It points to the start of the work buffer, and gets passed as is.
79// codesSize is the size of the user input (thus, it is the size of codesSrc).
80// codesDest is the current point in the work buffer.
81// codesSrc is the current point in the user-input, original, content-unmodified buffer.
82// codesRemain is the remaining size in codesSrc.
83void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
84 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
85 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080086 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090087
Jean Chalarda787dba2011-03-04 12:17:48 +090088 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
89 for (int i = 0; i < codesRemain; ++i) {
90 if (isDigraph(codesSrc, i, codesRemain)) {
91 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090092
Jean Chalarda787dba2011-03-04 12:17:48 +090093 // Copy the word up to the first char of the digraph, then continue processing
94 // on the remaining part of the word, skipping the second char of the digraph.
95 // In our example, copy "pru" and continue running on "fen"
96 // Make i the index of the second char of the digraph for simplicity. Forgetting
97 // to do that results in an infinite recursion so take care!
98 ++i;
99 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
100 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
101 codesBuffer, codesBufferSize, flags,
102 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
103 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
104 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the second char of the digraph in place, then continue processing on
107 // the remaining part of the word.
108 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
109 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
110 BYTES_IN_ONE_CHAR);
111 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
112 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
113 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
114 outWords, frequencies);
115 return;
116 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900117 }
118 }
119
120 // If we come here, we hit the end of the word: let's check it against the dictionary.
121 // In our example, we'll come here once for "prufen" and then once for "pruefen".
122 // If the word contains several digraphs, we'll come it for the product of them.
123 // eg. if the word is "ueberpruefen" we'll test, in order, against
124 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
125 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
126 if (0 != remainingBytes)
127 memcpy(codesDest, codesSrc, remainingBytes);
128
129 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
130 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
131}
132
133int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
134 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
135 unsigned short *outWords, int *frequencies) {
136
137 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
138 { // Incrementally tune the word and try all possibilities
139 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
140 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900141 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900142 } else { // Normal processing
143 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
144 outWords, frequencies);
145 }
146
satok817e5172011-03-04 06:06:45 -0800147 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900148 // Get the word count
149 int suggestedWordsCount = 0;
150 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
151 suggestedWordsCount++;
152 }
153
154 if (DEBUG_DICT) {
155 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900156 /// Print the returned words
157 for (int j = 0; j < suggestedWordsCount; ++j) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700158#ifdef FLAG_DBG
Jean Chalard980d6b62011-06-30 17:02:23 +0900159 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
160 char s[MAX_WORD_LENGTH];
161 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
Doug Kwance9efbf2011-07-07 22:53:50 -0700162#endif
Jean Chalard980d6b62011-06-30 17:02:23 +0900163 LOGI("%s %i", s, mFrequencies[j]);
164 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900165 LOGI("Next letters: ");
166 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
167 if (mNextLettersFrequency[k] > 0) {
168 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
169 }
170 }
171 }
satok817e5172011-03-04 06:06:45 -0800172 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900173 PROF_CLOSE;
174 return suggestedWordsCount;
175}
176
177void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
178 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
179 unsigned short *outWords, int *frequencies) {
180
satok61e2f852011-01-05 14:13:07 +0900181 PROF_OPEN;
182 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900183 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900184 if (DEBUG_DICT) assert(codesSize == mInputLength);
185
satoka3d78f62010-12-09 22:08:33 +0900186 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900187 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900188
satok61e2f852011-01-05 14:13:07 +0900189 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900190 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900191 PROF_END(1);
192
193 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900194 // Suggestion with missing character
195 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900196 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900197 if (DEBUG_DICT) {
198 LOGI("--- Suggest missing characters %d", i);
199 }
satok54fe9e02010-12-13 14:42:35 +0900200 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900201 }
202 }
satok61e2f852011-01-05 14:13:07 +0900203 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900204
satok61e2f852011-01-05 14:13:07 +0900205 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900206 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900207 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
208 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900209 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900210 if (DEBUG_DICT) {
211 LOGI("--- Suggest excessive characters %d", i);
212 }
satok54fe9e02010-12-13 14:42:35 +0900213 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900214 }
215 }
satok61e2f852011-01-05 14:13:07 +0900216 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900217
satok61e2f852011-01-05 14:13:07 +0900218 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900219 // Suggestion with transposed characters
220 // Only suggest words that length is mInputLength
221 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
222 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900223 if (DEBUG_DICT) {
224 LOGI("--- Suggest transposed characters %d", i);
225 }
satok54fe9e02010-12-13 14:42:35 +0900226 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900227 }
228 }
satok61e2f852011-01-05 14:13:07 +0900229 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900230
satok61e2f852011-01-05 14:13:07 +0900231 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900232 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900233 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
234 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900235 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900236 if (DEBUG_DICT) {
237 LOGI("--- Suggest missing space characters %d", i);
238 }
satok662fe692010-12-08 17:05:39 +0900239 getMissingSpaceWords(mInputLength, i);
240 }
241 }
satok61e2f852011-01-05 14:13:07 +0900242 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800243
244 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900245 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800246 // The first and last "mistyped spaces" are taken care of by excessive character handling
247 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900248 if (DEBUG_DICT) {
249 LOGI("--- Suggest words with proximity space %d", i);
250 }
satok817e5172011-03-04 06:06:45 -0800251 const int x = xcoordinates[i];
252 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900253 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800254 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
255 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900256 }
satok817e5172011-03-04 06:06:45 -0800257 if (proximityInfo->hasSpaceProximity(x, y)) {
258 getMistypedSpaceWords(mInputLength, i);
259 }
satok817e5172011-03-04 06:06:45 -0800260 }
261 }
262 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900263}
264
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900265void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
266 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900267 if (DEBUG_DICT) {
268 LOGI("initSuggest");
269 }
satok30088252010-12-01 21:22:15 +0900270 mFrequencies = frequencies;
271 mOutputChars = outWords;
272 mInputCodes = codes;
273 mInputLength = codesSize;
274 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
275}
276
Jean Chalard8124e642011-06-16 22:33:41 +0900277static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900278 if (c < nextLettersSize) {
279 nextLetters[c]++;
280 }
281}
282
satok662fe692010-12-08 17:05:39 +0900283// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900284// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900285bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900286 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900287 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700288#ifdef FLAG_DBG
satok30088252010-12-01 21:22:15 +0900289 char s[length + 1];
290 for (int i = 0; i <= length; i++) s[i] = word[i];
Doug Kwance9efbf2011-07-07 22:53:50 -0700291#endif
satok662fe692010-12-08 17:05:39 +0900292 LOGI("Found word = %s, freq = %d", s, frequency);
satok30088252010-12-01 21:22:15 +0900293 }
satokf5cded12010-12-06 21:28:24 +0900294 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900295 if (DEBUG_DICT) {
296 LOGI("Exceeded max word length.");
297 }
satokf5cded12010-12-06 21:28:24 +0900298 return false;
299 }
satok30088252010-12-01 21:22:15 +0900300
301 // Find the right insertion point
302 int insertAt = 0;
303 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900304 // TODO: How should we sort words with the same frequency?
305 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900306 break;
307 }
308 insertAt++;
309 }
310 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900311 if (DEBUG_DICT) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700312#ifdef FLAG_DBG
satokcdbbea72010-12-08 16:04:16 +0900313 char s[length + 1];
314 for (int i = 0; i <= length; i++) s[i] = word[i];
Doug Kwance9efbf2011-07-07 22:53:50 -0700315#endif
satokb2e5e592011-04-26 14:50:54 +0900316 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satokcdbbea72010-12-08 16:04:16 +0900317 }
satok30088252010-12-01 21:22:15 +0900318 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
319 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
320 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
321 mFrequencies[insertAt] = frequency;
322 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900323 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900324 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900325 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900326 while (length--) {
327 *dest++ = *word++;
328 }
329 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900330 if (DEBUG_DICT) {
331 LOGI("Added word at %d", insertAt);
332 }
satok30088252010-12-01 21:22:15 +0900333 return true;
334 }
335 return false;
336}
337
Jean Chalard8124e642011-06-16 22:33:41 +0900338static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900339 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
340 c = BASE_CHARS[c];
341 }
342 if (c >='A' && c <= 'Z') {
343 c |= 32;
344 } else if (c > 127) {
345 c = latin_tolower(c);
346 }
347 return c;
348}
349
Jean Chalardca5ef282011-06-17 15:36:26 +0900350bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
satok30088252010-12-01 21:22:15 +0900351 if (length != mInputLength) {
352 return false;
353 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900354 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900355 while (length--) {
356 if ((unsigned int) *inputCodes != (unsigned int) *word) {
357 return false;
358 }
satok662fe692010-12-08 17:05:39 +0900359 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900360 word++;
361 }
362 return true;
363}
364
satok715514d2010-12-02 20:19:59 +0900365static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900366static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900367
satok54fe9e02010-12-13 14:42:35 +0900368void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900369 const int excessivePos, const int transposedPos, int *nextLetters,
370 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900371 if (DEBUG_DICT) {
372 LOGI("getSuggestionCandidates %d", maxDepth);
373 assert(transposedPos + 1 < mInputLength);
374 assert(excessivePos < mInputLength);
375 assert(missingPos < mInputLength);
376 }
satok662fe692010-12-08 17:05:39 +0900377 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900378 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900379 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900380 int depth = 0;
381
382 mStackChildCount[0] = childCount;
383 mStackTraverseAll[0] = (mInputLength <= 0);
384 mStackNodeFreq[0] = 1;
385 mStackInputIndex[0] = 0;
386 mStackDiffs[0] = 0;
387 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900388 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900389
satok662fe692010-12-08 17:05:39 +0900390 // Depth first search
satokd2997922010-12-07 13:08:39 +0900391 while (depth >= 0) {
392 if (mStackChildCount[depth] > 0) {
393 --mStackChildCount[depth];
394 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900395 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900396 int inputIndex = mStackInputIndex[depth];
397 int diffs = mStackDiffs[depth];
398 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900399 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900400 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900401 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900402 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900403 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900404 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
405 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
406 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900407 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900408 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900409 mStackSiblingPos[depth] = siblingPos;
410 if (needsToTraverseChildrenNodes) {
411 // Goes to child node
412 ++depth;
413 mStackChildCount[depth] = childCount;
414 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900415 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900416 mStackInputIndex[depth] = inputIndex;
417 mStackDiffs[depth] = diffs;
418 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900419 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900420 }
421 } else {
satokcdbbea72010-12-08 16:04:16 +0900422 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900423 --depth;
424 }
425 }
426}
427
satokb2e5e592011-04-26 14:50:54 +0900428static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
429static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
430 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
431}
432
433static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
434inline static void multiplyIntCapped(const int multiplier, int *base) {
435 const int temp = *base;
436 if (temp != S_INT_MAX) {
437 // Branch if multiplier == 2 for the optimization
438 if (multiplier == 2) {
439 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
440 } else {
441 const int tempRetval = temp * multiplier;
442 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
443 }
444 }
445}
446
447inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900448 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900449 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900450 } else {
satokb2e5e592011-04-26 14:50:54 +0900451 int ret = base;
452 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
453 return ret;
454 }
455}
456
457inline static void multiplyRate(const int rate, int *freq) {
458 if (*freq != S_INT_MAX) {
459 if (*freq > 1000000) {
460 *freq /= 100;
461 multiplyIntCapped(rate, freq);
462 } else {
463 multiplyIntCapped(rate, freq);
464 *freq /= 100;
465 }
satokf7425bb2011-01-05 16:37:53 +0900466 }
467}
468
satok4c981d32011-04-19 13:58:42 +0900469inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900470 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
471 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900472 if (firstWordLength == 0 || secondWordLength == 0) {
473 return 0;
474 }
475 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
476 int tempFirstFreq = firstFreq;
477 multiplyRate(firstDemotionRate, &tempFirstFreq);
478
479 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
480 int tempSecondFreq = secondFreq;
481 multiplyRate(secondDemotionRate, &tempSecondFreq);
482
483 const int totalLength = firstWordLength + secondWordLength;
484
485 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
486 // length.
487 int totalFreq = tempFirstFreq + tempSecondFreq;
488
489 // This is a workaround to try offsetting the not-enough-demotion which will be done in
490 // calcNormalizedScore in Utils.java.
491 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
492 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
493 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
494 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
495 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
496
497 // At this moment, totalFreq is calculated by the following formula:
498 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
499 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
500
satokb2e5e592011-04-26 14:50:54 +0900501 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900502
503 // This is another workaround to offset the demotion which will be done in
504 // calcNormalizedScore in Utils.java.
505 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
506 // the same amount because we already have adjusted the synthetic freq of this "missing or
507 // mistyped space" suggestion candidate above in this method.
508 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
509 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
510
satokd8db9f82011-05-18 15:31:04 +0900511 if (isSpaceProximity) {
512 // A word pair with one space proximity correction
513 if (DEBUG_DICT) {
514 LOGI("Found a word pair with space proximity correction.");
515 }
516 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
517 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
518 }
519
satok4c981d32011-04-19 13:58:42 +0900520 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
521 return totalFreq;
522}
523
satok817e5172011-03-04 06:06:45 -0800524bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
525 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900526 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800527}
528
529bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
530 return getSplitTwoWordsSuggestion(
531 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900532 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800533}
534
satok58c49b92011-01-27 03:23:39 +0900535inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900536 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900537 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900538 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900539 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900540 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900541 if (mInputLength >= 2) {
542 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
543 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
544 / (10 * mInputLength
545 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900546 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900547 LOGI("Demotion rate for missing character is %d.", demotionRate);
548 }
satokdc5301e2011-04-11 16:14:45 +0900549 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900550 } else {
551 finalFreq = 0;
552 }
553 }
satokf7425bb2011-01-05 16:37:53 +0900554 if (transposedPos >= 0) multiplyRate(
555 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900556 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900557 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900558 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900559 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900560 }
561 }
satok58c49b92011-01-27 03:23:39 +0900562 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900563 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900564 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900565 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900566 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900567 if (DEBUG_DICT) {
568 LOGI("Found full matched word.");
569 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900570 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
571 }
572 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900573 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900574 }
satok9674f652011-04-20 17:15:27 +0900575 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900576 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900577 if (DEBUG_DICT) {
578 LOGI("Found one proximity correction.");
579 }
satokb2e5e592011-04-26 14:50:54 +0900580 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900581 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900582 }
satok9674f652011-04-20 17:15:27 +0900583 if (DEBUG_DICT) {
584 LOGI("calc: %d, %d", depth, sameLength);
585 }
satokb2e5e592011-04-26 14:50:54 +0900586 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900587 return finalFreq;
588}
satoka3d78f62010-12-09 22:08:33 +0900589
satok28bd03b2010-12-03 16:39:16 +0900590inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900591 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900592 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900593 // Skip the ' or other letter and continue deeper
594 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
595}
596
satoke07baa62010-12-09 21:55:40 +0900597inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900598 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900599 if (inputIndex < 0 || inputIndex >= inputLength) return false;
600 const int currentChar = *getInputCharsAt(inputIndex);
601 const int leftIndex = inputIndex - 1;
602 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900603 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900604 int i = 0;
605 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
606 if (leftChars[i++] == currentChar) return true;
607 }
608 }
609 const int rightIndex = inputIndex + 1;
610 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900611 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900612 int i = 0;
613 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
614 if (rightChars[i++] == currentChar) return true;
615 }
616 }
617 return false;
618}
619
Jean Chalarda5d58492011-02-18 17:50:58 +0900620// In the following function, c is the current character of the dictionary word
621// currently examined.
622// currentChars is an array containing the keys close to the character the
623// user actually typed at the same position. We want to see if c is in it: if so,
624// then the word contains at that position a character close to what the user
625// typed.
626// What the user typed is actually the first character of the array.
627// Notice : accented characters do not have a proximity list, so they are alone
628// in their list. The non-accented version of the character should be considered
629// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900630inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
631 const int *currentChars, const unsigned short c, const int skipPos,
632 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900633 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900634
635 // The first char in the array is what user typed. If it matches right away,
636 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900637 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900638 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
639
640 // If one of those is true, we should not check for close characters at all.
641 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
642 return UNRELATED_CHAR;
643
644 // If the non-accented, lowercased version of that first character matches c,
645 // then we have a non-accented version of the accented character the user
646 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900647 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900648 return NEAR_PROXIMITY_CHAR;
649
650 // Not an exact nor an accent-alike match: search the list of close keys
651 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900652 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900653 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900654 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900655 ++j;
656 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900657
658 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900659 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900660}
661
Jean Chalardca5ef282011-06-17 15:36:26 +0900662inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900663 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900664 const int inputIndex, const int matchWeight, const int skipPos,
665 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
666 int* nextLetters, const int nextLettersSize) {
667
668 const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900669 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900670
671 if (depth >= MIN_SUGGEST_DEPTH) {
672 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
673 excessivePos, transposedPos, freq, sameLength);
674 if (!isSameAsTyped)
675 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900676 }
677
678 if (sameLength && depth >= mInputLength && skipPos < 0) {
679 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
680 }
681}
682
Jean Chalarde6715e32011-06-30 19:47:25 +0900683bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
684 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
685 const int secondWordLength, const bool isSpaceProximity) {
686 if (inputLength >= MAX_WORD_LENGTH) return false;
687 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
688 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
689 return false;
690 const int newWordLength = firstWordLength + secondWordLength + 1;
691 // Allocating variable length array on stack
692 unsigned short word[newWordLength];
693 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
694 if (DEBUG_DICT) {
695 LOGI("First freq: %d", firstFreq);
696 }
697 if (firstFreq <= 0) return false;
698
699 for (int i = 0; i < firstWordLength; ++i) {
700 word[i] = mWord[i];
701 }
702
703 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
704 if (DEBUG_DICT) {
705 LOGI("Second freq: %d", secondFreq);
706 }
707 if (secondFreq <= 0) return false;
708
709 word[firstWordLength] = SPACE;
710 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
711 word[i] = mWord[i - firstWordLength - 1];
712 }
713
714 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
715 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
716 if (DEBUG_DICT) {
717 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
718 TYPED_LETTER_MULTIPLIER);
719 }
720 addWord(word, newWordLength, pairFreq);
721 return true;
722}
723
Jean Chalardbc90c722011-06-20 21:09:04 +0900724#ifndef NEW_DICTIONARY_FORMAT
725// TODO: Don't forget to bring inline functions back to over where they are used.
satokcdbbea72010-12-08 16:04:16 +0900726
Jean Chalardbc90c722011-06-20 21:09:04 +0900727// The following functions will be entirely replaced with new implementations.
728void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
729 const int excessivePos, const int transposedPos,int *nextLetters,
730 const int nextLettersSize) {
731 int initialPosition = initialPos;
732 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
733 getWordsRec(count, initialPosition, 0,
734 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
735 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
736 nextLettersSize);
737}
Jean Chalardca5ef282011-06-17 15:36:26 +0900738
Jean Chalardbc90c722011-06-20 21:09:04 +0900739void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
740 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
741 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
742 const int transposedPos, int *nextLetters, const int nextLettersSize) {
743 int siblingPos = pos;
744 for (int i = 0; i < childrenCount; ++i) {
745 int newCount;
746 int newChildPosition;
747 bool newTraverseAllNodes;
748 int newMatchRate;
749 int newInputIndex;
750 int newDiffs;
751 int newSiblingPos;
752 int newOutputIndex;
753 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
754 traverseAllNodes, matchWeight, inputIndex, diffs,
755 skipPos, excessivePos, transposedPos,
756 nextLetters, nextLettersSize,
757 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
758 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
759 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900760
Jean Chalardbc90c722011-06-20 21:09:04 +0900761 if (needsToTraverseChildrenNodes) {
762 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
763 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
764 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900765 }
satok48e432c2010-12-06 17:38:58 +0900766 }
satok48e432c2010-12-06 17:38:58 +0900767}
768
Jean Chalard980d6b62011-06-30 17:02:23 +0900769inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
770 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900771 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900772 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900773 int maxFreq = 0;
774 int depth = 0;
775 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900776 bool terminal = false;
777
satokaee09dc2010-12-09 19:21:51 +0900778 mStackChildCount[0] = count;
779 mStackSiblingPos[0] = pos;
780
781 while (depth >= 0) {
782 if (mStackChildCount[depth] > 0) {
783 --mStackChildCount[depth];
784 int firstChildPos;
785 int newFreq;
786 int siblingPos = mStackSiblingPos[depth];
787 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
788 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
789 &siblingPos);
790 mStackSiblingPos[depth] = siblingPos;
791 if (depth == (inputLength - 1)) {
792 // Traverse sibling node
793 if (terminal) {
794 if (newFreq > maxFreq) {
795 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
796 if (DEBUG_DICT && DEBUG_NODE) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700797#ifdef FLAG_DBG
satokaee09dc2010-12-09 19:21:51 +0900798 char s[inputLength + 1];
799 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
800 s[inputLength] = 0;
Doug Kwance9efbf2011-07-07 22:53:50 -0700801#endif
satokaee09dc2010-12-09 19:21:51 +0900802 LOGI("New missing space word found: %d > %d (%s), %d, %d",
803 newFreq, maxFreq, s, inputLength, depth);
804 }
805 maxFreq = newFreq;
806 }
807 }
808 } else if (needsToTraverseChildrenNodes) {
809 // Traverse children nodes
810 ++depth;
811 mStackChildCount[depth] = count;
812 mStackSiblingPos[depth] = firstChildPos;
813 }
814 } else {
815 // Traverse parent node
816 --depth;
satok662fe692010-12-08 17:05:39 +0900817 }
818 }
satokaee09dc2010-12-09 19:21:51 +0900819
820 word[inputLength] = 0;
821 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900822}
823
824inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900825 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
826 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
827 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900828 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900829 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900830 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
831 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900832 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900833 if (DEBUG_DICT) {
834 assert(inputC <= U_SHORT_MAX);
835 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900836 const unsigned short baseLowerC = toBaseLowerCase(c);
837 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900838 const bool hasChild = *newChildPosition != 0;
839 if (matched) {
840 word[depth] = c;
841 if (DEBUG_DICT && DEBUG_NODE) {
842 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900843 if (*newTerminal) {
844 LOGI("Terminal %d", *newFreq);
845 }
satok662fe692010-12-08 17:05:39 +0900846 }
satokaee09dc2010-12-09 19:21:51 +0900847 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900848 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900849 return true;
850 } else {
851 return false;
852 }
853 } else {
854 // If this node is not user typed character, this method treats this word as unmatched.
855 // Thus newTerminal shouldn't be true.
856 *newTerminal = false;
857 return false;
satok662fe692010-12-08 17:05:39 +0900858 }
satok662fe692010-12-08 17:05:39 +0900859}
Jean Chalard8124e642011-06-16 22:33:41 +0900860
861// TODO: use uint32_t instead of unsigned short
862bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
863 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900864 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900865 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900866 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900867 }
868}
869
Jean Chalard17e44a72011-06-16 22:51:11 +0900870
871// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900872int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
873 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900874 // returns address of bigram data of that word
875 // return -99 if not found
876
877 int count = Dictionary::getCount(DICT_ROOT, &pos);
878 unsigned short currentChar = (unsigned short) word[offset];
879 for (int j = 0; j < count; j++) {
880 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
881 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
882 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
883 if (c == currentChar) {
884 if (offset == length - 1) {
885 if (terminal) {
886 return (pos+1);
887 }
888 } else {
889 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900890 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900891 if (t > 0) {
892 return t;
893 }
894 }
895 }
896 }
897 if (terminal) {
898 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
899 }
900 // There could be two instances of each alphabet - upper and lower case. So continue
901 // looking ...
902 }
903 return NOT_VALID_WORD;
904}
905
Jean Chalardbc90c722011-06-20 21:09:04 +0900906// The following functions will be modified.
Jean Chalard0584f022011-06-30 19:23:16 +0900907inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
908 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
909 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalardbc90c722011-06-20 21:09:04 +0900910 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
911 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
912 int *nextSiblingPosition, int *nextOutputIndex) {
913 if (DEBUG_DICT) {
914 int inputCount = 0;
915 if (skipPos >= 0) ++inputCount;
916 if (excessivePos >= 0) ++inputCount;
917 if (transposedPos >= 0) ++inputCount;
918 assert(inputCount <= 1);
919 }
920 unsigned short c;
921 int childPosition;
922 bool terminal;
923 int freq;
924 bool isSameAsUserTypedLength = false;
925
Jean Chalard0584f022011-06-30 19:23:16 +0900926 const int pos = initialPos;
927 const int depth = initialDepth;
928 const int traverseAllNodes = initialTraverseAllNodes;
929 const int diffs = initialDiffs;
930
Jean Chalardbc90c722011-06-20 21:09:04 +0900931 const uint8_t flags = 0; // No flags for now
932
933 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
934
935 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
936 &c, &childPosition, &terminal, &freq);
937 *nextOutputIndex = depth + 1;
938
939 const bool needsToTraverseChildrenNodes = childPosition != 0;
940
941 // If we are only doing traverseAllNodes, no need to look at the typed characters.
942 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
943 mWord[depth] = c;
944 if (traverseAllNodes && terminal) {
945 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
946 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
947 }
948 if (!needsToTraverseChildrenNodes) return false;
949 *newTraverseAllNodes = traverseAllNodes;
950 *newMatchRate = matchWeight;
951 *newDiffs = diffs;
952 *newInputIndex = inputIndex;
953 } else {
954 const int *currentChars = getInputCharsAt(inputIndex);
955
956 if (transposedPos >= 0) {
957 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
958 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
959 }
960
961 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
962 transposedPos);
963 if (UNRELATED_CHAR == matchedProximityCharId) return false;
964 mWord[depth] = c;
965 // If inputIndex is greater than mInputLength, that means there is no
966 // proximity chars. So, we don't need to check proximity.
967 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
968 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
969 }
970 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
971 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
972 if (isSameAsUserTypedLength && terminal) {
973 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
974 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
975 }
976 if (!needsToTraverseChildrenNodes) return false;
977 // Start traversing all nodes after the index exceeds the user typed length
978 *newTraverseAllNodes = isSameAsUserTypedLength;
979 *newMatchRate = matchWeight;
980 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
981 *newInputIndex = inputIndex + 1;
982 }
983 // Optimization: Prune out words that are too long compared to how much was typed.
984 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
985 return false;
986 }
987
988 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
989 // TODO: Check if this can be isSameAsUserTypedLength only.
990 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
991 *newTraverseAllNodes = true;
992 }
993 // get the count of nodes and increment childAddress.
994 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
995 *newChildPosition = childPosition;
996 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
997 return needsToTraverseChildrenNodes;
998}
999
1000#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001001
Jean Chalard0584f022011-06-30 19:23:16 +09001002inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
1003 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
1004 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001005 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
1006 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard432789a2011-06-30 17:50:48 +09001007 int *nextSiblingPosition, int *newOutputIndex) {
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001008 if (DEBUG_DICT) {
1009 int inputCount = 0;
1010 if (skipPos >= 0) ++inputCount;
1011 if (excessivePos >= 0) ++inputCount;
1012 if (transposedPos >= 0) ++inputCount;
1013 assert(inputCount <= 1);
1014 }
1015 unsigned short c;
1016 int childPosition;
1017 bool terminal;
1018 int freq;
1019 bool isSameAsUserTypedLength = false;
1020
Jean Chalard0584f022011-06-30 19:23:16 +09001021 int pos = initialPos;
1022 int depth = initialDepth;
1023 int traverseAllNodes = initialTraverseAllNodes;
1024 int diffs = initialDiffs;
1025
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001026 const uint8_t flags = 0; // No flags for now
1027
1028 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1029
1030 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
1031 &c, &childPosition, &terminal, &freq);
Jean Chalard432789a2011-06-30 17:50:48 +09001032 *newOutputIndex = depth + 1;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001033
1034 const bool needsToTraverseChildrenNodes = childPosition != 0;
1035
1036 // If we are only doing traverseAllNodes, no need to look at the typed characters.
1037 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1038 mWord[depth] = c;
1039 if (traverseAllNodes && terminal) {
1040 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1041 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1042 }
1043 if (!needsToTraverseChildrenNodes) return false;
1044 *newTraverseAllNodes = traverseAllNodes;
1045 *newMatchRate = matchWeight;
1046 *newDiffs = diffs;
1047 *newInputIndex = inputIndex;
1048 } else {
1049 const int *currentChars = getInputCharsAt(inputIndex);
1050
1051 if (transposedPos >= 0) {
1052 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
1053 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
1054 }
1055
1056 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
1057 transposedPos);
1058 if (UNRELATED_CHAR == matchedProximityCharId) return false;
1059 mWord[depth] = c;
1060 // If inputIndex is greater than mInputLength, that means there is no
1061 // proximity chars. So, we don't need to check proximity.
1062 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
1063 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1064 }
1065 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1066 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1067 if (isSameAsUserTypedLength && terminal) {
1068 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1069 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1070 }
1071 if (!needsToTraverseChildrenNodes) return false;
1072 // Start traversing all nodes after the index exceeds the user typed length
1073 *newTraverseAllNodes = isSameAsUserTypedLength;
1074 *newMatchRate = matchWeight;
1075 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
1076 *newInputIndex = inputIndex + 1;
1077 }
1078 // Optimization: Prune out words that are too long compared to how much was typed.
1079 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
1080 return false;
1081 }
1082
1083 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
1084 // TODO: Check if this can be isSameAsUserTypedLength only.
1085 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
1086 *newTraverseAllNodes = true;
1087 }
1088 // get the count of nodes and increment childAddress.
1089 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
1090 *newChildPosition = childPosition;
1091 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
1092 return needsToTraverseChildrenNodes;
1093}
1094
Jean Chalardbc90c722011-06-20 21:09:04 +09001095#endif // NEW_DICTIONARY_FORMAT
1096
satok30088252010-12-01 21:22:15 +09001097} // namespace latinime