blob: 3c25eda0364e6b64ab63eb320efe3d940dd63694 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
28namespace latinime {
29
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090030const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
31 { { 'a', 'e' },
32 { 'o', 'e' },
33 { 'u', 'e' } };
34
Jean Chalard293ece02011-06-16 20:55:16 +090035// TODO: check the header
36UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090037 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090038 const bool isLatestDictVersion)
Jean Chalard293ece02011-06-16 20:55:16 +090039 : DICT_ROOT(streamStart),
40 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090041 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
42 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090043 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalarda787dba2011-03-04 12:17:48 +090044 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
45 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090046 if (DEBUG_DICT) {
47 LOGI("UnigramDictionary - constructor");
48 }
satok30088252010-12-01 21:22:15 +090049}
50
satok18c28f42010-12-02 18:11:54 +090051UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090052
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090053static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
54 const int MAX_PROXIMITY_CHARS) {
55 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
56}
57
58bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
59
60 // There can't be a digraph if we don't have at least 2 characters to examine
61 if (i + 2 > codesSize) return false;
62
63 // Search for the first char of some digraph
64 int lastDigraphIndex = -1;
65 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
66 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
67 lastDigraphIndex >= 0; --lastDigraphIndex) {
68 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
69 }
70 // No match: return early
71 if (lastDigraphIndex < 0) return false;
72
73 // It's an interesting digraph if the second char matches too.
74 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
75}
76
77// Mostly the same arguments as the non-recursive version, except:
78// codes is the original value. It points to the start of the work buffer, and gets passed as is.
79// codesSize is the size of the user input (thus, it is the size of codesSrc).
80// codesDest is the current point in the work buffer.
81// codesSrc is the current point in the user-input, original, content-unmodified buffer.
82// codesRemain is the remaining size in codesSrc.
83void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
84 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
85 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080086 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090087
Jean Chalarda787dba2011-03-04 12:17:48 +090088 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
89 for (int i = 0; i < codesRemain; ++i) {
90 if (isDigraph(codesSrc, i, codesRemain)) {
91 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090092
Jean Chalarda787dba2011-03-04 12:17:48 +090093 // Copy the word up to the first char of the digraph, then continue processing
94 // on the remaining part of the word, skipping the second char of the digraph.
95 // In our example, copy "pru" and continue running on "fen"
96 // Make i the index of the second char of the digraph for simplicity. Forgetting
97 // to do that results in an infinite recursion so take care!
98 ++i;
99 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
100 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
101 codesBuffer, codesBufferSize, flags,
102 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
103 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
104 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the second char of the digraph in place, then continue processing on
107 // the remaining part of the word.
108 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
109 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
110 BYTES_IN_ONE_CHAR);
111 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
112 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
113 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
114 outWords, frequencies);
115 return;
116 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900117 }
118 }
119
120 // If we come here, we hit the end of the word: let's check it against the dictionary.
121 // In our example, we'll come here once for "prufen" and then once for "pruefen".
122 // If the word contains several digraphs, we'll come it for the product of them.
123 // eg. if the word is "ueberpruefen" we'll test, in order, against
124 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
125 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
126 if (0 != remainingBytes)
127 memcpy(codesDest, codesSrc, remainingBytes);
128
129 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
130 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
131}
132
133int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
134 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
135 unsigned short *outWords, int *frequencies) {
136
137 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
138 { // Incrementally tune the word and try all possibilities
139 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
140 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900141 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900142 } else { // Normal processing
143 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
144 outWords, frequencies);
145 }
146
satok817e5172011-03-04 06:06:45 -0800147 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900148 // Get the word count
149 int suggestedWordsCount = 0;
150 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
151 suggestedWordsCount++;
152 }
153
154 if (DEBUG_DICT) {
155 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900156 /// Print the returned words
157 for (int j = 0; j < suggestedWordsCount; ++j) {
158 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
159 char s[MAX_WORD_LENGTH];
160 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
161 LOGI("%s %i", s, mFrequencies[j]);
162 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900163 LOGI("Next letters: ");
164 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
165 if (mNextLettersFrequency[k] > 0) {
166 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
167 }
168 }
169 }
satok817e5172011-03-04 06:06:45 -0800170 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900171 PROF_CLOSE;
172 return suggestedWordsCount;
173}
174
175void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
176 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
177 unsigned short *outWords, int *frequencies) {
178
satok61e2f852011-01-05 14:13:07 +0900179 PROF_OPEN;
180 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900181 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900182 if (DEBUG_DICT) assert(codesSize == mInputLength);
183
satoka3d78f62010-12-09 22:08:33 +0900184 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900185 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900186
satok61e2f852011-01-05 14:13:07 +0900187 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900188 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900189 PROF_END(1);
190
191 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900192 // Suggestion with missing character
193 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900194 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900195 if (DEBUG_DICT) {
196 LOGI("--- Suggest missing characters %d", i);
197 }
satok54fe9e02010-12-13 14:42:35 +0900198 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900199 }
200 }
satok61e2f852011-01-05 14:13:07 +0900201 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900202
satok61e2f852011-01-05 14:13:07 +0900203 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900204 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900205 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
206 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900207 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900208 if (DEBUG_DICT) {
209 LOGI("--- Suggest excessive characters %d", i);
210 }
satok54fe9e02010-12-13 14:42:35 +0900211 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900212 }
213 }
satok61e2f852011-01-05 14:13:07 +0900214 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900215
satok61e2f852011-01-05 14:13:07 +0900216 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900217 // Suggestion with transposed characters
218 // Only suggest words that length is mInputLength
219 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
220 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900221 if (DEBUG_DICT) {
222 LOGI("--- Suggest transposed characters %d", i);
223 }
satok54fe9e02010-12-13 14:42:35 +0900224 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900225 }
226 }
satok61e2f852011-01-05 14:13:07 +0900227 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900228
satok61e2f852011-01-05 14:13:07 +0900229 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900230 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900231 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
232 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900233 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900234 if (DEBUG_DICT) {
235 LOGI("--- Suggest missing space characters %d", i);
236 }
satok662fe692010-12-08 17:05:39 +0900237 getMissingSpaceWords(mInputLength, i);
238 }
239 }
satok61e2f852011-01-05 14:13:07 +0900240 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800241
242 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900243 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800244 // The first and last "mistyped spaces" are taken care of by excessive character handling
245 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900246 if (DEBUG_DICT) {
247 LOGI("--- Suggest words with proximity space %d", i);
248 }
satok817e5172011-03-04 06:06:45 -0800249 const int x = xcoordinates[i];
250 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900251 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800252 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
253 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900254 }
satok817e5172011-03-04 06:06:45 -0800255 if (proximityInfo->hasSpaceProximity(x, y)) {
256 getMistypedSpaceWords(mInputLength, i);
257 }
satok817e5172011-03-04 06:06:45 -0800258 }
259 }
260 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900261}
262
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900263void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
264 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900265 if (DEBUG_DICT) {
266 LOGI("initSuggest");
267 }
satok30088252010-12-01 21:22:15 +0900268 mFrequencies = frequencies;
269 mOutputChars = outWords;
270 mInputCodes = codes;
271 mInputLength = codesSize;
272 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
273}
274
Jean Chalard8124e642011-06-16 22:33:41 +0900275static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900276 if (c < nextLettersSize) {
277 nextLetters[c]++;
278 }
279}
280
satok662fe692010-12-08 17:05:39 +0900281// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900282// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900283bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900284 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900285 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
satok30088252010-12-01 21:22:15 +0900286 char s[length + 1];
287 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900288 LOGI("Found word = %s, freq = %d", s, frequency);
satok30088252010-12-01 21:22:15 +0900289 }
satokf5cded12010-12-06 21:28:24 +0900290 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900291 if (DEBUG_DICT) {
292 LOGI("Exceeded max word length.");
293 }
satokf5cded12010-12-06 21:28:24 +0900294 return false;
295 }
satok30088252010-12-01 21:22:15 +0900296
297 // Find the right insertion point
298 int insertAt = 0;
299 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900300 // TODO: How should we sort words with the same frequency?
301 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900302 break;
303 }
304 insertAt++;
305 }
306 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900307 if (DEBUG_DICT) {
308 char s[length + 1];
309 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900310 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satokcdbbea72010-12-08 16:04:16 +0900311 }
satok30088252010-12-01 21:22:15 +0900312 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
313 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
314 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
315 mFrequencies[insertAt] = frequency;
316 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900317 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900318 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900319 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900320 while (length--) {
321 *dest++ = *word++;
322 }
323 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900324 if (DEBUG_DICT) {
325 LOGI("Added word at %d", insertAt);
326 }
satok30088252010-12-01 21:22:15 +0900327 return true;
328 }
329 return false;
330}
331
Jean Chalard8124e642011-06-16 22:33:41 +0900332static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900333 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
334 c = BASE_CHARS[c];
335 }
336 if (c >='A' && c <= 'Z') {
337 c |= 32;
338 } else if (c > 127) {
339 c = latin_tolower(c);
340 }
341 return c;
342}
343
Jean Chalardca5ef282011-06-17 15:36:26 +0900344bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
satok30088252010-12-01 21:22:15 +0900345 if (length != mInputLength) {
346 return false;
347 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900348 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900349 while (length--) {
350 if ((unsigned int) *inputCodes != (unsigned int) *word) {
351 return false;
352 }
satok662fe692010-12-08 17:05:39 +0900353 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900354 word++;
355 }
356 return true;
357}
358
satok715514d2010-12-02 20:19:59 +0900359static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900360static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900361
satok54fe9e02010-12-13 14:42:35 +0900362void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900363 const int excessivePos, const int transposedPos, int *nextLetters,
364 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900365 if (DEBUG_DICT) {
366 LOGI("getSuggestionCandidates %d", maxDepth);
367 assert(transposedPos + 1 < mInputLength);
368 assert(excessivePos < mInputLength);
369 assert(missingPos < mInputLength);
370 }
satok662fe692010-12-08 17:05:39 +0900371 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900372 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900373 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900374 int depth = 0;
375
376 mStackChildCount[0] = childCount;
377 mStackTraverseAll[0] = (mInputLength <= 0);
378 mStackNodeFreq[0] = 1;
379 mStackInputIndex[0] = 0;
380 mStackDiffs[0] = 0;
381 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900382 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900383
satok662fe692010-12-08 17:05:39 +0900384 // Depth first search
satokd2997922010-12-07 13:08:39 +0900385 while (depth >= 0) {
386 if (mStackChildCount[depth] > 0) {
387 --mStackChildCount[depth];
388 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900389 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900390 int inputIndex = mStackInputIndex[depth];
391 int diffs = mStackDiffs[depth];
392 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900393 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900394 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900395 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900396 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900397 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900398 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
399 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
400 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900401 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900402 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900403 mStackSiblingPos[depth] = siblingPos;
404 if (needsToTraverseChildrenNodes) {
405 // Goes to child node
406 ++depth;
407 mStackChildCount[depth] = childCount;
408 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900409 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900410 mStackInputIndex[depth] = inputIndex;
411 mStackDiffs[depth] = diffs;
412 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900413 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900414 }
415 } else {
satokcdbbea72010-12-08 16:04:16 +0900416 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900417 --depth;
418 }
419 }
420}
421
satokb2e5e592011-04-26 14:50:54 +0900422static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
423static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
424 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
425}
426
427static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
428inline static void multiplyIntCapped(const int multiplier, int *base) {
429 const int temp = *base;
430 if (temp != S_INT_MAX) {
431 // Branch if multiplier == 2 for the optimization
432 if (multiplier == 2) {
433 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
434 } else {
435 const int tempRetval = temp * multiplier;
436 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
437 }
438 }
439}
440
441inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900442 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900443 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900444 } else {
satokb2e5e592011-04-26 14:50:54 +0900445 int ret = base;
446 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
447 return ret;
448 }
449}
450
451inline static void multiplyRate(const int rate, int *freq) {
452 if (*freq != S_INT_MAX) {
453 if (*freq > 1000000) {
454 *freq /= 100;
455 multiplyIntCapped(rate, freq);
456 } else {
457 multiplyIntCapped(rate, freq);
458 *freq /= 100;
459 }
satokf7425bb2011-01-05 16:37:53 +0900460 }
461}
462
satok4c981d32011-04-19 13:58:42 +0900463inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900464 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
465 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900466 if (firstWordLength == 0 || secondWordLength == 0) {
467 return 0;
468 }
469 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
470 int tempFirstFreq = firstFreq;
471 multiplyRate(firstDemotionRate, &tempFirstFreq);
472
473 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
474 int tempSecondFreq = secondFreq;
475 multiplyRate(secondDemotionRate, &tempSecondFreq);
476
477 const int totalLength = firstWordLength + secondWordLength;
478
479 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
480 // length.
481 int totalFreq = tempFirstFreq + tempSecondFreq;
482
483 // This is a workaround to try offsetting the not-enough-demotion which will be done in
484 // calcNormalizedScore in Utils.java.
485 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
486 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
487 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
488 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
489 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
490
491 // At this moment, totalFreq is calculated by the following formula:
492 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
493 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
494
satokb2e5e592011-04-26 14:50:54 +0900495 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900496
497 // This is another workaround to offset the demotion which will be done in
498 // calcNormalizedScore in Utils.java.
499 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
500 // the same amount because we already have adjusted the synthetic freq of this "missing or
501 // mistyped space" suggestion candidate above in this method.
502 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
503 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
504
satokd8db9f82011-05-18 15:31:04 +0900505 if (isSpaceProximity) {
506 // A word pair with one space proximity correction
507 if (DEBUG_DICT) {
508 LOGI("Found a word pair with space proximity correction.");
509 }
510 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
511 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
512 }
513
satok4c981d32011-04-19 13:58:42 +0900514 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
515 return totalFreq;
516}
517
satok817e5172011-03-04 06:06:45 -0800518bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
519 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900520 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800521}
522
523bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
524 return getSplitTwoWordsSuggestion(
525 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900526 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800527}
528
satok58c49b92011-01-27 03:23:39 +0900529inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900530 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900531 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900532 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900533 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900534 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900535 if (mInputLength >= 2) {
536 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
537 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
538 / (10 * mInputLength
539 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900540 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900541 LOGI("Demotion rate for missing character is %d.", demotionRate);
542 }
satokdc5301e2011-04-11 16:14:45 +0900543 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900544 } else {
545 finalFreq = 0;
546 }
547 }
satokf7425bb2011-01-05 16:37:53 +0900548 if (transposedPos >= 0) multiplyRate(
549 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900550 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900551 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900552 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900553 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900554 }
555 }
satok58c49b92011-01-27 03:23:39 +0900556 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900557 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900558 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900559 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900560 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900561 if (DEBUG_DICT) {
562 LOGI("Found full matched word.");
563 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900564 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
565 }
566 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900567 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900568 }
satok9674f652011-04-20 17:15:27 +0900569 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900570 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900571 if (DEBUG_DICT) {
572 LOGI("Found one proximity correction.");
573 }
satokb2e5e592011-04-26 14:50:54 +0900574 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900575 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900576 }
satok9674f652011-04-20 17:15:27 +0900577 if (DEBUG_DICT) {
578 LOGI("calc: %d, %d", depth, sameLength);
579 }
satokb2e5e592011-04-26 14:50:54 +0900580 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900581 return finalFreq;
582}
satoka3d78f62010-12-09 22:08:33 +0900583
satok28bd03b2010-12-03 16:39:16 +0900584inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900585 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900586 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900587 // Skip the ' or other letter and continue deeper
588 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
589}
590
satoke07baa62010-12-09 21:55:40 +0900591inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900592 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900593 if (inputIndex < 0 || inputIndex >= inputLength) return false;
594 const int currentChar = *getInputCharsAt(inputIndex);
595 const int leftIndex = inputIndex - 1;
596 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900597 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900598 int i = 0;
599 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
600 if (leftChars[i++] == currentChar) return true;
601 }
602 }
603 const int rightIndex = inputIndex + 1;
604 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900605 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900606 int i = 0;
607 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
608 if (rightChars[i++] == currentChar) return true;
609 }
610 }
611 return false;
612}
613
Jean Chalarda5d58492011-02-18 17:50:58 +0900614// In the following function, c is the current character of the dictionary word
615// currently examined.
616// currentChars is an array containing the keys close to the character the
617// user actually typed at the same position. We want to see if c is in it: if so,
618// then the word contains at that position a character close to what the user
619// typed.
620// What the user typed is actually the first character of the array.
621// Notice : accented characters do not have a proximity list, so they are alone
622// in their list. The non-accented version of the character should be considered
623// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900624inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
625 const int *currentChars, const unsigned short c, const int skipPos,
626 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900627 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900628
629 // The first char in the array is what user typed. If it matches right away,
630 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900631 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900632 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
633
634 // If one of those is true, we should not check for close characters at all.
635 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
636 return UNRELATED_CHAR;
637
638 // If the non-accented, lowercased version of that first character matches c,
639 // then we have a non-accented version of the accented character the user
640 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900641 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900642 return NEAR_PROXIMITY_CHAR;
643
644 // Not an exact nor an accent-alike match: search the list of close keys
645 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900646 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900647 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900648 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900649 ++j;
650 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900651
652 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900653 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900654}
655
Jean Chalardca5ef282011-06-17 15:36:26 +0900656inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900657 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900658 const int inputIndex, const int matchWeight, const int skipPos,
659 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
660 int* nextLetters, const int nextLettersSize) {
661
662 const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900663 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900664
665 if (depth >= MIN_SUGGEST_DEPTH) {
666 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
667 excessivePos, transposedPos, freq, sameLength);
668 if (!isSameAsTyped)
669 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900670 }
671
672 if (sameLength && depth >= mInputLength && skipPos < 0) {
673 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
674 }
675}
676
Jean Chalardbc90c722011-06-20 21:09:04 +0900677#ifndef NEW_DICTIONARY_FORMAT
678// TODO: Don't forget to bring inline functions back to over where they are used.
satokcdbbea72010-12-08 16:04:16 +0900679
Jean Chalardbc90c722011-06-20 21:09:04 +0900680// The following functions will be entirely replaced with new implementations.
681void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
682 const int excessivePos, const int transposedPos,int *nextLetters,
683 const int nextLettersSize) {
684 int initialPosition = initialPos;
685 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
686 getWordsRec(count, initialPosition, 0,
687 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
688 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
689 nextLettersSize);
690}
Jean Chalardca5ef282011-06-17 15:36:26 +0900691
Jean Chalardbc90c722011-06-20 21:09:04 +0900692void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
693 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
694 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
695 const int transposedPos, int *nextLetters, const int nextLettersSize) {
696 int siblingPos = pos;
697 for (int i = 0; i < childrenCount; ++i) {
698 int newCount;
699 int newChildPosition;
700 bool newTraverseAllNodes;
701 int newMatchRate;
702 int newInputIndex;
703 int newDiffs;
704 int newSiblingPos;
705 int newOutputIndex;
706 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
707 traverseAllNodes, matchWeight, inputIndex, diffs,
708 skipPos, excessivePos, transposedPos,
709 nextLetters, nextLettersSize,
710 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
711 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
712 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900713
Jean Chalardbc90c722011-06-20 21:09:04 +0900714 if (needsToTraverseChildrenNodes) {
715 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
716 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
717 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900718 }
satok48e432c2010-12-06 17:38:58 +0900719 }
satok48e432c2010-12-06 17:38:58 +0900720}
721
Jean Chalard980d6b62011-06-30 17:02:23 +0900722inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
723 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900724 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900725 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900726 int maxFreq = 0;
727 int depth = 0;
728 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900729 bool terminal = false;
730
satokaee09dc2010-12-09 19:21:51 +0900731 mStackChildCount[0] = count;
732 mStackSiblingPos[0] = pos;
733
734 while (depth >= 0) {
735 if (mStackChildCount[depth] > 0) {
736 --mStackChildCount[depth];
737 int firstChildPos;
738 int newFreq;
739 int siblingPos = mStackSiblingPos[depth];
740 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
741 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
742 &siblingPos);
743 mStackSiblingPos[depth] = siblingPos;
744 if (depth == (inputLength - 1)) {
745 // Traverse sibling node
746 if (terminal) {
747 if (newFreq > maxFreq) {
748 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
749 if (DEBUG_DICT && DEBUG_NODE) {
750 char s[inputLength + 1];
751 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
752 s[inputLength] = 0;
753 LOGI("New missing space word found: %d > %d (%s), %d, %d",
754 newFreq, maxFreq, s, inputLength, depth);
755 }
756 maxFreq = newFreq;
757 }
758 }
759 } else if (needsToTraverseChildrenNodes) {
760 // Traverse children nodes
761 ++depth;
762 mStackChildCount[depth] = count;
763 mStackSiblingPos[depth] = firstChildPos;
764 }
765 } else {
766 // Traverse parent node
767 --depth;
satok662fe692010-12-08 17:05:39 +0900768 }
769 }
satokaee09dc2010-12-09 19:21:51 +0900770
771 word[inputLength] = 0;
772 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900773}
774
775inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900776 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
777 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
778 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900779 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900780 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900781 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
782 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900783 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900784 if (DEBUG_DICT) {
785 assert(inputC <= U_SHORT_MAX);
786 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900787 const unsigned short baseLowerC = toBaseLowerCase(c);
788 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900789 const bool hasChild = *newChildPosition != 0;
790 if (matched) {
791 word[depth] = c;
792 if (DEBUG_DICT && DEBUG_NODE) {
793 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900794 if (*newTerminal) {
795 LOGI("Terminal %d", *newFreq);
796 }
satok662fe692010-12-08 17:05:39 +0900797 }
satokaee09dc2010-12-09 19:21:51 +0900798 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900799 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900800 return true;
801 } else {
802 return false;
803 }
804 } else {
805 // If this node is not user typed character, this method treats this word as unmatched.
806 // Thus newTerminal shouldn't be true.
807 *newTerminal = false;
808 return false;
satok662fe692010-12-08 17:05:39 +0900809 }
satok662fe692010-12-08 17:05:39 +0900810}
Jean Chalard8124e642011-06-16 22:33:41 +0900811
812// TODO: use uint32_t instead of unsigned short
813bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
814 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900815 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900816 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900817 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900818 }
819}
820
Jean Chalard17e44a72011-06-16 22:51:11 +0900821
822// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900823int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
824 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900825 // returns address of bigram data of that word
826 // return -99 if not found
827
828 int count = Dictionary::getCount(DICT_ROOT, &pos);
829 unsigned short currentChar = (unsigned short) word[offset];
830 for (int j = 0; j < count; j++) {
831 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
832 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
833 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
834 if (c == currentChar) {
835 if (offset == length - 1) {
836 if (terminal) {
837 return (pos+1);
838 }
839 } else {
840 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900841 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900842 if (t > 0) {
843 return t;
844 }
845 }
846 }
847 }
848 if (terminal) {
849 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
850 }
851 // There could be two instances of each alphabet - upper and lower case. So continue
852 // looking ...
853 }
854 return NOT_VALID_WORD;
855}
856
Jean Chalardbc90c722011-06-20 21:09:04 +0900857
858// The following functions will be modified.
859bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
860 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
861 const int secondWordLength, const bool isSpaceProximity) {
862 if (inputLength >= MAX_WORD_LENGTH) return false;
863 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
864 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
865 return false;
866 const int newWordLength = firstWordLength + secondWordLength + 1;
867 // Allocating variable length array on stack
868 unsigned short word[newWordLength];
Jean Chalardffefdb62011-06-30 17:15:32 +0900869 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
Jean Chalardbc90c722011-06-20 21:09:04 +0900870 if (DEBUG_DICT) {
871 LOGI("First freq: %d", firstFreq);
872 }
873 if (firstFreq <= 0) return false;
874
875 for (int i = 0; i < firstWordLength; ++i) {
876 word[i] = mWord[i];
877 }
878
Jean Chalardffefdb62011-06-30 17:15:32 +0900879 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
Jean Chalardbc90c722011-06-20 21:09:04 +0900880 if (DEBUG_DICT) {
881 LOGI("Second freq: %d", secondFreq);
882 }
883 if (secondFreq <= 0) return false;
884
885 word[firstWordLength] = SPACE;
886 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
887 word[i] = mWord[i - firstWordLength - 1];
888 }
889
890 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
891 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
892 if (DEBUG_DICT) {
893 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
894 TYPED_LETTER_MULTIPLIER);
895 }
896 addWord(word, newWordLength, pairFreq);
897 return true;
898}
899
900inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
901 const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
902 const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
903 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
904 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
905 int *nextSiblingPosition, int *nextOutputIndex) {
906 if (DEBUG_DICT) {
907 int inputCount = 0;
908 if (skipPos >= 0) ++inputCount;
909 if (excessivePos >= 0) ++inputCount;
910 if (transposedPos >= 0) ++inputCount;
911 assert(inputCount <= 1);
912 }
913 unsigned short c;
914 int childPosition;
915 bool terminal;
916 int freq;
917 bool isSameAsUserTypedLength = false;
918
919 const uint8_t flags = 0; // No flags for now
920
921 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
922
923 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
924 &c, &childPosition, &terminal, &freq);
925 *nextOutputIndex = depth + 1;
926
927 const bool needsToTraverseChildrenNodes = childPosition != 0;
928
929 // If we are only doing traverseAllNodes, no need to look at the typed characters.
930 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
931 mWord[depth] = c;
932 if (traverseAllNodes && terminal) {
933 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
934 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
935 }
936 if (!needsToTraverseChildrenNodes) return false;
937 *newTraverseAllNodes = traverseAllNodes;
938 *newMatchRate = matchWeight;
939 *newDiffs = diffs;
940 *newInputIndex = inputIndex;
941 } else {
942 const int *currentChars = getInputCharsAt(inputIndex);
943
944 if (transposedPos >= 0) {
945 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
946 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
947 }
948
949 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
950 transposedPos);
951 if (UNRELATED_CHAR == matchedProximityCharId) return false;
952 mWord[depth] = c;
953 // If inputIndex is greater than mInputLength, that means there is no
954 // proximity chars. So, we don't need to check proximity.
955 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
956 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
957 }
958 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
959 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
960 if (isSameAsUserTypedLength && terminal) {
961 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
962 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
963 }
964 if (!needsToTraverseChildrenNodes) return false;
965 // Start traversing all nodes after the index exceeds the user typed length
966 *newTraverseAllNodes = isSameAsUserTypedLength;
967 *newMatchRate = matchWeight;
968 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
969 *newInputIndex = inputIndex + 1;
970 }
971 // Optimization: Prune out words that are too long compared to how much was typed.
972 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
973 return false;
974 }
975
976 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
977 // TODO: Check if this can be isSameAsUserTypedLength only.
978 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
979 *newTraverseAllNodes = true;
980 }
981 // get the count of nodes and increment childAddress.
982 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
983 *newChildPosition = childPosition;
984 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
985 return needsToTraverseChildrenNodes;
986}
987
988#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +0900989
990bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
991 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
992 const int secondWordLength, const bool isSpaceProximity) {
993 if (inputLength >= MAX_WORD_LENGTH) return false;
994 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
995 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
996 return false;
997 const int newWordLength = firstWordLength + secondWordLength + 1;
998 // Allocating variable length array on stack
999 unsigned short word[newWordLength];
Jean Chalardffefdb62011-06-30 17:15:32 +09001000 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001001 if (DEBUG_DICT) {
1002 LOGI("First freq: %d", firstFreq);
1003 }
1004 if (firstFreq <= 0) return false;
1005
1006 for (int i = 0; i < firstWordLength; ++i) {
1007 word[i] = mWord[i];
1008 }
1009
Jean Chalardffefdb62011-06-30 17:15:32 +09001010 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001011 if (DEBUG_DICT) {
1012 LOGI("Second freq: %d", secondFreq);
1013 }
1014 if (secondFreq <= 0) return false;
1015
1016 word[firstWordLength] = SPACE;
1017 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
1018 word[i] = mWord[i - firstWordLength - 1];
1019 }
1020
1021 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
1022 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
1023 if (DEBUG_DICT) {
1024 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
1025 TYPED_LETTER_MULTIPLIER);
1026 }
1027 addWord(word, newWordLength, pairFreq);
1028 return true;
1029}
1030
1031inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
1032 const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
1033 const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
1034 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
1035 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
1036 int *nextSiblingPosition, int *nextOutputIndex) {
1037 if (DEBUG_DICT) {
1038 int inputCount = 0;
1039 if (skipPos >= 0) ++inputCount;
1040 if (excessivePos >= 0) ++inputCount;
1041 if (transposedPos >= 0) ++inputCount;
1042 assert(inputCount <= 1);
1043 }
1044 unsigned short c;
1045 int childPosition;
1046 bool terminal;
1047 int freq;
1048 bool isSameAsUserTypedLength = false;
1049
1050 const uint8_t flags = 0; // No flags for now
1051
1052 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1053
1054 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
1055 &c, &childPosition, &terminal, &freq);
1056 *nextOutputIndex = depth + 1;
1057
1058 const bool needsToTraverseChildrenNodes = childPosition != 0;
1059
1060 // If we are only doing traverseAllNodes, no need to look at the typed characters.
1061 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1062 mWord[depth] = c;
1063 if (traverseAllNodes && terminal) {
1064 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1065 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1066 }
1067 if (!needsToTraverseChildrenNodes) return false;
1068 *newTraverseAllNodes = traverseAllNodes;
1069 *newMatchRate = matchWeight;
1070 *newDiffs = diffs;
1071 *newInputIndex = inputIndex;
1072 } else {
1073 const int *currentChars = getInputCharsAt(inputIndex);
1074
1075 if (transposedPos >= 0) {
1076 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
1077 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
1078 }
1079
1080 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
1081 transposedPos);
1082 if (UNRELATED_CHAR == matchedProximityCharId) return false;
1083 mWord[depth] = c;
1084 // If inputIndex is greater than mInputLength, that means there is no
1085 // proximity chars. So, we don't need to check proximity.
1086 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
1087 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1088 }
1089 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1090 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1091 if (isSameAsUserTypedLength && terminal) {
1092 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1093 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1094 }
1095 if (!needsToTraverseChildrenNodes) return false;
1096 // Start traversing all nodes after the index exceeds the user typed length
1097 *newTraverseAllNodes = isSameAsUserTypedLength;
1098 *newMatchRate = matchWeight;
1099 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
1100 *newInputIndex = inputIndex + 1;
1101 }
1102 // Optimization: Prune out words that are too long compared to how much was typed.
1103 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
1104 return false;
1105 }
1106
1107 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
1108 // TODO: Check if this can be isSameAsUserTypedLength only.
1109 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
1110 *newTraverseAllNodes = true;
1111 }
1112 // get the count of nodes and increment childAddress.
1113 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
1114 *newChildPosition = childPosition;
1115 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
1116 return needsToTraverseChildrenNodes;
1117}
1118
Jean Chalardbc90c722011-06-20 21:09:04 +09001119#endif // NEW_DICTIONARY_FORMAT
1120
satok30088252010-12-01 21:22:15 +09001121} // namespace latinime