blob: c294cf3b54a58ea8a4087ed048eebb02e4f4287e [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
28namespace latinime {
29
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090030const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
31 { { 'a', 'e' },
32 { 'o', 'e' },
33 { 'u', 'e' } };
34
Jean Chalard293ece02011-06-16 20:55:16 +090035// TODO: check the header
36UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090037 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090038 const bool isLatestDictVersion)
Jean Chalard293ece02011-06-16 20:55:16 +090039 : DICT_ROOT(streamStart),
40 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090041 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
42 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090043 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalarda787dba2011-03-04 12:17:48 +090044 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
45 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090046 if (DEBUG_DICT) {
47 LOGI("UnigramDictionary - constructor");
48 }
satok30088252010-12-01 21:22:15 +090049}
50
satok18c28f42010-12-02 18:11:54 +090051UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090052
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090053static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
54 const int MAX_PROXIMITY_CHARS) {
55 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
56}
57
58bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
59
60 // There can't be a digraph if we don't have at least 2 characters to examine
61 if (i + 2 > codesSize) return false;
62
63 // Search for the first char of some digraph
64 int lastDigraphIndex = -1;
65 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
66 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
67 lastDigraphIndex >= 0; --lastDigraphIndex) {
68 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
69 }
70 // No match: return early
71 if (lastDigraphIndex < 0) return false;
72
73 // It's an interesting digraph if the second char matches too.
74 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
75}
76
77// Mostly the same arguments as the non-recursive version, except:
78// codes is the original value. It points to the start of the work buffer, and gets passed as is.
79// codesSize is the size of the user input (thus, it is the size of codesSrc).
80// codesDest is the current point in the work buffer.
81// codesSrc is the current point in the user-input, original, content-unmodified buffer.
82// codesRemain is the remaining size in codesSrc.
83void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
84 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
85 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080086 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090087
Jean Chalarda787dba2011-03-04 12:17:48 +090088 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
89 for (int i = 0; i < codesRemain; ++i) {
90 if (isDigraph(codesSrc, i, codesRemain)) {
91 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090092
Jean Chalarda787dba2011-03-04 12:17:48 +090093 // Copy the word up to the first char of the digraph, then continue processing
94 // on the remaining part of the word, skipping the second char of the digraph.
95 // In our example, copy "pru" and continue running on "fen"
96 // Make i the index of the second char of the digraph for simplicity. Forgetting
97 // to do that results in an infinite recursion so take care!
98 ++i;
99 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
100 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
101 codesBuffer, codesBufferSize, flags,
102 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
103 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
104 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the second char of the digraph in place, then continue processing on
107 // the remaining part of the word.
108 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
109 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
110 BYTES_IN_ONE_CHAR);
111 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
112 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
113 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
114 outWords, frequencies);
115 return;
116 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900117 }
118 }
119
120 // If we come here, we hit the end of the word: let's check it against the dictionary.
121 // In our example, we'll come here once for "prufen" and then once for "pruefen".
122 // If the word contains several digraphs, we'll come it for the product of them.
123 // eg. if the word is "ueberpruefen" we'll test, in order, against
124 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
125 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
126 if (0 != remainingBytes)
127 memcpy(codesDest, codesSrc, remainingBytes);
128
129 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
130 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
131}
132
133int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
134 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
135 unsigned short *outWords, int *frequencies) {
136
137 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
138 { // Incrementally tune the word and try all possibilities
139 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
140 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900141 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900142 } else { // Normal processing
143 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
144 outWords, frequencies);
145 }
146
satok817e5172011-03-04 06:06:45 -0800147 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900148 // Get the word count
149 int suggestedWordsCount = 0;
150 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
151 suggestedWordsCount++;
152 }
153
154 if (DEBUG_DICT) {
155 LOGI("Returning %d words", suggestedWordsCount);
156 LOGI("Next letters: ");
157 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
158 if (mNextLettersFrequency[k] > 0) {
159 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
160 }
161 }
162 }
satok817e5172011-03-04 06:06:45 -0800163 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900164 PROF_CLOSE;
165 return suggestedWordsCount;
166}
167
168void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
169 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
170 unsigned short *outWords, int *frequencies) {
171
satok61e2f852011-01-05 14:13:07 +0900172 PROF_OPEN;
173 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900174 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900175 if (DEBUG_DICT) assert(codesSize == mInputLength);
176
satoka3d78f62010-12-09 22:08:33 +0900177 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900178 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900179
satok61e2f852011-01-05 14:13:07 +0900180 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900181 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900182 PROF_END(1);
183
184 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900185 // Suggestion with missing character
186 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900187 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900188 if (DEBUG_DICT) {
189 LOGI("--- Suggest missing characters %d", i);
190 }
satok54fe9e02010-12-13 14:42:35 +0900191 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900192 }
193 }
satok61e2f852011-01-05 14:13:07 +0900194 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900195
satok61e2f852011-01-05 14:13:07 +0900196 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900197 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900198 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
199 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900200 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900201 if (DEBUG_DICT) {
202 LOGI("--- Suggest excessive characters %d", i);
203 }
satok54fe9e02010-12-13 14:42:35 +0900204 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900205 }
206 }
satok61e2f852011-01-05 14:13:07 +0900207 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900208
satok61e2f852011-01-05 14:13:07 +0900209 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900210 // Suggestion with transposed characters
211 // Only suggest words that length is mInputLength
212 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
213 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900214 if (DEBUG_DICT) {
215 LOGI("--- Suggest transposed characters %d", i);
216 }
satok54fe9e02010-12-13 14:42:35 +0900217 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900218 }
219 }
satok61e2f852011-01-05 14:13:07 +0900220 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900221
satok61e2f852011-01-05 14:13:07 +0900222 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900223 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900224 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
225 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900226 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900227 if (DEBUG_DICT) {
228 LOGI("--- Suggest missing space characters %d", i);
229 }
satok662fe692010-12-08 17:05:39 +0900230 getMissingSpaceWords(mInputLength, i);
231 }
232 }
satok61e2f852011-01-05 14:13:07 +0900233 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800234
235 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900236 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800237 // The first and last "mistyped spaces" are taken care of by excessive character handling
238 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900239 if (DEBUG_DICT) {
240 LOGI("--- Suggest words with proximity space %d", i);
241 }
satok817e5172011-03-04 06:06:45 -0800242 const int x = xcoordinates[i];
243 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900244 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800245 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
246 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900247 }
satok817e5172011-03-04 06:06:45 -0800248 if (proximityInfo->hasSpaceProximity(x, y)) {
249 getMistypedSpaceWords(mInputLength, i);
250 }
satok817e5172011-03-04 06:06:45 -0800251 }
252 }
253 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900254}
255
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900256void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
257 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900258 if (DEBUG_DICT) {
259 LOGI("initSuggest");
260 }
satok30088252010-12-01 21:22:15 +0900261 mFrequencies = frequencies;
262 mOutputChars = outWords;
263 mInputCodes = codes;
264 mInputLength = codesSize;
265 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
266}
267
Jean Chalard8124e642011-06-16 22:33:41 +0900268static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900269 if (c < nextLettersSize) {
270 nextLetters[c]++;
271 }
272}
273
satok662fe692010-12-08 17:05:39 +0900274// TODO: We need to optimize addWord by using STL or something
satok28bd03b2010-12-03 16:39:16 +0900275bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900276 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900277 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
satok30088252010-12-01 21:22:15 +0900278 char s[length + 1];
279 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900280 LOGI("Found word = %s, freq = %d", s, frequency);
satok30088252010-12-01 21:22:15 +0900281 }
satokf5cded12010-12-06 21:28:24 +0900282 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900283 if (DEBUG_DICT) {
284 LOGI("Exceeded max word length.");
285 }
satokf5cded12010-12-06 21:28:24 +0900286 return false;
287 }
satok30088252010-12-01 21:22:15 +0900288
289 // Find the right insertion point
290 int insertAt = 0;
291 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900292 // TODO: How should we sort words with the same frequency?
293 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900294 break;
295 }
296 insertAt++;
297 }
298 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900299 if (DEBUG_DICT) {
300 char s[length + 1];
301 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900302 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satokcdbbea72010-12-08 16:04:16 +0900303 }
satok30088252010-12-01 21:22:15 +0900304 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
305 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
306 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
307 mFrequencies[insertAt] = frequency;
308 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900309 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900310 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900311 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900312 while (length--) {
313 *dest++ = *word++;
314 }
315 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900316 if (DEBUG_DICT) {
317 LOGI("Added word at %d", insertAt);
318 }
satok30088252010-12-01 21:22:15 +0900319 return true;
320 }
321 return false;
322}
323
Jean Chalard8124e642011-06-16 22:33:41 +0900324static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900325 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
326 c = BASE_CHARS[c];
327 }
328 if (c >='A' && c <= 'Z') {
329 c |= 32;
330 } else if (c > 127) {
331 c = latin_tolower(c);
332 }
333 return c;
334}
335
satok28bd03b2010-12-03 16:39:16 +0900336bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {
satok30088252010-12-01 21:22:15 +0900337 if (length != mInputLength) {
338 return false;
339 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900340 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900341 while (length--) {
342 if ((unsigned int) *inputCodes != (unsigned int) *word) {
343 return false;
344 }
satok662fe692010-12-08 17:05:39 +0900345 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900346 word++;
347 }
348 return true;
349}
350
satok715514d2010-12-02 20:19:59 +0900351static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900352static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900353
satok54fe9e02010-12-13 14:42:35 +0900354void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900355 const int excessivePos, const int transposedPos, int *nextLetters,
356 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900357 if (DEBUG_DICT) {
358 LOGI("getSuggestionCandidates %d", maxDepth);
359 assert(transposedPos + 1 < mInputLength);
360 assert(excessivePos < mInputLength);
361 assert(missingPos < mInputLength);
362 }
satok662fe692010-12-08 17:05:39 +0900363 int rootPosition = ROOT_POS;
satokd2997922010-12-07 13:08:39 +0900364 // Get the number of child of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900365 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900366 int depth = 0;
367
368 mStackChildCount[0] = childCount;
369 mStackTraverseAll[0] = (mInputLength <= 0);
370 mStackNodeFreq[0] = 1;
371 mStackInputIndex[0] = 0;
372 mStackDiffs[0] = 0;
373 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900374 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900375
satok662fe692010-12-08 17:05:39 +0900376 // Depth first search
satokd2997922010-12-07 13:08:39 +0900377 while (depth >= 0) {
378 if (mStackChildCount[depth] > 0) {
379 --mStackChildCount[depth];
380 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900381 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900382 int inputIndex = mStackInputIndex[depth];
383 int diffs = mStackDiffs[depth];
384 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900385 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900386 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900387 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900388 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900389 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900390 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
391 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
392 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900393 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900394 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900395 mStackSiblingPos[depth] = siblingPos;
396 if (needsToTraverseChildrenNodes) {
397 // Goes to child node
398 ++depth;
399 mStackChildCount[depth] = childCount;
400 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900401 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900402 mStackInputIndex[depth] = inputIndex;
403 mStackDiffs[depth] = diffs;
404 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900405 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900406 }
407 } else {
satokcdbbea72010-12-08 16:04:16 +0900408 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900409 --depth;
410 }
411 }
412}
413
satokb2e5e592011-04-26 14:50:54 +0900414static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
415static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
416 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
417}
418
419static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
420inline static void multiplyIntCapped(const int multiplier, int *base) {
421 const int temp = *base;
422 if (temp != S_INT_MAX) {
423 // Branch if multiplier == 2 for the optimization
424 if (multiplier == 2) {
425 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
426 } else {
427 const int tempRetval = temp * multiplier;
428 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
429 }
430 }
431}
432
433inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900434 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900435 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900436 } else {
satokb2e5e592011-04-26 14:50:54 +0900437 int ret = base;
438 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
439 return ret;
440 }
441}
442
443inline static void multiplyRate(const int rate, int *freq) {
444 if (*freq != S_INT_MAX) {
445 if (*freq > 1000000) {
446 *freq /= 100;
447 multiplyIntCapped(rate, freq);
448 } else {
449 multiplyIntCapped(rate, freq);
450 *freq /= 100;
451 }
satokf7425bb2011-01-05 16:37:53 +0900452 }
453}
454
satok4c981d32011-04-19 13:58:42 +0900455inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900456 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
457 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900458 if (firstWordLength == 0 || secondWordLength == 0) {
459 return 0;
460 }
461 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
462 int tempFirstFreq = firstFreq;
463 multiplyRate(firstDemotionRate, &tempFirstFreq);
464
465 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
466 int tempSecondFreq = secondFreq;
467 multiplyRate(secondDemotionRate, &tempSecondFreq);
468
469 const int totalLength = firstWordLength + secondWordLength;
470
471 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
472 // length.
473 int totalFreq = tempFirstFreq + tempSecondFreq;
474
475 // This is a workaround to try offsetting the not-enough-demotion which will be done in
476 // calcNormalizedScore in Utils.java.
477 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
478 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
479 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
480 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
481 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
482
483 // At this moment, totalFreq is calculated by the following formula:
484 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
485 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
486
satokb2e5e592011-04-26 14:50:54 +0900487 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900488
489 // This is another workaround to offset the demotion which will be done in
490 // calcNormalizedScore in Utils.java.
491 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
492 // the same amount because we already have adjusted the synthetic freq of this "missing or
493 // mistyped space" suggestion candidate above in this method.
494 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
495 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
496
satokd8db9f82011-05-18 15:31:04 +0900497 if (isSpaceProximity) {
498 // A word pair with one space proximity correction
499 if (DEBUG_DICT) {
500 LOGI("Found a word pair with space proximity correction.");
501 }
502 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
503 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
504 }
505
satok4c981d32011-04-19 13:58:42 +0900506 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
507 return totalFreq;
508}
509
satok817e5172011-03-04 06:06:45 -0800510bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
511 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
satokd8db9f82011-05-18 15:31:04 +0900512 const int secondWordLength, const bool isSpaceProximity) {
satok817e5172011-03-04 06:06:45 -0800513 if (inputLength >= MAX_WORD_LENGTH) return false;
514 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
satok3c4bb772011-03-04 22:50:19 -0800515 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
satok817e5172011-03-04 06:06:45 -0800516 return false;
517 const int newWordLength = firstWordLength + secondWordLength + 1;
satok662fe692010-12-08 17:05:39 +0900518 // Allocating variable length array on stack
519 unsigned short word[newWordLength];
satok817e5172011-03-04 06:06:45 -0800520 const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord);
Ken Wakasade3070a2011-03-19 09:16:42 +0900521 if (DEBUG_DICT) {
522 LOGI("First freq: %d", firstFreq);
523 }
satokaee09dc2010-12-09 19:21:51 +0900524 if (firstFreq <= 0) return false;
525
satok817e5172011-03-04 06:06:45 -0800526 for (int i = 0; i < firstWordLength; ++i) {
satokaee09dc2010-12-09 19:21:51 +0900527 word[i] = mWord[i];
satok662fe692010-12-08 17:05:39 +0900528 }
satokaee09dc2010-12-09 19:21:51 +0900529
satok817e5172011-03-04 06:06:45 -0800530 const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord);
Ken Wakasade3070a2011-03-19 09:16:42 +0900531 if (DEBUG_DICT) {
532 LOGI("Second freq: %d", secondFreq);
533 }
satokaee09dc2010-12-09 19:21:51 +0900534 if (secondFreq <= 0) return false;
535
satok817e5172011-03-04 06:06:45 -0800536 word[firstWordLength] = SPACE;
537 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
538 word[i] = mWord[i - firstWordLength - 1];
satok662fe692010-12-08 17:05:39 +0900539 }
satokaee09dc2010-12-09 19:21:51 +0900540
satokd8db9f82011-05-18 15:31:04 +0900541 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
542 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
satoka4374d22011-04-18 11:40:22 +0900543 if (DEBUG_DICT) {
satokb2e5e592011-04-26 14:50:54 +0900544 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
satoka4374d22011-04-18 11:40:22 +0900545 TYPED_LETTER_MULTIPLIER);
546 }
satok662fe692010-12-08 17:05:39 +0900547 addWord(word, newWordLength, pairFreq);
548 return true;
549}
550
satok817e5172011-03-04 06:06:45 -0800551bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
552 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900553 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800554}
555
556bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
557 return getSplitTwoWordsSuggestion(
558 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900559 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800560}
561
satok662fe692010-12-08 17:05:39 +0900562// Keep this for comparing spec to new getWords
563void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900564 const int excessivePos, const int transposedPos,int *nextLetters,
565 const int nextLettersSize) {
satok662fe692010-12-08 17:05:39 +0900566 int initialPosition = initialPos;
Jean Chalard293ece02011-06-16 20:55:16 +0900567 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
satok662fe692010-12-08 17:05:39 +0900568 getWordsRec(count, initialPosition, 0,
569 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
satoka3d78f62010-12-09 22:08:33 +0900570 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
571 nextLettersSize);
satok662fe692010-12-08 17:05:39 +0900572}
573
satok68319262010-12-03 19:38:08 +0900574void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900575 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
576 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
577 const int transposedPos, int *nextLetters, const int nextLettersSize) {
satok48e432c2010-12-06 17:38:58 +0900578 int siblingPos = pos;
satok68319262010-12-03 19:38:08 +0900579 for (int i = 0; i < childrenCount; ++i) {
satok48e432c2010-12-06 17:38:58 +0900580 int newCount;
581 int newChildPosition;
satokd2997922010-12-07 13:08:39 +0900582 const int newDepth = depth + 1;
satok48e432c2010-12-06 17:38:58 +0900583 bool newTraverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900584 int newMatchRate;
satok48e432c2010-12-06 17:38:58 +0900585 int newInputIndex;
586 int newDiffs;
587 int newSiblingPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900588 int newOutputIndex;
satok48e432c2010-12-06 17:38:58 +0900589 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900590 traverseAllNodes, matchWeight, inputIndex, diffs,
591 skipPos, excessivePos, transposedPos,
satoka3d78f62010-12-09 22:08:33 +0900592 nextLetters, nextLettersSize,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900593 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
Jean Chalard17e44a72011-06-16 22:51:11 +0900594 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
satok48e432c2010-12-06 17:38:58 +0900595 siblingPos = newSiblingPos;
satok30088252010-12-01 21:22:15 +0900596
satok48e432c2010-12-06 17:38:58 +0900597 if (needsToTraverseChildrenNodes) {
598 getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900599 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
satoka3d78f62010-12-09 22:08:33 +0900600 nextLetters, nextLettersSize);
satok30088252010-12-01 21:22:15 +0900601 }
602 }
603}
604
satok58c49b92011-01-27 03:23:39 +0900605inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900606 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900607 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900608 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900609 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900610 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900611 if (mInputLength >= 2) {
612 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
613 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
614 / (10 * mInputLength
615 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900616 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900617 LOGI("Demotion rate for missing character is %d.", demotionRate);
618 }
satokdc5301e2011-04-11 16:14:45 +0900619 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900620 } else {
621 finalFreq = 0;
622 }
623 }
satokf7425bb2011-01-05 16:37:53 +0900624 if (transposedPos >= 0) multiplyRate(
625 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900626 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900627 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900628 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900629 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900630 }
631 }
satok58c49b92011-01-27 03:23:39 +0900632 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900633 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900634 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900635 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900636 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900637 if (DEBUG_DICT) {
638 LOGI("Found full matched word.");
639 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900640 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
641 }
642 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900643 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900644 }
satok9674f652011-04-20 17:15:27 +0900645 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900646 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900647 if (DEBUG_DICT) {
648 LOGI("Found one proximity correction.");
649 }
satokb2e5e592011-04-26 14:50:54 +0900650 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900651 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900652 }
satok9674f652011-04-20 17:15:27 +0900653 if (DEBUG_DICT) {
654 LOGI("calc: %d, %d", depth, sameLength);
655 }
satokb2e5e592011-04-26 14:50:54 +0900656 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900657 return finalFreq;
658}
satoka3d78f62010-12-09 22:08:33 +0900659
satok54fe9e02010-12-13 14:42:35 +0900660inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
Jean Chalardf5f834a2011-02-22 15:12:46 +0900661 unsigned short *word, const int inputIndex, const int depth, const int matchWeight,
satok54fe9e02010-12-13 14:42:35 +0900662 int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
663 const int transposedPos, const int freq) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900664 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, excessivePos,
satok58c49b92011-01-27 03:23:39 +0900665 transposedPos, freq, false);
satoka3d78f62010-12-09 22:08:33 +0900666 if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900667 if (depth >= mInputLength && skipPos < 0) {
satok715514d2010-12-02 20:19:59 +0900668 registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
669 }
670}
671
672inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
Jean Chalardf5f834a2011-02-22 15:12:46 +0900673 unsigned short *word, const int inputIndex, const int depth, const int matchWeight,
Jean Chalard8dc754a2011-01-27 14:20:22 +0900674 const int skipPos, const int excessivePos, const int transposedPos, const int freq) {
satok54fe9e02010-12-13 14:42:35 +0900675 if (sameAsTyped(word, depth + 1)) return;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900676 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
satok54fe9e02010-12-13 14:42:35 +0900677 excessivePos, transposedPos, freq, true);
678 // Proximity collection will promote a word of the same length as what user typed.
679 if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
satok715514d2010-12-02 20:19:59 +0900680}
satok28bd03b2010-12-03 16:39:16 +0900681
682inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900683 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900684 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900685 // Skip the ' or other letter and continue deeper
686 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
687}
688
satoke07baa62010-12-09 21:55:40 +0900689inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900690 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900691 if (inputIndex < 0 || inputIndex >= inputLength) return false;
692 const int currentChar = *getInputCharsAt(inputIndex);
693 const int leftIndex = inputIndex - 1;
694 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900695 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900696 int i = 0;
697 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
698 if (leftChars[i++] == currentChar) return true;
699 }
700 }
701 const int rightIndex = inputIndex + 1;
702 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900703 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900704 int i = 0;
705 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
706 if (rightChars[i++] == currentChar) return true;
707 }
708 }
709 return false;
710}
711
Jean Chalarda5d58492011-02-18 17:50:58 +0900712
713// In the following function, c is the current character of the dictionary word
714// currently examined.
715// currentChars is an array containing the keys close to the character the
716// user actually typed at the same position. We want to see if c is in it: if so,
717// then the word contains at that position a character close to what the user
718// typed.
719// What the user typed is actually the first character of the array.
720// Notice : accented characters do not have a proximity list, so they are alone
721// in their list. The non-accented version of the character should be considered
722// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900723inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
724 const int *currentChars, const unsigned short c, const int skipPos,
725 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900726 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900727
728 // The first char in the array is what user typed. If it matches right away,
729 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900730 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900731 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
732
733 // If one of those is true, we should not check for close characters at all.
734 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
735 return UNRELATED_CHAR;
736
737 // If the non-accented, lowercased version of that first character matches c,
738 // then we have a non-accented version of the accented character the user
739 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900740 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900741 return NEAR_PROXIMITY_CHAR;
742
743 // Not an exact nor an accent-alike match: search the list of close keys
744 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900745 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900746 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900747 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900748 ++j;
749 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900750
751 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900752 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900753}
754
satok48e432c2010-12-06 17:38:58 +0900755inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900756 const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
satoka3d78f62010-12-09 22:08:33 +0900757 const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
758 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900759 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900760 int *nextSiblingPosition, int *nextOutputIndex) {
satoka3d78f62010-12-09 22:08:33 +0900761 if (DEBUG_DICT) {
762 int inputCount = 0;
763 if (skipPos >= 0) ++inputCount;
764 if (excessivePos >= 0) ++inputCount;
765 if (transposedPos >= 0) ++inputCount;
766 assert(inputCount <= 1);
767 }
satok48e432c2010-12-06 17:38:58 +0900768 unsigned short c;
769 int childPosition;
770 bool terminal;
771 int freq;
satokfd16f1d2011-01-27 16:25:16 +0900772 bool isSameAsUserTypedLength = false;
satokcdbbea72010-12-08 16:04:16 +0900773
satokfd16f1d2011-01-27 16:25:16 +0900774 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
satokcdbbea72010-12-08 16:04:16 +0900775
Jean Chalard293ece02011-06-16 20:55:16 +0900776 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
777 &c, &childPosition, &terminal, &freq);
Jean Chalard17e44a72011-06-16 22:51:11 +0900778 *nextOutputIndex = depth + 1;
satok48e432c2010-12-06 17:38:58 +0900779
780 const bool needsToTraverseChildrenNodes = childPosition != 0;
781
782 // If we are only doing traverseAllNodes, no need to look at the typed characters.
783 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
784 mWord[depth] = c;
785 if (traverseAllNodes && terminal) {
satok54fe9e02010-12-13 14:42:35 +0900786 onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900787 matchWeight, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos,
788 freq);
satok48e432c2010-12-06 17:38:58 +0900789 }
790 if (!needsToTraverseChildrenNodes) return false;
791 *newTraverseAllNodes = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900792 *newMatchRate = matchWeight;
satok48e432c2010-12-06 17:38:58 +0900793 *newDiffs = diffs;
794 *newInputIndex = inputIndex;
satok48e432c2010-12-06 17:38:58 +0900795 } else {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900796 const int *currentChars = getInputCharsAt(inputIndex);
satoka3d78f62010-12-09 22:08:33 +0900797
798 if (transposedPos >= 0) {
799 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
800 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
801 }
802
803 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
804 transposedPos);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900805 if (UNRELATED_CHAR == matchedProximityCharId) return false;
satok48e432c2010-12-06 17:38:58 +0900806 mWord[depth] = c;
807 // If inputIndex is greater than mInputLength, that means there is no
808 // proximity chars. So, we don't need to check proximity.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900809 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
satokb2e5e592011-04-26 14:50:54 +0900810 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900811 }
satokfd16f1d2011-01-27 16:25:16 +0900812 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
813 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
satok48e432c2010-12-06 17:38:58 +0900814 if (isSameAsUserTypedLength && terminal) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900815 onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, matchWeight,
Jean Chalard8dc754a2011-01-27 14:20:22 +0900816 skipPos, excessivePos, transposedPos, freq);
satok48e432c2010-12-06 17:38:58 +0900817 }
818 if (!needsToTraverseChildrenNodes) return false;
819 // Start traversing all nodes after the index exceeds the user typed length
820 *newTraverseAllNodes = isSameAsUserTypedLength;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900821 *newMatchRate = matchWeight;
Jean Chalard8dc754a2011-01-27 14:20:22 +0900822 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
satok48e432c2010-12-06 17:38:58 +0900823 *newInputIndex = inputIndex + 1;
satok48e432c2010-12-06 17:38:58 +0900824 }
825 // Optimization: Prune out words that are too long compared to how much was typed.
satokd2997922010-12-07 13:08:39 +0900826 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
satok48e432c2010-12-06 17:38:58 +0900827 return false;
828 }
829
830 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
satokfd16f1d2011-01-27 16:25:16 +0900831 // TODO: Check if this can be isSameAsUserTypedLength only.
832 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
satok48e432c2010-12-06 17:38:58 +0900833 *newTraverseAllNodes = true;
834 }
835 // get the count of nodes and increment childAddress.
Jean Chalard293ece02011-06-16 20:55:16 +0900836 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
satok48e432c2010-12-06 17:38:58 +0900837 *newChildPosition = childPosition;
838 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
839 return needsToTraverseChildrenNodes;
840}
841
satokaee09dc2010-12-09 19:21:51 +0900842inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
843 unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900844 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900845 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900846 int maxFreq = 0;
847 int depth = 0;
848 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900849 bool terminal = false;
850
satokaee09dc2010-12-09 19:21:51 +0900851 mStackChildCount[0] = count;
852 mStackSiblingPos[0] = pos;
853
854 while (depth >= 0) {
855 if (mStackChildCount[depth] > 0) {
856 --mStackChildCount[depth];
857 int firstChildPos;
858 int newFreq;
859 int siblingPos = mStackSiblingPos[depth];
860 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
861 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
862 &siblingPos);
863 mStackSiblingPos[depth] = siblingPos;
864 if (depth == (inputLength - 1)) {
865 // Traverse sibling node
866 if (terminal) {
867 if (newFreq > maxFreq) {
868 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
869 if (DEBUG_DICT && DEBUG_NODE) {
870 char s[inputLength + 1];
871 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
872 s[inputLength] = 0;
873 LOGI("New missing space word found: %d > %d (%s), %d, %d",
874 newFreq, maxFreq, s, inputLength, depth);
875 }
876 maxFreq = newFreq;
877 }
878 }
879 } else if (needsToTraverseChildrenNodes) {
880 // Traverse children nodes
881 ++depth;
882 mStackChildCount[depth] = count;
883 mStackSiblingPos[depth] = firstChildPos;
884 }
885 } else {
886 // Traverse parent node
887 --depth;
satok662fe692010-12-08 17:05:39 +0900888 }
889 }
satokaee09dc2010-12-09 19:21:51 +0900890
891 word[inputLength] = 0;
892 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900893}
894
895inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900896 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
897 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
898 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900899 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900900 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900901 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
902 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900903 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900904 if (DEBUG_DICT) {
905 assert(inputC <= U_SHORT_MAX);
906 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900907 const unsigned short baseLowerC = toBaseLowerCase(c);
908 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900909 const bool hasChild = *newChildPosition != 0;
910 if (matched) {
911 word[depth] = c;
912 if (DEBUG_DICT && DEBUG_NODE) {
913 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900914 if (*newTerminal) {
915 LOGI("Terminal %d", *newFreq);
916 }
satok662fe692010-12-08 17:05:39 +0900917 }
satokaee09dc2010-12-09 19:21:51 +0900918 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900919 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900920 return true;
921 } else {
922 return false;
923 }
924 } else {
925 // If this node is not user typed character, this method treats this word as unmatched.
926 // Thus newTerminal shouldn't be true.
927 *newTerminal = false;
928 return false;
satok662fe692010-12-08 17:05:39 +0900929 }
satok662fe692010-12-08 17:05:39 +0900930}
Jean Chalard8124e642011-06-16 22:33:41 +0900931
932// TODO: use uint32_t instead of unsigned short
933bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
934 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900935 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900936 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900937 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900938 }
939}
940
Jean Chalard17e44a72011-06-16 22:51:11 +0900941
942// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900943int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
944 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900945 // returns address of bigram data of that word
946 // return -99 if not found
947
948 int count = Dictionary::getCount(DICT_ROOT, &pos);
949 unsigned short currentChar = (unsigned short) word[offset];
950 for (int j = 0; j < count; j++) {
951 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
952 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
953 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
954 if (c == currentChar) {
955 if (offset == length - 1) {
956 if (terminal) {
957 return (pos+1);
958 }
959 } else {
960 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900961 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900962 if (t > 0) {
963 return t;
964 }
965 }
966 }
967 }
968 if (terminal) {
969 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
970 }
971 // There could be two instances of each alphabet - upper and lower case. So continue
972 // looking ...
973 }
974 return NOT_VALID_WORD;
975}
976
satok30088252010-12-01 21:22:15 +0900977} // namespace latinime