blob: 290e9f997557f1bd0702034ec7508e31e6fbbcaa [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
28namespace latinime {
29
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090030const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
31 { { 'a', 'e' },
32 { 'o', 'e' },
33 { 'u', 'e' } };
34
Jean Chalard293ece02011-06-16 20:55:16 +090035// TODO: check the header
36UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090037 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090038 const bool isLatestDictVersion)
Jean Chalard293ece02011-06-16 20:55:16 +090039 : DICT_ROOT(streamStart),
40 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090041 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
42 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090043 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalarda787dba2011-03-04 12:17:48 +090044 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
45 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090046 if (DEBUG_DICT) {
47 LOGI("UnigramDictionary - constructor");
48 }
satok30088252010-12-01 21:22:15 +090049}
50
satok18c28f42010-12-02 18:11:54 +090051UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090052
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090053static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
54 const int MAX_PROXIMITY_CHARS) {
55 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
56}
57
58bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
59
60 // There can't be a digraph if we don't have at least 2 characters to examine
61 if (i + 2 > codesSize) return false;
62
63 // Search for the first char of some digraph
64 int lastDigraphIndex = -1;
65 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
66 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
67 lastDigraphIndex >= 0; --lastDigraphIndex) {
68 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
69 }
70 // No match: return early
71 if (lastDigraphIndex < 0) return false;
72
73 // It's an interesting digraph if the second char matches too.
74 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
75}
76
77// Mostly the same arguments as the non-recursive version, except:
78// codes is the original value. It points to the start of the work buffer, and gets passed as is.
79// codesSize is the size of the user input (thus, it is the size of codesSrc).
80// codesDest is the current point in the work buffer.
81// codesSrc is the current point in the user-input, original, content-unmodified buffer.
82// codesRemain is the remaining size in codesSrc.
83void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
84 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
85 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080086 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090087
Jean Chalarda787dba2011-03-04 12:17:48 +090088 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
89 for (int i = 0; i < codesRemain; ++i) {
90 if (isDigraph(codesSrc, i, codesRemain)) {
91 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090092
Jean Chalarda787dba2011-03-04 12:17:48 +090093 // Copy the word up to the first char of the digraph, then continue processing
94 // on the remaining part of the word, skipping the second char of the digraph.
95 // In our example, copy "pru" and continue running on "fen"
96 // Make i the index of the second char of the digraph for simplicity. Forgetting
97 // to do that results in an infinite recursion so take care!
98 ++i;
99 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
100 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
101 codesBuffer, codesBufferSize, flags,
102 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
103 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
104 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the second char of the digraph in place, then continue processing on
107 // the remaining part of the word.
108 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
109 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
110 BYTES_IN_ONE_CHAR);
111 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
112 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
113 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
114 outWords, frequencies);
115 return;
116 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900117 }
118 }
119
120 // If we come here, we hit the end of the word: let's check it against the dictionary.
121 // In our example, we'll come here once for "prufen" and then once for "pruefen".
122 // If the word contains several digraphs, we'll come it for the product of them.
123 // eg. if the word is "ueberpruefen" we'll test, in order, against
124 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
125 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
126 if (0 != remainingBytes)
127 memcpy(codesDest, codesSrc, remainingBytes);
128
129 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
130 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
131}
132
133int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
134 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
135 unsigned short *outWords, int *frequencies) {
136
137 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
138 { // Incrementally tune the word and try all possibilities
139 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
140 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900141 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900142 } else { // Normal processing
143 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
144 outWords, frequencies);
145 }
146
satok817e5172011-03-04 06:06:45 -0800147 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900148 // Get the word count
149 int suggestedWordsCount = 0;
150 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
151 suggestedWordsCount++;
152 }
153
154 if (DEBUG_DICT) {
155 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900156 /// Print the returned words
157 for (int j = 0; j < suggestedWordsCount; ++j) {
158 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
159 char s[MAX_WORD_LENGTH];
160 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
161 LOGI("%s %i", s, mFrequencies[j]);
162 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900163 LOGI("Next letters: ");
164 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
165 if (mNextLettersFrequency[k] > 0) {
166 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
167 }
168 }
169 }
satok817e5172011-03-04 06:06:45 -0800170 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900171 PROF_CLOSE;
172 return suggestedWordsCount;
173}
174
175void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
176 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
177 unsigned short *outWords, int *frequencies) {
178
satok61e2f852011-01-05 14:13:07 +0900179 PROF_OPEN;
180 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900181 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900182 if (DEBUG_DICT) assert(codesSize == mInputLength);
183
satoka3d78f62010-12-09 22:08:33 +0900184 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900185 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900186
satok61e2f852011-01-05 14:13:07 +0900187 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900188 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900189 PROF_END(1);
190
191 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900192 // Suggestion with missing character
193 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900194 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900195 if (DEBUG_DICT) {
196 LOGI("--- Suggest missing characters %d", i);
197 }
satok54fe9e02010-12-13 14:42:35 +0900198 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900199 }
200 }
satok61e2f852011-01-05 14:13:07 +0900201 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900202
satok61e2f852011-01-05 14:13:07 +0900203 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900204 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900205 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
206 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900207 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900208 if (DEBUG_DICT) {
209 LOGI("--- Suggest excessive characters %d", i);
210 }
satok54fe9e02010-12-13 14:42:35 +0900211 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900212 }
213 }
satok61e2f852011-01-05 14:13:07 +0900214 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900215
satok61e2f852011-01-05 14:13:07 +0900216 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900217 // Suggestion with transposed characters
218 // Only suggest words that length is mInputLength
219 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
220 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900221 if (DEBUG_DICT) {
222 LOGI("--- Suggest transposed characters %d", i);
223 }
satok54fe9e02010-12-13 14:42:35 +0900224 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900225 }
226 }
satok61e2f852011-01-05 14:13:07 +0900227 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900228
satok61e2f852011-01-05 14:13:07 +0900229 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900230 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900231 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
232 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900233 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900234 if (DEBUG_DICT) {
235 LOGI("--- Suggest missing space characters %d", i);
236 }
satok662fe692010-12-08 17:05:39 +0900237 getMissingSpaceWords(mInputLength, i);
238 }
239 }
satok61e2f852011-01-05 14:13:07 +0900240 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800241
242 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900243 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800244 // The first and last "mistyped spaces" are taken care of by excessive character handling
245 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900246 if (DEBUG_DICT) {
247 LOGI("--- Suggest words with proximity space %d", i);
248 }
satok817e5172011-03-04 06:06:45 -0800249 const int x = xcoordinates[i];
250 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900251 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800252 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
253 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900254 }
satok817e5172011-03-04 06:06:45 -0800255 if (proximityInfo->hasSpaceProximity(x, y)) {
256 getMistypedSpaceWords(mInputLength, i);
257 }
satok817e5172011-03-04 06:06:45 -0800258 }
259 }
260 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900261}
262
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900263void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
264 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900265 if (DEBUG_DICT) {
266 LOGI("initSuggest");
267 }
satok30088252010-12-01 21:22:15 +0900268 mFrequencies = frequencies;
269 mOutputChars = outWords;
270 mInputCodes = codes;
271 mInputLength = codesSize;
272 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
273}
274
Jean Chalard8124e642011-06-16 22:33:41 +0900275static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900276 if (c < nextLettersSize) {
277 nextLetters[c]++;
278 }
279}
280
satok662fe692010-12-08 17:05:39 +0900281// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900282// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900283bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900284 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900285 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
satok30088252010-12-01 21:22:15 +0900286 char s[length + 1];
287 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900288 LOGI("Found word = %s, freq = %d", s, frequency);
satok30088252010-12-01 21:22:15 +0900289 }
satokf5cded12010-12-06 21:28:24 +0900290 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900291 if (DEBUG_DICT) {
292 LOGI("Exceeded max word length.");
293 }
satokf5cded12010-12-06 21:28:24 +0900294 return false;
295 }
satok30088252010-12-01 21:22:15 +0900296
297 // Find the right insertion point
298 int insertAt = 0;
299 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900300 // TODO: How should we sort words with the same frequency?
301 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900302 break;
303 }
304 insertAt++;
305 }
306 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900307 if (DEBUG_DICT) {
308 char s[length + 1];
309 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900310 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satokcdbbea72010-12-08 16:04:16 +0900311 }
satok30088252010-12-01 21:22:15 +0900312 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
313 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
314 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
315 mFrequencies[insertAt] = frequency;
316 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900317 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900318 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900319 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900320 while (length--) {
321 *dest++ = *word++;
322 }
323 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900324 if (DEBUG_DICT) {
325 LOGI("Added word at %d", insertAt);
326 }
satok30088252010-12-01 21:22:15 +0900327 return true;
328 }
329 return false;
330}
331
Jean Chalard8124e642011-06-16 22:33:41 +0900332static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900333 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
334 c = BASE_CHARS[c];
335 }
336 if (c >='A' && c <= 'Z') {
337 c |= 32;
338 } else if (c > 127) {
339 c = latin_tolower(c);
340 }
341 return c;
342}
343
Jean Chalardca5ef282011-06-17 15:36:26 +0900344bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
satok30088252010-12-01 21:22:15 +0900345 if (length != mInputLength) {
346 return false;
347 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900348 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900349 while (length--) {
350 if ((unsigned int) *inputCodes != (unsigned int) *word) {
351 return false;
352 }
satok662fe692010-12-08 17:05:39 +0900353 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900354 word++;
355 }
356 return true;
357}
358
satok715514d2010-12-02 20:19:59 +0900359static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900360static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900361
satok54fe9e02010-12-13 14:42:35 +0900362void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900363 const int excessivePos, const int transposedPos, int *nextLetters,
364 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900365 if (DEBUG_DICT) {
366 LOGI("getSuggestionCandidates %d", maxDepth);
367 assert(transposedPos + 1 < mInputLength);
368 assert(excessivePos < mInputLength);
369 assert(missingPos < mInputLength);
370 }
satok662fe692010-12-08 17:05:39 +0900371 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900372 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900373 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900374 int depth = 0;
375
376 mStackChildCount[0] = childCount;
377 mStackTraverseAll[0] = (mInputLength <= 0);
378 mStackNodeFreq[0] = 1;
379 mStackInputIndex[0] = 0;
380 mStackDiffs[0] = 0;
381 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900382 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900383
satok662fe692010-12-08 17:05:39 +0900384 // Depth first search
satokd2997922010-12-07 13:08:39 +0900385 while (depth >= 0) {
386 if (mStackChildCount[depth] > 0) {
387 --mStackChildCount[depth];
388 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900389 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900390 int inputIndex = mStackInputIndex[depth];
391 int diffs = mStackDiffs[depth];
392 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900393 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900394 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900395 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900396 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900397 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900398 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
399 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
400 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900401 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900402 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900403 mStackSiblingPos[depth] = siblingPos;
404 if (needsToTraverseChildrenNodes) {
405 // Goes to child node
406 ++depth;
407 mStackChildCount[depth] = childCount;
408 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900409 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900410 mStackInputIndex[depth] = inputIndex;
411 mStackDiffs[depth] = diffs;
412 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900413 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900414 }
415 } else {
satokcdbbea72010-12-08 16:04:16 +0900416 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900417 --depth;
418 }
419 }
420}
421
satokb2e5e592011-04-26 14:50:54 +0900422static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
423static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
424 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
425}
426
427static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
428inline static void multiplyIntCapped(const int multiplier, int *base) {
429 const int temp = *base;
430 if (temp != S_INT_MAX) {
431 // Branch if multiplier == 2 for the optimization
432 if (multiplier == 2) {
433 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
434 } else {
435 const int tempRetval = temp * multiplier;
436 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
437 }
438 }
439}
440
441inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900442 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900443 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900444 } else {
satokb2e5e592011-04-26 14:50:54 +0900445 int ret = base;
446 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
447 return ret;
448 }
449}
450
451inline static void multiplyRate(const int rate, int *freq) {
452 if (*freq != S_INT_MAX) {
453 if (*freq > 1000000) {
454 *freq /= 100;
455 multiplyIntCapped(rate, freq);
456 } else {
457 multiplyIntCapped(rate, freq);
458 *freq /= 100;
459 }
satokf7425bb2011-01-05 16:37:53 +0900460 }
461}
462
satok4c981d32011-04-19 13:58:42 +0900463inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900464 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
465 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900466 if (firstWordLength == 0 || secondWordLength == 0) {
467 return 0;
468 }
469 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
470 int tempFirstFreq = firstFreq;
471 multiplyRate(firstDemotionRate, &tempFirstFreq);
472
473 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
474 int tempSecondFreq = secondFreq;
475 multiplyRate(secondDemotionRate, &tempSecondFreq);
476
477 const int totalLength = firstWordLength + secondWordLength;
478
479 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
480 // length.
481 int totalFreq = tempFirstFreq + tempSecondFreq;
482
483 // This is a workaround to try offsetting the not-enough-demotion which will be done in
484 // calcNormalizedScore in Utils.java.
485 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
486 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
487 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
488 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
489 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
490
491 // At this moment, totalFreq is calculated by the following formula:
492 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
493 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
494
satokb2e5e592011-04-26 14:50:54 +0900495 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900496
497 // This is another workaround to offset the demotion which will be done in
498 // calcNormalizedScore in Utils.java.
499 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
500 // the same amount because we already have adjusted the synthetic freq of this "missing or
501 // mistyped space" suggestion candidate above in this method.
502 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
503 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
504
satokd8db9f82011-05-18 15:31:04 +0900505 if (isSpaceProximity) {
506 // A word pair with one space proximity correction
507 if (DEBUG_DICT) {
508 LOGI("Found a word pair with space proximity correction.");
509 }
510 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
511 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
512 }
513
satok4c981d32011-04-19 13:58:42 +0900514 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
515 return totalFreq;
516}
517
satok817e5172011-03-04 06:06:45 -0800518bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
519 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900520 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800521}
522
523bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
524 return getSplitTwoWordsSuggestion(
525 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900526 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800527}
528
satok58c49b92011-01-27 03:23:39 +0900529inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900530 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900531 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900532 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900533 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900534 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900535 if (mInputLength >= 2) {
536 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
537 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
538 / (10 * mInputLength
539 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900540 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900541 LOGI("Demotion rate for missing character is %d.", demotionRate);
542 }
satokdc5301e2011-04-11 16:14:45 +0900543 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900544 } else {
545 finalFreq = 0;
546 }
547 }
satokf7425bb2011-01-05 16:37:53 +0900548 if (transposedPos >= 0) multiplyRate(
549 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900550 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900551 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900552 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900553 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900554 }
555 }
satok58c49b92011-01-27 03:23:39 +0900556 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900557 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900558 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900559 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900560 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900561 if (DEBUG_DICT) {
562 LOGI("Found full matched word.");
563 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900564 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
565 }
566 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900567 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900568 }
satok9674f652011-04-20 17:15:27 +0900569 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900570 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900571 if (DEBUG_DICT) {
572 LOGI("Found one proximity correction.");
573 }
satokb2e5e592011-04-26 14:50:54 +0900574 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900575 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900576 }
satok9674f652011-04-20 17:15:27 +0900577 if (DEBUG_DICT) {
578 LOGI("calc: %d, %d", depth, sameLength);
579 }
satokb2e5e592011-04-26 14:50:54 +0900580 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900581 return finalFreq;
582}
satoka3d78f62010-12-09 22:08:33 +0900583
satok28bd03b2010-12-03 16:39:16 +0900584inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900585 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900586 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900587 // Skip the ' or other letter and continue deeper
588 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
589}
590
satoke07baa62010-12-09 21:55:40 +0900591inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900592 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900593 if (inputIndex < 0 || inputIndex >= inputLength) return false;
594 const int currentChar = *getInputCharsAt(inputIndex);
595 const int leftIndex = inputIndex - 1;
596 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900597 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900598 int i = 0;
599 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
600 if (leftChars[i++] == currentChar) return true;
601 }
602 }
603 const int rightIndex = inputIndex + 1;
604 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900605 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900606 int i = 0;
607 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
608 if (rightChars[i++] == currentChar) return true;
609 }
610 }
611 return false;
612}
613
Jean Chalarda5d58492011-02-18 17:50:58 +0900614// In the following function, c is the current character of the dictionary word
615// currently examined.
616// currentChars is an array containing the keys close to the character the
617// user actually typed at the same position. We want to see if c is in it: if so,
618// then the word contains at that position a character close to what the user
619// typed.
620// What the user typed is actually the first character of the array.
621// Notice : accented characters do not have a proximity list, so they are alone
622// in their list. The non-accented version of the character should be considered
623// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900624inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
625 const int *currentChars, const unsigned short c, const int skipPos,
626 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900627 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900628
629 // The first char in the array is what user typed. If it matches right away,
630 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900631 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900632 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
633
634 // If one of those is true, we should not check for close characters at all.
635 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
636 return UNRELATED_CHAR;
637
638 // If the non-accented, lowercased version of that first character matches c,
639 // then we have a non-accented version of the accented character the user
640 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900641 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900642 return NEAR_PROXIMITY_CHAR;
643
644 // Not an exact nor an accent-alike match: search the list of close keys
645 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900646 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900647 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900648 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900649 ++j;
650 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900651
652 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900653 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900654}
655
Jean Chalardca5ef282011-06-17 15:36:26 +0900656inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900657 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900658 const int inputIndex, const int matchWeight, const int skipPos,
659 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
660 int* nextLetters, const int nextLettersSize) {
661
662 const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900663 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900664
665 if (depth >= MIN_SUGGEST_DEPTH) {
666 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
667 excessivePos, transposedPos, freq, sameLength);
668 if (!isSameAsTyped)
669 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900670 }
671
672 if (sameLength && depth >= mInputLength && skipPos < 0) {
673 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
674 }
675}
676
Jean Chalarde6715e32011-06-30 19:47:25 +0900677bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
678 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
679 const int secondWordLength, const bool isSpaceProximity) {
680 if (inputLength >= MAX_WORD_LENGTH) return false;
681 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
682 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
683 return false;
684 const int newWordLength = firstWordLength + secondWordLength + 1;
685 // Allocating variable length array on stack
686 unsigned short word[newWordLength];
687 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
688 if (DEBUG_DICT) {
689 LOGI("First freq: %d", firstFreq);
690 }
691 if (firstFreq <= 0) return false;
692
693 for (int i = 0; i < firstWordLength; ++i) {
694 word[i] = mWord[i];
695 }
696
697 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
698 if (DEBUG_DICT) {
699 LOGI("Second freq: %d", secondFreq);
700 }
701 if (secondFreq <= 0) return false;
702
703 word[firstWordLength] = SPACE;
704 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
705 word[i] = mWord[i - firstWordLength - 1];
706 }
707
708 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
709 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
710 if (DEBUG_DICT) {
711 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
712 TYPED_LETTER_MULTIPLIER);
713 }
714 addWord(word, newWordLength, pairFreq);
715 return true;
716}
717
Jean Chalardbc90c722011-06-20 21:09:04 +0900718#ifndef NEW_DICTIONARY_FORMAT
719// TODO: Don't forget to bring inline functions back to over where they are used.
satokcdbbea72010-12-08 16:04:16 +0900720
Jean Chalardbc90c722011-06-20 21:09:04 +0900721// The following functions will be entirely replaced with new implementations.
722void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
723 const int excessivePos, const int transposedPos,int *nextLetters,
724 const int nextLettersSize) {
725 int initialPosition = initialPos;
726 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
727 getWordsRec(count, initialPosition, 0,
728 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
729 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
730 nextLettersSize);
731}
Jean Chalardca5ef282011-06-17 15:36:26 +0900732
Jean Chalardbc90c722011-06-20 21:09:04 +0900733void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
734 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
735 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
736 const int transposedPos, int *nextLetters, const int nextLettersSize) {
737 int siblingPos = pos;
738 for (int i = 0; i < childrenCount; ++i) {
739 int newCount;
740 int newChildPosition;
741 bool newTraverseAllNodes;
742 int newMatchRate;
743 int newInputIndex;
744 int newDiffs;
745 int newSiblingPos;
746 int newOutputIndex;
747 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
748 traverseAllNodes, matchWeight, inputIndex, diffs,
749 skipPos, excessivePos, transposedPos,
750 nextLetters, nextLettersSize,
751 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
752 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
753 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900754
Jean Chalardbc90c722011-06-20 21:09:04 +0900755 if (needsToTraverseChildrenNodes) {
756 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
757 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
758 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900759 }
satok48e432c2010-12-06 17:38:58 +0900760 }
satok48e432c2010-12-06 17:38:58 +0900761}
762
Jean Chalard980d6b62011-06-30 17:02:23 +0900763inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
764 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900765 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900766 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900767 int maxFreq = 0;
768 int depth = 0;
769 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900770 bool terminal = false;
771
satokaee09dc2010-12-09 19:21:51 +0900772 mStackChildCount[0] = count;
773 mStackSiblingPos[0] = pos;
774
775 while (depth >= 0) {
776 if (mStackChildCount[depth] > 0) {
777 --mStackChildCount[depth];
778 int firstChildPos;
779 int newFreq;
780 int siblingPos = mStackSiblingPos[depth];
781 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
782 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
783 &siblingPos);
784 mStackSiblingPos[depth] = siblingPos;
785 if (depth == (inputLength - 1)) {
786 // Traverse sibling node
787 if (terminal) {
788 if (newFreq > maxFreq) {
789 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
790 if (DEBUG_DICT && DEBUG_NODE) {
791 char s[inputLength + 1];
792 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
793 s[inputLength] = 0;
794 LOGI("New missing space word found: %d > %d (%s), %d, %d",
795 newFreq, maxFreq, s, inputLength, depth);
796 }
797 maxFreq = newFreq;
798 }
799 }
800 } else if (needsToTraverseChildrenNodes) {
801 // Traverse children nodes
802 ++depth;
803 mStackChildCount[depth] = count;
804 mStackSiblingPos[depth] = firstChildPos;
805 }
806 } else {
807 // Traverse parent node
808 --depth;
satok662fe692010-12-08 17:05:39 +0900809 }
810 }
satokaee09dc2010-12-09 19:21:51 +0900811
812 word[inputLength] = 0;
813 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900814}
815
816inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900817 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
818 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
819 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900820 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900821 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900822 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
823 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900824 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900825 if (DEBUG_DICT) {
826 assert(inputC <= U_SHORT_MAX);
827 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900828 const unsigned short baseLowerC = toBaseLowerCase(c);
829 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900830 const bool hasChild = *newChildPosition != 0;
831 if (matched) {
832 word[depth] = c;
833 if (DEBUG_DICT && DEBUG_NODE) {
834 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900835 if (*newTerminal) {
836 LOGI("Terminal %d", *newFreq);
837 }
satok662fe692010-12-08 17:05:39 +0900838 }
satokaee09dc2010-12-09 19:21:51 +0900839 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900840 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900841 return true;
842 } else {
843 return false;
844 }
845 } else {
846 // If this node is not user typed character, this method treats this word as unmatched.
847 // Thus newTerminal shouldn't be true.
848 *newTerminal = false;
849 return false;
satok662fe692010-12-08 17:05:39 +0900850 }
satok662fe692010-12-08 17:05:39 +0900851}
Jean Chalard8124e642011-06-16 22:33:41 +0900852
853// TODO: use uint32_t instead of unsigned short
854bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
855 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900856 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900857 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900858 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900859 }
860}
861
Jean Chalard17e44a72011-06-16 22:51:11 +0900862
863// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900864int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
865 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900866 // returns address of bigram data of that word
867 // return -99 if not found
868
869 int count = Dictionary::getCount(DICT_ROOT, &pos);
870 unsigned short currentChar = (unsigned short) word[offset];
871 for (int j = 0; j < count; j++) {
872 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
873 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
874 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
875 if (c == currentChar) {
876 if (offset == length - 1) {
877 if (terminal) {
878 return (pos+1);
879 }
880 } else {
881 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900882 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900883 if (t > 0) {
884 return t;
885 }
886 }
887 }
888 }
889 if (terminal) {
890 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
891 }
892 // There could be two instances of each alphabet - upper and lower case. So continue
893 // looking ...
894 }
895 return NOT_VALID_WORD;
896}
897
Jean Chalardbc90c722011-06-20 21:09:04 +0900898// The following functions will be modified.
Jean Chalard0584f022011-06-30 19:23:16 +0900899inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
900 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
901 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalardbc90c722011-06-20 21:09:04 +0900902 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
903 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
904 int *nextSiblingPosition, int *nextOutputIndex) {
905 if (DEBUG_DICT) {
906 int inputCount = 0;
907 if (skipPos >= 0) ++inputCount;
908 if (excessivePos >= 0) ++inputCount;
909 if (transposedPos >= 0) ++inputCount;
910 assert(inputCount <= 1);
911 }
912 unsigned short c;
913 int childPosition;
914 bool terminal;
915 int freq;
916 bool isSameAsUserTypedLength = false;
917
Jean Chalard0584f022011-06-30 19:23:16 +0900918 const int pos = initialPos;
919 const int depth = initialDepth;
920 const int traverseAllNodes = initialTraverseAllNodes;
921 const int diffs = initialDiffs;
922
Jean Chalardbc90c722011-06-20 21:09:04 +0900923 const uint8_t flags = 0; // No flags for now
924
925 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
926
927 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
928 &c, &childPosition, &terminal, &freq);
929 *nextOutputIndex = depth + 1;
930
931 const bool needsToTraverseChildrenNodes = childPosition != 0;
932
933 // If we are only doing traverseAllNodes, no need to look at the typed characters.
934 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
935 mWord[depth] = c;
936 if (traverseAllNodes && terminal) {
937 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
938 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
939 }
940 if (!needsToTraverseChildrenNodes) return false;
941 *newTraverseAllNodes = traverseAllNodes;
942 *newMatchRate = matchWeight;
943 *newDiffs = diffs;
944 *newInputIndex = inputIndex;
945 } else {
946 const int *currentChars = getInputCharsAt(inputIndex);
947
948 if (transposedPos >= 0) {
949 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
950 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
951 }
952
953 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
954 transposedPos);
955 if (UNRELATED_CHAR == matchedProximityCharId) return false;
956 mWord[depth] = c;
957 // If inputIndex is greater than mInputLength, that means there is no
958 // proximity chars. So, we don't need to check proximity.
959 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
960 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
961 }
962 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
963 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
964 if (isSameAsUserTypedLength && terminal) {
965 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
966 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
967 }
968 if (!needsToTraverseChildrenNodes) return false;
969 // Start traversing all nodes after the index exceeds the user typed length
970 *newTraverseAllNodes = isSameAsUserTypedLength;
971 *newMatchRate = matchWeight;
972 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
973 *newInputIndex = inputIndex + 1;
974 }
975 // Optimization: Prune out words that are too long compared to how much was typed.
976 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
977 return false;
978 }
979
980 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
981 // TODO: Check if this can be isSameAsUserTypedLength only.
982 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
983 *newTraverseAllNodes = true;
984 }
985 // get the count of nodes and increment childAddress.
986 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
987 *newChildPosition = childPosition;
988 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
989 return needsToTraverseChildrenNodes;
990}
991
992#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +0900993
Jean Chalard0584f022011-06-30 19:23:16 +0900994inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
995 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
996 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard85a1d1e2011-06-21 22:23:21 +0900997 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
998 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard432789a2011-06-30 17:50:48 +0900999 int *nextSiblingPosition, int *newOutputIndex) {
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001000 if (DEBUG_DICT) {
1001 int inputCount = 0;
1002 if (skipPos >= 0) ++inputCount;
1003 if (excessivePos >= 0) ++inputCount;
1004 if (transposedPos >= 0) ++inputCount;
1005 assert(inputCount <= 1);
1006 }
1007 unsigned short c;
1008 int childPosition;
1009 bool terminal;
1010 int freq;
1011 bool isSameAsUserTypedLength = false;
1012
Jean Chalard0584f022011-06-30 19:23:16 +09001013 int pos = initialPos;
1014 int depth = initialDepth;
1015 int traverseAllNodes = initialTraverseAllNodes;
1016 int diffs = initialDiffs;
1017
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001018 const uint8_t flags = 0; // No flags for now
1019
1020 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1021
1022 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
1023 &c, &childPosition, &terminal, &freq);
Jean Chalard432789a2011-06-30 17:50:48 +09001024 *newOutputIndex = depth + 1;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001025
1026 const bool needsToTraverseChildrenNodes = childPosition != 0;
1027
1028 // If we are only doing traverseAllNodes, no need to look at the typed characters.
1029 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1030 mWord[depth] = c;
1031 if (traverseAllNodes && terminal) {
1032 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1033 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1034 }
1035 if (!needsToTraverseChildrenNodes) return false;
1036 *newTraverseAllNodes = traverseAllNodes;
1037 *newMatchRate = matchWeight;
1038 *newDiffs = diffs;
1039 *newInputIndex = inputIndex;
1040 } else {
1041 const int *currentChars = getInputCharsAt(inputIndex);
1042
1043 if (transposedPos >= 0) {
1044 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
1045 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
1046 }
1047
1048 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
1049 transposedPos);
1050 if (UNRELATED_CHAR == matchedProximityCharId) return false;
1051 mWord[depth] = c;
1052 // If inputIndex is greater than mInputLength, that means there is no
1053 // proximity chars. So, we don't need to check proximity.
1054 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
1055 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1056 }
1057 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1058 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1059 if (isSameAsUserTypedLength && terminal) {
1060 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1061 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1062 }
1063 if (!needsToTraverseChildrenNodes) return false;
1064 // Start traversing all nodes after the index exceeds the user typed length
1065 *newTraverseAllNodes = isSameAsUserTypedLength;
1066 *newMatchRate = matchWeight;
1067 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
1068 *newInputIndex = inputIndex + 1;
1069 }
1070 // Optimization: Prune out words that are too long compared to how much was typed.
1071 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
1072 return false;
1073 }
1074
1075 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
1076 // TODO: Check if this can be isSameAsUserTypedLength only.
1077 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
1078 *newTraverseAllNodes = true;
1079 }
1080 // get the count of nodes and increment childAddress.
1081 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
1082 *newChildPosition = childPosition;
1083 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
1084 return needsToTraverseChildrenNodes;
1085}
1086
Jean Chalardbc90c722011-06-20 21:09:04 +09001087#endif // NEW_DICTIONARY_FORMAT
1088
satok30088252010-12-01 21:22:15 +09001089} // namespace latinime