blob: b1dd1e2a2520949c4a74d3179a1c816ab3e41f60 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
Jean Chalard1059f272011-06-28 20:45:05 +090028#ifdef NEW_DICTIONARY_FORMAT
29#include "binary_format.h"
30#endif // NEW_DICTIONARY_FORMAT
31
satok30088252010-12-01 21:22:15 +090032namespace latinime {
33
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090034const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
35 { { 'a', 'e' },
36 { 'o', 'e' },
37 { 'u', 'e' } };
38
Jean Chalard293ece02011-06-16 20:55:16 +090039// TODO: check the header
40UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090041 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090042 const bool isLatestDictVersion)
Jean Chalard1059f272011-06-28 20:45:05 +090043#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090044 : DICT_ROOT(streamStart),
Jean Chalard1059f272011-06-28 20:45:05 +090045#else // NEW_DICTIONARY_FORMAT
46 : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
47#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090048 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090049 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
50 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalard1059f272011-06-28 20:45:05 +090051#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090052 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalard1059f272011-06-28 20:45:05 +090053#else // NEW_DICTIONARY_FORMAT
54 // TODO : remove this variable.
55 ROOT_POS(0),
56#endif // NEW_DICTIONARY_FORMAT
Jean Chalarda787dba2011-03-04 12:17:48 +090057 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
58 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090059 if (DEBUG_DICT) {
60 LOGI("UnigramDictionary - constructor");
61 }
satok30088252010-12-01 21:22:15 +090062}
63
satok18c28f42010-12-02 18:11:54 +090064UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090065
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090066static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
67 const int MAX_PROXIMITY_CHARS) {
68 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
69}
70
71bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
72
73 // There can't be a digraph if we don't have at least 2 characters to examine
74 if (i + 2 > codesSize) return false;
75
76 // Search for the first char of some digraph
77 int lastDigraphIndex = -1;
78 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
79 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
80 lastDigraphIndex >= 0; --lastDigraphIndex) {
81 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
82 }
83 // No match: return early
84 if (lastDigraphIndex < 0) return false;
85
86 // It's an interesting digraph if the second char matches too.
87 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
88}
89
90// Mostly the same arguments as the non-recursive version, except:
91// codes is the original value. It points to the start of the work buffer, and gets passed as is.
92// codesSize is the size of the user input (thus, it is the size of codesSrc).
93// codesDest is the current point in the work buffer.
94// codesSrc is the current point in the user-input, original, content-unmodified buffer.
95// codesRemain is the remaining size in codesSrc.
96void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
97 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
98 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080099 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900100
Jean Chalarda787dba2011-03-04 12:17:48 +0900101 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
102 for (int i = 0; i < codesRemain; ++i) {
103 if (isDigraph(codesSrc, i, codesRemain)) {
104 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the word up to the first char of the digraph, then continue processing
107 // on the remaining part of the word, skipping the second char of the digraph.
108 // In our example, copy "pru" and continue running on "fen"
109 // Make i the index of the second char of the digraph for simplicity. Forgetting
110 // to do that results in an infinite recursion so take care!
111 ++i;
112 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
113 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
114 codesBuffer, codesBufferSize, flags,
115 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
116 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
117 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900118
Jean Chalarda787dba2011-03-04 12:17:48 +0900119 // Copy the second char of the digraph in place, then continue processing on
120 // the remaining part of the word.
121 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
122 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
123 BYTES_IN_ONE_CHAR);
124 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
125 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
126 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
127 outWords, frequencies);
128 return;
129 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900130 }
131 }
132
133 // If we come here, we hit the end of the word: let's check it against the dictionary.
134 // In our example, we'll come here once for "prufen" and then once for "pruefen".
135 // If the word contains several digraphs, we'll come it for the product of them.
136 // eg. if the word is "ueberpruefen" we'll test, in order, against
137 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
138 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
139 if (0 != remainingBytes)
140 memcpy(codesDest, codesSrc, remainingBytes);
141
142 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
143 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
144}
145
146int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
147 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
148 unsigned short *outWords, int *frequencies) {
149
150 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
151 { // Incrementally tune the word and try all possibilities
152 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
153 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900154 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900155 } else { // Normal processing
156 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
157 outWords, frequencies);
158 }
159
satok817e5172011-03-04 06:06:45 -0800160 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900161 // Get the word count
162 int suggestedWordsCount = 0;
163 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
164 suggestedWordsCount++;
165 }
166
167 if (DEBUG_DICT) {
168 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900169 /// Print the returned words
170 for (int j = 0; j < suggestedWordsCount; ++j) {
171 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
172 char s[MAX_WORD_LENGTH];
173 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
174 LOGI("%s %i", s, mFrequencies[j]);
175 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900176 LOGI("Next letters: ");
177 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
178 if (mNextLettersFrequency[k] > 0) {
179 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
180 }
181 }
182 }
satok817e5172011-03-04 06:06:45 -0800183 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900184 PROF_CLOSE;
185 return suggestedWordsCount;
186}
187
188void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
189 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
190 unsigned short *outWords, int *frequencies) {
191
satok61e2f852011-01-05 14:13:07 +0900192 PROF_OPEN;
193 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900194 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900195 if (DEBUG_DICT) assert(codesSize == mInputLength);
196
satoka3d78f62010-12-09 22:08:33 +0900197 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900198 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900199
satok61e2f852011-01-05 14:13:07 +0900200 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900201 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900202 PROF_END(1);
203
204 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900205 // Suggestion with missing character
206 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900207 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900208 if (DEBUG_DICT) {
209 LOGI("--- Suggest missing characters %d", i);
210 }
satok54fe9e02010-12-13 14:42:35 +0900211 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900212 }
213 }
satok61e2f852011-01-05 14:13:07 +0900214 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900215
satok61e2f852011-01-05 14:13:07 +0900216 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900217 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900218 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
219 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900220 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900221 if (DEBUG_DICT) {
222 LOGI("--- Suggest excessive characters %d", i);
223 }
satok54fe9e02010-12-13 14:42:35 +0900224 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900225 }
226 }
satok61e2f852011-01-05 14:13:07 +0900227 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900228
satok61e2f852011-01-05 14:13:07 +0900229 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900230 // Suggestion with transposed characters
231 // Only suggest words that length is mInputLength
232 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
233 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900234 if (DEBUG_DICT) {
235 LOGI("--- Suggest transposed characters %d", i);
236 }
satok54fe9e02010-12-13 14:42:35 +0900237 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900238 }
239 }
satok61e2f852011-01-05 14:13:07 +0900240 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900241
satok61e2f852011-01-05 14:13:07 +0900242 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900243 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900244 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
245 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900246 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900247 if (DEBUG_DICT) {
248 LOGI("--- Suggest missing space characters %d", i);
249 }
satok662fe692010-12-08 17:05:39 +0900250 getMissingSpaceWords(mInputLength, i);
251 }
252 }
satok61e2f852011-01-05 14:13:07 +0900253 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800254
255 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900256 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800257 // The first and last "mistyped spaces" are taken care of by excessive character handling
258 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900259 if (DEBUG_DICT) {
260 LOGI("--- Suggest words with proximity space %d", i);
261 }
satok817e5172011-03-04 06:06:45 -0800262 const int x = xcoordinates[i];
263 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900264 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800265 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
266 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900267 }
satok817e5172011-03-04 06:06:45 -0800268 if (proximityInfo->hasSpaceProximity(x, y)) {
269 getMistypedSpaceWords(mInputLength, i);
270 }
satok817e5172011-03-04 06:06:45 -0800271 }
272 }
273 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900274}
275
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900276void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
277 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900278 if (DEBUG_DICT) {
279 LOGI("initSuggest");
280 }
satok30088252010-12-01 21:22:15 +0900281 mFrequencies = frequencies;
282 mOutputChars = outWords;
283 mInputCodes = codes;
284 mInputLength = codesSize;
285 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
286}
287
Jean Chalard8124e642011-06-16 22:33:41 +0900288static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900289 if (c < nextLettersSize) {
290 nextLetters[c]++;
291 }
292}
293
satok662fe692010-12-08 17:05:39 +0900294// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900295// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900296bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900297 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900298 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
satok30088252010-12-01 21:22:15 +0900299 char s[length + 1];
300 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900301 LOGI("Found word = %s, freq = %d", s, frequency);
satok30088252010-12-01 21:22:15 +0900302 }
satokf5cded12010-12-06 21:28:24 +0900303 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900304 if (DEBUG_DICT) {
305 LOGI("Exceeded max word length.");
306 }
satokf5cded12010-12-06 21:28:24 +0900307 return false;
308 }
satok30088252010-12-01 21:22:15 +0900309
310 // Find the right insertion point
311 int insertAt = 0;
312 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900313 // TODO: How should we sort words with the same frequency?
314 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900315 break;
316 }
317 insertAt++;
318 }
319 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900320 if (DEBUG_DICT) {
321 char s[length + 1];
322 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900323 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satokcdbbea72010-12-08 16:04:16 +0900324 }
satok30088252010-12-01 21:22:15 +0900325 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
326 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
327 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
328 mFrequencies[insertAt] = frequency;
329 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900330 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900331 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900332 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900333 while (length--) {
334 *dest++ = *word++;
335 }
336 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900337 if (DEBUG_DICT) {
338 LOGI("Added word at %d", insertAt);
339 }
satok30088252010-12-01 21:22:15 +0900340 return true;
341 }
342 return false;
343}
344
Jean Chalard8124e642011-06-16 22:33:41 +0900345static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900346 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
347 c = BASE_CHARS[c];
348 }
349 if (c >='A' && c <= 'Z') {
350 c |= 32;
351 } else if (c > 127) {
352 c = latin_tolower(c);
353 }
354 return c;
355}
356
Jean Chalardca5ef282011-06-17 15:36:26 +0900357bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
satok30088252010-12-01 21:22:15 +0900358 if (length != mInputLength) {
359 return false;
360 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900361 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900362 while (length--) {
363 if ((unsigned int) *inputCodes != (unsigned int) *word) {
364 return false;
365 }
satok662fe692010-12-08 17:05:39 +0900366 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900367 word++;
368 }
369 return true;
370}
371
satok715514d2010-12-02 20:19:59 +0900372static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900373static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900374
satok54fe9e02010-12-13 14:42:35 +0900375void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900376 const int excessivePos, const int transposedPos, int *nextLetters,
377 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900378 if (DEBUG_DICT) {
379 LOGI("getSuggestionCandidates %d", maxDepth);
380 assert(transposedPos + 1 < mInputLength);
381 assert(excessivePos < mInputLength);
382 assert(missingPos < mInputLength);
383 }
satok662fe692010-12-08 17:05:39 +0900384 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900385 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900386 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900387 int depth = 0;
388
389 mStackChildCount[0] = childCount;
390 mStackTraverseAll[0] = (mInputLength <= 0);
391 mStackNodeFreq[0] = 1;
392 mStackInputIndex[0] = 0;
393 mStackDiffs[0] = 0;
394 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900395 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900396
satok662fe692010-12-08 17:05:39 +0900397 // Depth first search
satokd2997922010-12-07 13:08:39 +0900398 while (depth >= 0) {
399 if (mStackChildCount[depth] > 0) {
400 --mStackChildCount[depth];
401 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900402 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900403 int inputIndex = mStackInputIndex[depth];
404 int diffs = mStackDiffs[depth];
405 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900406 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900407 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900408 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900409 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900410 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900411 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
412 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
413 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900414 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900415 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900416 mStackSiblingPos[depth] = siblingPos;
417 if (needsToTraverseChildrenNodes) {
418 // Goes to child node
419 ++depth;
420 mStackChildCount[depth] = childCount;
421 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900422 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900423 mStackInputIndex[depth] = inputIndex;
424 mStackDiffs[depth] = diffs;
425 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900426 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900427 }
428 } else {
satokcdbbea72010-12-08 16:04:16 +0900429 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900430 --depth;
431 }
432 }
433}
434
satokb2e5e592011-04-26 14:50:54 +0900435static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
436static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
437 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
438}
439
440static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
441inline static void multiplyIntCapped(const int multiplier, int *base) {
442 const int temp = *base;
443 if (temp != S_INT_MAX) {
444 // Branch if multiplier == 2 for the optimization
445 if (multiplier == 2) {
446 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
447 } else {
448 const int tempRetval = temp * multiplier;
449 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
450 }
451 }
452}
453
454inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900455 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900456 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900457 } else {
satokb2e5e592011-04-26 14:50:54 +0900458 int ret = base;
459 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
460 return ret;
461 }
462}
463
464inline static void multiplyRate(const int rate, int *freq) {
465 if (*freq != S_INT_MAX) {
466 if (*freq > 1000000) {
467 *freq /= 100;
468 multiplyIntCapped(rate, freq);
469 } else {
470 multiplyIntCapped(rate, freq);
471 *freq /= 100;
472 }
satokf7425bb2011-01-05 16:37:53 +0900473 }
474}
475
satok4c981d32011-04-19 13:58:42 +0900476inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900477 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
478 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900479 if (firstWordLength == 0 || secondWordLength == 0) {
480 return 0;
481 }
482 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
483 int tempFirstFreq = firstFreq;
484 multiplyRate(firstDemotionRate, &tempFirstFreq);
485
486 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
487 int tempSecondFreq = secondFreq;
488 multiplyRate(secondDemotionRate, &tempSecondFreq);
489
490 const int totalLength = firstWordLength + secondWordLength;
491
492 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
493 // length.
494 int totalFreq = tempFirstFreq + tempSecondFreq;
495
496 // This is a workaround to try offsetting the not-enough-demotion which will be done in
497 // calcNormalizedScore in Utils.java.
498 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
499 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
500 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
501 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
502 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
503
504 // At this moment, totalFreq is calculated by the following formula:
505 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
506 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
507
satokb2e5e592011-04-26 14:50:54 +0900508 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900509
510 // This is another workaround to offset the demotion which will be done in
511 // calcNormalizedScore in Utils.java.
512 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
513 // the same amount because we already have adjusted the synthetic freq of this "missing or
514 // mistyped space" suggestion candidate above in this method.
515 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
516 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
517
satokd8db9f82011-05-18 15:31:04 +0900518 if (isSpaceProximity) {
519 // A word pair with one space proximity correction
520 if (DEBUG_DICT) {
521 LOGI("Found a word pair with space proximity correction.");
522 }
523 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
524 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
525 }
526
satok4c981d32011-04-19 13:58:42 +0900527 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
528 return totalFreq;
529}
530
satok817e5172011-03-04 06:06:45 -0800531bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
532 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900533 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800534}
535
536bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
537 return getSplitTwoWordsSuggestion(
538 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900539 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800540}
541
satok58c49b92011-01-27 03:23:39 +0900542inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900543 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900544 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900545 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900546 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900547 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900548 if (mInputLength >= 2) {
549 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
550 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
551 / (10 * mInputLength
552 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900553 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900554 LOGI("Demotion rate for missing character is %d.", demotionRate);
555 }
satokdc5301e2011-04-11 16:14:45 +0900556 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900557 } else {
558 finalFreq = 0;
559 }
560 }
satokf7425bb2011-01-05 16:37:53 +0900561 if (transposedPos >= 0) multiplyRate(
562 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900563 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900564 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900565 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900566 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900567 }
568 }
satok58c49b92011-01-27 03:23:39 +0900569 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900570 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900571 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900572 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900573 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900574 if (DEBUG_DICT) {
575 LOGI("Found full matched word.");
576 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900577 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
578 }
579 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900580 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900581 }
satok9674f652011-04-20 17:15:27 +0900582 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900583 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900584 if (DEBUG_DICT) {
585 LOGI("Found one proximity correction.");
586 }
satokb2e5e592011-04-26 14:50:54 +0900587 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900588 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900589 }
satok9674f652011-04-20 17:15:27 +0900590 if (DEBUG_DICT) {
591 LOGI("calc: %d, %d", depth, sameLength);
592 }
satokb2e5e592011-04-26 14:50:54 +0900593 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900594 return finalFreq;
595}
satoka3d78f62010-12-09 22:08:33 +0900596
satok28bd03b2010-12-03 16:39:16 +0900597inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900598 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900599 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900600 // Skip the ' or other letter and continue deeper
601 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
602}
603
satoke07baa62010-12-09 21:55:40 +0900604inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900605 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900606 if (inputIndex < 0 || inputIndex >= inputLength) return false;
607 const int currentChar = *getInputCharsAt(inputIndex);
608 const int leftIndex = inputIndex - 1;
609 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900610 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900611 int i = 0;
612 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
613 if (leftChars[i++] == currentChar) return true;
614 }
615 }
616 const int rightIndex = inputIndex + 1;
617 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900618 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900619 int i = 0;
620 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
621 if (rightChars[i++] == currentChar) return true;
622 }
623 }
624 return false;
625}
626
Jean Chalarda5d58492011-02-18 17:50:58 +0900627// In the following function, c is the current character of the dictionary word
628// currently examined.
629// currentChars is an array containing the keys close to the character the
630// user actually typed at the same position. We want to see if c is in it: if so,
631// then the word contains at that position a character close to what the user
632// typed.
633// What the user typed is actually the first character of the array.
634// Notice : accented characters do not have a proximity list, so they are alone
635// in their list. The non-accented version of the character should be considered
636// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900637inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
638 const int *currentChars, const unsigned short c, const int skipPos,
639 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900640 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900641
642 // The first char in the array is what user typed. If it matches right away,
643 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900644 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900645 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
646
647 // If one of those is true, we should not check for close characters at all.
648 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
649 return UNRELATED_CHAR;
650
651 // If the non-accented, lowercased version of that first character matches c,
652 // then we have a non-accented version of the accented character the user
653 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900654 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900655 return NEAR_PROXIMITY_CHAR;
656
657 // Not an exact nor an accent-alike match: search the list of close keys
658 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900659 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900660 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900661 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900662 ++j;
663 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900664
665 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900666 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900667}
668
Jean Chalardca5ef282011-06-17 15:36:26 +0900669inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900670 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900671 const int inputIndex, const int matchWeight, const int skipPos,
672 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
673 int* nextLetters, const int nextLettersSize) {
674
675 const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900676 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900677
678 if (depth >= MIN_SUGGEST_DEPTH) {
679 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
680 excessivePos, transposedPos, freq, sameLength);
681 if (!isSameAsTyped)
682 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900683 }
684
685 if (sameLength && depth >= mInputLength && skipPos < 0) {
686 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
687 }
688}
689
Jean Chalarde6715e32011-06-30 19:47:25 +0900690bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
691 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
692 const int secondWordLength, const bool isSpaceProximity) {
693 if (inputLength >= MAX_WORD_LENGTH) return false;
694 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
695 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
696 return false;
697 const int newWordLength = firstWordLength + secondWordLength + 1;
698 // Allocating variable length array on stack
699 unsigned short word[newWordLength];
700 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
701 if (DEBUG_DICT) {
702 LOGI("First freq: %d", firstFreq);
703 }
704 if (firstFreq <= 0) return false;
705
706 for (int i = 0; i < firstWordLength; ++i) {
707 word[i] = mWord[i];
708 }
709
710 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
711 if (DEBUG_DICT) {
712 LOGI("Second freq: %d", secondFreq);
713 }
714 if (secondFreq <= 0) return false;
715
716 word[firstWordLength] = SPACE;
717 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
718 word[i] = mWord[i - firstWordLength - 1];
719 }
720
721 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
722 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
723 if (DEBUG_DICT) {
724 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
725 TYPED_LETTER_MULTIPLIER);
726 }
727 addWord(word, newWordLength, pairFreq);
728 return true;
729}
730
Jean Chalardbc90c722011-06-20 21:09:04 +0900731#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardbc90c722011-06-20 21:09:04 +0900732// The following functions will be entirely replaced with new implementations.
733void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
734 const int excessivePos, const int transposedPos,int *nextLetters,
735 const int nextLettersSize) {
736 int initialPosition = initialPos;
737 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
738 getWordsRec(count, initialPosition, 0,
739 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
740 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
741 nextLettersSize);
742}
Jean Chalardca5ef282011-06-17 15:36:26 +0900743
Jean Chalardbc90c722011-06-20 21:09:04 +0900744void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
745 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
746 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
747 const int transposedPos, int *nextLetters, const int nextLettersSize) {
748 int siblingPos = pos;
749 for (int i = 0; i < childrenCount; ++i) {
750 int newCount;
751 int newChildPosition;
752 bool newTraverseAllNodes;
753 int newMatchRate;
754 int newInputIndex;
755 int newDiffs;
756 int newSiblingPos;
757 int newOutputIndex;
758 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
759 traverseAllNodes, matchWeight, inputIndex, diffs,
760 skipPos, excessivePos, transposedPos,
761 nextLetters, nextLettersSize,
762 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
763 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
764 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900765
Jean Chalardbc90c722011-06-20 21:09:04 +0900766 if (needsToTraverseChildrenNodes) {
767 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
768 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
769 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900770 }
satok48e432c2010-12-06 17:38:58 +0900771 }
satok48e432c2010-12-06 17:38:58 +0900772}
773
Jean Chalard980d6b62011-06-30 17:02:23 +0900774inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
775 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900776 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900777 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900778 int maxFreq = 0;
779 int depth = 0;
780 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900781 bool terminal = false;
782
satokaee09dc2010-12-09 19:21:51 +0900783 mStackChildCount[0] = count;
784 mStackSiblingPos[0] = pos;
785
786 while (depth >= 0) {
787 if (mStackChildCount[depth] > 0) {
788 --mStackChildCount[depth];
789 int firstChildPos;
790 int newFreq;
791 int siblingPos = mStackSiblingPos[depth];
792 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
793 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
794 &siblingPos);
795 mStackSiblingPos[depth] = siblingPos;
796 if (depth == (inputLength - 1)) {
797 // Traverse sibling node
798 if (terminal) {
799 if (newFreq > maxFreq) {
800 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
801 if (DEBUG_DICT && DEBUG_NODE) {
802 char s[inputLength + 1];
803 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
804 s[inputLength] = 0;
805 LOGI("New missing space word found: %d > %d (%s), %d, %d",
806 newFreq, maxFreq, s, inputLength, depth);
807 }
808 maxFreq = newFreq;
809 }
810 }
811 } else if (needsToTraverseChildrenNodes) {
812 // Traverse children nodes
813 ++depth;
814 mStackChildCount[depth] = count;
815 mStackSiblingPos[depth] = firstChildPos;
816 }
817 } else {
818 // Traverse parent node
819 --depth;
satok662fe692010-12-08 17:05:39 +0900820 }
821 }
satokaee09dc2010-12-09 19:21:51 +0900822
823 word[inputLength] = 0;
824 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900825}
826
827inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900828 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
829 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
830 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900831 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900832 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900833 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
834 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900835 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900836 if (DEBUG_DICT) {
837 assert(inputC <= U_SHORT_MAX);
838 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900839 const unsigned short baseLowerC = toBaseLowerCase(c);
840 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900841 const bool hasChild = *newChildPosition != 0;
842 if (matched) {
843 word[depth] = c;
844 if (DEBUG_DICT && DEBUG_NODE) {
845 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900846 if (*newTerminal) {
847 LOGI("Terminal %d", *newFreq);
848 }
satok662fe692010-12-08 17:05:39 +0900849 }
satokaee09dc2010-12-09 19:21:51 +0900850 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900851 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900852 return true;
853 } else {
854 return false;
855 }
856 } else {
857 // If this node is not user typed character, this method treats this word as unmatched.
858 // Thus newTerminal shouldn't be true.
859 *newTerminal = false;
860 return false;
satok662fe692010-12-08 17:05:39 +0900861 }
satok662fe692010-12-08 17:05:39 +0900862}
Jean Chalard8124e642011-06-16 22:33:41 +0900863
864// TODO: use uint32_t instead of unsigned short
865bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
866 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900867 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900868 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900869 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900870 }
871}
872
Jean Chalard17e44a72011-06-16 22:51:11 +0900873
874// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900875int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
876 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900877 // returns address of bigram data of that word
878 // return -99 if not found
879
880 int count = Dictionary::getCount(DICT_ROOT, &pos);
881 unsigned short currentChar = (unsigned short) word[offset];
882 for (int j = 0; j < count; j++) {
883 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
884 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
885 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
886 if (c == currentChar) {
887 if (offset == length - 1) {
888 if (terminal) {
889 return (pos+1);
890 }
891 } else {
892 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900893 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900894 if (t > 0) {
895 return t;
896 }
897 }
898 }
899 }
900 if (terminal) {
901 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
902 }
903 // There could be two instances of each alphabet - upper and lower case. So continue
904 // looking ...
905 }
906 return NOT_VALID_WORD;
907}
908
Jean Chalardbc90c722011-06-20 21:09:04 +0900909// The following functions will be modified.
Jean Chalard0584f022011-06-30 19:23:16 +0900910inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
911 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
912 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalardbc90c722011-06-20 21:09:04 +0900913 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
914 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
915 int *nextSiblingPosition, int *nextOutputIndex) {
916 if (DEBUG_DICT) {
917 int inputCount = 0;
918 if (skipPos >= 0) ++inputCount;
919 if (excessivePos >= 0) ++inputCount;
920 if (transposedPos >= 0) ++inputCount;
921 assert(inputCount <= 1);
922 }
923 unsigned short c;
924 int childPosition;
925 bool terminal;
926 int freq;
927 bool isSameAsUserTypedLength = false;
928
Jean Chalard0584f022011-06-30 19:23:16 +0900929 const int pos = initialPos;
930 const int depth = initialDepth;
931 const int traverseAllNodes = initialTraverseAllNodes;
932 const int diffs = initialDiffs;
933
Jean Chalardbc90c722011-06-20 21:09:04 +0900934 const uint8_t flags = 0; // No flags for now
935
936 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
937
938 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
939 &c, &childPosition, &terminal, &freq);
940 *nextOutputIndex = depth + 1;
941
942 const bool needsToTraverseChildrenNodes = childPosition != 0;
943
944 // If we are only doing traverseAllNodes, no need to look at the typed characters.
945 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
946 mWord[depth] = c;
947 if (traverseAllNodes && terminal) {
948 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
949 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
950 }
951 if (!needsToTraverseChildrenNodes) return false;
952 *newTraverseAllNodes = traverseAllNodes;
953 *newMatchRate = matchWeight;
954 *newDiffs = diffs;
955 *newInputIndex = inputIndex;
956 } else {
957 const int *currentChars = getInputCharsAt(inputIndex);
958
959 if (transposedPos >= 0) {
960 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
961 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
962 }
963
964 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
965 transposedPos);
966 if (UNRELATED_CHAR == matchedProximityCharId) return false;
967 mWord[depth] = c;
968 // If inputIndex is greater than mInputLength, that means there is no
969 // proximity chars. So, we don't need to check proximity.
970 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
971 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
972 }
973 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
974 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
975 if (isSameAsUserTypedLength && terminal) {
976 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
977 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
978 }
979 if (!needsToTraverseChildrenNodes) return false;
980 // Start traversing all nodes after the index exceeds the user typed length
981 *newTraverseAllNodes = isSameAsUserTypedLength;
982 *newMatchRate = matchWeight;
983 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
984 *newInputIndex = inputIndex + 1;
985 }
986 // Optimization: Prune out words that are too long compared to how much was typed.
987 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
988 return false;
989 }
990
991 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
992 // TODO: Check if this can be isSameAsUserTypedLength only.
993 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
994 *newTraverseAllNodes = true;
995 }
996 // get the count of nodes and increment childAddress.
997 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
998 *newChildPosition = childPosition;
999 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
1000 return needsToTraverseChildrenNodes;
1001}
1002
1003#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001004
Jean Chalard1059f272011-06-28 20:45:05 +09001005// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
1006// interface.
1007inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
1008 const int inputLength, unsigned short *word) {
1009 uint16_t inWord[inputLength];
1010
1011 for (int i = 0; i < inputLength; ++i) {
1012 inWord[i] = *getInputCharsAt(startInputIndex + i);
1013 }
1014 return getMostFrequentWordLikeInner(inWord, inputLength, word);
1015}
1016
1017// This function will take the position of a character array within a CharGroup,
1018// and check it actually like-matches the word in inWord starting at startInputIndex,
1019// that is, it matches it with case and accents squashed.
1020// The function returns true if there was a full match, false otherwise.
1021// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
1022// It will also place the end position of the array in outPos; in outInputIndex,
1023// it will place the index of the first char AFTER the match if there was a match,
1024// and the initial position if there was not. It makes sense because if there was
1025// a match we want to continue searching, but if there was not, we want to go to
1026// the next CharGroup.
1027// In and out parameters may point to the same location. This function takes care
1028// not to use any input parameters after it wrote into its outputs.
1029static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
1030 const uint8_t* const root, const int startPos,
1031 const uint16_t* const inWord, const int startInputIndex,
1032 int32_t* outNewWord, int* outInputIndex, int* outPos) {
1033 const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
1034 int pos = startPos;
1035 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1036 int32_t baseChar = toBaseLowerCase(character);
1037 const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
1038
1039 if (baseChar != wChar) {
1040 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
1041 *outInputIndex = startInputIndex;
1042 return false;
1043 }
1044 int inputIndex = startInputIndex;
1045 outNewWord[inputIndex] = character;
1046 if (hasMultipleChars) {
1047 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1048 while (NOT_A_CHARACTER != character) {
1049 baseChar = toBaseLowerCase(character);
1050 if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
1051 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
1052 *outInputIndex = startInputIndex;
1053 return false;
1054 }
1055 outNewWord[inputIndex] = character;
1056 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1057 }
1058 }
1059 *outInputIndex = inputIndex + 1;
1060 *outPos = pos;
1061 return true;
1062}
1063
1064// This function is invoked when a word like the word searched for is found.
1065// It will compare the frequency to the max frequency, and if greater, will
1066// copy the word into the output buffer. In output value maxFreq, it will
1067// write the new maximum frequency if it changed.
1068static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
1069 short unsigned int* outWord, int* maxFreq) {
1070 if (freq > *maxFreq) {
1071 for (int q = 0; q < length; ++q)
1072 outWord[q] = newWord[q];
1073 outWord[length] = 0;
1074 *maxFreq = freq;
1075 }
1076}
1077
1078// Will find the highest frequency of the words like the one passed as an argument,
1079// that is, everything that only differs by case/accents.
1080int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
1081 const int length, short unsigned int* outWord) {
1082 int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
1083 int depth = 0;
1084 int maxFreq = -1;
1085 const uint8_t* const root = DICT_ROOT;
1086
1087 mStackChildCount[0] = root[0];
1088 mStackInputIndex[0] = 0;
1089 mStackSiblingPos[0] = 1;
1090 while (depth >= 0) {
1091 const int charGroupCount = mStackChildCount[depth];
1092 int pos = mStackSiblingPos[depth];
1093 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
1094 int inputIndex = mStackInputIndex[depth];
1095 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1096 // Test whether all chars in this group match with the word we are searching for. If so,
1097 // we want to traverse its children (or if the length match, evaluate its frequency).
1098 // Note that this function will output the position regardless, but will only write
1099 // into inputIndex if there is a match.
1100 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
1101 inputIndex, newWord, &inputIndex, &pos);
1102 if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
1103 const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1104 onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
1105 }
1106 pos = BinaryFormat::skipFrequency(flags, pos);
1107 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1108 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
1109 // If we had a match and the word has children, we want to traverse them. We don't have
1110 // to traverse words longer than the one we are searching for, since they will not match
1111 // anyway, so don't traverse unless inputIndex < length.
1112 if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
1113 // Save position for this depth, to get back to this once children are done
1114 mStackChildCount[depth] = charGroupIndex;
1115 mStackSiblingPos[depth] = siblingPos;
1116 // Prepare stack values for next depth
1117 ++depth;
1118 int childrenPos = childrenNodePos;
1119 mStackChildCount[depth] =
1120 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
1121 mStackSiblingPos[depth] = childrenPos;
1122 mStackInputIndex[depth] = inputIndex;
1123 pos = childrenPos;
1124 // Go to the next depth level.
1125 ++depth;
1126 break;
1127 } else {
1128 // No match, or no children, or word too long to ever match: go the next sibling.
1129 pos = siblingPos;
1130 }
1131 }
1132 --depth;
1133 }
1134 return maxFreq;
1135}
1136
1137// This function gets the frequency of the exact matching word in the dictionary.
1138// If no match is found, it returns -1.
1139int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const {
1140 int pos = 0;
1141 int wordPos = 0;
1142 const uint8_t* const root = DICT_ROOT;
1143
1144 while (true) {
1145 // If we already traversed the tree further than the word is long, there means
1146 // there was no match (or we would have found it).
1147 if (wordPos > length) return -1;
1148 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
1149 const uint16_t wChar = inWord[wordPos];
1150 while (true) {
1151 // If there are no more character groups in this node, it means we could not
1152 // find a matching character for this depth, therefore there is no match.
1153 if (0 >= charGroupCount) return -1;
1154 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1155 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1156 if (character == wChar) {
1157 // This is the correct node. Only one character group may start with the same
1158 // char within a node, so either we found our match in this node, or there is
1159 // no match and we can return -1. So we will check all the characters in this
1160 // character group indeed does match.
1161 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
1162 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1163 while (NOT_A_CHARACTER != character) {
1164 ++wordPos;
1165 // If we shoot the length of the word we search for, or if we find a single
1166 // character that does not match, as explained above, it means the word is
1167 // not in the dictionary (by virtue of this chargroup being the only one to
1168 // match the word on the first character, but not matching the whole word).
1169 if (wordPos > length) return -1;
1170 if (inWord[wordPos] != character) return -1;
1171 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1172 }
1173 }
1174 // If we come here we know that so far, we do match. Either we are on a terminal
1175 // and we match the length, in which case we found it, or we traverse children.
1176 // If we don't match the length AND don't have children, then a word in the
1177 // dictionary fully matches a prefix of the searched word but not the full word.
1178 ++wordPos;
1179 if (FLAG_IS_TERMINAL & flags) {
1180 if (wordPos == length) {
1181 return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1182 }
1183 pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
1184 }
1185 if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags))
1186 return -1;
1187 // We have children and we are still shorter than the word we are searching for, so
1188 // we need to traverse children. Put the pointer on the children position, and
1189 // break
1190 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
1191 break;
1192 } else {
1193 // This chargroup does not match, so skip the remaining part and go to the next.
1194 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
1195 pos = BinaryFormat::skipOtherCharacters(root, pos);
1196 }
1197 pos = BinaryFormat::skipFrequency(flags, pos);
1198 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1199 }
1200 --charGroupCount;
1201 }
1202 }
1203}
1204
1205bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
1206 return -1 != getFrequency(inWord, length);
1207}
1208
1209int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
1210 unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
1211 int maxAlternatives) {
1212 // TODO: add implementation.
1213 return 0;
1214}
1215
1216// TODO: remove this function.
1217int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
1218 int length) const {
1219 return -1;
1220}
1221
1222// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
1223// If the return value is false, then the caller should read in the output "nextSiblingPosition"
1224// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
1225// It is worthy to note that when false is returned, the output values other than
1226// nextSiblingPosition are undefined.
1227// If the return value is true, then the caller must proceed to traverse the children of this
1228// node. processCurrentNode will output the information about the children: their count in
1229// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
1230// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
1231// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
1232// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
1233// there aren't any more nodes at this level, it merely returns the address of the first byte after
1234// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
1235// given level, as output into newCount when traversing this level's parent.
Jean Chalard0584f022011-06-30 19:23:16 +09001236inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
1237 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
1238 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard1059f272011-06-28 20:45:05 +09001239 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001240 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard432789a2011-06-30 17:50:48 +09001241 int *nextSiblingPosition, int *newOutputIndex) {
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001242 if (DEBUG_DICT) {
1243 int inputCount = 0;
1244 if (skipPos >= 0) ++inputCount;
1245 if (excessivePos >= 0) ++inputCount;
1246 if (transposedPos >= 0) ++inputCount;
1247 assert(inputCount <= 1);
1248 }
Jean Chalard0584f022011-06-30 19:23:16 +09001249 int pos = initialPos;
1250 int depth = initialDepth;
1251 int traverseAllNodes = initialTraverseAllNodes;
1252 int diffs = initialDiffs;
1253
Jean Chalard1059f272011-06-28 20:45:05 +09001254 // Flags contain the following information:
1255 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
1256 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
1257 // is on the specified number of bytes.
1258 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
1259 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
1260 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
1261 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
1262 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
1263 const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001264
Jean Chalard1059f272011-06-28 20:45:05 +09001265 // This gets only ONE character from the stream. Next there will be:
1266 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
1267 // else if FLAG_IS_TERMINAL: the frequency
1268 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
1269 // Note that you can't have a node that both is not a terminal and has no children.
1270 int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
1271 assert(NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001272
Jean Chalard1059f272011-06-28 20:45:05 +09001273 // We are going to loop through each character and make it look like it's a different
1274 // node each time. To do that, we will process characters in this node in order until
1275 // we find the character terminator. This is signalled by getCharCode* returning
1276 // NOT_A_CHARACTER.
1277 // As a special case, if there is only one character in this node, we must not read the
1278 // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
1279 // This way, each loop run will look like a "virtual node".
1280 do {
1281 // We prefetch the next char. If 'c' is the last char of this node, we will have
1282 // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
1283 // should behave as a terminal or not and whether we have children.
1284 const int32_t nextc = hasMultipleChars
1285 ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
1286 const bool isLastChar = (NOT_A_CHARACTER == nextc);
1287 // If there are more chars in this nodes, then this virtual node is not a terminal.
1288 // If we are on the last char, this virtual node is a terminal if this node is.
1289 const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));
1290 // If there are more chars in this node, then this virtual node has children.
1291 // If we are on the last char, this virtual node has children if this node has.
1292 const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001293
Jean Chalard1059f272011-06-28 20:45:05 +09001294 // This has to be done for each virtual char (this forwards the "inputIndex" which
1295 // is the index in the user-inputted chars, as read by getInputCharsAt.
1296 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1297 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1298 mWord[depth] = c;
1299 if (traverseAllNodes && isTerminal) {
1300 // The frequency should be here, because we come here only if this is actually
1301 // a terminal node, and we are on its last char.
1302 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1303 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1304 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1305 }
1306 if (!hasChildren) {
1307 // If we don't have children here, that means we finished processing all
1308 // characters of this node (we are on the last virtual node), AND we are in
1309 // traverseAllNodes mode, which means we are searching for *completions*. We
1310 // should skip the frequency if we have a terminal, and report the position
1311 // of the next sibling. We don't have to return other values because we are
1312 // returning false, as in "don't traverse children".
1313 if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);
1314 *nextSiblingPosition =
1315 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1316 return false;
1317 }
1318 } else {
1319 const int *currentChars = getInputCharsAt(inputIndex);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001320
Jean Chalard1059f272011-06-28 20:45:05 +09001321 if (transposedPos >= 0) {
1322 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
1323 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
1324 }
1325
1326 const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,
1327 excessivePos, transposedPos);
1328 if (UNRELATED_CHAR == matchedProximityCharId) {
1329 // We found that this is an unrelated character, so we should give up traversing
1330 // this node and its children entirely.
1331 // However we may not be on the last virtual node yet so we skip the remaining
1332 // characters in this node, the frequency if it's there, read the next sibling
1333 // position to output it, then return false.
1334 // We don't have to output other values because we return false, as in
1335 // "don't traverse children".
1336 if (!isLastChar) {
1337 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1338 }
1339 pos = BinaryFormat::skipFrequency(flags, pos);
1340 *nextSiblingPosition =
1341 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1342 return false;
1343 }
1344 mWord[depth] = c;
1345 // If inputIndex is greater than mInputLength, that means there is no
1346 // proximity chars. So, we don't need to check proximity.
1347 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
1348 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1349 }
1350 const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1351 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1352 if (isSameAsUserTypedLength && isTerminal) {
1353 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1354 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1355 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1356 }
1357 // This character matched the typed character (enough to traverse the node at least)
1358 // so we just evaluated it. Now we should evaluate this virtual node's children - that
1359 // is, if it has any. If it has no children, we're done here - so we skip the end of
1360 // the node, output the siblings position, and return false "don't traverse children".
1361 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
1362 // remaining char in this group for there can't be any.
1363 if (!hasChildren) {
1364 pos = BinaryFormat::skipFrequency(flags, pos);
1365 *nextSiblingPosition =
1366 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1367 return false;
1368 }
1369 // Start traversing all nodes after the index exceeds the user typed length
1370 traverseAllNodes = isSameAsUserTypedLength;
1371 diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
1372 // Finally, we are ready to go to the next character, the next "virtual node".
1373 // We should advance the input index.
1374 // We do this in this branch of the 'if traverseAllNodes' because we are still matching
1375 // characters to input; the other branch is not matching them but searching for
1376 // completions, this is why it does not have to do it.
1377 ++inputIndex;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001378 }
Jean Chalard1059f272011-06-28 20:45:05 +09001379 // Optimization: Prune out words that are too long compared to how much was typed.
1380 if (depth >= maxDepth || diffs > mMaxEditDistance) {
1381 // We are giving up parsing this node and its children. Skip the rest of the node,
1382 // output the sibling position, and return that we don't want to traverse children.
1383 if (!isLastChar) {
1384 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1385 }
1386 pos = BinaryFormat::skipFrequency(flags, pos);
1387 *nextSiblingPosition =
1388 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1389 return false;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001390 }
1391
Jean Chalard1059f272011-06-28 20:45:05 +09001392 // Prepare for the next character. Promote the prefetched char to current char - the loop
1393 // will take care of prefetching the next. If we finally found our last char, nextc will
1394 // contain NOT_A_CHARACTER.
1395 c = nextc;
1396 // Also, the next char is one "virtual node" depth more than this char.
1397 ++depth;
1398 } while (NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001399
1400 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
Jean Chalard1059f272011-06-28 20:45:05 +09001401 // Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.
1402 if (mInputLength <= *newInputIndex) {
1403 traverseAllNodes = true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001404 }
Jean Chalard1059f272011-06-28 20:45:05 +09001405
1406 // All the output values that are purely computation by this function are held in local
1407 // variables. Output them to the caller.
1408 *newTraverseAllNodes = traverseAllNodes;
1409 *newMatchRate = matchWeight;
1410 *newDiffs = diffs;
1411 *newInputIndex = inputIndex;
1412 *newOutputIndex = depth;
1413
1414 // Now we finished processing this node, and we want to traverse children. If there are no
1415 // children, we can't come here.
1416 assert(BinaryFormat::hasChildrenInFlags(flags));
1417
1418 // If this node was a terminal it still has the frequency under the pointer (it may have been
1419 // read, but not skipped - see readFrequencyWithoutMovingPointer).
1420 // Next come the children position, then possibly attributes (attributes are bigrams only for
1421 // now, maybe something related to shortcuts in the future).
1422 // Once this is read, we still need to output the number of nodes in the immediate children of
1423 // this node, so we read and output it before returning true, as in "please traverse children".
1424 pos = BinaryFormat::skipFrequency(flags, pos);
1425 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
1426 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1427 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
1428 *newChildrenPosition = childrenPos;
1429 return true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001430}
1431
Jean Chalardbc90c722011-06-20 21:09:04 +09001432#endif // NEW_DICTIONARY_FORMAT
1433
satok30088252010-12-01 21:22:15 +09001434} // namespace latinime