blob: 5e72c764f563bd7c0a1be7eb3f49d5f2651d5aa2 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
Jean Chalard1059f272011-06-28 20:45:05 +090028#ifdef NEW_DICTIONARY_FORMAT
29#include "binary_format.h"
30#endif // NEW_DICTIONARY_FORMAT
31
satok30088252010-12-01 21:22:15 +090032namespace latinime {
33
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090034const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
35 { { 'a', 'e' },
36 { 'o', 'e' },
37 { 'u', 'e' } };
38
Jean Chalard293ece02011-06-16 20:55:16 +090039// TODO: check the header
40UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090041 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090042 const bool isLatestDictVersion)
Jean Chalard1059f272011-06-28 20:45:05 +090043#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090044 : DICT_ROOT(streamStart),
Jean Chalard1059f272011-06-28 20:45:05 +090045#else // NEW_DICTIONARY_FORMAT
46 : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
47#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090048 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090049 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
50 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalard1059f272011-06-28 20:45:05 +090051#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090052 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalard1059f272011-06-28 20:45:05 +090053#else // NEW_DICTIONARY_FORMAT
54 // TODO : remove this variable.
55 ROOT_POS(0),
56#endif // NEW_DICTIONARY_FORMAT
Jean Chalarda787dba2011-03-04 12:17:48 +090057 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
58 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090059 if (DEBUG_DICT) {
60 LOGI("UnigramDictionary - constructor");
61 }
satok30088252010-12-01 21:22:15 +090062}
63
satok18c28f42010-12-02 18:11:54 +090064UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090065
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090066static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
67 const int MAX_PROXIMITY_CHARS) {
68 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
69}
70
71bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
72
73 // There can't be a digraph if we don't have at least 2 characters to examine
74 if (i + 2 > codesSize) return false;
75
76 // Search for the first char of some digraph
77 int lastDigraphIndex = -1;
78 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
79 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
80 lastDigraphIndex >= 0; --lastDigraphIndex) {
81 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
82 }
83 // No match: return early
84 if (lastDigraphIndex < 0) return false;
85
86 // It's an interesting digraph if the second char matches too.
87 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
88}
89
90// Mostly the same arguments as the non-recursive version, except:
91// codes is the original value. It points to the start of the work buffer, and gets passed as is.
92// codesSize is the size of the user input (thus, it is the size of codesSrc).
93// codesDest is the current point in the work buffer.
94// codesSrc is the current point in the user-input, original, content-unmodified buffer.
95// codesRemain is the remaining size in codesSrc.
96void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
97 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
98 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080099 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900100
Jean Chalarda787dba2011-03-04 12:17:48 +0900101 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
102 for (int i = 0; i < codesRemain; ++i) {
103 if (isDigraph(codesSrc, i, codesRemain)) {
104 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the word up to the first char of the digraph, then continue processing
107 // on the remaining part of the word, skipping the second char of the digraph.
108 // In our example, copy "pru" and continue running on "fen"
109 // Make i the index of the second char of the digraph for simplicity. Forgetting
110 // to do that results in an infinite recursion so take care!
111 ++i;
112 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
113 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
114 codesBuffer, codesBufferSize, flags,
115 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
116 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
117 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900118
Jean Chalarda787dba2011-03-04 12:17:48 +0900119 // Copy the second char of the digraph in place, then continue processing on
120 // the remaining part of the word.
121 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
122 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
123 BYTES_IN_ONE_CHAR);
124 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
125 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
126 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
127 outWords, frequencies);
128 return;
129 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900130 }
131 }
132
133 // If we come here, we hit the end of the word: let's check it against the dictionary.
134 // In our example, we'll come here once for "prufen" and then once for "pruefen".
135 // If the word contains several digraphs, we'll come it for the product of them.
136 // eg. if the word is "ueberpruefen" we'll test, in order, against
137 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
138 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
139 if (0 != remainingBytes)
140 memcpy(codesDest, codesSrc, remainingBytes);
141
142 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
143 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
144}
145
146int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
147 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
148 unsigned short *outWords, int *frequencies) {
149
150 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
151 { // Incrementally tune the word and try all possibilities
152 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
153 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900154 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900155 } else { // Normal processing
156 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
157 outWords, frequencies);
158 }
159
satok817e5172011-03-04 06:06:45 -0800160 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900161 // Get the word count
162 int suggestedWordsCount = 0;
163 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
164 suggestedWordsCount++;
165 }
166
167 if (DEBUG_DICT) {
168 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900169 /// Print the returned words
170 for (int j = 0; j < suggestedWordsCount; ++j) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700171#ifdef FLAG_DBG
Jean Chalard980d6b62011-06-30 17:02:23 +0900172 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
173 char s[MAX_WORD_LENGTH];
174 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
175 LOGI("%s %i", s, mFrequencies[j]);
satok787945b2011-07-14 08:32:57 +0900176#endif
Jean Chalard980d6b62011-06-30 17:02:23 +0900177 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900178 LOGI("Next letters: ");
179 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
180 if (mNextLettersFrequency[k] > 0) {
181 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
182 }
183 }
184 }
satok817e5172011-03-04 06:06:45 -0800185 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900186 PROF_CLOSE;
187 return suggestedWordsCount;
188}
189
190void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
191 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
192 unsigned short *outWords, int *frequencies) {
193
satok61e2f852011-01-05 14:13:07 +0900194 PROF_OPEN;
195 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900196 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900197 if (DEBUG_DICT) assert(codesSize == mInputLength);
198
satoka3d78f62010-12-09 22:08:33 +0900199 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900200 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900201
satok61e2f852011-01-05 14:13:07 +0900202 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900203 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900204 PROF_END(1);
205
206 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900207 // Suggestion with missing character
208 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900209 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900210 if (DEBUG_DICT) {
211 LOGI("--- Suggest missing characters %d", i);
212 }
satok54fe9e02010-12-13 14:42:35 +0900213 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900214 }
215 }
satok61e2f852011-01-05 14:13:07 +0900216 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900217
satok61e2f852011-01-05 14:13:07 +0900218 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900219 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900220 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
221 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900222 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900223 if (DEBUG_DICT) {
224 LOGI("--- Suggest excessive characters %d", i);
225 }
satok54fe9e02010-12-13 14:42:35 +0900226 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900227 }
228 }
satok61e2f852011-01-05 14:13:07 +0900229 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900230
satok61e2f852011-01-05 14:13:07 +0900231 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900232 // Suggestion with transposed characters
233 // Only suggest words that length is mInputLength
234 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
235 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900236 if (DEBUG_DICT) {
237 LOGI("--- Suggest transposed characters %d", i);
238 }
satok54fe9e02010-12-13 14:42:35 +0900239 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900240 }
241 }
satok61e2f852011-01-05 14:13:07 +0900242 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900243
satok61e2f852011-01-05 14:13:07 +0900244 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900245 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900246 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
247 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900248 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900249 if (DEBUG_DICT) {
250 LOGI("--- Suggest missing space characters %d", i);
251 }
satok662fe692010-12-08 17:05:39 +0900252 getMissingSpaceWords(mInputLength, i);
253 }
254 }
satok61e2f852011-01-05 14:13:07 +0900255 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800256
257 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900258 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800259 // The first and last "mistyped spaces" are taken care of by excessive character handling
260 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900261 if (DEBUG_DICT) {
262 LOGI("--- Suggest words with proximity space %d", i);
263 }
satok817e5172011-03-04 06:06:45 -0800264 const int x = xcoordinates[i];
265 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900266 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800267 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
268 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900269 }
satok817e5172011-03-04 06:06:45 -0800270 if (proximityInfo->hasSpaceProximity(x, y)) {
271 getMistypedSpaceWords(mInputLength, i);
272 }
satok817e5172011-03-04 06:06:45 -0800273 }
274 }
275 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900276}
277
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900278void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
279 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900280 if (DEBUG_DICT) {
281 LOGI("initSuggest");
282 }
satok30088252010-12-01 21:22:15 +0900283 mFrequencies = frequencies;
284 mOutputChars = outWords;
285 mInputCodes = codes;
286 mInputLength = codesSize;
287 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
288}
289
Jean Chalard8124e642011-06-16 22:33:41 +0900290static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900291 if (c < nextLettersSize) {
292 nextLetters[c]++;
293 }
294}
295
satok662fe692010-12-08 17:05:39 +0900296// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900297// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900298bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900299 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900300 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700301#ifdef FLAG_DBG
satok30088252010-12-01 21:22:15 +0900302 char s[length + 1];
303 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900304 LOGI("Found word = %s, freq = %d", s, frequency);
satok787945b2011-07-14 08:32:57 +0900305#endif
satok30088252010-12-01 21:22:15 +0900306 }
satokf5cded12010-12-06 21:28:24 +0900307 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900308 if (DEBUG_DICT) {
309 LOGI("Exceeded max word length.");
310 }
satokf5cded12010-12-06 21:28:24 +0900311 return false;
312 }
satok30088252010-12-01 21:22:15 +0900313
314 // Find the right insertion point
315 int insertAt = 0;
316 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900317 // TODO: How should we sort words with the same frequency?
318 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900319 break;
320 }
321 insertAt++;
322 }
323 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900324 if (DEBUG_DICT) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700325#ifdef FLAG_DBG
satokcdbbea72010-12-08 16:04:16 +0900326 char s[length + 1];
327 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900328 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satok787945b2011-07-14 08:32:57 +0900329#endif
satokcdbbea72010-12-08 16:04:16 +0900330 }
satok30088252010-12-01 21:22:15 +0900331 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
332 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
333 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
334 mFrequencies[insertAt] = frequency;
335 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900336 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900337 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900338 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900339 while (length--) {
340 *dest++ = *word++;
341 }
342 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900343 if (DEBUG_DICT) {
344 LOGI("Added word at %d", insertAt);
345 }
satok30088252010-12-01 21:22:15 +0900346 return true;
347 }
348 return false;
349}
350
Jean Chalard8124e642011-06-16 22:33:41 +0900351static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900352 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
353 c = BASE_CHARS[c];
354 }
355 if (c >='A' && c <= 'Z') {
356 c |= 32;
357 } else if (c > 127) {
358 c = latin_tolower(c);
359 }
360 return c;
361}
362
Jean Chalardca5ef282011-06-17 15:36:26 +0900363bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
satok30088252010-12-01 21:22:15 +0900364 if (length != mInputLength) {
365 return false;
366 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900367 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900368 while (length--) {
369 if ((unsigned int) *inputCodes != (unsigned int) *word) {
370 return false;
371 }
satok662fe692010-12-08 17:05:39 +0900372 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900373 word++;
374 }
375 return true;
376}
377
satok715514d2010-12-02 20:19:59 +0900378static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900379static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900380
satok54fe9e02010-12-13 14:42:35 +0900381void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900382 const int excessivePos, const int transposedPos, int *nextLetters,
383 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900384 if (DEBUG_DICT) {
385 LOGI("getSuggestionCandidates %d", maxDepth);
386 assert(transposedPos + 1 < mInputLength);
387 assert(excessivePos < mInputLength);
388 assert(missingPos < mInputLength);
389 }
satok662fe692010-12-08 17:05:39 +0900390 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900391 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900392 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900393 int depth = 0;
394
395 mStackChildCount[0] = childCount;
396 mStackTraverseAll[0] = (mInputLength <= 0);
397 mStackNodeFreq[0] = 1;
398 mStackInputIndex[0] = 0;
399 mStackDiffs[0] = 0;
400 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900401 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900402
satok662fe692010-12-08 17:05:39 +0900403 // Depth first search
satokd2997922010-12-07 13:08:39 +0900404 while (depth >= 0) {
405 if (mStackChildCount[depth] > 0) {
406 --mStackChildCount[depth];
407 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900408 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900409 int inputIndex = mStackInputIndex[depth];
410 int diffs = mStackDiffs[depth];
411 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900412 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900413 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900414 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900415 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900416 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900417 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
418 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
419 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900420 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900421 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900422 mStackSiblingPos[depth] = siblingPos;
423 if (needsToTraverseChildrenNodes) {
424 // Goes to child node
425 ++depth;
426 mStackChildCount[depth] = childCount;
427 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900428 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900429 mStackInputIndex[depth] = inputIndex;
430 mStackDiffs[depth] = diffs;
431 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900432 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900433 }
434 } else {
satokcdbbea72010-12-08 16:04:16 +0900435 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900436 --depth;
437 }
438 }
439}
440
satokb2e5e592011-04-26 14:50:54 +0900441static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
442static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
443 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
444}
445
446static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
447inline static void multiplyIntCapped(const int multiplier, int *base) {
448 const int temp = *base;
449 if (temp != S_INT_MAX) {
450 // Branch if multiplier == 2 for the optimization
451 if (multiplier == 2) {
452 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
453 } else {
454 const int tempRetval = temp * multiplier;
455 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
456 }
457 }
458}
459
460inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900461 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900462 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900463 } else {
satokb2e5e592011-04-26 14:50:54 +0900464 int ret = base;
465 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
466 return ret;
467 }
468}
469
470inline static void multiplyRate(const int rate, int *freq) {
471 if (*freq != S_INT_MAX) {
472 if (*freq > 1000000) {
473 *freq /= 100;
474 multiplyIntCapped(rate, freq);
475 } else {
476 multiplyIntCapped(rate, freq);
477 *freq /= 100;
478 }
satokf7425bb2011-01-05 16:37:53 +0900479 }
480}
481
satok4c981d32011-04-19 13:58:42 +0900482inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900483 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
484 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900485 if (firstWordLength == 0 || secondWordLength == 0) {
486 return 0;
487 }
488 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
489 int tempFirstFreq = firstFreq;
490 multiplyRate(firstDemotionRate, &tempFirstFreq);
491
492 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
493 int tempSecondFreq = secondFreq;
494 multiplyRate(secondDemotionRate, &tempSecondFreq);
495
496 const int totalLength = firstWordLength + secondWordLength;
497
498 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
499 // length.
500 int totalFreq = tempFirstFreq + tempSecondFreq;
501
502 // This is a workaround to try offsetting the not-enough-demotion which will be done in
503 // calcNormalizedScore in Utils.java.
504 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
505 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
506 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
507 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
508 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
509
510 // At this moment, totalFreq is calculated by the following formula:
511 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
512 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
513
satokb2e5e592011-04-26 14:50:54 +0900514 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900515
516 // This is another workaround to offset the demotion which will be done in
517 // calcNormalizedScore in Utils.java.
518 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
519 // the same amount because we already have adjusted the synthetic freq of this "missing or
520 // mistyped space" suggestion candidate above in this method.
521 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
522 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
523
satokd8db9f82011-05-18 15:31:04 +0900524 if (isSpaceProximity) {
525 // A word pair with one space proximity correction
526 if (DEBUG_DICT) {
527 LOGI("Found a word pair with space proximity correction.");
528 }
529 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
530 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
531 }
532
satok4c981d32011-04-19 13:58:42 +0900533 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
534 return totalFreq;
535}
536
satok817e5172011-03-04 06:06:45 -0800537bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
538 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900539 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800540}
541
542bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
543 return getSplitTwoWordsSuggestion(
544 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900545 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800546}
547
satok58c49b92011-01-27 03:23:39 +0900548inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900549 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900550 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900551 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900552 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900553 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900554 if (mInputLength >= 2) {
555 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
556 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
557 / (10 * mInputLength
558 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900559 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900560 LOGI("Demotion rate for missing character is %d.", demotionRate);
561 }
satokdc5301e2011-04-11 16:14:45 +0900562 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900563 } else {
564 finalFreq = 0;
565 }
566 }
satokf7425bb2011-01-05 16:37:53 +0900567 if (transposedPos >= 0) multiplyRate(
568 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900569 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900570 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900571 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900572 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900573 }
574 }
satok58c49b92011-01-27 03:23:39 +0900575 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900576 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900577 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900578 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900579 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900580 if (DEBUG_DICT) {
581 LOGI("Found full matched word.");
582 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900583 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
584 }
585 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900586 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900587 }
satok9674f652011-04-20 17:15:27 +0900588 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900589 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900590 if (DEBUG_DICT) {
591 LOGI("Found one proximity correction.");
592 }
satokb2e5e592011-04-26 14:50:54 +0900593 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900594 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900595 }
satok9674f652011-04-20 17:15:27 +0900596 if (DEBUG_DICT) {
597 LOGI("calc: %d, %d", depth, sameLength);
598 }
satokb2e5e592011-04-26 14:50:54 +0900599 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900600 return finalFreq;
601}
satoka3d78f62010-12-09 22:08:33 +0900602
satok28bd03b2010-12-03 16:39:16 +0900603inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900604 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900605 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900606 // Skip the ' or other letter and continue deeper
607 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
608}
609
satoke07baa62010-12-09 21:55:40 +0900610inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900611 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900612 if (inputIndex < 0 || inputIndex >= inputLength) return false;
613 const int currentChar = *getInputCharsAt(inputIndex);
614 const int leftIndex = inputIndex - 1;
615 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900616 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900617 int i = 0;
618 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
619 if (leftChars[i++] == currentChar) return true;
620 }
621 }
622 const int rightIndex = inputIndex + 1;
623 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900624 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900625 int i = 0;
626 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
627 if (rightChars[i++] == currentChar) return true;
628 }
629 }
630 return false;
631}
632
Jean Chalarda5d58492011-02-18 17:50:58 +0900633// In the following function, c is the current character of the dictionary word
634// currently examined.
635// currentChars is an array containing the keys close to the character the
636// user actually typed at the same position. We want to see if c is in it: if so,
637// then the word contains at that position a character close to what the user
638// typed.
639// What the user typed is actually the first character of the array.
640// Notice : accented characters do not have a proximity list, so they are alone
641// in their list. The non-accented version of the character should be considered
642// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900643inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
644 const int *currentChars, const unsigned short c, const int skipPos,
645 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900646 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900647
648 // The first char in the array is what user typed. If it matches right away,
649 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900650 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900651 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
652
653 // If one of those is true, we should not check for close characters at all.
654 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
655 return UNRELATED_CHAR;
656
657 // If the non-accented, lowercased version of that first character matches c,
658 // then we have a non-accented version of the accented character the user
659 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900660 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900661 return NEAR_PROXIMITY_CHAR;
662
663 // Not an exact nor an accent-alike match: search the list of close keys
664 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900665 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900666 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900667 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900668 ++j;
669 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900670
671 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900672 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900673}
674
Jean Chalardca5ef282011-06-17 15:36:26 +0900675inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900676 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900677 const int inputIndex, const int matchWeight, const int skipPos,
678 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
679 int* nextLetters, const int nextLettersSize) {
680
681 const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900682 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900683
684 if (depth >= MIN_SUGGEST_DEPTH) {
685 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
686 excessivePos, transposedPos, freq, sameLength);
687 if (!isSameAsTyped)
688 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900689 }
690
691 if (sameLength && depth >= mInputLength && skipPos < 0) {
692 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
693 }
694}
695
Jean Chalarde6715e32011-06-30 19:47:25 +0900696bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
697 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
698 const int secondWordLength, const bool isSpaceProximity) {
699 if (inputLength >= MAX_WORD_LENGTH) return false;
700 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
701 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
702 return false;
703 const int newWordLength = firstWordLength + secondWordLength + 1;
704 // Allocating variable length array on stack
705 unsigned short word[newWordLength];
706 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
707 if (DEBUG_DICT) {
708 LOGI("First freq: %d", firstFreq);
709 }
710 if (firstFreq <= 0) return false;
711
712 for (int i = 0; i < firstWordLength; ++i) {
713 word[i] = mWord[i];
714 }
715
716 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
717 if (DEBUG_DICT) {
718 LOGI("Second freq: %d", secondFreq);
719 }
720 if (secondFreq <= 0) return false;
721
722 word[firstWordLength] = SPACE;
723 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
724 word[i] = mWord[i - firstWordLength - 1];
725 }
726
727 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
728 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
729 if (DEBUG_DICT) {
730 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
731 TYPED_LETTER_MULTIPLIER);
732 }
733 addWord(word, newWordLength, pairFreq);
734 return true;
735}
736
Jean Chalardbc90c722011-06-20 21:09:04 +0900737#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardbc90c722011-06-20 21:09:04 +0900738// The following functions will be entirely replaced with new implementations.
739void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
740 const int excessivePos, const int transposedPos,int *nextLetters,
741 const int nextLettersSize) {
742 int initialPosition = initialPos;
743 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
744 getWordsRec(count, initialPosition, 0,
745 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
746 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
747 nextLettersSize);
748}
Jean Chalardca5ef282011-06-17 15:36:26 +0900749
Jean Chalardbc90c722011-06-20 21:09:04 +0900750void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
751 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
752 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
753 const int transposedPos, int *nextLetters, const int nextLettersSize) {
754 int siblingPos = pos;
755 for (int i = 0; i < childrenCount; ++i) {
756 int newCount;
757 int newChildPosition;
758 bool newTraverseAllNodes;
759 int newMatchRate;
760 int newInputIndex;
761 int newDiffs;
762 int newSiblingPos;
763 int newOutputIndex;
764 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
765 traverseAllNodes, matchWeight, inputIndex, diffs,
766 skipPos, excessivePos, transposedPos,
767 nextLetters, nextLettersSize,
768 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
769 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
770 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900771
Jean Chalardbc90c722011-06-20 21:09:04 +0900772 if (needsToTraverseChildrenNodes) {
773 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
774 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
775 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900776 }
satok48e432c2010-12-06 17:38:58 +0900777 }
satok48e432c2010-12-06 17:38:58 +0900778}
779
Jean Chalard980d6b62011-06-30 17:02:23 +0900780inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
781 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900782 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900783 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900784 int maxFreq = 0;
785 int depth = 0;
786 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900787 bool terminal = false;
788
satokaee09dc2010-12-09 19:21:51 +0900789 mStackChildCount[0] = count;
790 mStackSiblingPos[0] = pos;
791
792 while (depth >= 0) {
793 if (mStackChildCount[depth] > 0) {
794 --mStackChildCount[depth];
795 int firstChildPos;
796 int newFreq;
797 int siblingPos = mStackSiblingPos[depth];
798 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
799 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
800 &siblingPos);
801 mStackSiblingPos[depth] = siblingPos;
802 if (depth == (inputLength - 1)) {
803 // Traverse sibling node
804 if (terminal) {
805 if (newFreq > maxFreq) {
806 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
807 if (DEBUG_DICT && DEBUG_NODE) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700808#ifdef FLAG_DBG
satokaee09dc2010-12-09 19:21:51 +0900809 char s[inputLength + 1];
810 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
811 s[inputLength] = 0;
812 LOGI("New missing space word found: %d > %d (%s), %d, %d",
813 newFreq, maxFreq, s, inputLength, depth);
satok787945b2011-07-14 08:32:57 +0900814#endif
satokaee09dc2010-12-09 19:21:51 +0900815 }
816 maxFreq = newFreq;
817 }
818 }
819 } else if (needsToTraverseChildrenNodes) {
820 // Traverse children nodes
821 ++depth;
822 mStackChildCount[depth] = count;
823 mStackSiblingPos[depth] = firstChildPos;
824 }
825 } else {
826 // Traverse parent node
827 --depth;
satok662fe692010-12-08 17:05:39 +0900828 }
829 }
satokaee09dc2010-12-09 19:21:51 +0900830
831 word[inputLength] = 0;
832 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900833}
834
835inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900836 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
837 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
838 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900839 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900840 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900841 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
842 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900843 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900844 if (DEBUG_DICT) {
845 assert(inputC <= U_SHORT_MAX);
846 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900847 const unsigned short baseLowerC = toBaseLowerCase(c);
848 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900849 const bool hasChild = *newChildPosition != 0;
850 if (matched) {
851 word[depth] = c;
852 if (DEBUG_DICT && DEBUG_NODE) {
853 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900854 if (*newTerminal) {
855 LOGI("Terminal %d", *newFreq);
856 }
satok662fe692010-12-08 17:05:39 +0900857 }
satokaee09dc2010-12-09 19:21:51 +0900858 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900859 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900860 return true;
861 } else {
862 return false;
863 }
864 } else {
865 // If this node is not user typed character, this method treats this word as unmatched.
866 // Thus newTerminal shouldn't be true.
867 *newTerminal = false;
868 return false;
satok662fe692010-12-08 17:05:39 +0900869 }
satok662fe692010-12-08 17:05:39 +0900870}
Jean Chalard8124e642011-06-16 22:33:41 +0900871
872// TODO: use uint32_t instead of unsigned short
873bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
874 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900875 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900876 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900877 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900878 }
879}
880
Jean Chalard17e44a72011-06-16 22:51:11 +0900881
882// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900883int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
884 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900885 // returns address of bigram data of that word
886 // return -99 if not found
887
888 int count = Dictionary::getCount(DICT_ROOT, &pos);
889 unsigned short currentChar = (unsigned short) word[offset];
890 for (int j = 0; j < count; j++) {
891 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
892 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
893 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
894 if (c == currentChar) {
895 if (offset == length - 1) {
896 if (terminal) {
897 return (pos+1);
898 }
899 } else {
900 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900901 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900902 if (t > 0) {
903 return t;
904 }
905 }
906 }
907 }
908 if (terminal) {
909 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
910 }
911 // There could be two instances of each alphabet - upper and lower case. So continue
912 // looking ...
913 }
914 return NOT_VALID_WORD;
915}
916
Jean Chalardbc90c722011-06-20 21:09:04 +0900917// The following functions will be modified.
Jean Chalard0584f022011-06-30 19:23:16 +0900918inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
919 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
920 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalardbc90c722011-06-20 21:09:04 +0900921 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
922 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
923 int *nextSiblingPosition, int *nextOutputIndex) {
924 if (DEBUG_DICT) {
925 int inputCount = 0;
926 if (skipPos >= 0) ++inputCount;
927 if (excessivePos >= 0) ++inputCount;
928 if (transposedPos >= 0) ++inputCount;
929 assert(inputCount <= 1);
930 }
931 unsigned short c;
932 int childPosition;
933 bool terminal;
934 int freq;
935 bool isSameAsUserTypedLength = false;
936
Jean Chalard0584f022011-06-30 19:23:16 +0900937 const int pos = initialPos;
938 const int depth = initialDepth;
939 const int traverseAllNodes = initialTraverseAllNodes;
940 const int diffs = initialDiffs;
941
Jean Chalardbc90c722011-06-20 21:09:04 +0900942 const uint8_t flags = 0; // No flags for now
943
944 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
945
946 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
947 &c, &childPosition, &terminal, &freq);
948 *nextOutputIndex = depth + 1;
949
950 const bool needsToTraverseChildrenNodes = childPosition != 0;
951
952 // If we are only doing traverseAllNodes, no need to look at the typed characters.
953 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
954 mWord[depth] = c;
955 if (traverseAllNodes && terminal) {
956 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
957 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
958 }
959 if (!needsToTraverseChildrenNodes) return false;
960 *newTraverseAllNodes = traverseAllNodes;
961 *newMatchRate = matchWeight;
962 *newDiffs = diffs;
963 *newInputIndex = inputIndex;
964 } else {
965 const int *currentChars = getInputCharsAt(inputIndex);
966
967 if (transposedPos >= 0) {
968 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
969 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
970 }
971
972 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
973 transposedPos);
974 if (UNRELATED_CHAR == matchedProximityCharId) return false;
975 mWord[depth] = c;
976 // If inputIndex is greater than mInputLength, that means there is no
977 // proximity chars. So, we don't need to check proximity.
978 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
979 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
980 }
981 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
982 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
983 if (isSameAsUserTypedLength && terminal) {
984 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
985 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
986 }
987 if (!needsToTraverseChildrenNodes) return false;
988 // Start traversing all nodes after the index exceeds the user typed length
989 *newTraverseAllNodes = isSameAsUserTypedLength;
990 *newMatchRate = matchWeight;
991 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
992 *newInputIndex = inputIndex + 1;
993 }
994 // Optimization: Prune out words that are too long compared to how much was typed.
995 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
996 return false;
997 }
998
999 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
1000 // TODO: Check if this can be isSameAsUserTypedLength only.
1001 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
1002 *newTraverseAllNodes = true;
1003 }
1004 // get the count of nodes and increment childAddress.
1005 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
1006 *newChildPosition = childPosition;
1007 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
1008 return needsToTraverseChildrenNodes;
1009}
1010
1011#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001012
Jean Chalard1059f272011-06-28 20:45:05 +09001013// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
1014// interface.
1015inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
1016 const int inputLength, unsigned short *word) {
1017 uint16_t inWord[inputLength];
1018
1019 for (int i = 0; i < inputLength; ++i) {
1020 inWord[i] = *getInputCharsAt(startInputIndex + i);
1021 }
1022 return getMostFrequentWordLikeInner(inWord, inputLength, word);
1023}
1024
1025// This function will take the position of a character array within a CharGroup,
1026// and check it actually like-matches the word in inWord starting at startInputIndex,
1027// that is, it matches it with case and accents squashed.
1028// The function returns true if there was a full match, false otherwise.
1029// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
1030// It will also place the end position of the array in outPos; in outInputIndex,
1031// it will place the index of the first char AFTER the match if there was a match,
1032// and the initial position if there was not. It makes sense because if there was
1033// a match we want to continue searching, but if there was not, we want to go to
1034// the next CharGroup.
1035// In and out parameters may point to the same location. This function takes care
1036// not to use any input parameters after it wrote into its outputs.
1037static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
1038 const uint8_t* const root, const int startPos,
1039 const uint16_t* const inWord, const int startInputIndex,
1040 int32_t* outNewWord, int* outInputIndex, int* outPos) {
1041 const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
1042 int pos = startPos;
1043 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1044 int32_t baseChar = toBaseLowerCase(character);
1045 const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
1046
1047 if (baseChar != wChar) {
1048 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
1049 *outInputIndex = startInputIndex;
1050 return false;
1051 }
1052 int inputIndex = startInputIndex;
1053 outNewWord[inputIndex] = character;
1054 if (hasMultipleChars) {
1055 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1056 while (NOT_A_CHARACTER != character) {
1057 baseChar = toBaseLowerCase(character);
1058 if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
1059 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
1060 *outInputIndex = startInputIndex;
1061 return false;
1062 }
1063 outNewWord[inputIndex] = character;
1064 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1065 }
1066 }
1067 *outInputIndex = inputIndex + 1;
1068 *outPos = pos;
1069 return true;
1070}
1071
1072// This function is invoked when a word like the word searched for is found.
1073// It will compare the frequency to the max frequency, and if greater, will
1074// copy the word into the output buffer. In output value maxFreq, it will
1075// write the new maximum frequency if it changed.
1076static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
1077 short unsigned int* outWord, int* maxFreq) {
1078 if (freq > *maxFreq) {
1079 for (int q = 0; q < length; ++q)
1080 outWord[q] = newWord[q];
1081 outWord[length] = 0;
1082 *maxFreq = freq;
1083 }
1084}
1085
1086// Will find the highest frequency of the words like the one passed as an argument,
1087// that is, everything that only differs by case/accents.
1088int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
1089 const int length, short unsigned int* outWord) {
1090 int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
1091 int depth = 0;
1092 int maxFreq = -1;
1093 const uint8_t* const root = DICT_ROOT;
1094
1095 mStackChildCount[0] = root[0];
1096 mStackInputIndex[0] = 0;
1097 mStackSiblingPos[0] = 1;
1098 while (depth >= 0) {
1099 const int charGroupCount = mStackChildCount[depth];
1100 int pos = mStackSiblingPos[depth];
1101 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
1102 int inputIndex = mStackInputIndex[depth];
1103 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1104 // Test whether all chars in this group match with the word we are searching for. If so,
1105 // we want to traverse its children (or if the length match, evaluate its frequency).
1106 // Note that this function will output the position regardless, but will only write
1107 // into inputIndex if there is a match.
1108 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
1109 inputIndex, newWord, &inputIndex, &pos);
1110 if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
1111 const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1112 onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
1113 }
1114 pos = BinaryFormat::skipFrequency(flags, pos);
1115 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1116 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
1117 // If we had a match and the word has children, we want to traverse them. We don't have
1118 // to traverse words longer than the one we are searching for, since they will not match
1119 // anyway, so don't traverse unless inputIndex < length.
1120 if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
1121 // Save position for this depth, to get back to this once children are done
1122 mStackChildCount[depth] = charGroupIndex;
1123 mStackSiblingPos[depth] = siblingPos;
1124 // Prepare stack values for next depth
1125 ++depth;
1126 int childrenPos = childrenNodePos;
1127 mStackChildCount[depth] =
1128 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
1129 mStackSiblingPos[depth] = childrenPos;
1130 mStackInputIndex[depth] = inputIndex;
1131 pos = childrenPos;
1132 // Go to the next depth level.
1133 ++depth;
1134 break;
1135 } else {
1136 // No match, or no children, or word too long to ever match: go the next sibling.
1137 pos = siblingPos;
1138 }
1139 }
1140 --depth;
1141 }
1142 return maxFreq;
1143}
1144
1145// This function gets the frequency of the exact matching word in the dictionary.
1146// If no match is found, it returns -1.
1147int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const {
1148 int pos = 0;
1149 int wordPos = 0;
1150 const uint8_t* const root = DICT_ROOT;
1151
1152 while (true) {
1153 // If we already traversed the tree further than the word is long, there means
1154 // there was no match (or we would have found it).
1155 if (wordPos > length) return -1;
1156 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
1157 const uint16_t wChar = inWord[wordPos];
1158 while (true) {
1159 // If there are no more character groups in this node, it means we could not
1160 // find a matching character for this depth, therefore there is no match.
1161 if (0 >= charGroupCount) return -1;
1162 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1163 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1164 if (character == wChar) {
1165 // This is the correct node. Only one character group may start with the same
1166 // char within a node, so either we found our match in this node, or there is
1167 // no match and we can return -1. So we will check all the characters in this
1168 // character group indeed does match.
1169 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
1170 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1171 while (NOT_A_CHARACTER != character) {
1172 ++wordPos;
1173 // If we shoot the length of the word we search for, or if we find a single
1174 // character that does not match, as explained above, it means the word is
1175 // not in the dictionary (by virtue of this chargroup being the only one to
1176 // match the word on the first character, but not matching the whole word).
1177 if (wordPos > length) return -1;
1178 if (inWord[wordPos] != character) return -1;
1179 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1180 }
1181 }
1182 // If we come here we know that so far, we do match. Either we are on a terminal
1183 // and we match the length, in which case we found it, or we traverse children.
1184 // If we don't match the length AND don't have children, then a word in the
1185 // dictionary fully matches a prefix of the searched word but not the full word.
1186 ++wordPos;
1187 if (FLAG_IS_TERMINAL & flags) {
1188 if (wordPos == length) {
1189 return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1190 }
1191 pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
1192 }
1193 if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags))
1194 return -1;
1195 // We have children and we are still shorter than the word we are searching for, so
1196 // we need to traverse children. Put the pointer on the children position, and
1197 // break
1198 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
1199 break;
1200 } else {
1201 // This chargroup does not match, so skip the remaining part and go to the next.
1202 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
1203 pos = BinaryFormat::skipOtherCharacters(root, pos);
1204 }
1205 pos = BinaryFormat::skipFrequency(flags, pos);
1206 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1207 }
1208 --charGroupCount;
1209 }
1210 }
1211}
1212
1213bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
1214 return -1 != getFrequency(inWord, length);
1215}
1216
1217int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
1218 unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
1219 int maxAlternatives) {
1220 // TODO: add implementation.
1221 return 0;
1222}
1223
1224// TODO: remove this function.
1225int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
1226 int length) const {
1227 return -1;
1228}
1229
1230// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
1231// If the return value is false, then the caller should read in the output "nextSiblingPosition"
1232// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
1233// It is worthy to note that when false is returned, the output values other than
1234// nextSiblingPosition are undefined.
1235// If the return value is true, then the caller must proceed to traverse the children of this
1236// node. processCurrentNode will output the information about the children: their count in
1237// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
1238// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
1239// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
1240// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
1241// there aren't any more nodes at this level, it merely returns the address of the first byte after
1242// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
1243// given level, as output into newCount when traversing this level's parent.
Jean Chalard0584f022011-06-30 19:23:16 +09001244inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
1245 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
1246 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard1059f272011-06-28 20:45:05 +09001247 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001248 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard432789a2011-06-30 17:50:48 +09001249 int *nextSiblingPosition, int *newOutputIndex) {
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001250 if (DEBUG_DICT) {
1251 int inputCount = 0;
1252 if (skipPos >= 0) ++inputCount;
1253 if (excessivePos >= 0) ++inputCount;
1254 if (transposedPos >= 0) ++inputCount;
1255 assert(inputCount <= 1);
1256 }
Jean Chalard0584f022011-06-30 19:23:16 +09001257 int pos = initialPos;
1258 int depth = initialDepth;
1259 int traverseAllNodes = initialTraverseAllNodes;
1260 int diffs = initialDiffs;
1261
Jean Chalard1059f272011-06-28 20:45:05 +09001262 // Flags contain the following information:
1263 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
1264 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
1265 // is on the specified number of bytes.
1266 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
1267 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
1268 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
1269 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
1270 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
1271 const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001272
Jean Chalard1059f272011-06-28 20:45:05 +09001273 // This gets only ONE character from the stream. Next there will be:
1274 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
1275 // else if FLAG_IS_TERMINAL: the frequency
1276 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
1277 // Note that you can't have a node that both is not a terminal and has no children.
1278 int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
1279 assert(NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001280
Jean Chalard1059f272011-06-28 20:45:05 +09001281 // We are going to loop through each character and make it look like it's a different
1282 // node each time. To do that, we will process characters in this node in order until
1283 // we find the character terminator. This is signalled by getCharCode* returning
1284 // NOT_A_CHARACTER.
1285 // As a special case, if there is only one character in this node, we must not read the
1286 // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
1287 // This way, each loop run will look like a "virtual node".
1288 do {
1289 // We prefetch the next char. If 'c' is the last char of this node, we will have
1290 // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
1291 // should behave as a terminal or not and whether we have children.
1292 const int32_t nextc = hasMultipleChars
1293 ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
1294 const bool isLastChar = (NOT_A_CHARACTER == nextc);
1295 // If there are more chars in this nodes, then this virtual node is not a terminal.
1296 // If we are on the last char, this virtual node is a terminal if this node is.
1297 const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));
1298 // If there are more chars in this node, then this virtual node has children.
1299 // If we are on the last char, this virtual node has children if this node has.
1300 const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001301
Jean Chalard1059f272011-06-28 20:45:05 +09001302 // This has to be done for each virtual char (this forwards the "inputIndex" which
1303 // is the index in the user-inputted chars, as read by getInputCharsAt.
1304 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1305 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1306 mWord[depth] = c;
1307 if (traverseAllNodes && isTerminal) {
1308 // The frequency should be here, because we come here only if this is actually
1309 // a terminal node, and we are on its last char.
1310 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1311 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1312 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1313 }
1314 if (!hasChildren) {
1315 // If we don't have children here, that means we finished processing all
1316 // characters of this node (we are on the last virtual node), AND we are in
1317 // traverseAllNodes mode, which means we are searching for *completions*. We
1318 // should skip the frequency if we have a terminal, and report the position
1319 // of the next sibling. We don't have to return other values because we are
1320 // returning false, as in "don't traverse children".
1321 if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);
1322 *nextSiblingPosition =
1323 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1324 return false;
1325 }
1326 } else {
1327 const int *currentChars = getInputCharsAt(inputIndex);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001328
Jean Chalard1059f272011-06-28 20:45:05 +09001329 if (transposedPos >= 0) {
1330 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
1331 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
1332 }
1333
1334 const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,
1335 excessivePos, transposedPos);
1336 if (UNRELATED_CHAR == matchedProximityCharId) {
1337 // We found that this is an unrelated character, so we should give up traversing
1338 // this node and its children entirely.
1339 // However we may not be on the last virtual node yet so we skip the remaining
1340 // characters in this node, the frequency if it's there, read the next sibling
1341 // position to output it, then return false.
1342 // We don't have to output other values because we return false, as in
1343 // "don't traverse children".
1344 if (!isLastChar) {
1345 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1346 }
1347 pos = BinaryFormat::skipFrequency(flags, pos);
1348 *nextSiblingPosition =
1349 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1350 return false;
1351 }
1352 mWord[depth] = c;
1353 // If inputIndex is greater than mInputLength, that means there is no
1354 // proximity chars. So, we don't need to check proximity.
1355 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
1356 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1357 }
1358 const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1359 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1360 if (isSameAsUserTypedLength && isTerminal) {
1361 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1362 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1363 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1364 }
1365 // This character matched the typed character (enough to traverse the node at least)
1366 // so we just evaluated it. Now we should evaluate this virtual node's children - that
1367 // is, if it has any. If it has no children, we're done here - so we skip the end of
1368 // the node, output the siblings position, and return false "don't traverse children".
1369 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
1370 // remaining char in this group for there can't be any.
1371 if (!hasChildren) {
1372 pos = BinaryFormat::skipFrequency(flags, pos);
1373 *nextSiblingPosition =
1374 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1375 return false;
1376 }
1377 // Start traversing all nodes after the index exceeds the user typed length
1378 traverseAllNodes = isSameAsUserTypedLength;
1379 diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
1380 // Finally, we are ready to go to the next character, the next "virtual node".
1381 // We should advance the input index.
1382 // We do this in this branch of the 'if traverseAllNodes' because we are still matching
1383 // characters to input; the other branch is not matching them but searching for
1384 // completions, this is why it does not have to do it.
1385 ++inputIndex;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001386 }
Jean Chalard1059f272011-06-28 20:45:05 +09001387 // Optimization: Prune out words that are too long compared to how much was typed.
1388 if (depth >= maxDepth || diffs > mMaxEditDistance) {
1389 // We are giving up parsing this node and its children. Skip the rest of the node,
1390 // output the sibling position, and return that we don't want to traverse children.
1391 if (!isLastChar) {
1392 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1393 }
1394 pos = BinaryFormat::skipFrequency(flags, pos);
1395 *nextSiblingPosition =
1396 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1397 return false;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001398 }
1399
Jean Chalard1059f272011-06-28 20:45:05 +09001400 // Prepare for the next character. Promote the prefetched char to current char - the loop
1401 // will take care of prefetching the next. If we finally found our last char, nextc will
1402 // contain NOT_A_CHARACTER.
1403 c = nextc;
1404 // Also, the next char is one "virtual node" depth more than this char.
1405 ++depth;
1406 } while (NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001407
1408 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
Jean Chalard1059f272011-06-28 20:45:05 +09001409 // Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.
1410 if (mInputLength <= *newInputIndex) {
1411 traverseAllNodes = true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001412 }
Jean Chalard1059f272011-06-28 20:45:05 +09001413
1414 // All the output values that are purely computation by this function are held in local
1415 // variables. Output them to the caller.
1416 *newTraverseAllNodes = traverseAllNodes;
1417 *newMatchRate = matchWeight;
1418 *newDiffs = diffs;
1419 *newInputIndex = inputIndex;
1420 *newOutputIndex = depth;
1421
1422 // Now we finished processing this node, and we want to traverse children. If there are no
1423 // children, we can't come here.
1424 assert(BinaryFormat::hasChildrenInFlags(flags));
1425
1426 // If this node was a terminal it still has the frequency under the pointer (it may have been
1427 // read, but not skipped - see readFrequencyWithoutMovingPointer).
1428 // Next come the children position, then possibly attributes (attributes are bigrams only for
1429 // now, maybe something related to shortcuts in the future).
1430 // Once this is read, we still need to output the number of nodes in the immediate children of
1431 // this node, so we read and output it before returning true, as in "please traverse children".
1432 pos = BinaryFormat::skipFrequency(flags, pos);
1433 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
1434 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1435 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
1436 *newChildrenPosition = childrenPos;
1437 return true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001438}
1439
Jean Chalardbc90c722011-06-20 21:09:04 +09001440#endif // NEW_DICTIONARY_FORMAT
1441
satok30088252010-12-01 21:22:15 +09001442} // namespace latinime