blob: bccd37a61851144e356050d05fd13aa14f8a2af7 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090024#include "dictionary.h"
25#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090026
Jean Chalard1059f272011-06-28 20:45:05 +090027#ifdef NEW_DICTIONARY_FORMAT
28#include "binary_format.h"
29#endif // NEW_DICTIONARY_FORMAT
30
satok30088252010-12-01 21:22:15 +090031namespace latinime {
32
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090033const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
34 { { 'a', 'e' },
35 { 'o', 'e' },
36 { 'u', 'e' } };
37
Jean Chalard293ece02011-06-16 20:55:16 +090038// TODO: check the header
39UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090040 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090041 const bool isLatestDictVersion)
Jean Chalard1059f272011-06-28 20:45:05 +090042#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090043 : DICT_ROOT(streamStart),
Jean Chalard1059f272011-06-28 20:45:05 +090044#else // NEW_DICTIONARY_FORMAT
45 : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
46#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090047 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090048 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
49 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalard1059f272011-06-28 20:45:05 +090050#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090051 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalard1059f272011-06-28 20:45:05 +090052#else // NEW_DICTIONARY_FORMAT
53 // TODO : remove this variable.
54 ROOT_POS(0),
55#endif // NEW_DICTIONARY_FORMAT
satok1d7eaf82011-07-13 10:32:02 +090056 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
Jean Chalarda787dba2011-03-04 12:17:48 +090057 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090058 if (DEBUG_DICT) {
59 LOGI("UnigramDictionary - constructor");
60 }
satok30088252010-12-01 21:22:15 +090061}
62
satok18c28f42010-12-02 18:11:54 +090063UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090064
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090065static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
66 const int MAX_PROXIMITY_CHARS) {
67 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
68}
69
70bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
71
72 // There can't be a digraph if we don't have at least 2 characters to examine
73 if (i + 2 > codesSize) return false;
74
75 // Search for the first char of some digraph
76 int lastDigraphIndex = -1;
77 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
78 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
79 lastDigraphIndex >= 0; --lastDigraphIndex) {
80 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
81 }
82 // No match: return early
83 if (lastDigraphIndex < 0) return false;
84
85 // It's an interesting digraph if the second char matches too.
86 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
87}
88
89// Mostly the same arguments as the non-recursive version, except:
90// codes is the original value. It points to the start of the work buffer, and gets passed as is.
91// codesSize is the size of the user input (thus, it is the size of codesSrc).
92// codesDest is the current point in the work buffer.
93// codesSrc is the current point in the user-input, original, content-unmodified buffer.
94// codesRemain is the remaining size in codesSrc.
satok1d7eaf82011-07-13 10:32:02 +090095void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090096 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
97 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080098 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090099
Jean Chalarda787dba2011-03-04 12:17:48 +0900100 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
101 for (int i = 0; i < codesRemain; ++i) {
102 if (isDigraph(codesSrc, i, codesRemain)) {
103 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900104
Jean Chalarda787dba2011-03-04 12:17:48 +0900105 // Copy the word up to the first char of the digraph, then continue processing
106 // on the remaining part of the word, skipping the second char of the digraph.
107 // In our example, copy "pru" and continue running on "fen"
108 // Make i the index of the second char of the digraph for simplicity. Forgetting
109 // to do that results in an infinite recursion so take care!
110 ++i;
111 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
112 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
113 codesBuffer, codesBufferSize, flags,
114 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
115 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
116 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900117
Jean Chalarda787dba2011-03-04 12:17:48 +0900118 // Copy the second char of the digraph in place, then continue processing on
119 // the remaining part of the word.
120 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
121 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
122 BYTES_IN_ONE_CHAR);
123 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
124 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
125 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
126 outWords, frequencies);
127 return;
128 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900129 }
130 }
131
132 // If we come here, we hit the end of the word: let's check it against the dictionary.
133 // In our example, we'll come here once for "prufen" and then once for "pruefen".
134 // If the word contains several digraphs, we'll come it for the product of them.
135 // eg. if the word is "ueberpruefen" we'll test, in order, against
136 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
137 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
138 if (0 != remainingBytes)
139 memcpy(codesDest, codesSrc, remainingBytes);
140
141 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
142 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
143}
144
satok1d7eaf82011-07-13 10:32:02 +0900145int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900146 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
147 unsigned short *outWords, int *frequencies) {
148
149 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
150 { // Incrementally tune the word and try all possibilities
151 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
152 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900153 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900154 } else { // Normal processing
155 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
156 outWords, frequencies);
157 }
158
satok817e5172011-03-04 06:06:45 -0800159 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900160 // Get the word count
161 int suggestedWordsCount = 0;
162 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
163 suggestedWordsCount++;
164 }
165
166 if (DEBUG_DICT) {
167 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900168 /// Print the returned words
169 for (int j = 0; j < suggestedWordsCount; ++j) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700170#ifdef FLAG_DBG
Jean Chalard980d6b62011-06-30 17:02:23 +0900171 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
172 char s[MAX_WORD_LENGTH];
173 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
174 LOGI("%s %i", s, mFrequencies[j]);
satok787945b2011-07-14 08:32:57 +0900175#endif
Jean Chalard980d6b62011-06-30 17:02:23 +0900176 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900177 LOGI("Next letters: ");
178 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
179 if (mNextLettersFrequency[k] > 0) {
180 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
181 }
182 }
183 }
satok817e5172011-03-04 06:06:45 -0800184 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900185 PROF_CLOSE;
186 return suggestedWordsCount;
187}
188
satok1d7eaf82011-07-13 10:32:02 +0900189void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900190 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
191 unsigned short *outWords, int *frequencies) {
192
satok61e2f852011-01-05 14:13:07 +0900193 PROF_OPEN;
194 PROF_START(0);
satok1d7eaf82011-07-13 10:32:02 +0900195 initSuggestions(
196 proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900197 if (DEBUG_DICT) assert(codesSize == mInputLength);
198
satoka3d78f62010-12-09 22:08:33 +0900199 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900200 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900201
satok61e2f852011-01-05 14:13:07 +0900202 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900203 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900204 PROF_END(1);
205
206 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900207 // Suggestion with missing character
208 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900209 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900210 if (DEBUG_DICT) {
211 LOGI("--- Suggest missing characters %d", i);
212 }
satok54fe9e02010-12-13 14:42:35 +0900213 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900214 }
215 }
satok61e2f852011-01-05 14:13:07 +0900216 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900217
satok61e2f852011-01-05 14:13:07 +0900218 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900219 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900220 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
221 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900222 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900223 if (DEBUG_DICT) {
224 LOGI("--- Suggest excessive characters %d", i);
225 }
satok54fe9e02010-12-13 14:42:35 +0900226 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900227 }
228 }
satok61e2f852011-01-05 14:13:07 +0900229 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900230
satok61e2f852011-01-05 14:13:07 +0900231 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900232 // Suggestion with transposed characters
233 // Only suggest words that length is mInputLength
234 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
235 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900236 if (DEBUG_DICT) {
237 LOGI("--- Suggest transposed characters %d", i);
238 }
satok54fe9e02010-12-13 14:42:35 +0900239 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900240 }
241 }
satok61e2f852011-01-05 14:13:07 +0900242 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900243
satok61e2f852011-01-05 14:13:07 +0900244 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900245 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900246 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
247 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900248 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900249 if (DEBUG_DICT) {
250 LOGI("--- Suggest missing space characters %d", i);
251 }
satok662fe692010-12-08 17:05:39 +0900252 getMissingSpaceWords(mInputLength, i);
253 }
254 }
satok61e2f852011-01-05 14:13:07 +0900255 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800256
257 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900258 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800259 // The first and last "mistyped spaces" are taken care of by excessive character handling
260 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900261 if (DEBUG_DICT) {
262 LOGI("--- Suggest words with proximity space %d", i);
263 }
satok817e5172011-03-04 06:06:45 -0800264 const int x = xcoordinates[i];
265 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900266 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800267 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
268 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900269 }
satok817e5172011-03-04 06:06:45 -0800270 if (proximityInfo->hasSpaceProximity(x, y)) {
271 getMistypedSpaceWords(mInputLength, i);
272 }
satok817e5172011-03-04 06:06:45 -0800273 }
274 }
275 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900276}
277
satok1d7eaf82011-07-13 10:32:02 +0900278void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
279 const int *ycoordinates, const int *codes, const int codesSize,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900280 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900281 if (DEBUG_DICT) {
282 LOGI("initSuggest");
283 }
satok30088252010-12-01 21:22:15 +0900284 mFrequencies = frequencies;
285 mOutputChars = outWords;
satok30088252010-12-01 21:22:15 +0900286 mInputLength = codesSize;
287 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
satok1d7eaf82011-07-13 10:32:02 +0900288 proximityInfo->setInputParams(codes, codesSize);
289 mProximityInfo = proximityInfo;
satok30088252010-12-01 21:22:15 +0900290}
291
Jean Chalard8124e642011-06-16 22:33:41 +0900292static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900293 if (c < nextLettersSize) {
294 nextLetters[c]++;
295 }
296}
297
satok662fe692010-12-08 17:05:39 +0900298// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900299// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900300bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900301 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900302 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700303#ifdef FLAG_DBG
satok30088252010-12-01 21:22:15 +0900304 char s[length + 1];
305 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900306 LOGI("Found word = %s, freq = %d", s, frequency);
satok787945b2011-07-14 08:32:57 +0900307#endif
satok30088252010-12-01 21:22:15 +0900308 }
satokf5cded12010-12-06 21:28:24 +0900309 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900310 if (DEBUG_DICT) {
311 LOGI("Exceeded max word length.");
312 }
satokf5cded12010-12-06 21:28:24 +0900313 return false;
314 }
satok30088252010-12-01 21:22:15 +0900315
316 // Find the right insertion point
317 int insertAt = 0;
318 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900319 // TODO: How should we sort words with the same frequency?
320 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900321 break;
322 }
323 insertAt++;
324 }
325 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900326 if (DEBUG_DICT) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700327#ifdef FLAG_DBG
satokcdbbea72010-12-08 16:04:16 +0900328 char s[length + 1];
329 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900330 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satok787945b2011-07-14 08:32:57 +0900331#endif
satokcdbbea72010-12-08 16:04:16 +0900332 }
satok30088252010-12-01 21:22:15 +0900333 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
334 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
335 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
336 mFrequencies[insertAt] = frequency;
337 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900338 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900339 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900340 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900341 while (length--) {
342 *dest++ = *word++;
343 }
344 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900345 if (DEBUG_DICT) {
346 LOGI("Added word at %d", insertAt);
347 }
satok30088252010-12-01 21:22:15 +0900348 return true;
349 }
350 return false;
351}
352
satok715514d2010-12-02 20:19:59 +0900353static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900354static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900355
satok54fe9e02010-12-13 14:42:35 +0900356void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900357 const int excessivePos, const int transposedPos, int *nextLetters,
358 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900359 if (DEBUG_DICT) {
360 LOGI("getSuggestionCandidates %d", maxDepth);
361 assert(transposedPos + 1 < mInputLength);
362 assert(excessivePos < mInputLength);
363 assert(missingPos < mInputLength);
364 }
satok662fe692010-12-08 17:05:39 +0900365 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900366 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900367 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900368 int depth = 0;
369
370 mStackChildCount[0] = childCount;
371 mStackTraverseAll[0] = (mInputLength <= 0);
372 mStackNodeFreq[0] = 1;
373 mStackInputIndex[0] = 0;
374 mStackDiffs[0] = 0;
375 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900376 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900377
satok662fe692010-12-08 17:05:39 +0900378 // Depth first search
satokd2997922010-12-07 13:08:39 +0900379 while (depth >= 0) {
380 if (mStackChildCount[depth] > 0) {
381 --mStackChildCount[depth];
382 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900383 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900384 int inputIndex = mStackInputIndex[depth];
385 int diffs = mStackDiffs[depth];
386 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900387 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900388 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900389 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900390 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900391 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900392 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
393 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
394 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900395 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900396 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900397 mStackSiblingPos[depth] = siblingPos;
398 if (needsToTraverseChildrenNodes) {
399 // Goes to child node
400 ++depth;
401 mStackChildCount[depth] = childCount;
402 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900403 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900404 mStackInputIndex[depth] = inputIndex;
405 mStackDiffs[depth] = diffs;
406 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900407 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900408 }
409 } else {
satokcdbbea72010-12-08 16:04:16 +0900410 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900411 --depth;
412 }
413 }
414}
415
satokb2e5e592011-04-26 14:50:54 +0900416static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
417static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
418 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
419}
420
421static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
422inline static void multiplyIntCapped(const int multiplier, int *base) {
423 const int temp = *base;
424 if (temp != S_INT_MAX) {
425 // Branch if multiplier == 2 for the optimization
426 if (multiplier == 2) {
427 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
428 } else {
429 const int tempRetval = temp * multiplier;
430 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
431 }
432 }
433}
434
435inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900436 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900437 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900438 } else {
satokb2e5e592011-04-26 14:50:54 +0900439 int ret = base;
440 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
441 return ret;
442 }
443}
444
445inline static void multiplyRate(const int rate, int *freq) {
446 if (*freq != S_INT_MAX) {
447 if (*freq > 1000000) {
448 *freq /= 100;
449 multiplyIntCapped(rate, freq);
450 } else {
451 multiplyIntCapped(rate, freq);
452 *freq /= 100;
453 }
satokf7425bb2011-01-05 16:37:53 +0900454 }
455}
456
satok4c981d32011-04-19 13:58:42 +0900457inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900458 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
459 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900460 if (firstWordLength == 0 || secondWordLength == 0) {
461 return 0;
462 }
463 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
464 int tempFirstFreq = firstFreq;
465 multiplyRate(firstDemotionRate, &tempFirstFreq);
466
467 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
468 int tempSecondFreq = secondFreq;
469 multiplyRate(secondDemotionRate, &tempSecondFreq);
470
471 const int totalLength = firstWordLength + secondWordLength;
472
473 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
474 // length.
475 int totalFreq = tempFirstFreq + tempSecondFreq;
476
477 // This is a workaround to try offsetting the not-enough-demotion which will be done in
478 // calcNormalizedScore in Utils.java.
479 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
480 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
481 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
482 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
483 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
484
485 // At this moment, totalFreq is calculated by the following formula:
486 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
487 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
488
satokb2e5e592011-04-26 14:50:54 +0900489 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900490
491 // This is another workaround to offset the demotion which will be done in
492 // calcNormalizedScore in Utils.java.
493 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
494 // the same amount because we already have adjusted the synthetic freq of this "missing or
495 // mistyped space" suggestion candidate above in this method.
496 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
497 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
498
satokd8db9f82011-05-18 15:31:04 +0900499 if (isSpaceProximity) {
500 // A word pair with one space proximity correction
501 if (DEBUG_DICT) {
502 LOGI("Found a word pair with space proximity correction.");
503 }
504 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
505 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
506 }
507
satok4c981d32011-04-19 13:58:42 +0900508 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
509 return totalFreq;
510}
511
satok817e5172011-03-04 06:06:45 -0800512bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
513 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900514 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800515}
516
517bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
518 return getSplitTwoWordsSuggestion(
519 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900520 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800521}
522
satok58c49b92011-01-27 03:23:39 +0900523inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900524 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900525 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900526 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900527 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900528 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900529 if (mInputLength >= 2) {
530 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
531 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
532 / (10 * mInputLength
533 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900534 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900535 LOGI("Demotion rate for missing character is %d.", demotionRate);
536 }
satokdc5301e2011-04-11 16:14:45 +0900537 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900538 } else {
539 finalFreq = 0;
540 }
541 }
satokf7425bb2011-01-05 16:37:53 +0900542 if (transposedPos >= 0) multiplyRate(
543 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900544 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900545 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satokd24df432011-07-14 15:43:42 +0900546 if (!mProximityInfo->existsAdjacentProximityChars(inputIndex)) {
satok1d7eaf82011-07-13 10:32:02 +0900547 // If an excessive character is not adjacent to the left char or the right char,
548 // we will demote this word.
satokf7425bb2011-01-05 16:37:53 +0900549 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900550 }
551 }
satok58c49b92011-01-27 03:23:39 +0900552 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900553 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900554 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900555 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900556 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900557 if (DEBUG_DICT) {
558 LOGI("Found full matched word.");
559 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900560 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
561 }
562 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900563 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900564 }
satok9674f652011-04-20 17:15:27 +0900565 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900566 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900567 if (DEBUG_DICT) {
568 LOGI("Found one proximity correction.");
569 }
satokb2e5e592011-04-26 14:50:54 +0900570 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900571 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900572 }
satok9674f652011-04-20 17:15:27 +0900573 if (DEBUG_DICT) {
574 LOGI("calc: %d, %d", depth, sameLength);
575 }
satokb2e5e592011-04-26 14:50:54 +0900576 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900577 return finalFreq;
578}
satoka3d78f62010-12-09 22:08:33 +0900579
satok28bd03b2010-12-03 16:39:16 +0900580inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900581 const int inputIndex, const int skipPos, const int depth) {
satokd24df432011-07-14 15:43:42 +0900582 const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex);
satok28bd03b2010-12-03 16:39:16 +0900583 // Skip the ' or other letter and continue deeper
584 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
585}
586
satok28bd03b2010-12-03 16:39:16 +0900587
Jean Chalardca5ef282011-06-17 15:36:26 +0900588inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900589 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900590 const int inputIndex, const int matchWeight, const int skipPos,
591 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
592 int* nextLetters, const int nextLettersSize) {
593
satok1d7eaf82011-07-13 10:32:02 +0900594 const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900595 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900596
597 if (depth >= MIN_SUGGEST_DEPTH) {
598 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
599 excessivePos, transposedPos, freq, sameLength);
600 if (!isSameAsTyped)
601 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900602 }
603
604 if (sameLength && depth >= mInputLength && skipPos < 0) {
605 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
606 }
607}
608
Jean Chalarde6715e32011-06-30 19:47:25 +0900609bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
610 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
611 const int secondWordLength, const bool isSpaceProximity) {
612 if (inputLength >= MAX_WORD_LENGTH) return false;
613 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
614 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
615 return false;
616 const int newWordLength = firstWordLength + secondWordLength + 1;
617 // Allocating variable length array on stack
618 unsigned short word[newWordLength];
619 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
620 if (DEBUG_DICT) {
621 LOGI("First freq: %d", firstFreq);
622 }
623 if (firstFreq <= 0) return false;
624
625 for (int i = 0; i < firstWordLength; ++i) {
626 word[i] = mWord[i];
627 }
628
629 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
630 if (DEBUG_DICT) {
631 LOGI("Second freq: %d", secondFreq);
632 }
633 if (secondFreq <= 0) return false;
634
635 word[firstWordLength] = SPACE;
636 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
637 word[i] = mWord[i - firstWordLength - 1];
638 }
639
640 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
641 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
642 if (DEBUG_DICT) {
643 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
644 TYPED_LETTER_MULTIPLIER);
645 }
646 addWord(word, newWordLength, pairFreq);
647 return true;
648}
649
Jean Chalardbc90c722011-06-20 21:09:04 +0900650#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardbc90c722011-06-20 21:09:04 +0900651// The following functions will be entirely replaced with new implementations.
652void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
653 const int excessivePos, const int transposedPos,int *nextLetters,
654 const int nextLettersSize) {
655 int initialPosition = initialPos;
656 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
657 getWordsRec(count, initialPosition, 0,
658 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
659 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
660 nextLettersSize);
661}
Jean Chalardca5ef282011-06-17 15:36:26 +0900662
Jean Chalardbc90c722011-06-20 21:09:04 +0900663void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
664 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
665 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
666 const int transposedPos, int *nextLetters, const int nextLettersSize) {
667 int siblingPos = pos;
668 for (int i = 0; i < childrenCount; ++i) {
669 int newCount;
670 int newChildPosition;
671 bool newTraverseAllNodes;
672 int newMatchRate;
673 int newInputIndex;
674 int newDiffs;
675 int newSiblingPos;
676 int newOutputIndex;
677 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
678 traverseAllNodes, matchWeight, inputIndex, diffs,
679 skipPos, excessivePos, transposedPos,
680 nextLetters, nextLettersSize,
681 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
682 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
683 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900684
Jean Chalardbc90c722011-06-20 21:09:04 +0900685 if (needsToTraverseChildrenNodes) {
686 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
687 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
688 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900689 }
satok48e432c2010-12-06 17:38:58 +0900690 }
satok48e432c2010-12-06 17:38:58 +0900691}
692
Jean Chalard980d6b62011-06-30 17:02:23 +0900693inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
694 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900695 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900696 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900697 int maxFreq = 0;
698 int depth = 0;
699 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900700 bool terminal = false;
701
satokaee09dc2010-12-09 19:21:51 +0900702 mStackChildCount[0] = count;
703 mStackSiblingPos[0] = pos;
704
705 while (depth >= 0) {
706 if (mStackChildCount[depth] > 0) {
707 --mStackChildCount[depth];
708 int firstChildPos;
709 int newFreq;
710 int siblingPos = mStackSiblingPos[depth];
711 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
712 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
713 &siblingPos);
714 mStackSiblingPos[depth] = siblingPos;
715 if (depth == (inputLength - 1)) {
716 // Traverse sibling node
717 if (terminal) {
718 if (newFreq > maxFreq) {
719 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
720 if (DEBUG_DICT && DEBUG_NODE) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700721#ifdef FLAG_DBG
satokaee09dc2010-12-09 19:21:51 +0900722 char s[inputLength + 1];
723 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
724 s[inputLength] = 0;
725 LOGI("New missing space word found: %d > %d (%s), %d, %d",
726 newFreq, maxFreq, s, inputLength, depth);
satok787945b2011-07-14 08:32:57 +0900727#endif
satokaee09dc2010-12-09 19:21:51 +0900728 }
729 maxFreq = newFreq;
730 }
731 }
732 } else if (needsToTraverseChildrenNodes) {
733 // Traverse children nodes
734 ++depth;
735 mStackChildCount[depth] = count;
736 mStackSiblingPos[depth] = firstChildPos;
737 }
738 } else {
739 // Traverse parent node
740 --depth;
satok662fe692010-12-08 17:05:39 +0900741 }
742 }
satokaee09dc2010-12-09 19:21:51 +0900743
744 word[inputLength] = 0;
745 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900746}
747
748inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900749 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
750 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
751 const int inputIndex = startInputIndex + depth;
satok662fe692010-12-08 17:05:39 +0900752 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900753 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
754 &c, newChildPosition, newTerminal, newFreq);
satokd24df432011-07-14 15:43:42 +0900755 const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);
Ken Wakasade3070a2011-03-19 09:16:42 +0900756 if (DEBUG_DICT) {
757 assert(inputC <= U_SHORT_MAX);
758 }
satokd24df432011-07-14 15:43:42 +0900759 const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900760 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900761 const bool hasChild = *newChildPosition != 0;
762 if (matched) {
763 word[depth] = c;
764 if (DEBUG_DICT && DEBUG_NODE) {
765 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900766 if (*newTerminal) {
767 LOGI("Terminal %d", *newFreq);
768 }
satok662fe692010-12-08 17:05:39 +0900769 }
satokaee09dc2010-12-09 19:21:51 +0900770 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900771 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900772 return true;
773 } else {
774 return false;
775 }
776 } else {
777 // If this node is not user typed character, this method treats this word as unmatched.
778 // Thus newTerminal shouldn't be true.
779 *newTerminal = false;
780 return false;
satok662fe692010-12-08 17:05:39 +0900781 }
satok662fe692010-12-08 17:05:39 +0900782}
Jean Chalard8124e642011-06-16 22:33:41 +0900783
784// TODO: use uint32_t instead of unsigned short
785bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
786 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900787 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900788 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900789 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900790 }
791}
792
Jean Chalard17e44a72011-06-16 22:51:11 +0900793
794// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900795int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
796 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900797 // returns address of bigram data of that word
798 // return -99 if not found
799
800 int count = Dictionary::getCount(DICT_ROOT, &pos);
801 unsigned short currentChar = (unsigned short) word[offset];
802 for (int j = 0; j < count; j++) {
803 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
804 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
805 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
806 if (c == currentChar) {
807 if (offset == length - 1) {
808 if (terminal) {
809 return (pos+1);
810 }
811 } else {
812 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900813 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900814 if (t > 0) {
815 return t;
816 }
817 }
818 }
819 }
820 if (terminal) {
821 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
822 }
823 // There could be two instances of each alphabet - upper and lower case. So continue
824 // looking ...
825 }
826 return NOT_VALID_WORD;
827}
828
Jean Chalardbc90c722011-06-20 21:09:04 +0900829// The following functions will be modified.
Jean Chalard0584f022011-06-30 19:23:16 +0900830inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
831 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
832 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalardbc90c722011-06-20 21:09:04 +0900833 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
834 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
835 int *nextSiblingPosition, int *nextOutputIndex) {
836 if (DEBUG_DICT) {
837 int inputCount = 0;
838 if (skipPos >= 0) ++inputCount;
839 if (excessivePos >= 0) ++inputCount;
840 if (transposedPos >= 0) ++inputCount;
841 assert(inputCount <= 1);
842 }
843 unsigned short c;
844 int childPosition;
845 bool terminal;
846 int freq;
847 bool isSameAsUserTypedLength = false;
848
Jean Chalard0584f022011-06-30 19:23:16 +0900849 const int pos = initialPos;
850 const int depth = initialDepth;
851 const int traverseAllNodes = initialTraverseAllNodes;
852 const int diffs = initialDiffs;
853
Jean Chalardbc90c722011-06-20 21:09:04 +0900854 const uint8_t flags = 0; // No flags for now
855
856 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
857
858 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
859 &c, &childPosition, &terminal, &freq);
860 *nextOutputIndex = depth + 1;
861
862 const bool needsToTraverseChildrenNodes = childPosition != 0;
863
864 // If we are only doing traverseAllNodes, no need to look at the typed characters.
865 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
866 mWord[depth] = c;
867 if (traverseAllNodes && terminal) {
868 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
869 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
870 }
871 if (!needsToTraverseChildrenNodes) return false;
872 *newTraverseAllNodes = traverseAllNodes;
873 *newMatchRate = matchWeight;
874 *newDiffs = diffs;
875 *newInputIndex = inputIndex;
876 } else {
satokd24df432011-07-14 15:43:42 +0900877 int inputIndexForProximity = inputIndex;
Jean Chalardbc90c722011-06-20 21:09:04 +0900878
879 if (transposedPos >= 0) {
satokd24df432011-07-14 15:43:42 +0900880 if (inputIndex == transposedPos) ++inputIndexForProximity;
881 if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
Jean Chalardbc90c722011-06-20 21:09:04 +0900882 }
883
satokd24df432011-07-14 15:43:42 +0900884 ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(
885 inputIndexForProximity, c, skipPos, excessivePos, transposedPos);
886 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;
Jean Chalardbc90c722011-06-20 21:09:04 +0900887 mWord[depth] = c;
888 // If inputIndex is greater than mInputLength, that means there is no
889 // proximity chars. So, we don't need to check proximity.
satokd24df432011-07-14 15:43:42 +0900890 if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
Jean Chalardbc90c722011-06-20 21:09:04 +0900891 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
892 }
893 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
894 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
895 if (isSameAsUserTypedLength && terminal) {
896 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
897 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
898 }
899 if (!needsToTraverseChildrenNodes) return false;
900 // Start traversing all nodes after the index exceeds the user typed length
901 *newTraverseAllNodes = isSameAsUserTypedLength;
902 *newMatchRate = matchWeight;
satokd24df432011-07-14 15:43:42 +0900903 *newDiffs = diffs
904 + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
Jean Chalardbc90c722011-06-20 21:09:04 +0900905 *newInputIndex = inputIndex + 1;
906 }
907 // Optimization: Prune out words that are too long compared to how much was typed.
908 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
909 return false;
910 }
911
912 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
913 // TODO: Check if this can be isSameAsUserTypedLength only.
914 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
915 *newTraverseAllNodes = true;
916 }
917 // get the count of nodes and increment childAddress.
918 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
919 *newChildPosition = childPosition;
920 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
921 return needsToTraverseChildrenNodes;
922}
923
924#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +0900925
Jean Chalard1059f272011-06-28 20:45:05 +0900926// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
927// interface.
928inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
929 const int inputLength, unsigned short *word) {
930 uint16_t inWord[inputLength];
931
932 for (int i = 0; i < inputLength; ++i) {
satokd24df432011-07-14 15:43:42 +0900933 inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i);
Jean Chalard1059f272011-06-28 20:45:05 +0900934 }
935 return getMostFrequentWordLikeInner(inWord, inputLength, word);
936}
937
938// This function will take the position of a character array within a CharGroup,
939// and check it actually like-matches the word in inWord starting at startInputIndex,
940// that is, it matches it with case and accents squashed.
941// The function returns true if there was a full match, false otherwise.
942// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
943// It will also place the end position of the array in outPos; in outInputIndex,
944// it will place the index of the first char AFTER the match if there was a match,
945// and the initial position if there was not. It makes sense because if there was
946// a match we want to continue searching, but if there was not, we want to go to
947// the next CharGroup.
948// In and out parameters may point to the same location. This function takes care
949// not to use any input parameters after it wrote into its outputs.
950static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
951 const uint8_t* const root, const int startPos,
952 const uint16_t* const inWord, const int startInputIndex,
953 int32_t* outNewWord, int* outInputIndex, int* outPos) {
954 const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
955 int pos = startPos;
956 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
satokd24df432011-07-14 15:43:42 +0900957 int32_t baseChar = Dictionary::toBaseLowerCase(character);
958 const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);
Jean Chalard1059f272011-06-28 20:45:05 +0900959
960 if (baseChar != wChar) {
961 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
962 *outInputIndex = startInputIndex;
963 return false;
964 }
965 int inputIndex = startInputIndex;
966 outNewWord[inputIndex] = character;
967 if (hasMultipleChars) {
968 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
969 while (NOT_A_CHARACTER != character) {
satokd24df432011-07-14 15:43:42 +0900970 baseChar = Dictionary::toBaseLowerCase(character);
971 if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
Jean Chalard1059f272011-06-28 20:45:05 +0900972 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
973 *outInputIndex = startInputIndex;
974 return false;
975 }
976 outNewWord[inputIndex] = character;
977 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
978 }
979 }
980 *outInputIndex = inputIndex + 1;
981 *outPos = pos;
982 return true;
983}
984
985// This function is invoked when a word like the word searched for is found.
986// It will compare the frequency to the max frequency, and if greater, will
987// copy the word into the output buffer. In output value maxFreq, it will
988// write the new maximum frequency if it changed.
989static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
990 short unsigned int* outWord, int* maxFreq) {
991 if (freq > *maxFreq) {
992 for (int q = 0; q < length; ++q)
993 outWord[q] = newWord[q];
994 outWord[length] = 0;
995 *maxFreq = freq;
996 }
997}
998
999// Will find the highest frequency of the words like the one passed as an argument,
1000// that is, everything that only differs by case/accents.
1001int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
1002 const int length, short unsigned int* outWord) {
1003 int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
1004 int depth = 0;
1005 int maxFreq = -1;
1006 const uint8_t* const root = DICT_ROOT;
1007
1008 mStackChildCount[0] = root[0];
1009 mStackInputIndex[0] = 0;
1010 mStackSiblingPos[0] = 1;
1011 while (depth >= 0) {
1012 const int charGroupCount = mStackChildCount[depth];
1013 int pos = mStackSiblingPos[depth];
1014 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
1015 int inputIndex = mStackInputIndex[depth];
1016 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1017 // Test whether all chars in this group match with the word we are searching for. If so,
1018 // we want to traverse its children (or if the length match, evaluate its frequency).
1019 // Note that this function will output the position regardless, but will only write
1020 // into inputIndex if there is a match.
1021 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
1022 inputIndex, newWord, &inputIndex, &pos);
1023 if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
1024 const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1025 onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
1026 }
1027 pos = BinaryFormat::skipFrequency(flags, pos);
1028 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1029 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
1030 // If we had a match and the word has children, we want to traverse them. We don't have
1031 // to traverse words longer than the one we are searching for, since they will not match
1032 // anyway, so don't traverse unless inputIndex < length.
1033 if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
1034 // Save position for this depth, to get back to this once children are done
1035 mStackChildCount[depth] = charGroupIndex;
1036 mStackSiblingPos[depth] = siblingPos;
1037 // Prepare stack values for next depth
1038 ++depth;
1039 int childrenPos = childrenNodePos;
1040 mStackChildCount[depth] =
1041 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
1042 mStackSiblingPos[depth] = childrenPos;
1043 mStackInputIndex[depth] = inputIndex;
1044 pos = childrenPos;
1045 // Go to the next depth level.
1046 ++depth;
1047 break;
1048 } else {
1049 // No match, or no children, or word too long to ever match: go the next sibling.
1050 pos = siblingPos;
1051 }
1052 }
1053 --depth;
1054 }
1055 return maxFreq;
1056}
1057
Jean Chalard1059f272011-06-28 20:45:05 +09001058bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
Jean Chalard6a0e9642011-07-25 18:17:11 +09001059 return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
Jean Chalard1059f272011-06-28 20:45:05 +09001060}
1061
1062// TODO: remove this function.
1063int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
1064 int length) const {
1065 return -1;
1066}
1067
1068// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
1069// If the return value is false, then the caller should read in the output "nextSiblingPosition"
1070// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
1071// It is worthy to note that when false is returned, the output values other than
1072// nextSiblingPosition are undefined.
1073// If the return value is true, then the caller must proceed to traverse the children of this
1074// node. processCurrentNode will output the information about the children: their count in
1075// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
1076// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
1077// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
1078// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
1079// there aren't any more nodes at this level, it merely returns the address of the first byte after
1080// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
1081// given level, as output into newCount when traversing this level's parent.
Jean Chalard0584f022011-06-30 19:23:16 +09001082inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
1083 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
1084 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard1059f272011-06-28 20:45:05 +09001085 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001086 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard432789a2011-06-30 17:50:48 +09001087 int *nextSiblingPosition, int *newOutputIndex) {
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001088 if (DEBUG_DICT) {
1089 int inputCount = 0;
1090 if (skipPos >= 0) ++inputCount;
1091 if (excessivePos >= 0) ++inputCount;
1092 if (transposedPos >= 0) ++inputCount;
1093 assert(inputCount <= 1);
1094 }
Jean Chalard0584f022011-06-30 19:23:16 +09001095 int pos = initialPos;
1096 int depth = initialDepth;
1097 int traverseAllNodes = initialTraverseAllNodes;
1098 int diffs = initialDiffs;
1099
Jean Chalard1059f272011-06-28 20:45:05 +09001100 // Flags contain the following information:
1101 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
1102 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
1103 // is on the specified number of bytes.
1104 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
1105 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
1106 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
1107 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
1108 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
1109 const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001110
Jean Chalard1059f272011-06-28 20:45:05 +09001111 // This gets only ONE character from the stream. Next there will be:
1112 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
1113 // else if FLAG_IS_TERMINAL: the frequency
1114 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
1115 // Note that you can't have a node that both is not a terminal and has no children.
1116 int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
1117 assert(NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001118
Jean Chalard1059f272011-06-28 20:45:05 +09001119 // We are going to loop through each character and make it look like it's a different
1120 // node each time. To do that, we will process characters in this node in order until
1121 // we find the character terminator. This is signalled by getCharCode* returning
1122 // NOT_A_CHARACTER.
1123 // As a special case, if there is only one character in this node, we must not read the
1124 // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
1125 // This way, each loop run will look like a "virtual node".
1126 do {
1127 // We prefetch the next char. If 'c' is the last char of this node, we will have
1128 // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
1129 // should behave as a terminal or not and whether we have children.
1130 const int32_t nextc = hasMultipleChars
1131 ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
1132 const bool isLastChar = (NOT_A_CHARACTER == nextc);
1133 // If there are more chars in this nodes, then this virtual node is not a terminal.
1134 // If we are on the last char, this virtual node is a terminal if this node is.
1135 const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));
1136 // If there are more chars in this node, then this virtual node has children.
1137 // If we are on the last char, this virtual node has children if this node has.
1138 const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001139
Jean Chalard1059f272011-06-28 20:45:05 +09001140 // This has to be done for each virtual char (this forwards the "inputIndex" which
satokd24df432011-07-14 15:43:42 +09001141 // is the index in the user-inputted chars, as read by proximity chars.
Jean Chalard1059f272011-06-28 20:45:05 +09001142 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1143 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1144 mWord[depth] = c;
1145 if (traverseAllNodes && isTerminal) {
1146 // The frequency should be here, because we come here only if this is actually
1147 // a terminal node, and we are on its last char.
1148 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1149 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1150 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1151 }
1152 if (!hasChildren) {
1153 // If we don't have children here, that means we finished processing all
1154 // characters of this node (we are on the last virtual node), AND we are in
1155 // traverseAllNodes mode, which means we are searching for *completions*. We
1156 // should skip the frequency if we have a terminal, and report the position
1157 // of the next sibling. We don't have to return other values because we are
1158 // returning false, as in "don't traverse children".
1159 if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);
1160 *nextSiblingPosition =
1161 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1162 return false;
1163 }
1164 } else {
satokd24df432011-07-14 15:43:42 +09001165 int inputIndexForProximity = inputIndex;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001166
Jean Chalard1059f272011-06-28 20:45:05 +09001167 if (transposedPos >= 0) {
satokd24df432011-07-14 15:43:42 +09001168 if (inputIndex == transposedPos) ++inputIndexForProximity;
1169 if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
Jean Chalard1059f272011-06-28 20:45:05 +09001170 }
1171
satokd24df432011-07-14 15:43:42 +09001172 int matchedProximityCharId = mProximityInfo->getMatchedProximityId(
1173 inputIndexForProximity, c, skipPos, excessivePos, transposedPos);
1174 if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
Jean Chalard1059f272011-06-28 20:45:05 +09001175 // We found that this is an unrelated character, so we should give up traversing
1176 // this node and its children entirely.
1177 // However we may not be on the last virtual node yet so we skip the remaining
1178 // characters in this node, the frequency if it's there, read the next sibling
1179 // position to output it, then return false.
1180 // We don't have to output other values because we return false, as in
1181 // "don't traverse children".
1182 if (!isLastChar) {
1183 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1184 }
1185 pos = BinaryFormat::skipFrequency(flags, pos);
1186 *nextSiblingPosition =
1187 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1188 return false;
1189 }
1190 mWord[depth] = c;
1191 // If inputIndex is greater than mInputLength, that means there is no
1192 // proximity chars. So, we don't need to check proximity.
satokd24df432011-07-14 15:43:42 +09001193 if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
Jean Chalard1059f272011-06-28 20:45:05 +09001194 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1195 }
1196 const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1197 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1198 if (isSameAsUserTypedLength && isTerminal) {
1199 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1200 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1201 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1202 }
1203 // This character matched the typed character (enough to traverse the node at least)
1204 // so we just evaluated it. Now we should evaluate this virtual node's children - that
1205 // is, if it has any. If it has no children, we're done here - so we skip the end of
1206 // the node, output the siblings position, and return false "don't traverse children".
1207 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
1208 // remaining char in this group for there can't be any.
1209 if (!hasChildren) {
1210 pos = BinaryFormat::skipFrequency(flags, pos);
1211 *nextSiblingPosition =
1212 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1213 return false;
1214 }
1215 // Start traversing all nodes after the index exceeds the user typed length
1216 traverseAllNodes = isSameAsUserTypedLength;
satokd24df432011-07-14 15:43:42 +09001217 diffs = diffs
1218 + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
Jean Chalard1059f272011-06-28 20:45:05 +09001219 // Finally, we are ready to go to the next character, the next "virtual node".
1220 // We should advance the input index.
1221 // We do this in this branch of the 'if traverseAllNodes' because we are still matching
1222 // characters to input; the other branch is not matching them but searching for
1223 // completions, this is why it does not have to do it.
1224 ++inputIndex;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001225 }
Jean Chalard1059f272011-06-28 20:45:05 +09001226 // Optimization: Prune out words that are too long compared to how much was typed.
1227 if (depth >= maxDepth || diffs > mMaxEditDistance) {
1228 // We are giving up parsing this node and its children. Skip the rest of the node,
1229 // output the sibling position, and return that we don't want to traverse children.
1230 if (!isLastChar) {
1231 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1232 }
1233 pos = BinaryFormat::skipFrequency(flags, pos);
1234 *nextSiblingPosition =
1235 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1236 return false;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001237 }
1238
Jean Chalard1059f272011-06-28 20:45:05 +09001239 // Prepare for the next character. Promote the prefetched char to current char - the loop
1240 // will take care of prefetching the next. If we finally found our last char, nextc will
1241 // contain NOT_A_CHARACTER.
1242 c = nextc;
1243 // Also, the next char is one "virtual node" depth more than this char.
1244 ++depth;
1245 } while (NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001246
1247 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
Jean Chalard1059f272011-06-28 20:45:05 +09001248 // Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.
1249 if (mInputLength <= *newInputIndex) {
1250 traverseAllNodes = true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001251 }
Jean Chalard1059f272011-06-28 20:45:05 +09001252
1253 // All the output values that are purely computation by this function are held in local
1254 // variables. Output them to the caller.
1255 *newTraverseAllNodes = traverseAllNodes;
1256 *newMatchRate = matchWeight;
1257 *newDiffs = diffs;
1258 *newInputIndex = inputIndex;
1259 *newOutputIndex = depth;
1260
1261 // Now we finished processing this node, and we want to traverse children. If there are no
1262 // children, we can't come here.
1263 assert(BinaryFormat::hasChildrenInFlags(flags));
1264
1265 // If this node was a terminal it still has the frequency under the pointer (it may have been
1266 // read, but not skipped - see readFrequencyWithoutMovingPointer).
1267 // Next come the children position, then possibly attributes (attributes are bigrams only for
1268 // now, maybe something related to shortcuts in the future).
1269 // Once this is read, we still need to output the number of nodes in the immediate children of
1270 // this node, so we read and output it before returning true, as in "please traverse children".
1271 pos = BinaryFormat::skipFrequency(flags, pos);
1272 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
1273 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1274 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
1275 *newChildrenPosition = childrenPos;
1276 return true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001277}
1278
Jean Chalardbc90c722011-06-20 21:09:04 +09001279#endif // NEW_DICTIONARY_FORMAT
1280
satok30088252010-12-01 21:22:15 +09001281} // namespace latinime