blob: 36233b74141a581756ac2348a0889a740fdc35f0 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
28namespace latinime {
29
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090030const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
31 { { 'a', 'e' },
32 { 'o', 'e' },
33 { 'u', 'e' } };
34
Jean Chalard293ece02011-06-16 20:55:16 +090035// TODO: check the header
36UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090037 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090038 const bool isLatestDictVersion)
Jean Chalard293ece02011-06-16 20:55:16 +090039 : DICT_ROOT(streamStart),
40 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090041 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
42 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090043 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalarda787dba2011-03-04 12:17:48 +090044 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
45 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090046 if (DEBUG_DICT) {
47 LOGI("UnigramDictionary - constructor");
48 }
satok30088252010-12-01 21:22:15 +090049}
50
satok18c28f42010-12-02 18:11:54 +090051UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090052
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090053static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
54 const int MAX_PROXIMITY_CHARS) {
55 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
56}
57
58bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
59
60 // There can't be a digraph if we don't have at least 2 characters to examine
61 if (i + 2 > codesSize) return false;
62
63 // Search for the first char of some digraph
64 int lastDigraphIndex = -1;
65 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
66 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
67 lastDigraphIndex >= 0; --lastDigraphIndex) {
68 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
69 }
70 // No match: return early
71 if (lastDigraphIndex < 0) return false;
72
73 // It's an interesting digraph if the second char matches too.
74 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
75}
76
77// Mostly the same arguments as the non-recursive version, except:
78// codes is the original value. It points to the start of the work buffer, and gets passed as is.
79// codesSize is the size of the user input (thus, it is the size of codesSrc).
80// codesDest is the current point in the work buffer.
81// codesSrc is the current point in the user-input, original, content-unmodified buffer.
82// codesRemain is the remaining size in codesSrc.
83void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
84 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
85 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080086 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090087
Jean Chalarda787dba2011-03-04 12:17:48 +090088 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
89 for (int i = 0; i < codesRemain; ++i) {
90 if (isDigraph(codesSrc, i, codesRemain)) {
91 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090092
Jean Chalarda787dba2011-03-04 12:17:48 +090093 // Copy the word up to the first char of the digraph, then continue processing
94 // on the remaining part of the word, skipping the second char of the digraph.
95 // In our example, copy "pru" and continue running on "fen"
96 // Make i the index of the second char of the digraph for simplicity. Forgetting
97 // to do that results in an infinite recursion so take care!
98 ++i;
99 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
100 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
101 codesBuffer, codesBufferSize, flags,
102 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
103 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
104 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the second char of the digraph in place, then continue processing on
107 // the remaining part of the word.
108 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
109 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
110 BYTES_IN_ONE_CHAR);
111 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
112 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
113 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
114 outWords, frequencies);
115 return;
116 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900117 }
118 }
119
120 // If we come here, we hit the end of the word: let's check it against the dictionary.
121 // In our example, we'll come here once for "prufen" and then once for "pruefen".
122 // If the word contains several digraphs, we'll come it for the product of them.
123 // eg. if the word is "ueberpruefen" we'll test, in order, against
124 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
125 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
126 if (0 != remainingBytes)
127 memcpy(codesDest, codesSrc, remainingBytes);
128
129 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
130 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
131}
132
133int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
134 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
135 unsigned short *outWords, int *frequencies) {
136
137 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
138 { // Incrementally tune the word and try all possibilities
139 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
140 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900141 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900142 } else { // Normal processing
143 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
144 outWords, frequencies);
145 }
146
satok817e5172011-03-04 06:06:45 -0800147 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900148 // Get the word count
149 int suggestedWordsCount = 0;
150 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
151 suggestedWordsCount++;
152 }
153
154 if (DEBUG_DICT) {
155 LOGI("Returning %d words", suggestedWordsCount);
156 LOGI("Next letters: ");
157 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
158 if (mNextLettersFrequency[k] > 0) {
159 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
160 }
161 }
162 }
satok817e5172011-03-04 06:06:45 -0800163 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900164 PROF_CLOSE;
165 return suggestedWordsCount;
166}
167
168void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
169 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
170 unsigned short *outWords, int *frequencies) {
171
satok61e2f852011-01-05 14:13:07 +0900172 PROF_OPEN;
173 PROF_START(0);
satok30088252010-12-01 21:22:15 +0900174 initSuggestions(codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900175 if (DEBUG_DICT) assert(codesSize == mInputLength);
176
satoka3d78f62010-12-09 22:08:33 +0900177 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900178 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900179
satok61e2f852011-01-05 14:13:07 +0900180 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900181 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900182 PROF_END(1);
183
184 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900185 // Suggestion with missing character
186 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900187 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900188 if (DEBUG_DICT) {
189 LOGI("--- Suggest missing characters %d", i);
190 }
satok54fe9e02010-12-13 14:42:35 +0900191 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900192 }
193 }
satok61e2f852011-01-05 14:13:07 +0900194 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900195
satok61e2f852011-01-05 14:13:07 +0900196 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900197 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900198 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
199 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900200 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900201 if (DEBUG_DICT) {
202 LOGI("--- Suggest excessive characters %d", i);
203 }
satok54fe9e02010-12-13 14:42:35 +0900204 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900205 }
206 }
satok61e2f852011-01-05 14:13:07 +0900207 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900208
satok61e2f852011-01-05 14:13:07 +0900209 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900210 // Suggestion with transposed characters
211 // Only suggest words that length is mInputLength
212 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
213 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900214 if (DEBUG_DICT) {
215 LOGI("--- Suggest transposed characters %d", i);
216 }
satok54fe9e02010-12-13 14:42:35 +0900217 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900218 }
219 }
satok61e2f852011-01-05 14:13:07 +0900220 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900221
satok61e2f852011-01-05 14:13:07 +0900222 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900223 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900224 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
225 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900226 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900227 if (DEBUG_DICT) {
228 LOGI("--- Suggest missing space characters %d", i);
229 }
satok662fe692010-12-08 17:05:39 +0900230 getMissingSpaceWords(mInputLength, i);
231 }
232 }
satok61e2f852011-01-05 14:13:07 +0900233 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800234
235 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900236 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800237 // The first and last "mistyped spaces" are taken care of by excessive character handling
238 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900239 if (DEBUG_DICT) {
240 LOGI("--- Suggest words with proximity space %d", i);
241 }
satok817e5172011-03-04 06:06:45 -0800242 const int x = xcoordinates[i];
243 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900244 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800245 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
246 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900247 }
satok817e5172011-03-04 06:06:45 -0800248 if (proximityInfo->hasSpaceProximity(x, y)) {
249 getMistypedSpaceWords(mInputLength, i);
250 }
satok817e5172011-03-04 06:06:45 -0800251 }
252 }
253 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900254}
255
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900256void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
257 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900258 if (DEBUG_DICT) {
259 LOGI("initSuggest");
260 }
satok30088252010-12-01 21:22:15 +0900261 mFrequencies = frequencies;
262 mOutputChars = outWords;
263 mInputCodes = codes;
264 mInputLength = codesSize;
265 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
266}
267
Jean Chalard8124e642011-06-16 22:33:41 +0900268static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900269 if (c < nextLettersSize) {
270 nextLetters[c]++;
271 }
272}
273
satok662fe692010-12-08 17:05:39 +0900274// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900275// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900276bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900277 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900278 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
satok30088252010-12-01 21:22:15 +0900279 char s[length + 1];
280 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900281 LOGI("Found word = %s, freq = %d", s, frequency);
satok30088252010-12-01 21:22:15 +0900282 }
satokf5cded12010-12-06 21:28:24 +0900283 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900284 if (DEBUG_DICT) {
285 LOGI("Exceeded max word length.");
286 }
satokf5cded12010-12-06 21:28:24 +0900287 return false;
288 }
satok30088252010-12-01 21:22:15 +0900289
290 // Find the right insertion point
291 int insertAt = 0;
292 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900293 // TODO: How should we sort words with the same frequency?
294 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900295 break;
296 }
297 insertAt++;
298 }
299 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900300 if (DEBUG_DICT) {
301 char s[length + 1];
302 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900303 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satokcdbbea72010-12-08 16:04:16 +0900304 }
satok30088252010-12-01 21:22:15 +0900305 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
306 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
307 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
308 mFrequencies[insertAt] = frequency;
309 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900310 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900311 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900312 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900313 while (length--) {
314 *dest++ = *word++;
315 }
316 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900317 if (DEBUG_DICT) {
318 LOGI("Added word at %d", insertAt);
319 }
satok30088252010-12-01 21:22:15 +0900320 return true;
321 }
322 return false;
323}
324
Jean Chalardca5ef282011-06-17 15:36:26 +0900325inline void UnigramDictionary::addWordAlternatesSpellings(const uint8_t* const root, int pos,
326 int depth, int finalFreq) {
327 // TODO: actually add alternates when the format supports it.
328}
329
330static inline bool hasAlternateSpellings(uint8_t flags) {
331 // TODO: when the format supports it, return the actual value.
332 return false;
333}
334
Jean Chalard8124e642011-06-16 22:33:41 +0900335static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900336 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
337 c = BASE_CHARS[c];
338 }
339 if (c >='A' && c <= 'Z') {
340 c |= 32;
341 } else if (c > 127) {
342 c = latin_tolower(c);
343 }
344 return c;
345}
346
Jean Chalardca5ef282011-06-17 15:36:26 +0900347bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
satok30088252010-12-01 21:22:15 +0900348 if (length != mInputLength) {
349 return false;
350 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900351 const int *inputCodes = mInputCodes;
satok30088252010-12-01 21:22:15 +0900352 while (length--) {
353 if ((unsigned int) *inputCodes != (unsigned int) *word) {
354 return false;
355 }
satok662fe692010-12-08 17:05:39 +0900356 inputCodes += MAX_PROXIMITY_CHARS;
satok30088252010-12-01 21:22:15 +0900357 word++;
358 }
359 return true;
360}
361
satok715514d2010-12-02 20:19:59 +0900362static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900363static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900364
satok54fe9e02010-12-13 14:42:35 +0900365void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900366 const int excessivePos, const int transposedPos, int *nextLetters,
367 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900368 if (DEBUG_DICT) {
369 LOGI("getSuggestionCandidates %d", maxDepth);
370 assert(transposedPos + 1 < mInputLength);
371 assert(excessivePos < mInputLength);
372 assert(missingPos < mInputLength);
373 }
satok662fe692010-12-08 17:05:39 +0900374 int rootPosition = ROOT_POS;
satokd2997922010-12-07 13:08:39 +0900375 // Get the number of child of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900376 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900377 int depth = 0;
378
379 mStackChildCount[0] = childCount;
380 mStackTraverseAll[0] = (mInputLength <= 0);
381 mStackNodeFreq[0] = 1;
382 mStackInputIndex[0] = 0;
383 mStackDiffs[0] = 0;
384 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900385 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900386
satok662fe692010-12-08 17:05:39 +0900387 // Depth first search
satokd2997922010-12-07 13:08:39 +0900388 while (depth >= 0) {
389 if (mStackChildCount[depth] > 0) {
390 --mStackChildCount[depth];
391 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900392 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900393 int inputIndex = mStackInputIndex[depth];
394 int diffs = mStackDiffs[depth];
395 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900396 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900397 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900398 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900399 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900400 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900401 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
402 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
403 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900404 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900405 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900406 mStackSiblingPos[depth] = siblingPos;
407 if (needsToTraverseChildrenNodes) {
408 // Goes to child node
409 ++depth;
410 mStackChildCount[depth] = childCount;
411 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900412 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900413 mStackInputIndex[depth] = inputIndex;
414 mStackDiffs[depth] = diffs;
415 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900416 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900417 }
418 } else {
satokcdbbea72010-12-08 16:04:16 +0900419 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900420 --depth;
421 }
422 }
423}
424
satokb2e5e592011-04-26 14:50:54 +0900425static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
426static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
427 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
428}
429
430static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
431inline static void multiplyIntCapped(const int multiplier, int *base) {
432 const int temp = *base;
433 if (temp != S_INT_MAX) {
434 // Branch if multiplier == 2 for the optimization
435 if (multiplier == 2) {
436 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
437 } else {
438 const int tempRetval = temp * multiplier;
439 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
440 }
441 }
442}
443
444inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900445 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900446 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900447 } else {
satokb2e5e592011-04-26 14:50:54 +0900448 int ret = base;
449 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
450 return ret;
451 }
452}
453
454inline static void multiplyRate(const int rate, int *freq) {
455 if (*freq != S_INT_MAX) {
456 if (*freq > 1000000) {
457 *freq /= 100;
458 multiplyIntCapped(rate, freq);
459 } else {
460 multiplyIntCapped(rate, freq);
461 *freq /= 100;
462 }
satokf7425bb2011-01-05 16:37:53 +0900463 }
464}
465
satok4c981d32011-04-19 13:58:42 +0900466inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900467 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
468 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900469 if (firstWordLength == 0 || secondWordLength == 0) {
470 return 0;
471 }
472 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
473 int tempFirstFreq = firstFreq;
474 multiplyRate(firstDemotionRate, &tempFirstFreq);
475
476 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
477 int tempSecondFreq = secondFreq;
478 multiplyRate(secondDemotionRate, &tempSecondFreq);
479
480 const int totalLength = firstWordLength + secondWordLength;
481
482 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
483 // length.
484 int totalFreq = tempFirstFreq + tempSecondFreq;
485
486 // This is a workaround to try offsetting the not-enough-demotion which will be done in
487 // calcNormalizedScore in Utils.java.
488 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
489 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
490 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
491 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
492 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
493
494 // At this moment, totalFreq is calculated by the following formula:
495 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
496 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
497
satokb2e5e592011-04-26 14:50:54 +0900498 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900499
500 // This is another workaround to offset the demotion which will be done in
501 // calcNormalizedScore in Utils.java.
502 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
503 // the same amount because we already have adjusted the synthetic freq of this "missing or
504 // mistyped space" suggestion candidate above in this method.
505 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
506 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
507
satokd8db9f82011-05-18 15:31:04 +0900508 if (isSpaceProximity) {
509 // A word pair with one space proximity correction
510 if (DEBUG_DICT) {
511 LOGI("Found a word pair with space proximity correction.");
512 }
513 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
514 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
515 }
516
satok4c981d32011-04-19 13:58:42 +0900517 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
518 return totalFreq;
519}
520
satok817e5172011-03-04 06:06:45 -0800521bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
522 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900523 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800524}
525
526bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
527 return getSplitTwoWordsSuggestion(
528 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900529 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800530}
531
satok58c49b92011-01-27 03:23:39 +0900532inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900533 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900534 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900535 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900536 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900537 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900538 if (mInputLength >= 2) {
539 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
540 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
541 / (10 * mInputLength
542 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900543 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900544 LOGI("Demotion rate for missing character is %d.", demotionRate);
545 }
satokdc5301e2011-04-11 16:14:45 +0900546 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900547 } else {
548 finalFreq = 0;
549 }
550 }
satokf7425bb2011-01-05 16:37:53 +0900551 if (transposedPos >= 0) multiplyRate(
552 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900553 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900554 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900555 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satokf7425bb2011-01-05 16:37:53 +0900556 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900557 }
558 }
satok58c49b92011-01-27 03:23:39 +0900559 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900560 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900561 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900562 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900563 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900564 if (DEBUG_DICT) {
565 LOGI("Found full matched word.");
566 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900567 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
568 }
569 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900570 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900571 }
satok9674f652011-04-20 17:15:27 +0900572 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900573 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900574 if (DEBUG_DICT) {
575 LOGI("Found one proximity correction.");
576 }
satokb2e5e592011-04-26 14:50:54 +0900577 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900578 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900579 }
satok9674f652011-04-20 17:15:27 +0900580 if (DEBUG_DICT) {
581 LOGI("calc: %d, %d", depth, sameLength);
582 }
satokb2e5e592011-04-26 14:50:54 +0900583 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900584 return finalFreq;
585}
satoka3d78f62010-12-09 22:08:33 +0900586
satok28bd03b2010-12-03 16:39:16 +0900587inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900588 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900589 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900590 // Skip the ' or other letter and continue deeper
591 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
592}
593
satoke07baa62010-12-09 21:55:40 +0900594inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900595 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900596 if (inputIndex < 0 || inputIndex >= inputLength) return false;
597 const int currentChar = *getInputCharsAt(inputIndex);
598 const int leftIndex = inputIndex - 1;
599 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900600 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900601 int i = 0;
602 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
603 if (leftChars[i++] == currentChar) return true;
604 }
605 }
606 const int rightIndex = inputIndex + 1;
607 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900608 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900609 int i = 0;
610 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
611 if (rightChars[i++] == currentChar) return true;
612 }
613 }
614 return false;
615}
616
Jean Chalarda5d58492011-02-18 17:50:58 +0900617// In the following function, c is the current character of the dictionary word
618// currently examined.
619// currentChars is an array containing the keys close to the character the
620// user actually typed at the same position. We want to see if c is in it: if so,
621// then the word contains at that position a character close to what the user
622// typed.
623// What the user typed is actually the first character of the array.
624// Notice : accented characters do not have a proximity list, so they are alone
625// in their list. The non-accented version of the character should be considered
626// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900627inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
628 const int *currentChars, const unsigned short c, const int skipPos,
629 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900630 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900631
632 // The first char in the array is what user typed. If it matches right away,
633 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900634 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900635 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
636
637 // If one of those is true, we should not check for close characters at all.
638 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
639 return UNRELATED_CHAR;
640
641 // If the non-accented, lowercased version of that first character matches c,
642 // then we have a non-accented version of the accented character the user
643 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900644 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900645 return NEAR_PROXIMITY_CHAR;
646
647 // Not an exact nor an accent-alike match: search the list of close keys
648 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900649 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900650 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900651 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900652 ++j;
653 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900654
655 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900656 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900657}
658
Jean Chalardca5ef282011-06-17 15:36:26 +0900659inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
660 const uint8_t* const root, const uint8_t flags, int pos,
661 const int inputIndex, const int matchWeight, const int skipPos,
662 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
663 int* nextLetters, const int nextLettersSize) {
664
665 const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
666 const bool hasAlternates = hasAlternateSpellings(flags);
667 if (isSameAsTyped && !hasAlternates) return;
668
669 if (depth >= MIN_SUGGEST_DEPTH) {
670 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
671 excessivePos, transposedPos, freq, sameLength);
672 if (!isSameAsTyped)
673 addWord(word, depth + 1, finalFreq);
674 if (hasAlternates)
675 addWordAlternatesSpellings(DICT_ROOT, pos, flags, finalFreq);
676 }
677
678 if (sameLength && depth >= mInputLength && skipPos < 0) {
679 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
680 }
681}
682
Jean Chalardbc90c722011-06-20 21:09:04 +0900683#ifndef NEW_DICTIONARY_FORMAT
684// TODO: Don't forget to bring inline functions back to over where they are used.
satokcdbbea72010-12-08 16:04:16 +0900685
Jean Chalardbc90c722011-06-20 21:09:04 +0900686// The following functions will be entirely replaced with new implementations.
687void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
688 const int excessivePos, const int transposedPos,int *nextLetters,
689 const int nextLettersSize) {
690 int initialPosition = initialPos;
691 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
692 getWordsRec(count, initialPosition, 0,
693 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
694 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
695 nextLettersSize);
696}
Jean Chalardca5ef282011-06-17 15:36:26 +0900697
Jean Chalardbc90c722011-06-20 21:09:04 +0900698void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
699 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
700 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
701 const int transposedPos, int *nextLetters, const int nextLettersSize) {
702 int siblingPos = pos;
703 for (int i = 0; i < childrenCount; ++i) {
704 int newCount;
705 int newChildPosition;
706 bool newTraverseAllNodes;
707 int newMatchRate;
708 int newInputIndex;
709 int newDiffs;
710 int newSiblingPos;
711 int newOutputIndex;
712 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
713 traverseAllNodes, matchWeight, inputIndex, diffs,
714 skipPos, excessivePos, transposedPos,
715 nextLetters, nextLettersSize,
716 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
717 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
718 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900719
Jean Chalardbc90c722011-06-20 21:09:04 +0900720 if (needsToTraverseChildrenNodes) {
721 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
722 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
723 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900724 }
satok48e432c2010-12-06 17:38:58 +0900725 }
satok48e432c2010-12-06 17:38:58 +0900726}
727
satokaee09dc2010-12-09 19:21:51 +0900728inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
729 unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900730 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900731 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900732 int maxFreq = 0;
733 int depth = 0;
734 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900735 bool terminal = false;
736
satokaee09dc2010-12-09 19:21:51 +0900737 mStackChildCount[0] = count;
738 mStackSiblingPos[0] = pos;
739
740 while (depth >= 0) {
741 if (mStackChildCount[depth] > 0) {
742 --mStackChildCount[depth];
743 int firstChildPos;
744 int newFreq;
745 int siblingPos = mStackSiblingPos[depth];
746 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
747 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
748 &siblingPos);
749 mStackSiblingPos[depth] = siblingPos;
750 if (depth == (inputLength - 1)) {
751 // Traverse sibling node
752 if (terminal) {
753 if (newFreq > maxFreq) {
754 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
755 if (DEBUG_DICT && DEBUG_NODE) {
756 char s[inputLength + 1];
757 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
758 s[inputLength] = 0;
759 LOGI("New missing space word found: %d > %d (%s), %d, %d",
760 newFreq, maxFreq, s, inputLength, depth);
761 }
762 maxFreq = newFreq;
763 }
764 }
765 } else if (needsToTraverseChildrenNodes) {
766 // Traverse children nodes
767 ++depth;
768 mStackChildCount[depth] = count;
769 mStackSiblingPos[depth] = firstChildPos;
770 }
771 } else {
772 // Traverse parent node
773 --depth;
satok662fe692010-12-08 17:05:39 +0900774 }
775 }
satokaee09dc2010-12-09 19:21:51 +0900776
777 word[inputLength] = 0;
778 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900779}
780
781inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900782 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
783 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
784 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900785 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900786 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900787 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
788 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900789 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900790 if (DEBUG_DICT) {
791 assert(inputC <= U_SHORT_MAX);
792 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900793 const unsigned short baseLowerC = toBaseLowerCase(c);
794 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900795 const bool hasChild = *newChildPosition != 0;
796 if (matched) {
797 word[depth] = c;
798 if (DEBUG_DICT && DEBUG_NODE) {
799 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900800 if (*newTerminal) {
801 LOGI("Terminal %d", *newFreq);
802 }
satok662fe692010-12-08 17:05:39 +0900803 }
satokaee09dc2010-12-09 19:21:51 +0900804 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900805 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900806 return true;
807 } else {
808 return false;
809 }
810 } else {
811 // If this node is not user typed character, this method treats this word as unmatched.
812 // Thus newTerminal shouldn't be true.
813 *newTerminal = false;
814 return false;
satok662fe692010-12-08 17:05:39 +0900815 }
satok662fe692010-12-08 17:05:39 +0900816}
Jean Chalard8124e642011-06-16 22:33:41 +0900817
818// TODO: use uint32_t instead of unsigned short
819bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
820 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900821 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900822 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900823 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900824 }
825}
826
Jean Chalard17e44a72011-06-16 22:51:11 +0900827
828// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900829int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
830 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900831 // returns address of bigram data of that word
832 // return -99 if not found
833
834 int count = Dictionary::getCount(DICT_ROOT, &pos);
835 unsigned short currentChar = (unsigned short) word[offset];
836 for (int j = 0; j < count; j++) {
837 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
838 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
839 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
840 if (c == currentChar) {
841 if (offset == length - 1) {
842 if (terminal) {
843 return (pos+1);
844 }
845 } else {
846 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900847 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900848 if (t > 0) {
849 return t;
850 }
851 }
852 }
853 }
854 if (terminal) {
855 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
856 }
857 // There could be two instances of each alphabet - upper and lower case. So continue
858 // looking ...
859 }
860 return NOT_VALID_WORD;
861}
862
Jean Chalardbc90c722011-06-20 21:09:04 +0900863
864// The following functions will be modified.
865bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
866 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
867 const int secondWordLength, const bool isSpaceProximity) {
868 if (inputLength >= MAX_WORD_LENGTH) return false;
869 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
870 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
871 return false;
872 const int newWordLength = firstWordLength + secondWordLength + 1;
873 // Allocating variable length array on stack
874 unsigned short word[newWordLength];
875 const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord);
876 if (DEBUG_DICT) {
877 LOGI("First freq: %d", firstFreq);
878 }
879 if (firstFreq <= 0) return false;
880
881 for (int i = 0; i < firstWordLength; ++i) {
882 word[i] = mWord[i];
883 }
884
885 const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord);
886 if (DEBUG_DICT) {
887 LOGI("Second freq: %d", secondFreq);
888 }
889 if (secondFreq <= 0) return false;
890
891 word[firstWordLength] = SPACE;
892 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
893 word[i] = mWord[i - firstWordLength - 1];
894 }
895
896 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
897 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
898 if (DEBUG_DICT) {
899 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
900 TYPED_LETTER_MULTIPLIER);
901 }
902 addWord(word, newWordLength, pairFreq);
903 return true;
904}
905
906inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
907 const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
908 const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
909 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
910 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
911 int *nextSiblingPosition, int *nextOutputIndex) {
912 if (DEBUG_DICT) {
913 int inputCount = 0;
914 if (skipPos >= 0) ++inputCount;
915 if (excessivePos >= 0) ++inputCount;
916 if (transposedPos >= 0) ++inputCount;
917 assert(inputCount <= 1);
918 }
919 unsigned short c;
920 int childPosition;
921 bool terminal;
922 int freq;
923 bool isSameAsUserTypedLength = false;
924
925 const uint8_t flags = 0; // No flags for now
926
927 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
928
929 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
930 &c, &childPosition, &terminal, &freq);
931 *nextOutputIndex = depth + 1;
932
933 const bool needsToTraverseChildrenNodes = childPosition != 0;
934
935 // If we are only doing traverseAllNodes, no need to look at the typed characters.
936 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
937 mWord[depth] = c;
938 if (traverseAllNodes && terminal) {
939 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
940 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
941 }
942 if (!needsToTraverseChildrenNodes) return false;
943 *newTraverseAllNodes = traverseAllNodes;
944 *newMatchRate = matchWeight;
945 *newDiffs = diffs;
946 *newInputIndex = inputIndex;
947 } else {
948 const int *currentChars = getInputCharsAt(inputIndex);
949
950 if (transposedPos >= 0) {
951 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
952 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
953 }
954
955 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
956 transposedPos);
957 if (UNRELATED_CHAR == matchedProximityCharId) return false;
958 mWord[depth] = c;
959 // If inputIndex is greater than mInputLength, that means there is no
960 // proximity chars. So, we don't need to check proximity.
961 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
962 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
963 }
964 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
965 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
966 if (isSameAsUserTypedLength && terminal) {
967 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
968 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
969 }
970 if (!needsToTraverseChildrenNodes) return false;
971 // Start traversing all nodes after the index exceeds the user typed length
972 *newTraverseAllNodes = isSameAsUserTypedLength;
973 *newMatchRate = matchWeight;
974 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
975 *newInputIndex = inputIndex + 1;
976 }
977 // Optimization: Prune out words that are too long compared to how much was typed.
978 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
979 return false;
980 }
981
982 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
983 // TODO: Check if this can be isSameAsUserTypedLength only.
984 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
985 *newTraverseAllNodes = true;
986 }
987 // get the count of nodes and increment childAddress.
988 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
989 *newChildPosition = childPosition;
990 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
991 return needsToTraverseChildrenNodes;
992}
993
994#else // NEW_DICTIONARY_FORMAT
995#endif // NEW_DICTIONARY_FORMAT
996
satok30088252010-12-01 21:22:15 +0900997} // namespace latinime