blob: 9bdd916f0c88b8f06235c05dfc11a2bc45e8577b [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok48e432c2010-12-06 17:38:58 +090018#include <assert.h>
satok30088252010-12-01 21:22:15 +090019#include <string.h>
20
satoke808e432010-12-02 14:53:24 +090021#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090022
satok30088252010-12-01 21:22:15 +090023#include "basechars.h"
24#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090025#include "dictionary.h"
26#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090027
Jean Chalard1059f272011-06-28 20:45:05 +090028#ifdef NEW_DICTIONARY_FORMAT
29#include "binary_format.h"
30#endif // NEW_DICTIONARY_FORMAT
31
satok30088252010-12-01 21:22:15 +090032namespace latinime {
33
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090034const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
35 { { 'a', 'e' },
36 { 'o', 'e' },
37 { 'u', 'e' } };
38
Jean Chalard293ece02011-06-16 20:55:16 +090039// TODO: check the header
40UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok662fe692010-12-08 17:05:39 +090041 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok18c28f42010-12-02 18:11:54 +090042 const bool isLatestDictVersion)
Jean Chalard1059f272011-06-28 20:45:05 +090043#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090044 : DICT_ROOT(streamStart),
Jean Chalard1059f272011-06-28 20:45:05 +090045#else // NEW_DICTIONARY_FORMAT
46 : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
47#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +090048 MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok662fe692010-12-08 17:05:39 +090049 MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
50 TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalard1059f272011-06-28 20:45:05 +090051#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090052 ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
Jean Chalard1059f272011-06-28 20:45:05 +090053#else // NEW_DICTIONARY_FORMAT
54 // TODO : remove this variable.
55 ROOT_POS(0),
56#endif // NEW_DICTIONARY_FORMAT
satok1d7eaf82011-07-13 10:32:02 +090057 BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
Jean Chalarda787dba2011-03-04 12:17:48 +090058 MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +090059 if (DEBUG_DICT) {
60 LOGI("UnigramDictionary - constructor");
61 }
satok30088252010-12-01 21:22:15 +090062}
63
satok18c28f42010-12-02 18:11:54 +090064UnigramDictionary::~UnigramDictionary() {}
satok30088252010-12-01 21:22:15 +090065
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090066static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
67 const int MAX_PROXIMITY_CHARS) {
68 return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
69}
70
71bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
72
73 // There can't be a digraph if we don't have at least 2 characters to examine
74 if (i + 2 > codesSize) return false;
75
76 // Search for the first char of some digraph
77 int lastDigraphIndex = -1;
78 const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
79 for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
80 lastDigraphIndex >= 0; --lastDigraphIndex) {
81 if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
82 }
83 // No match: return early
84 if (lastDigraphIndex < 0) return false;
85
86 // It's an interesting digraph if the second char matches too.
87 return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
88}
89
90// Mostly the same arguments as the non-recursive version, except:
91// codes is the original value. It points to the start of the work buffer, and gets passed as is.
92// codesSize is the size of the user input (thus, it is the size of codesSrc).
93// codesDest is the current point in the work buffer.
94// codesSrc is the current point in the user-input, original, content-unmodified buffer.
95// codesRemain is the remaining size in codesSrc.
satok1d7eaf82011-07-13 10:32:02 +090096void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090097 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
98 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080099 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900100
Jean Chalarda787dba2011-03-04 12:17:48 +0900101 if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
102 for (int i = 0; i < codesRemain; ++i) {
103 if (isDigraph(codesSrc, i, codesRemain)) {
104 // Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900105
Jean Chalarda787dba2011-03-04 12:17:48 +0900106 // Copy the word up to the first char of the digraph, then continue processing
107 // on the remaining part of the word, skipping the second char of the digraph.
108 // In our example, copy "pru" and continue running on "fen"
109 // Make i the index of the second char of the digraph for simplicity. Forgetting
110 // to do that results in an infinite recursion so take care!
111 ++i;
112 memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
113 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
114 codesBuffer, codesBufferSize, flags,
115 codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
116 currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
117 frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900118
Jean Chalarda787dba2011-03-04 12:17:48 +0900119 // Copy the second char of the digraph in place, then continue processing on
120 // the remaining part of the word.
121 // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
122 memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
123 BYTES_IN_ONE_CHAR);
124 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
125 codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
126 codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
127 outWords, frequencies);
128 return;
129 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900130 }
131 }
132
133 // If we come here, we hit the end of the word: let's check it against the dictionary.
134 // In our example, we'll come here once for "prufen" and then once for "pruefen".
135 // If the word contains several digraphs, we'll come it for the product of them.
136 // eg. if the word is "ueberpruefen" we'll test, in order, against
137 // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
138 const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
139 if (0 != remainingBytes)
140 memcpy(codesDest, codesSrc, remainingBytes);
141
142 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
143 (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
144}
145
satok1d7eaf82011-07-13 10:32:02 +0900146int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900147 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
148 unsigned short *outWords, int *frequencies) {
149
150 if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
151 { // Incrementally tune the word and try all possibilities
152 int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
153 getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalarda787dba2011-03-04 12:17:48 +0900154 codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900155 } else { // Normal processing
156 getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
157 outWords, frequencies);
158 }
159
satok817e5172011-03-04 06:06:45 -0800160 PROF_START(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900161 // Get the word count
162 int suggestedWordsCount = 0;
163 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
164 suggestedWordsCount++;
165 }
166
167 if (DEBUG_DICT) {
168 LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard980d6b62011-06-30 17:02:23 +0900169 /// Print the returned words
170 for (int j = 0; j < suggestedWordsCount; ++j) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700171#ifdef FLAG_DBG
Jean Chalard980d6b62011-06-30 17:02:23 +0900172 short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
173 char s[MAX_WORD_LENGTH];
174 for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
175 LOGI("%s %i", s, mFrequencies[j]);
satok787945b2011-07-14 08:32:57 +0900176#endif
Jean Chalard980d6b62011-06-30 17:02:23 +0900177 }
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900178 LOGI("Next letters: ");
179 for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
180 if (mNextLettersFrequency[k] > 0) {
181 LOGI("%c = %d,", k, mNextLettersFrequency[k]);
182 }
183 }
184 }
satok817e5172011-03-04 06:06:45 -0800185 PROF_END(20);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900186 PROF_CLOSE;
187 return suggestedWordsCount;
188}
189
satok1d7eaf82011-07-13 10:32:02 +0900190void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900191 const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
192 unsigned short *outWords, int *frequencies) {
193
satok61e2f852011-01-05 14:13:07 +0900194 PROF_OPEN;
195 PROF_START(0);
satok1d7eaf82011-07-13 10:32:02 +0900196 initSuggestions(
197 proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);
satok54fe9e02010-12-13 14:42:35 +0900198 if (DEBUG_DICT) assert(codesSize == mInputLength);
199
satoka3d78f62010-12-09 22:08:33 +0900200 const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok61e2f852011-01-05 14:13:07 +0900201 PROF_END(0);
satok30088252010-12-01 21:22:15 +0900202
satok61e2f852011-01-05 14:13:07 +0900203 PROF_START(1);
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900204 getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
satok61e2f852011-01-05 14:13:07 +0900205 PROF_END(1);
206
207 PROF_START(2);
satok662fe692010-12-08 17:05:39 +0900208 // Suggestion with missing character
209 if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok30088252010-12-01 21:22:15 +0900210 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900211 if (DEBUG_DICT) {
212 LOGI("--- Suggest missing characters %d", i);
213 }
satok54fe9e02010-12-13 14:42:35 +0900214 getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
satokcdbbea72010-12-08 16:04:16 +0900215 }
216 }
satok61e2f852011-01-05 14:13:07 +0900217 PROF_END(2);
satokcdbbea72010-12-08 16:04:16 +0900218
satok61e2f852011-01-05 14:13:07 +0900219 PROF_START(3);
satok662fe692010-12-08 17:05:39 +0900220 // Suggestion with excessive character
satok54fe9e02010-12-13 14:42:35 +0900221 if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
222 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satokcdbbea72010-12-08 16:04:16 +0900223 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900224 if (DEBUG_DICT) {
225 LOGI("--- Suggest excessive characters %d", i);
226 }
satok54fe9e02010-12-13 14:42:35 +0900227 getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
satok30088252010-12-01 21:22:15 +0900228 }
229 }
satok61e2f852011-01-05 14:13:07 +0900230 PROF_END(3);
satok30088252010-12-01 21:22:15 +0900231
satok61e2f852011-01-05 14:13:07 +0900232 PROF_START(4);
satoka3d78f62010-12-09 22:08:33 +0900233 // Suggestion with transposed characters
234 // Only suggest words that length is mInputLength
235 if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
236 for (int i = 0; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900237 if (DEBUG_DICT) {
238 LOGI("--- Suggest transposed characters %d", i);
239 }
satok54fe9e02010-12-13 14:42:35 +0900240 getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
satoka3d78f62010-12-09 22:08:33 +0900241 }
242 }
satok61e2f852011-01-05 14:13:07 +0900243 PROF_END(4);
satoka3d78f62010-12-09 22:08:33 +0900244
satok61e2f852011-01-05 14:13:07 +0900245 PROF_START(5);
satok662fe692010-12-08 17:05:39 +0900246 // Suggestions with missing space
satok54fe9e02010-12-13 14:42:35 +0900247 if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
248 && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok662fe692010-12-08 17:05:39 +0900249 for (int i = 1; i < codesSize; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900250 if (DEBUG_DICT) {
251 LOGI("--- Suggest missing space characters %d", i);
252 }
satok662fe692010-12-08 17:05:39 +0900253 getMissingSpaceWords(mInputLength, i);
254 }
255 }
satok61e2f852011-01-05 14:13:07 +0900256 PROF_END(5);
satok817e5172011-03-04 06:06:45 -0800257
258 PROF_START(6);
Jean Chalarde93b1f222011-06-01 17:12:25 +0900259 if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok817e5172011-03-04 06:06:45 -0800260 // The first and last "mistyped spaces" are taken care of by excessive character handling
261 for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900262 if (DEBUG_DICT) {
263 LOGI("--- Suggest words with proximity space %d", i);
264 }
satok817e5172011-03-04 06:06:45 -0800265 const int x = xcoordinates[i];
266 const int y = ycoordinates[i];
Ken Wakasade3070a2011-03-19 09:16:42 +0900267 if (DEBUG_PROXIMITY_INFO) {
satok817e5172011-03-04 06:06:45 -0800268 LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
269 i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasade3070a2011-03-19 09:16:42 +0900270 }
satok817e5172011-03-04 06:06:45 -0800271 if (proximityInfo->hasSpaceProximity(x, y)) {
272 getMistypedSpaceWords(mInputLength, i);
273 }
satok817e5172011-03-04 06:06:45 -0800274 }
275 }
276 PROF_END(6);
satok30088252010-12-01 21:22:15 +0900277}
278
satok1d7eaf82011-07-13 10:32:02 +0900279void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
280 const int *ycoordinates, const int *codes, const int codesSize,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900281 unsigned short *outWords, int *frequencies) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900282 if (DEBUG_DICT) {
283 LOGI("initSuggest");
284 }
satok30088252010-12-01 21:22:15 +0900285 mFrequencies = frequencies;
286 mOutputChars = outWords;
satok30088252010-12-01 21:22:15 +0900287 mInputLength = codesSize;
288 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
satok1d7eaf82011-07-13 10:32:02 +0900289 proximityInfo->setInputParams(codes, codesSize);
290 mProximityInfo = proximityInfo;
satok30088252010-12-01 21:22:15 +0900291}
292
Jean Chalard8124e642011-06-16 22:33:41 +0900293static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok30088252010-12-01 21:22:15 +0900294 if (c < nextLettersSize) {
295 nextLetters[c]++;
296 }
297}
298
satok662fe692010-12-08 17:05:39 +0900299// TODO: We need to optimize addWord by using STL or something
Jean Chalardca5ef282011-06-17 15:36:26 +0900300// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok28bd03b2010-12-03 16:39:16 +0900301bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok30088252010-12-01 21:22:15 +0900302 word[length] = 0;
satok662fe692010-12-08 17:05:39 +0900303 if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700304#ifdef FLAG_DBG
satok30088252010-12-01 21:22:15 +0900305 char s[length + 1];
306 for (int i = 0; i <= length; i++) s[i] = word[i];
satok662fe692010-12-08 17:05:39 +0900307 LOGI("Found word = %s, freq = %d", s, frequency);
satok787945b2011-07-14 08:32:57 +0900308#endif
satok30088252010-12-01 21:22:15 +0900309 }
satokf5cded12010-12-06 21:28:24 +0900310 if (length > MAX_WORD_LENGTH) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900311 if (DEBUG_DICT) {
312 LOGI("Exceeded max word length.");
313 }
satokf5cded12010-12-06 21:28:24 +0900314 return false;
315 }
satok30088252010-12-01 21:22:15 +0900316
317 // Find the right insertion point
318 int insertAt = 0;
319 while (insertAt < MAX_WORDS) {
Jean Chalard17e44a72011-06-16 22:51:11 +0900320 // TODO: How should we sort words with the same frequency?
321 if (frequency > mFrequencies[insertAt]) {
satok30088252010-12-01 21:22:15 +0900322 break;
323 }
324 insertAt++;
325 }
326 if (insertAt < MAX_WORDS) {
satokcdbbea72010-12-08 16:04:16 +0900327 if (DEBUG_DICT) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700328#ifdef FLAG_DBG
satokcdbbea72010-12-08 16:04:16 +0900329 char s[length + 1];
330 for (int i = 0; i <= length; i++) s[i] = word[i];
satokb2e5e592011-04-26 14:50:54 +0900331 LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satok787945b2011-07-14 08:32:57 +0900332#endif
satokcdbbea72010-12-08 16:04:16 +0900333 }
satok30088252010-12-01 21:22:15 +0900334 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
335 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
336 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
337 mFrequencies[insertAt] = frequency;
338 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
satok715514d2010-12-02 20:19:59 +0900339 (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
satok30088252010-12-01 21:22:15 +0900340 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok715514d2010-12-02 20:19:59 +0900341 unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
satok30088252010-12-01 21:22:15 +0900342 while (length--) {
343 *dest++ = *word++;
344 }
345 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +0900346 if (DEBUG_DICT) {
347 LOGI("Added word at %d", insertAt);
348 }
satok30088252010-12-01 21:22:15 +0900349 return true;
350 }
351 return false;
352}
353
Jean Chalard8124e642011-06-16 22:33:41 +0900354static inline unsigned short toBaseLowerCase(unsigned short c) {
satok30088252010-12-01 21:22:15 +0900355 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
356 c = BASE_CHARS[c];
357 }
358 if (c >='A' && c <= 'Z') {
359 c |= 32;
360 } else if (c > 127) {
361 c = latin_tolower(c);
362 }
363 return c;
364}
365
satok715514d2010-12-02 20:19:59 +0900366static const char QUOTE = '\'';
satok662fe692010-12-08 17:05:39 +0900367static const char SPACE = ' ';
satok30088252010-12-01 21:22:15 +0900368
satok54fe9e02010-12-13 14:42:35 +0900369void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satoka3d78f62010-12-09 22:08:33 +0900370 const int excessivePos, const int transposedPos, int *nextLetters,
371 const int nextLettersSize, const int maxDepth) {
satok54fe9e02010-12-13 14:42:35 +0900372 if (DEBUG_DICT) {
373 LOGI("getSuggestionCandidates %d", maxDepth);
374 assert(transposedPos + 1 < mInputLength);
375 assert(excessivePos < mInputLength);
376 assert(missingPos < mInputLength);
377 }
satok662fe692010-12-08 17:05:39 +0900378 int rootPosition = ROOT_POS;
Jean Chalard980d6b62011-06-30 17:02:23 +0900379 // Get the number of children of root, then increment the position
Jean Chalard293ece02011-06-16 20:55:16 +0900380 int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satokd2997922010-12-07 13:08:39 +0900381 int depth = 0;
382
383 mStackChildCount[0] = childCount;
384 mStackTraverseAll[0] = (mInputLength <= 0);
385 mStackNodeFreq[0] = 1;
386 mStackInputIndex[0] = 0;
387 mStackDiffs[0] = 0;
388 mStackSiblingPos[0] = rootPosition;
Jean Chalard17e44a72011-06-16 22:51:11 +0900389 mStackOutputIndex[0] = 0;
satokd2997922010-12-07 13:08:39 +0900390
satok662fe692010-12-08 17:05:39 +0900391 // Depth first search
satokd2997922010-12-07 13:08:39 +0900392 while (depth >= 0) {
393 if (mStackChildCount[depth] > 0) {
394 --mStackChildCount[depth];
395 bool traverseAllNodes = mStackTraverseAll[depth];
Jean Chalardf5f834a2011-02-22 15:12:46 +0900396 int matchWeight = mStackNodeFreq[depth];
satokd2997922010-12-07 13:08:39 +0900397 int inputIndex = mStackInputIndex[depth];
398 int diffs = mStackDiffs[depth];
399 int siblingPos = mStackSiblingPos[depth];
Jean Chalard17e44a72011-06-16 22:51:11 +0900400 int outputIndex = mStackOutputIndex[depth];
satokd2997922010-12-07 13:08:39 +0900401 int firstChildPos;
satoka3d78f62010-12-09 22:08:33 +0900402 // depth will never be greater than maxDepth because in that case,
satokd2997922010-12-07 13:08:39 +0900403 // needsToTraverseChildrenNodes should be false
Jean Chalard17e44a72011-06-16 22:51:11 +0900404 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900405 maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
406 excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
407 &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
Jean Chalard17e44a72011-06-16 22:51:11 +0900408 &siblingPos, &outputIndex);
satok662fe692010-12-08 17:05:39 +0900409 // Update next sibling pos
satokd2997922010-12-07 13:08:39 +0900410 mStackSiblingPos[depth] = siblingPos;
411 if (needsToTraverseChildrenNodes) {
412 // Goes to child node
413 ++depth;
414 mStackChildCount[depth] = childCount;
415 mStackTraverseAll[depth] = traverseAllNodes;
Jean Chalardf5f834a2011-02-22 15:12:46 +0900416 mStackNodeFreq[depth] = matchWeight;
satokd2997922010-12-07 13:08:39 +0900417 mStackInputIndex[depth] = inputIndex;
418 mStackDiffs[depth] = diffs;
419 mStackSiblingPos[depth] = firstChildPos;
Jean Chalard17e44a72011-06-16 22:51:11 +0900420 mStackOutputIndex[depth] = outputIndex;
satokd2997922010-12-07 13:08:39 +0900421 }
422 } else {
satokcdbbea72010-12-08 16:04:16 +0900423 // Goes to parent sibling node
satokd2997922010-12-07 13:08:39 +0900424 --depth;
425 }
426 }
427}
428
satokb2e5e592011-04-26 14:50:54 +0900429static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
430static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
431 return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
432}
433
434static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
435inline static void multiplyIntCapped(const int multiplier, int *base) {
436 const int temp = *base;
437 if (temp != S_INT_MAX) {
438 // Branch if multiplier == 2 for the optimization
439 if (multiplier == 2) {
440 *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
441 } else {
442 const int tempRetval = temp * multiplier;
443 *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
444 }
445 }
446}
447
448inline static int powerIntCapped(const int base, const int n) {
satok0b6b0a52011-04-27 16:29:27 +0900449 if (base == 2) {
satokb2e5e592011-04-26 14:50:54 +0900450 return n < 31 ? 1 << n : S_INT_MAX;
satokf7425bb2011-01-05 16:37:53 +0900451 } else {
satokb2e5e592011-04-26 14:50:54 +0900452 int ret = base;
453 for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
454 return ret;
455 }
456}
457
458inline static void multiplyRate(const int rate, int *freq) {
459 if (*freq != S_INT_MAX) {
460 if (*freq > 1000000) {
461 *freq /= 100;
462 multiplyIntCapped(rate, freq);
463 } else {
464 multiplyIntCapped(rate, freq);
465 *freq /= 100;
466 }
satokf7425bb2011-01-05 16:37:53 +0900467 }
468}
469
satok4c981d32011-04-19 13:58:42 +0900470inline static int calcFreqForSplitTwoWords(
satokd8db9f82011-05-18 15:31:04 +0900471 const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,
472 const int firstFreq, const int secondFreq, const bool isSpaceProximity) {
satok4c981d32011-04-19 13:58:42 +0900473 if (firstWordLength == 0 || secondWordLength == 0) {
474 return 0;
475 }
476 const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
477 int tempFirstFreq = firstFreq;
478 multiplyRate(firstDemotionRate, &tempFirstFreq);
479
480 const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
481 int tempSecondFreq = secondFreq;
482 multiplyRate(secondDemotionRate, &tempSecondFreq);
483
484 const int totalLength = firstWordLength + secondWordLength;
485
486 // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
487 // length.
488 int totalFreq = tempFirstFreq + tempSecondFreq;
489
490 // This is a workaround to try offsetting the not-enough-demotion which will be done in
491 // calcNormalizedScore in Utils.java.
492 // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
493 // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
494 // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
495 const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
496 multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
497
498 // At this moment, totalFreq is calculated by the following formula:
499 // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
500 // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
501
satokb2e5e592011-04-26 14:50:54 +0900502 multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
satok4c981d32011-04-19 13:58:42 +0900503
504 // This is another workaround to offset the demotion which will be done in
505 // calcNormalizedScore in Utils.java.
506 // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
507 // the same amount because we already have adjusted the synthetic freq of this "missing or
508 // mistyped space" suggestion candidate above in this method.
509 const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
510 multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
511
satokd8db9f82011-05-18 15:31:04 +0900512 if (isSpaceProximity) {
513 // A word pair with one space proximity correction
514 if (DEBUG_DICT) {
515 LOGI("Found a word pair with space proximity correction.");
516 }
517 multiplyIntCapped(typedLetterMultiplier, &totalFreq);
518 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
519 }
520
satok4c981d32011-04-19 13:58:42 +0900521 multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
522 return totalFreq;
523}
524
satok817e5172011-03-04 06:06:45 -0800525bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
526 return getSplitTwoWordsSuggestion(
satokd8db9f82011-05-18 15:31:04 +0900527 inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);
satok817e5172011-03-04 06:06:45 -0800528}
529
530bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
531 return getSplitTwoWordsSuggestion(
532 inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
satokd8db9f82011-05-18 15:31:04 +0900533 inputLength - spaceProximityPos - 1, true);
satok817e5172011-03-04 06:06:45 -0800534}
535
satok58c49b92011-01-27 03:23:39 +0900536inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
Jean Chalardf5f834a2011-02-22 15:12:46 +0900537 const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900538 const int freq, const bool sameLength) const {
satoka3d78f62010-12-09 22:08:33 +0900539 // TODO: Demote by edit distance
Jean Chalardf5f834a2011-02-22 15:12:46 +0900540 int finalFreq = freq * matchWeight;
Jean Chalard07a84062011-03-03 10:22:10 +0900541 if (skipPos >= 0) {
satokdc5301e2011-04-11 16:14:45 +0900542 if (mInputLength >= 2) {
543 const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
544 * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
545 / (10 * mInputLength
546 - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
satok9674f652011-04-20 17:15:27 +0900547 if (DEBUG_DICT_FULL) {
satok72bc17e2011-04-13 17:23:27 +0900548 LOGI("Demotion rate for missing character is %d.", demotionRate);
549 }
satokdc5301e2011-04-11 16:14:45 +0900550 multiplyRate(demotionRate, &finalFreq);
Jean Chalard07a84062011-03-03 10:22:10 +0900551 } else {
552 finalFreq = 0;
553 }
554 }
satokf7425bb2011-01-05 16:37:53 +0900555 if (transposedPos >= 0) multiplyRate(
556 WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900557 if (excessivePos >= 0) {
satokf7425bb2011-01-05 16:37:53 +0900558 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900559 if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
satok1d7eaf82011-07-13 10:32:02 +0900560 // If an excessive character is not adjacent to the left char or the right char,
561 // we will demote this word.
satokf7425bb2011-01-05 16:37:53 +0900562 multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900563 }
564 }
satok58c49b92011-01-27 03:23:39 +0900565 int lengthFreq = TYPED_LETTER_MULTIPLIER;
satokb2e5e592011-04-26 14:50:54 +0900566 multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);
Jean Chalardf5f834a2011-02-22 15:12:46 +0900567 if (lengthFreq == matchWeight) {
satok72bc17e2011-04-13 17:23:27 +0900568 // Full exact match
Jean Chalard8dc754a2011-01-27 14:20:22 +0900569 if (depth > 1) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900570 if (DEBUG_DICT) {
571 LOGI("Found full matched word.");
572 }
Jean Chalard8dc754a2011-01-27 14:20:22 +0900573 multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
574 }
575 if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
Jean Chalarda5d58492011-02-18 17:50:58 +0900576 finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900577 }
satok9674f652011-04-20 17:15:27 +0900578 } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {
satok9d2a3022011-04-14 19:13:34 +0900579 // A word with proximity corrections
satok72bc17e2011-04-13 17:23:27 +0900580 if (DEBUG_DICT) {
581 LOGI("Found one proximity correction.");
582 }
satokb2e5e592011-04-26 14:50:54 +0900583 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);
satok9d2a3022011-04-14 19:13:34 +0900584 multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
satok58c49b92011-01-27 03:23:39 +0900585 }
satok9674f652011-04-20 17:15:27 +0900586 if (DEBUG_DICT) {
587 LOGI("calc: %d, %d", depth, sameLength);
588 }
satokb2e5e592011-04-26 14:50:54 +0900589 if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);
satok54fe9e02010-12-13 14:42:35 +0900590 return finalFreq;
591}
satoka3d78f62010-12-09 22:08:33 +0900592
satok28bd03b2010-12-03 16:39:16 +0900593inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900594 const int inputIndex, const int skipPos, const int depth) {
satok8fbd5522011-02-22 17:28:55 +0900595 const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
satok28bd03b2010-12-03 16:39:16 +0900596 // Skip the ' or other letter and continue deeper
597 return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
598}
599
satoke07baa62010-12-09 21:55:40 +0900600inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
Jean Chalard07a84062011-03-03 10:22:10 +0900601 const int inputLength) const {
satoke07baa62010-12-09 21:55:40 +0900602 if (inputIndex < 0 || inputIndex >= inputLength) return false;
603 const int currentChar = *getInputCharsAt(inputIndex);
604 const int leftIndex = inputIndex - 1;
605 if (leftIndex >= 0) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900606 const int *leftChars = getInputCharsAt(leftIndex);
satoke07baa62010-12-09 21:55:40 +0900607 int i = 0;
608 while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
609 if (leftChars[i++] == currentChar) return true;
610 }
611 }
612 const int rightIndex = inputIndex + 1;
613 if (rightIndex < inputLength) {
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900614 const int *rightChars = getInputCharsAt(rightIndex);
satoke07baa62010-12-09 21:55:40 +0900615 int i = 0;
616 while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
617 if (rightChars[i++] == currentChar) return true;
618 }
619 }
620 return false;
621}
622
Jean Chalarda5d58492011-02-18 17:50:58 +0900623// In the following function, c is the current character of the dictionary word
624// currently examined.
625// currentChars is an array containing the keys close to the character the
626// user actually typed at the same position. We want to see if c is in it: if so,
627// then the word contains at that position a character close to what the user
628// typed.
629// What the user typed is actually the first character of the array.
630// Notice : accented characters do not have a proximity list, so they are alone
631// in their list. The non-accented version of the character should be considered
632// "close", but not the other keys close to the non-accented version.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900633inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
634 const int *currentChars, const unsigned short c, const int skipPos,
635 const int excessivePos, const int transposedPos) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900636 const unsigned short baseLowerC = toBaseLowerCase(c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900637
638 // The first char in the array is what user typed. If it matches right away,
639 // that means the user typed that same char for this pos.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900640 if (currentChars[0] == baseLowerC || currentChars[0] == c)
Jean Chalarda5d58492011-02-18 17:50:58 +0900641 return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
642
643 // If one of those is true, we should not check for close characters at all.
644 if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
645 return UNRELATED_CHAR;
646
647 // If the non-accented, lowercased version of that first character matches c,
648 // then we have a non-accented version of the accented character the user
649 // typed. Treat it as a close char.
Jean Chalardf5f834a2011-02-22 15:12:46 +0900650 if (toBaseLowerCase(currentChars[0]) == baseLowerC)
Jean Chalarda5d58492011-02-18 17:50:58 +0900651 return NEAR_PROXIMITY_CHAR;
652
653 // Not an exact nor an accent-alike match: search the list of close keys
654 int j = 1;
satoke07baa62010-12-09 21:55:40 +0900655 while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
Jean Chalardf5f834a2011-02-22 15:12:46 +0900656 const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
Jean Chalarda5d58492011-02-18 17:50:58 +0900657 if (matched) return NEAR_PROXIMITY_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900658 ++j;
659 }
Jean Chalarda5d58492011-02-18 17:50:58 +0900660
661 // Was not included, signal this as an unrelated character.
Jean Chalard8dc754a2011-01-27 14:20:22 +0900662 return UNRELATED_CHAR;
satok28bd03b2010-12-03 16:39:16 +0900663}
664
Jean Chalardca5ef282011-06-17 15:36:26 +0900665inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900666 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900667 const int inputIndex, const int matchWeight, const int skipPos,
668 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
669 int* nextLetters, const int nextLettersSize) {
670
satok1d7eaf82011-07-13 10:32:02 +0900671 const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false;
Jean Chalard980d6b62011-06-30 17:02:23 +0900672 if (isSameAsTyped) return;
Jean Chalardca5ef282011-06-17 15:36:26 +0900673
674 if (depth >= MIN_SUGGEST_DEPTH) {
675 const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
676 excessivePos, transposedPos, freq, sameLength);
677 if (!isSameAsTyped)
678 addWord(word, depth + 1, finalFreq);
Jean Chalardca5ef282011-06-17 15:36:26 +0900679 }
680
681 if (sameLength && depth >= mInputLength && skipPos < 0) {
682 registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);
683 }
684}
685
Jean Chalarde6715e32011-06-30 19:47:25 +0900686bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
687 const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
688 const int secondWordLength, const bool isSpaceProximity) {
689 if (inputLength >= MAX_WORD_LENGTH) return false;
690 if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
691 || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
692 return false;
693 const int newWordLength = firstWordLength + secondWordLength + 1;
694 // Allocating variable length array on stack
695 unsigned short word[newWordLength];
696 const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
697 if (DEBUG_DICT) {
698 LOGI("First freq: %d", firstFreq);
699 }
700 if (firstFreq <= 0) return false;
701
702 for (int i = 0; i < firstWordLength; ++i) {
703 word[i] = mWord[i];
704 }
705
706 const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
707 if (DEBUG_DICT) {
708 LOGI("Second freq: %d", secondFreq);
709 }
710 if (secondFreq <= 0) return false;
711
712 word[firstWordLength] = SPACE;
713 for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
714 word[i] = mWord[i - firstWordLength - 1];
715 }
716
717 int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
718 secondWordLength, firstFreq, secondFreq, isSpaceProximity);
719 if (DEBUG_DICT) {
720 LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
721 TYPED_LETTER_MULTIPLIER);
722 }
723 addWord(word, newWordLength, pairFreq);
724 return true;
725}
726
Jean Chalardbc90c722011-06-20 21:09:04 +0900727#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardbc90c722011-06-20 21:09:04 +0900728// The following functions will be entirely replaced with new implementations.
729void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
730 const int excessivePos, const int transposedPos,int *nextLetters,
731 const int nextLettersSize) {
732 int initialPosition = initialPos;
733 const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
734 getWordsRec(count, initialPosition, 0,
735 min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
736 mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
737 nextLettersSize);
738}
Jean Chalardca5ef282011-06-17 15:36:26 +0900739
Jean Chalardbc90c722011-06-20 21:09:04 +0900740void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
741 const int maxDepth, const bool traverseAllNodes, const int matchWeight,
742 const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
743 const int transposedPos, int *nextLetters, const int nextLettersSize) {
744 int siblingPos = pos;
745 for (int i = 0; i < childrenCount; ++i) {
746 int newCount;
747 int newChildPosition;
748 bool newTraverseAllNodes;
749 int newMatchRate;
750 int newInputIndex;
751 int newDiffs;
752 int newSiblingPos;
753 int newOutputIndex;
754 const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
755 traverseAllNodes, matchWeight, inputIndex, diffs,
756 skipPos, excessivePos, transposedPos,
757 nextLetters, nextLettersSize,
758 &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
759 &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
760 siblingPos = newSiblingPos;
satokcdbbea72010-12-08 16:04:16 +0900761
Jean Chalardbc90c722011-06-20 21:09:04 +0900762 if (needsToTraverseChildrenNodes) {
763 getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
764 newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
765 nextLetters, nextLettersSize);
satok48e432c2010-12-06 17:38:58 +0900766 }
satok48e432c2010-12-06 17:38:58 +0900767 }
satok48e432c2010-12-06 17:38:58 +0900768}
769
Jean Chalard980d6b62011-06-30 17:02:23 +0900770inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
771 const int inputLength, unsigned short *word) {
satok662fe692010-12-08 17:05:39 +0900772 int pos = ROOT_POS;
Jean Chalard293ece02011-06-16 20:55:16 +0900773 int count = Dictionary::getCount(DICT_ROOT, &pos);
satokaee09dc2010-12-09 19:21:51 +0900774 int maxFreq = 0;
775 int depth = 0;
776 unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
satok662fe692010-12-08 17:05:39 +0900777 bool terminal = false;
778
satokaee09dc2010-12-09 19:21:51 +0900779 mStackChildCount[0] = count;
780 mStackSiblingPos[0] = pos;
781
782 while (depth >= 0) {
783 if (mStackChildCount[depth] > 0) {
784 --mStackChildCount[depth];
785 int firstChildPos;
786 int newFreq;
787 int siblingPos = mStackSiblingPos[depth];
788 const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
789 startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
790 &siblingPos);
791 mStackSiblingPos[depth] = siblingPos;
792 if (depth == (inputLength - 1)) {
793 // Traverse sibling node
794 if (terminal) {
795 if (newFreq > maxFreq) {
796 for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
797 if (DEBUG_DICT && DEBUG_NODE) {
Doug Kwance9efbf2011-07-07 22:53:50 -0700798#ifdef FLAG_DBG
satokaee09dc2010-12-09 19:21:51 +0900799 char s[inputLength + 1];
800 for (int i = 0; i < inputLength; ++i) s[i] = word[i];
801 s[inputLength] = 0;
802 LOGI("New missing space word found: %d > %d (%s), %d, %d",
803 newFreq, maxFreq, s, inputLength, depth);
satok787945b2011-07-14 08:32:57 +0900804#endif
satokaee09dc2010-12-09 19:21:51 +0900805 }
806 maxFreq = newFreq;
807 }
808 }
809 } else if (needsToTraverseChildrenNodes) {
810 // Traverse children nodes
811 ++depth;
812 mStackChildCount[depth] = count;
813 mStackSiblingPos[depth] = firstChildPos;
814 }
815 } else {
816 // Traverse parent node
817 --depth;
satok662fe692010-12-08 17:05:39 +0900818 }
819 }
satokaee09dc2010-12-09 19:21:51 +0900820
821 word[inputLength] = 0;
822 return maxFreq;
satok662fe692010-12-08 17:05:39 +0900823}
824
825inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
satokaee09dc2010-12-09 19:21:51 +0900826 const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
827 int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
828 const int inputIndex = startInputIndex + depth;
satok8fbd5522011-02-22 17:28:55 +0900829 const int *currentChars = getInputCharsAt(inputIndex);
satok662fe692010-12-08 17:05:39 +0900830 unsigned short c;
Jean Chalard293ece02011-06-16 20:55:16 +0900831 *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
832 &c, newChildPosition, newTerminal, newFreq);
satokaee09dc2010-12-09 19:21:51 +0900833 const unsigned int inputC = currentChars[0];
Ken Wakasade3070a2011-03-19 09:16:42 +0900834 if (DEBUG_DICT) {
835 assert(inputC <= U_SHORT_MAX);
836 }
Jean Chalardf5f834a2011-02-22 15:12:46 +0900837 const unsigned short baseLowerC = toBaseLowerCase(c);
838 const bool matched = (inputC == baseLowerC || inputC == c);
satokaee09dc2010-12-09 19:21:51 +0900839 const bool hasChild = *newChildPosition != 0;
840 if (matched) {
841 word[depth] = c;
842 if (DEBUG_DICT && DEBUG_NODE) {
843 LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
Ken Wakasade3070a2011-03-19 09:16:42 +0900844 if (*newTerminal) {
845 LOGI("Terminal %d", *newFreq);
846 }
satok662fe692010-12-08 17:05:39 +0900847 }
satokaee09dc2010-12-09 19:21:51 +0900848 if (hasChild) {
Jean Chalard293ece02011-06-16 20:55:16 +0900849 *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
satokaee09dc2010-12-09 19:21:51 +0900850 return true;
851 } else {
852 return false;
853 }
854 } else {
855 // If this node is not user typed character, this method treats this word as unmatched.
856 // Thus newTerminal shouldn't be true.
857 *newTerminal = false;
858 return false;
satok662fe692010-12-08 17:05:39 +0900859 }
satok662fe692010-12-08 17:05:39 +0900860}
Jean Chalard8124e642011-06-16 22:33:41 +0900861
862// TODO: use uint32_t instead of unsigned short
863bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
864 if (IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900865 return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900866 } else {
Jean Chalard581335c2011-06-17 12:45:17 +0900867 return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
Jean Chalard8124e642011-06-16 22:33:41 +0900868 }
869}
870
Jean Chalard17e44a72011-06-16 22:51:11 +0900871
872// Require strict exact match.
Jean Chalard581335c2011-06-17 12:45:17 +0900873int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
874 int length) const {
Jean Chalard8124e642011-06-16 22:33:41 +0900875 // returns address of bigram data of that word
876 // return -99 if not found
877
878 int count = Dictionary::getCount(DICT_ROOT, &pos);
879 unsigned short currentChar = (unsigned short) word[offset];
880 for (int j = 0; j < count; j++) {
881 unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
882 int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
883 int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
884 if (c == currentChar) {
885 if (offset == length - 1) {
886 if (terminal) {
887 return (pos+1);
888 }
889 } else {
890 if (childPos != 0) {
Jean Chalard581335c2011-06-17 12:45:17 +0900891 int t = getBigramPosition(childPos, word, offset + 1, length);
Jean Chalard8124e642011-06-16 22:33:41 +0900892 if (t > 0) {
893 return t;
894 }
895 }
896 }
897 }
898 if (terminal) {
899 Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
900 }
901 // There could be two instances of each alphabet - upper and lower case. So continue
902 // looking ...
903 }
904 return NOT_VALID_WORD;
905}
906
Jean Chalardbc90c722011-06-20 21:09:04 +0900907// The following functions will be modified.
Jean Chalard0584f022011-06-30 19:23:16 +0900908inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
909 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
910 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalardbc90c722011-06-20 21:09:04 +0900911 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
912 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
913 int *nextSiblingPosition, int *nextOutputIndex) {
914 if (DEBUG_DICT) {
915 int inputCount = 0;
916 if (skipPos >= 0) ++inputCount;
917 if (excessivePos >= 0) ++inputCount;
918 if (transposedPos >= 0) ++inputCount;
919 assert(inputCount <= 1);
920 }
921 unsigned short c;
922 int childPosition;
923 bool terminal;
924 int freq;
925 bool isSameAsUserTypedLength = false;
926
Jean Chalard0584f022011-06-30 19:23:16 +0900927 const int pos = initialPos;
928 const int depth = initialDepth;
929 const int traverseAllNodes = initialTraverseAllNodes;
930 const int diffs = initialDiffs;
931
Jean Chalardbc90c722011-06-20 21:09:04 +0900932 const uint8_t flags = 0; // No flags for now
933
934 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
935
936 *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
937 &c, &childPosition, &terminal, &freq);
938 *nextOutputIndex = depth + 1;
939
940 const bool needsToTraverseChildrenNodes = childPosition != 0;
941
942 // If we are only doing traverseAllNodes, no need to look at the typed characters.
943 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
944 mWord[depth] = c;
945 if (traverseAllNodes && terminal) {
946 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
947 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
948 }
949 if (!needsToTraverseChildrenNodes) return false;
950 *newTraverseAllNodes = traverseAllNodes;
951 *newMatchRate = matchWeight;
952 *newDiffs = diffs;
953 *newInputIndex = inputIndex;
954 } else {
955 const int *currentChars = getInputCharsAt(inputIndex);
956
957 if (transposedPos >= 0) {
958 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
959 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
960 }
961
962 int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
963 transposedPos);
964 if (UNRELATED_CHAR == matchedProximityCharId) return false;
965 mWord[depth] = c;
966 // If inputIndex is greater than mInputLength, that means there is no
967 // proximity chars. So, we don't need to check proximity.
968 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
969 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
970 }
971 bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
972 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
973 if (isSameAsUserTypedLength && terminal) {
974 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
975 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
976 }
977 if (!needsToTraverseChildrenNodes) return false;
978 // Start traversing all nodes after the index exceeds the user typed length
979 *newTraverseAllNodes = isSameAsUserTypedLength;
980 *newMatchRate = matchWeight;
981 *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
982 *newInputIndex = inputIndex + 1;
983 }
984 // Optimization: Prune out words that are too long compared to how much was typed.
985 if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
986 return false;
987 }
988
989 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
990 // TODO: Check if this can be isSameAsUserTypedLength only.
991 if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
992 *newTraverseAllNodes = true;
993 }
994 // get the count of nodes and increment childAddress.
995 *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
996 *newChildPosition = childPosition;
997 if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
998 return needsToTraverseChildrenNodes;
999}
1000
1001#else // NEW_DICTIONARY_FORMAT
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001002
Jean Chalard1059f272011-06-28 20:45:05 +09001003// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
1004// interface.
1005inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
1006 const int inputLength, unsigned short *word) {
1007 uint16_t inWord[inputLength];
1008
1009 for (int i = 0; i < inputLength; ++i) {
1010 inWord[i] = *getInputCharsAt(startInputIndex + i);
1011 }
1012 return getMostFrequentWordLikeInner(inWord, inputLength, word);
1013}
1014
1015// This function will take the position of a character array within a CharGroup,
1016// and check it actually like-matches the word in inWord starting at startInputIndex,
1017// that is, it matches it with case and accents squashed.
1018// The function returns true if there was a full match, false otherwise.
1019// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
1020// It will also place the end position of the array in outPos; in outInputIndex,
1021// it will place the index of the first char AFTER the match if there was a match,
1022// and the initial position if there was not. It makes sense because if there was
1023// a match we want to continue searching, but if there was not, we want to go to
1024// the next CharGroup.
1025// In and out parameters may point to the same location. This function takes care
1026// not to use any input parameters after it wrote into its outputs.
1027static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
1028 const uint8_t* const root, const int startPos,
1029 const uint16_t* const inWord, const int startInputIndex,
1030 int32_t* outNewWord, int* outInputIndex, int* outPos) {
1031 const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
1032 int pos = startPos;
1033 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1034 int32_t baseChar = toBaseLowerCase(character);
1035 const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
1036
1037 if (baseChar != wChar) {
1038 *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
1039 *outInputIndex = startInputIndex;
1040 return false;
1041 }
1042 int inputIndex = startInputIndex;
1043 outNewWord[inputIndex] = character;
1044 if (hasMultipleChars) {
1045 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1046 while (NOT_A_CHARACTER != character) {
1047 baseChar = toBaseLowerCase(character);
1048 if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
1049 *outPos = BinaryFormat::skipOtherCharacters(root, pos);
1050 *outInputIndex = startInputIndex;
1051 return false;
1052 }
1053 outNewWord[inputIndex] = character;
1054 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1055 }
1056 }
1057 *outInputIndex = inputIndex + 1;
1058 *outPos = pos;
1059 return true;
1060}
1061
1062// This function is invoked when a word like the word searched for is found.
1063// It will compare the frequency to the max frequency, and if greater, will
1064// copy the word into the output buffer. In output value maxFreq, it will
1065// write the new maximum frequency if it changed.
1066static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
1067 short unsigned int* outWord, int* maxFreq) {
1068 if (freq > *maxFreq) {
1069 for (int q = 0; q < length; ++q)
1070 outWord[q] = newWord[q];
1071 outWord[length] = 0;
1072 *maxFreq = freq;
1073 }
1074}
1075
1076// Will find the highest frequency of the words like the one passed as an argument,
1077// that is, everything that only differs by case/accents.
1078int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
1079 const int length, short unsigned int* outWord) {
1080 int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
1081 int depth = 0;
1082 int maxFreq = -1;
1083 const uint8_t* const root = DICT_ROOT;
1084
1085 mStackChildCount[0] = root[0];
1086 mStackInputIndex[0] = 0;
1087 mStackSiblingPos[0] = 1;
1088 while (depth >= 0) {
1089 const int charGroupCount = mStackChildCount[depth];
1090 int pos = mStackSiblingPos[depth];
1091 for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
1092 int inputIndex = mStackInputIndex[depth];
1093 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1094 // Test whether all chars in this group match with the word we are searching for. If so,
1095 // we want to traverse its children (or if the length match, evaluate its frequency).
1096 // Note that this function will output the position regardless, but will only write
1097 // into inputIndex if there is a match.
1098 const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
1099 inputIndex, newWord, &inputIndex, &pos);
1100 if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
1101 const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1102 onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
1103 }
1104 pos = BinaryFormat::skipFrequency(flags, pos);
1105 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1106 const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
1107 // If we had a match and the word has children, we want to traverse them. We don't have
1108 // to traverse words longer than the one we are searching for, since they will not match
1109 // anyway, so don't traverse unless inputIndex < length.
1110 if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
1111 // Save position for this depth, to get back to this once children are done
1112 mStackChildCount[depth] = charGroupIndex;
1113 mStackSiblingPos[depth] = siblingPos;
1114 // Prepare stack values for next depth
1115 ++depth;
1116 int childrenPos = childrenNodePos;
1117 mStackChildCount[depth] =
1118 BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
1119 mStackSiblingPos[depth] = childrenPos;
1120 mStackInputIndex[depth] = inputIndex;
1121 pos = childrenPos;
1122 // Go to the next depth level.
1123 ++depth;
1124 break;
1125 } else {
1126 // No match, or no children, or word too long to ever match: go the next sibling.
1127 pos = siblingPos;
1128 }
1129 }
1130 --depth;
1131 }
1132 return maxFreq;
1133}
1134
1135// This function gets the frequency of the exact matching word in the dictionary.
1136// If no match is found, it returns -1.
1137int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const {
1138 int pos = 0;
1139 int wordPos = 0;
1140 const uint8_t* const root = DICT_ROOT;
1141
1142 while (true) {
1143 // If we already traversed the tree further than the word is long, there means
1144 // there was no match (or we would have found it).
1145 if (wordPos > length) return -1;
1146 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
1147 const uint16_t wChar = inWord[wordPos];
1148 while (true) {
1149 // If there are no more character groups in this node, it means we could not
1150 // find a matching character for this depth, therefore there is no match.
1151 if (0 >= charGroupCount) return -1;
1152 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
1153 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1154 if (character == wChar) {
1155 // This is the correct node. Only one character group may start with the same
1156 // char within a node, so either we found our match in this node, or there is
1157 // no match and we can return -1. So we will check all the characters in this
1158 // character group indeed does match.
1159 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
1160 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1161 while (NOT_A_CHARACTER != character) {
1162 ++wordPos;
1163 // If we shoot the length of the word we search for, or if we find a single
1164 // character that does not match, as explained above, it means the word is
1165 // not in the dictionary (by virtue of this chargroup being the only one to
1166 // match the word on the first character, but not matching the whole word).
1167 if (wordPos > length) return -1;
1168 if (inWord[wordPos] != character) return -1;
1169 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
1170 }
1171 }
1172 // If we come here we know that so far, we do match. Either we are on a terminal
1173 // and we match the length, in which case we found it, or we traverse children.
1174 // If we don't match the length AND don't have children, then a word in the
1175 // dictionary fully matches a prefix of the searched word but not the full word.
1176 ++wordPos;
1177 if (FLAG_IS_TERMINAL & flags) {
1178 if (wordPos == length) {
1179 return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
1180 }
1181 pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
1182 }
1183 if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags))
1184 return -1;
1185 // We have children and we are still shorter than the word we are searching for, so
1186 // we need to traverse children. Put the pointer on the children position, and
1187 // break
1188 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
1189 break;
1190 } else {
1191 // This chargroup does not match, so skip the remaining part and go to the next.
1192 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
1193 pos = BinaryFormat::skipOtherCharacters(root, pos);
1194 }
1195 pos = BinaryFormat::skipFrequency(flags, pos);
1196 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
1197 }
1198 --charGroupCount;
1199 }
1200 }
1201}
1202
1203bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
1204 return -1 != getFrequency(inWord, length);
1205}
1206
1207int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
1208 unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
1209 int maxAlternatives) {
1210 // TODO: add implementation.
1211 return 0;
1212}
1213
1214// TODO: remove this function.
1215int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
1216 int length) const {
1217 return -1;
1218}
1219
1220// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
1221// If the return value is false, then the caller should read in the output "nextSiblingPosition"
1222// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
1223// It is worthy to note that when false is returned, the output values other than
1224// nextSiblingPosition are undefined.
1225// If the return value is true, then the caller must proceed to traverse the children of this
1226// node. processCurrentNode will output the information about the children: their count in
1227// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
1228// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
1229// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
1230// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
1231// there aren't any more nodes at this level, it merely returns the address of the first byte after
1232// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
1233// given level, as output into newCount when traversing this level's parent.
Jean Chalard0584f022011-06-30 19:23:16 +09001234inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
1235 const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
1236 const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
Jean Chalard1059f272011-06-28 20:45:05 +09001237 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001238 bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
Jean Chalard432789a2011-06-30 17:50:48 +09001239 int *nextSiblingPosition, int *newOutputIndex) {
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001240 if (DEBUG_DICT) {
1241 int inputCount = 0;
1242 if (skipPos >= 0) ++inputCount;
1243 if (excessivePos >= 0) ++inputCount;
1244 if (transposedPos >= 0) ++inputCount;
1245 assert(inputCount <= 1);
1246 }
Jean Chalard0584f022011-06-30 19:23:16 +09001247 int pos = initialPos;
1248 int depth = initialDepth;
1249 int traverseAllNodes = initialTraverseAllNodes;
1250 int diffs = initialDiffs;
1251
Jean Chalard1059f272011-06-28 20:45:05 +09001252 // Flags contain the following information:
1253 // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
1254 // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
1255 // is on the specified number of bytes.
1256 // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
1257 // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
1258 // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
1259 // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
1260 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
1261 const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001262
Jean Chalard1059f272011-06-28 20:45:05 +09001263 // This gets only ONE character from the stream. Next there will be:
1264 // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
1265 // else if FLAG_IS_TERMINAL: the frequency
1266 // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
1267 // Note that you can't have a node that both is not a terminal and has no children.
1268 int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
1269 assert(NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001270
Jean Chalard1059f272011-06-28 20:45:05 +09001271 // We are going to loop through each character and make it look like it's a different
1272 // node each time. To do that, we will process characters in this node in order until
1273 // we find the character terminator. This is signalled by getCharCode* returning
1274 // NOT_A_CHARACTER.
1275 // As a special case, if there is only one character in this node, we must not read the
1276 // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
1277 // This way, each loop run will look like a "virtual node".
1278 do {
1279 // We prefetch the next char. If 'c' is the last char of this node, we will have
1280 // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
1281 // should behave as a terminal or not and whether we have children.
1282 const int32_t nextc = hasMultipleChars
1283 ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
1284 const bool isLastChar = (NOT_A_CHARACTER == nextc);
1285 // If there are more chars in this nodes, then this virtual node is not a terminal.
1286 // If we are on the last char, this virtual node is a terminal if this node is.
1287 const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));
1288 // If there are more chars in this node, then this virtual node has children.
1289 // If we are on the last char, this virtual node has children if this node has.
1290 const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001291
Jean Chalard1059f272011-06-28 20:45:05 +09001292 // This has to be done for each virtual char (this forwards the "inputIndex" which
1293 // is the index in the user-inputted chars, as read by getInputCharsAt.
1294 if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
1295 if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
1296 mWord[depth] = c;
1297 if (traverseAllNodes && isTerminal) {
1298 // The frequency should be here, because we come here only if this is actually
1299 // a terminal node, and we are on its last char.
1300 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1301 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1302 excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
1303 }
1304 if (!hasChildren) {
1305 // If we don't have children here, that means we finished processing all
1306 // characters of this node (we are on the last virtual node), AND we are in
1307 // traverseAllNodes mode, which means we are searching for *completions*. We
1308 // should skip the frequency if we have a terminal, and report the position
1309 // of the next sibling. We don't have to return other values because we are
1310 // returning false, as in "don't traverse children".
1311 if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);
1312 *nextSiblingPosition =
1313 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1314 return false;
1315 }
1316 } else {
1317 const int *currentChars = getInputCharsAt(inputIndex);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001318
Jean Chalard1059f272011-06-28 20:45:05 +09001319 if (transposedPos >= 0) {
1320 if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
1321 if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
1322 }
1323
1324 const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,
1325 excessivePos, transposedPos);
1326 if (UNRELATED_CHAR == matchedProximityCharId) {
1327 // We found that this is an unrelated character, so we should give up traversing
1328 // this node and its children entirely.
1329 // However we may not be on the last virtual node yet so we skip the remaining
1330 // characters in this node, the frequency if it's there, read the next sibling
1331 // position to output it, then return false.
1332 // We don't have to output other values because we return false, as in
1333 // "don't traverse children".
1334 if (!isLastChar) {
1335 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1336 }
1337 pos = BinaryFormat::skipFrequency(flags, pos);
1338 *nextSiblingPosition =
1339 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1340 return false;
1341 }
1342 mWord[depth] = c;
1343 // If inputIndex is greater than mInputLength, that means there is no
1344 // proximity chars. So, we don't need to check proximity.
1345 if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
1346 multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
1347 }
1348 const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
1349 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
1350 if (isSameAsUserTypedLength && isTerminal) {
1351 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
1352 onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
1353 excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
1354 }
1355 // This character matched the typed character (enough to traverse the node at least)
1356 // so we just evaluated it. Now we should evaluate this virtual node's children - that
1357 // is, if it has any. If it has no children, we're done here - so we skip the end of
1358 // the node, output the siblings position, and return false "don't traverse children".
1359 // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
1360 // remaining char in this group for there can't be any.
1361 if (!hasChildren) {
1362 pos = BinaryFormat::skipFrequency(flags, pos);
1363 *nextSiblingPosition =
1364 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1365 return false;
1366 }
1367 // Start traversing all nodes after the index exceeds the user typed length
1368 traverseAllNodes = isSameAsUserTypedLength;
1369 diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
1370 // Finally, we are ready to go to the next character, the next "virtual node".
1371 // We should advance the input index.
1372 // We do this in this branch of the 'if traverseAllNodes' because we are still matching
1373 // characters to input; the other branch is not matching them but searching for
1374 // completions, this is why it does not have to do it.
1375 ++inputIndex;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001376 }
Jean Chalard1059f272011-06-28 20:45:05 +09001377 // Optimization: Prune out words that are too long compared to how much was typed.
1378 if (depth >= maxDepth || diffs > mMaxEditDistance) {
1379 // We are giving up parsing this node and its children. Skip the rest of the node,
1380 // output the sibling position, and return that we don't want to traverse children.
1381 if (!isLastChar) {
1382 pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
1383 }
1384 pos = BinaryFormat::skipFrequency(flags, pos);
1385 *nextSiblingPosition =
1386 BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1387 return false;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001388 }
1389
Jean Chalard1059f272011-06-28 20:45:05 +09001390 // Prepare for the next character. Promote the prefetched char to current char - the loop
1391 // will take care of prefetching the next. If we finally found our last char, nextc will
1392 // contain NOT_A_CHARACTER.
1393 c = nextc;
1394 // Also, the next char is one "virtual node" depth more than this char.
1395 ++depth;
1396 } while (NOT_A_CHARACTER != c);
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001397
1398 // If inputIndex is greater than mInputLength, that means there are no proximity chars.
Jean Chalard1059f272011-06-28 20:45:05 +09001399 // Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.
1400 if (mInputLength <= *newInputIndex) {
1401 traverseAllNodes = true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001402 }
Jean Chalard1059f272011-06-28 20:45:05 +09001403
1404 // All the output values that are purely computation by this function are held in local
1405 // variables. Output them to the caller.
1406 *newTraverseAllNodes = traverseAllNodes;
1407 *newMatchRate = matchWeight;
1408 *newDiffs = diffs;
1409 *newInputIndex = inputIndex;
1410 *newOutputIndex = depth;
1411
1412 // Now we finished processing this node, and we want to traverse children. If there are no
1413 // children, we can't come here.
1414 assert(BinaryFormat::hasChildrenInFlags(flags));
1415
1416 // If this node was a terminal it still has the frequency under the pointer (it may have been
1417 // read, but not skipped - see readFrequencyWithoutMovingPointer).
1418 // Next come the children position, then possibly attributes (attributes are bigrams only for
1419 // now, maybe something related to shortcuts in the future).
1420 // Once this is read, we still need to output the number of nodes in the immediate children of
1421 // this node, so we read and output it before returning true, as in "please traverse children".
1422 pos = BinaryFormat::skipFrequency(flags, pos);
1423 int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
1424 *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
1425 *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
1426 *newChildrenPosition = childrenPos;
1427 return true;
Jean Chalard85a1d1e2011-06-21 22:23:21 +09001428}
1429
Jean Chalardbc90c722011-06-20 21:09:04 +09001430#endif // NEW_DICTIONARY_FORMAT
1431
satok30088252010-12-01 21:22:15 +09001432} // namespace latinime