blob: b20dc11731bb029037471071db9db97bbf439a4e [file] [log] [blame]
The Android Open Source Project923bf412009-03-13 15:11:42 -07001/*
2**
3** Copyright 2009, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
18#include <stdio.h>
19#include <fcntl.h>
20#include <sys/mman.h>
21#include <string.h>
satok15dc33d2010-12-01 15:37:31 +090022
23#ifdef FLAG_DBG
24#define LOG_TAG "LatinIME: dictionary.cpp"
25#include <cutils/log.h>
26#define DEBUG_DICT 1
27#else // FLAG_DBG
Ken Wakasa826269c2010-04-27 10:28:14 +090028#define LOGI
satok15dc33d2010-12-01 15:37:31 +090029#define DEBUG_DICT 0
30#endif // FLAG_DBG
The Android Open Source Project923bf412009-03-13 15:11:42 -070031
32#include "dictionary.h"
33#include "basechars.h"
Ken Wakasa707505e2010-04-21 02:35:47 +090034#include "char_utils.h"
The Android Open Source Project923bf412009-03-13 15:11:42 -070035
Jae Yong Sung937d5ad2010-06-30 20:28:04 -070036#define DICTIONARY_VERSION_MIN 200
37#define DICTIONARY_HEADER_SIZE 2
38#define NOT_VALID_WORD -99
The Android Open Source Project923bf412009-03-13 15:11:42 -070039
satokd4952c82010-12-01 19:09:29 +090040#define SUGGEST_MISSING_CHARACTERS true
41#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5
42
43
The Android Open Source Project923bf412009-03-13 15:11:42 -070044namespace latinime {
45
46Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
47{
satok15dc33d2010-12-01 15:37:31 +090048 LOGI("Dictionary - constructor");
The Android Open Source Project923bf412009-03-13 15:11:42 -070049 mDict = (unsigned char*) dict;
50 mTypedLetterMultiplier = typedLetterMultiplier;
51 mFullWordMultiplier = fullWordMultiplier;
Jae Yong Sung937d5ad2010-06-30 20:28:04 -070052 getVersionNumber();
The Android Open Source Project923bf412009-03-13 15:11:42 -070053}
54
55Dictionary::~Dictionary()
56{
57}
58
59int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
Amith Yamasani1b62ff12010-02-05 14:07:04 -080060 int maxWordLength, int maxWords, int maxAlternatives, int skipPos,
61 int *nextLetters, int nextLettersSize)
The Android Open Source Project923bf412009-03-13 15:11:42 -070062{
satokd4952c82010-12-01 19:09:29 +090063
64 initSuggestions(codes, codesSize, outWords, frequencies, maxWordLength, maxWords,
65 maxAlternatives);
66
67 int suggestedWordsCount = getSuggestionCandidates(codesSize, maxWords, skipPos, nextLetters,
68 nextLettersSize);
69
70 // If there aren't sufficient suggestions, search for words by allowing wild cards at
71 // the different character positions. This feature is not ready for prime-time as we need
72 // to figure out the best ranking for such words compared to proximity corrections and
73 // completions.
74 if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
75 for (int i = 0; i < codesSize; ++i) {
76 int tempCount = getSuggestionCandidates(codesSize, maxWords, i, NULL, 0);
77 if (tempCount > suggestedWordsCount) {
78 suggestedWordsCount = tempCount;
79 break;
80 }
81 }
82 }
83
84 if (DEBUG_DICT) {
85 LOGI("Returning %d words", suggestedWordsCount);
86 LOGI("Next letters: ");
87 for (int k = 0; k < nextLettersSize; k++) {
88 if (nextLetters[k] > 0) {
89 LOGI("%c = %d,", k, nextLetters[k]);
90 }
91 }
92 LOGI("\n");
93 }
94 return suggestedWordsCount;
95}
96
97void Dictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
98 int *frequencies, int maxWordLength, int maxWords, int maxAlternatives) {
The Android Open Source Project923bf412009-03-13 15:11:42 -070099 mFrequencies = frequencies;
100 mOutputChars = outWords;
101 mInputCodes = codes;
102 mInputLength = codesSize;
103 mMaxAlternatives = maxAlternatives;
104 mMaxWordLength = maxWordLength;
105 mMaxWords = maxWords;
Amith Yamasani322dc3d2009-07-15 18:30:47 -0700106 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
satokd4952c82010-12-01 19:09:29 +0900107}
The Android Open Source Project923bf412009-03-13 15:11:42 -0700108
satokd4952c82010-12-01 19:09:29 +0900109int Dictionary::getSuggestionCandidates(int inputLength, int maxWords, int skipPos,
110 int *nextLetters, int nextLettersSize) {
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700111 if (checkIfDictVersionIsLatest()) {
satokd4952c82010-12-01 19:09:29 +0900112 getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
113 nextLetters, nextLettersSize);
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700114 } else {
satokd4952c82010-12-01 19:09:29 +0900115 getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700116 }
The Android Open Source Project923bf412009-03-13 15:11:42 -0700117
Amith Yamasanid0e43ec2009-10-14 16:10:32 -0700118 // Get the word count
satokd4952c82010-12-01 19:09:29 +0900119 int suggestedWordsCount = 0;
120 while (suggestedWordsCount < maxWords && mFrequencies[suggestedWordsCount] > 0) {
121 suggestedWordsCount++;
Amith Yamasani1b62ff12010-02-05 14:07:04 -0800122 }
satokd4952c82010-12-01 19:09:29 +0900123 return suggestedWordsCount;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700124}
125
satokd4952c82010-12-01 19:09:29 +0900126void Dictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
127 if (c < nextLettersSize) {
128 nextLetters[c]++;
Amith Yamasani1b62ff12010-02-05 14:07:04 -0800129 }
130}
131
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700132void
133Dictionary::getVersionNumber()
134{
135 mVersion = (mDict[0] & 0xFF);
136 mBigram = (mDict[1] & 0xFF);
137 LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
138}
139
140// Checks whether it has the latest dictionary or the old dictionary
141bool
142Dictionary::checkIfDictVersionIsLatest()
143{
144 return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
145}
146
The Android Open Source Project923bf412009-03-13 15:11:42 -0700147unsigned short
148Dictionary::getChar(int *pos)
149{
150 unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
151 // If the code is 255, then actual 16 bit code follows (in big endian)
152 if (ch == 0xFF) {
153 ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
154 (*pos) += 2;
155 }
156 return ch;
157}
158
159int
160Dictionary::getAddress(int *pos)
161{
162 int address = 0;
Amith Yamasanicc3e5c72009-03-31 10:51:17 -0700163 if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
164 *pos += 1;
165 } else {
166 address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
167 address += (mDict[*pos + 1] & 0xFF) << 8;
168 address += (mDict[*pos + 2] & 0xFF);
169 *pos += 3;
170 }
The Android Open Source Project923bf412009-03-13 15:11:42 -0700171 return address;
172}
173
174int
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700175Dictionary::getFreq(int *pos)
176{
177 int freq = mDict[(*pos)++] & 0xFF;
178
179 if (checkIfDictVersionIsLatest()) {
180 // skipping bigram
181 int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
182 if (bigramExist > 0) {
183 int nextBigramExist = 1;
184 while (nextBigramExist > 0) {
185 (*pos) += 3;
186 nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
187 }
188 } else {
189 (*pos)++;
190 }
191 }
192
193 return freq;
194}
195
196int
The Android Open Source Project923bf412009-03-13 15:11:42 -0700197Dictionary::wideStrLen(unsigned short *str)
198{
199 if (!str) return 0;
200 unsigned short *end = str;
201 while (*end)
202 end++;
203 return end - str;
204}
205
206bool
207Dictionary::addWord(unsigned short *word, int length, int frequency)
208{
209 word[length] = 0;
Amith Yamasani322dc3d2009-07-15 18:30:47 -0700210 if (DEBUG_DICT) {
211 char s[length + 1];
212 for (int i = 0; i <= length; i++) s[i] = word[i];
213 LOGI("Found word = %s, freq = %d : \n", s, frequency);
214 }
The Android Open Source Project923bf412009-03-13 15:11:42 -0700215
216 // Find the right insertion point
217 int insertAt = 0;
218 while (insertAt < mMaxWords) {
219 if (frequency > mFrequencies[insertAt]
220 || (mFrequencies[insertAt] == frequency
221 && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
222 break;
223 }
224 insertAt++;
225 }
226 if (insertAt < mMaxWords) {
227 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
228 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
229 (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
230 mFrequencies[insertAt] = frequency;
231 memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
232 (char*) mOutputChars + (insertAt ) * mMaxWordLength * sizeof(short),
233 (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
234 unsigned short *dest = mOutputChars + (insertAt ) * mMaxWordLength;
235 while (length--) {
236 *dest++ = *word++;
237 }
238 *dest = 0; // NULL terminate
The Android Open Source Project923bf412009-03-13 15:11:42 -0700239 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
240 return true;
241 }
242 return false;
243}
244
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700245bool
246Dictionary::addWordBigram(unsigned short *word, int length, int frequency)
247{
248 word[length] = 0;
249 if (DEBUG_DICT) {
250 char s[length + 1];
251 for (int i = 0; i <= length; i++) s[i] = word[i];
252 LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
253 }
254
255 // Find the right insertion point
256 int insertAt = 0;
257 while (insertAt < mMaxBigrams) {
258 if (frequency > mBigramFreq[insertAt]
259 || (mBigramFreq[insertAt] == frequency
260 && length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) {
261 break;
262 }
263 insertAt++;
264 }
265 LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
266 if (insertAt < mMaxBigrams) {
267 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
268 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
269 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
270 mBigramFreq[insertAt] = frequency;
271 memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
272 (char*) mBigramChars + (insertAt ) * mMaxWordLength * sizeof(short),
273 (mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength);
274 unsigned short *dest = mBigramChars + (insertAt ) * mMaxWordLength;
275 while (length--) {
276 *dest++ = *word++;
277 }
278 *dest = 0; // NULL terminate
279 if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
280 return true;
281 }
282 return false;
283}
284
The Android Open Source Project923bf412009-03-13 15:11:42 -0700285unsigned short
Amith Yamasanif1150882009-08-07 14:04:24 -0700286Dictionary::toLowerCase(unsigned short c) {
The Android Open Source Project923bf412009-03-13 15:11:42 -0700287 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
288 c = BASE_CHARS[c];
289 }
Amith Yamasanif1150882009-08-07 14:04:24 -0700290 if (c >='A' && c <= 'Z') {
291 c |= 32;
292 } else if (c > 127) {
Ken Wakasa707505e2010-04-21 02:35:47 +0900293 c = latin_tolower(c);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700294 }
295 return c;
296}
297
298bool
299Dictionary::sameAsTyped(unsigned short *word, int length)
300{
301 if (length != mInputLength) {
302 return false;
303 }
304 int *inputCodes = mInputCodes;
305 while (length--) {
306 if ((unsigned int) *inputCodes != (unsigned int) *word) {
307 return false;
308 }
309 inputCodes += mMaxAlternatives;
310 word++;
311 }
312 return true;
313}
314
315static char QUOTE = '\'';
316
317void
Amith Yamasani322dc3d2009-07-15 18:30:47 -0700318Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
satokd4952c82010-12-01 19:09:29 +0900319 int diffs, int skipPos, int *nextLetters, int nextLettersSize)
The Android Open Source Project923bf412009-03-13 15:11:42 -0700320{
321 // Optimization: Prune out words that are too long compared to how much was typed.
322 if (depth > maxDepth) {
323 return;
324 }
Amith Yamasani322dc3d2009-07-15 18:30:47 -0700325 if (diffs > mMaxEditDistance) {
326 return;
327 }
The Android Open Source Project923bf412009-03-13 15:11:42 -0700328 int count = getCount(&pos);
329 int *currentChars = NULL;
330 if (mInputLength <= inputIndex) {
331 completion = true;
332 } else {
333 currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
334 }
335
336 for (int i = 0; i < count; i++) {
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700337 // -- at char
The Android Open Source Project923bf412009-03-13 15:11:42 -0700338 unsigned short c = getChar(&pos);
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700339 // -- at flag/add
Amith Yamasanif1150882009-08-07 14:04:24 -0700340 unsigned short lowerC = toLowerCase(c);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700341 bool terminal = getTerminal(&pos);
342 int childrenAddress = getAddress(&pos);
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700343 // -- after address or flag
Amith Yamasanicc3e5c72009-03-31 10:51:17 -0700344 int freq = 1;
345 if (terminal) freq = getFreq(&pos);
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700346 // -- after add or freq
347
The Android Open Source Project923bf412009-03-13 15:11:42 -0700348 // If we are only doing completions, no need to look at the typed characters.
349 if (completion) {
350 mWord[depth] = c;
351 if (terminal) {
352 addWord(mWord, depth + 1, freq * snr);
satokd4952c82010-12-01 19:09:29 +0900353 if (depth >= mInputLength && skipPos < 0) {
354 registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
Amith Yamasani1b62ff12010-02-05 14:07:04 -0800355 }
The Android Open Source Project923bf412009-03-13 15:11:42 -0700356 }
357 if (childrenAddress != 0) {
satokd4952c82010-12-01 19:09:29 +0900358 getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
359 diffs, skipPos, nextLetters, nextLettersSize);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700360 }
satokd4952c82010-12-01 19:09:29 +0900361 } else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) {
Amith Yamasanic3df2d62009-06-04 12:20:45 -0700362 // Skip the ' or other letter and continue deeper
363 mWord[depth] = c;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700364 if (childrenAddress != 0) {
satokd4952c82010-12-01 19:09:29 +0900365 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
366 skipPos, nextLetters, nextLettersSize);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700367 }
368 } else {
369 int j = 0;
370 while (currentChars[j] > 0) {
The Android Open Source Project923bf412009-03-13 15:11:42 -0700371 if (currentChars[j] == lowerC || currentChars[j] == c) {
Amith Yamasani322dc3d2009-07-15 18:30:47 -0700372 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700373 mWord[depth] = c;
374 if (mInputLength == inputIndex + 1) {
375 if (terminal) {
376 if (//INCLUDE_TYPED_WORD_IF_VALID ||
377 !sameAsTyped(mWord, depth + 1)) {
Amith Yamasanif51d16a2009-08-10 17:22:39 -0700378 int finalFreq = freq * snr * addedWeight;
satokd4952c82010-12-01 19:09:29 +0900379 if (skipPos < 0) finalFreq *= mFullWordMultiplier;
Amith Yamasanif51d16a2009-08-10 17:22:39 -0700380 addWord(mWord, depth + 1, finalFreq);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700381 }
382 }
383 if (childrenAddress != 0) {
384 getWordsRec(childrenAddress, depth + 1,
Amith Yamasani322dc3d2009-07-15 18:30:47 -0700385 maxDepth, true, snr * addedWeight, inputIndex + 1,
satokd4952c82010-12-01 19:09:29 +0900386 diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700387 }
388 } else if (childrenAddress != 0) {
389 getWordsRec(childrenAddress, depth + 1, maxDepth,
satokd4952c82010-12-01 19:09:29 +0900390 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
391 skipPos, nextLetters, nextLettersSize);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700392 }
393 }
394 j++;
satokd4952c82010-12-01 19:09:29 +0900395 if (skipPos >= 0) break;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700396 }
397 }
398 }
399}
400
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700401int
402Dictionary::getBigramAddress(int *pos, bool advance)
The Android Open Source Project923bf412009-03-13 15:11:42 -0700403{
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700404 int address = 0;
405
406 address += (mDict[*pos] & 0x3F) << 16;
407 address += (mDict[*pos + 1] & 0xFF) << 8;
408 address += (mDict[*pos + 2] & 0xFF);
409
410 if (advance) {
411 *pos += 3;
412 }
413
414 return address;
415}
416
417int
418Dictionary::getBigramFreq(int *pos)
419{
420 int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
421
422 return freq;
423}
424
425
426int
Jae Yong Sung80aa14f2010-07-26 11:43:29 -0700427Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
428 unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
429 int maxAlternatives)
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700430{
431 mBigramFreq = bigramFreq;
432 mBigramChars = bigramChars;
Jae Yong Sung80aa14f2010-07-26 11:43:29 -0700433 mInputCodes = codes;
434 mInputLength = codesSize;
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700435 mMaxWordLength = maxWordLength;
436 mMaxBigrams = maxBigrams;
Jae Yong Sung80aa14f2010-07-26 11:43:29 -0700437 mMaxAlternatives = maxAlternatives;
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700438
439 if (mBigram == 1 && checkIfDictVersionIsLatest()) {
440 int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
441 LOGI("Pos -> %d\n", pos);
442 if (pos < 0) {
443 return 0;
444 }
445
446 int bigramCount = 0;
447 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
448 if (bigramExist > 0) {
449 int nextBigramExist = 1;
Jae Yong Sung80aa14f2010-07-26 11:43:29 -0700450 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700451 int bigramAddress = getBigramAddress(&pos, true);
452 int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
453 // search for all bigrams and store them
454 searchForTerminalNode(bigramAddress, frequency);
455 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
456 bigramCount++;
457 }
458 }
459
460 return bigramCount;
461 }
462 return 0;
463}
464
465void
466Dictionary::searchForTerminalNode(int addressLookingFor, int frequency)
467{
468 // track word with such address and store it in an array
469 unsigned short word[mMaxWordLength];
470
471 int pos;
472 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
473 bool found = false;
474 char followingChar = ' ';
475 int depth = -1;
476
477 while(!found) {
478 bool followDownAddressSearchStop = false;
479 bool firstAddress = true;
480 bool haveToSearchAll = true;
481
482 if (depth >= 0) {
483 word[depth] = (unsigned short) followingChar;
484 }
485 pos = followDownBranchAddress; // pos start at count
486 int count = mDict[pos] & 0xFF;
487 LOGI("count - %d\n",count);
488 pos++;
489 for (int i = 0; i < count; i++) {
490 // pos at data
491 pos++;
492 // pos now at flag
493 if (!getFirstBitOfByte(&pos)) { // non-terminal
494 if (!followDownAddressSearchStop) {
495 int addr = getBigramAddress(&pos, false);
496 if (addr > addressLookingFor) {
497 followDownAddressSearchStop = true;
498 if (firstAddress) {
499 firstAddress = false;
500 haveToSearchAll = true;
501 } else if (!haveToSearchAll) {
502 break;
503 }
504 } else {
505 followDownBranchAddress = addr;
506 followingChar = (char)(0xFF & mDict[pos-1]);
507 if (firstAddress) {
508 firstAddress = false;
509 haveToSearchAll = false;
510 }
511 }
512 }
513 pos += 3;
514 } else if (getFirstBitOfByte(&pos)) { // terminal
515 if (addressLookingFor == (pos-1)) { // found !!
516 depth++;
517 word[depth] = (0xFF & mDict[pos-1]);
518 found = true;
519 break;
520 }
521 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
522 if (!followDownAddressSearchStop) {
523 int addr = getBigramAddress(&pos, false);
524 if (addr > addressLookingFor) {
525 followDownAddressSearchStop = true;
526 if (firstAddress) {
527 firstAddress = false;
528 haveToSearchAll = true;
529 } else if (!haveToSearchAll) {
530 break;
531 }
532 } else {
533 followDownBranchAddress = addr;
534 followingChar = (char)(0xFF & mDict[pos-1]);
535 if (firstAddress) {
536 firstAddress = false;
537 haveToSearchAll = true;
538 }
539 }
540 }
541 pos += 4;
542 } else { // freq only (2 byte)
543 pos += 2;
544 }
545
546 // skipping bigram
547 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
548 if (bigramExist > 0) {
549 int nextBigramExist = 1;
550 while (nextBigramExist > 0) {
551 pos += 3;
552 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
553 }
554 } else {
555 pos++;
556 }
557 }
558 }
559 depth++;
560 if (followDownBranchAddress == 0) {
561 LOGI("ERROR!!! Cannot find bigram!!");
562 break;
563 }
564 }
Jae Yong Sung80aa14f2010-07-26 11:43:29 -0700565 if (checkFirstCharacter(word)) {
566 addWordBigram(word, depth, frequency);
567 }
568}
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700569
Jae Yong Sung80aa14f2010-07-26 11:43:29 -0700570bool
571Dictionary::checkFirstCharacter(unsigned short *word)
572{
573 // Checks whether this word starts with same character or neighboring characters of
574 // what user typed.
575
576 int *inputCodes = mInputCodes;
577 int maxAlt = mMaxAlternatives;
578 while (maxAlt > 0) {
579 if ((unsigned int) *inputCodes == (unsigned int) *word) {
580 return true;
581 }
582 inputCodes++;
583 maxAlt--;
584 }
585 return false;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700586}
587
588bool
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700589Dictionary::isValidWord(unsigned short *word, int length)
590{
591 if (checkIfDictVersionIsLatest()) {
592 return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
593 } else {
594 return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
595 }
596}
597
598int
The Android Open Source Project923bf412009-03-13 15:11:42 -0700599Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700600 // returns address of bigram data of that word
601 // return -99 if not found
602
The Android Open Source Project923bf412009-03-13 15:11:42 -0700603 int count = getCount(&pos);
604 unsigned short currentChar = (unsigned short) word[offset];
605 for (int j = 0; j < count; j++) {
606 unsigned short c = getChar(&pos);
607 int terminal = getTerminal(&pos);
608 int childPos = getAddress(&pos);
609 if (c == currentChar) {
610 if (offset == length - 1) {
611 if (terminal) {
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700612 return (pos+1);
The Android Open Source Project923bf412009-03-13 15:11:42 -0700613 }
614 } else {
615 if (childPos != 0) {
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700616 int t = isValidWordRec(childPos, word, offset + 1, length);
617 if (t > 0) {
618 return t;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700619 }
620 }
621 }
622 }
Amith Yamasanicc3e5c72009-03-31 10:51:17 -0700623 if (terminal) {
624 getFreq(&pos);
625 }
The Android Open Source Project923bf412009-03-13 15:11:42 -0700626 // There could be two instances of each alphabet - upper and lower case. So continue
627 // looking ...
628 }
Jae Yong Sung937d5ad2010-06-30 20:28:04 -0700629 return NOT_VALID_WORD;
The Android Open Source Project923bf412009-03-13 15:11:42 -0700630}
631
632
633} // namespace latinime