blob: 5a48a97a8a48f3e904878db1443c20bbc19fe124 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
18#include <stdio.h>
19#include <fcntl.h>
20#include <sys/mman.h>
21#include <string.h>
22
satoke808e432010-12-02 14:53:24 +090023#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok30088252010-12-01 21:22:15 +090024
satok30088252010-12-01 21:22:15 +090025#include "basechars.h"
26#include "char_utils.h"
satoke808e432010-12-02 14:53:24 +090027#include "dictionary.h"
28#include "unigram_dictionary.h"
satok30088252010-12-01 21:22:15 +090029
30namespace latinime {
31
satoke808e432010-12-02 14:53:24 +090032UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
33 int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,
34 const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary)
35 : DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
36 MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
37 HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary)
satok30088252010-12-01 21:22:15 +090038{
39 LOGI("UnigramDictionary - constructor");
satoke808e432010-12-02 14:53:24 +090040 LOGI("Has Bigram : %d \n", hasBigram);
satok30088252010-12-01 21:22:15 +090041 mTypedLetterMultiplier = typedLetterMultiplier;
42 mFullWordMultiplier = fullWordMultiplier;
satok30088252010-12-01 21:22:15 +090043}
44
45UnigramDictionary::~UnigramDictionary()
46{
47}
48
49int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
50 int *nextLetters, int nextLettersSize)
51{
52
53 initSuggestions(codes, codesSize, outWords, frequencies);
54
55 int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
56 nextLettersSize);
57
58 // If there aren't sufficient suggestions, search for words by allowing wild cards at
59 // the different character positions. This feature is not ready for prime-time as we need
60 // to figure out the best ranking for such words compared to proximity corrections and
61 // completions.
62 if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
63 for (int i = 0; i < codesSize; ++i) {
64 int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
65 if (tempCount > suggestedWordsCount) {
66 suggestedWordsCount = tempCount;
67 break;
68 }
69 }
70 }
71
72 if (DEBUG_DICT) {
73 LOGI("Returning %d words", suggestedWordsCount);
74 LOGI("Next letters: ");
75 for (int k = 0; k < nextLettersSize; k++) {
76 if (nextLetters[k] > 0) {
77 LOGI("%c = %d,", k, nextLetters[k]);
78 }
79 }
80 LOGI("\n");
81 }
82 return suggestedWordsCount;
83}
84
85void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
86 int *frequencies) {
87 mFrequencies = frequencies;
88 mOutputChars = outWords;
89 mInputCodes = codes;
90 mInputLength = codesSize;
91 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
92}
93
94int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
95 int *nextLetters, int nextLettersSize) {
satoke808e432010-12-02 14:53:24 +090096 if (IS_LATEST_DICT_VERSION) {
satok30088252010-12-01 21:22:15 +090097 getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
98 nextLetters, nextLettersSize);
99 } else {
100 getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
101 }
102
103 // Get the word count
104 int suggestedWordsCount = 0;
105 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
106 suggestedWordsCount++;
107 }
108 return suggestedWordsCount;
109}
110
111void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
112 if (c < nextLettersSize) {
113 nextLetters[c]++;
114 }
115}
116
satok30088252010-12-01 21:22:15 +0900117int
118UnigramDictionary::wideStrLen(unsigned short *str)
119{
120 if (!str) return 0;
121 unsigned short *end = str;
122 while (*end)
123 end++;
124 return end - str;
125}
126
127bool
128UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
129{
130 word[length] = 0;
131 if (DEBUG_DICT) {
132 char s[length + 1];
133 for (int i = 0; i <= length; i++) s[i] = word[i];
134 LOGI("Found word = %s, freq = %d : \n", s, frequency);
135 }
136
137 // Find the right insertion point
138 int insertAt = 0;
139 while (insertAt < MAX_WORDS) {
140 if (frequency > mFrequencies[insertAt]
141 || (mFrequencies[insertAt] == frequency
142 && length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
143 break;
144 }
145 insertAt++;
146 }
147 if (insertAt < MAX_WORDS) {
148 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
149 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
150 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
151 mFrequencies[insertAt] = frequency;
152 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
153 (char*) mOutputChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
154 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
155 unsigned short *dest = mOutputChars + (insertAt ) * MAX_WORD_LENGTH;
156 while (length--) {
157 *dest++ = *word++;
158 }
159 *dest = 0; // NULL terminate
160 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
161 return true;
162 }
163 return false;
164}
165
166bool
167UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
168{
169 word[length] = 0;
170 if (DEBUG_DICT) {
171 char s[length + 1];
172 for (int i = 0; i <= length; i++) s[i] = word[i];
173 LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
174 }
175
176 // Find the right insertion point
177 int insertAt = 0;
178 while (insertAt < mMaxBigrams) {
179 if (frequency > mBigramFreq[insertAt]
180 || (mBigramFreq[insertAt] == frequency
181 && length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
182 break;
183 }
184 insertAt++;
185 }
186 LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
187 if (insertAt < mMaxBigrams) {
188 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
189 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
190 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
191 mBigramFreq[insertAt] = frequency;
192 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
193 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
194 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
195 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
196 while (length--) {
197 *dest++ = *word++;
198 }
199 *dest = 0; // NULL terminate
200 if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
201 return true;
202 }
203 return false;
204}
205
206unsigned short
207UnigramDictionary::toLowerCase(unsigned short c) {
208 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
209 c = BASE_CHARS[c];
210 }
211 if (c >='A' && c <= 'Z') {
212 c |= 32;
213 } else if (c > 127) {
214 c = latin_tolower(c);
215 }
216 return c;
217}
218
219bool
220UnigramDictionary::sameAsTyped(unsigned short *word, int length)
221{
222 if (length != mInputLength) {
223 return false;
224 }
225 int *inputCodes = mInputCodes;
226 while (length--) {
227 if ((unsigned int) *inputCodes != (unsigned int) *word) {
228 return false;
229 }
230 inputCodes += MAX_ALTERNATIVES;
231 word++;
232 }
233 return true;
234}
235
236static char QUOTE = '\'';
237
238void
239UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
240 int diffs, int skipPos, int *nextLetters, int nextLettersSize)
241{
242 // Optimization: Prune out words that are too long compared to how much was typed.
243 if (depth > maxDepth) {
244 return;
245 }
246 if (diffs > mMaxEditDistance) {
247 return;
248 }
satoke808e432010-12-02 14:53:24 +0900249 int count = Dictionary::getCount(DICT, &pos);
satok30088252010-12-01 21:22:15 +0900250 int *currentChars = NULL;
251 if (mInputLength <= inputIndex) {
252 completion = true;
253 } else {
254 currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
255 }
256
257 for (int i = 0; i < count; i++) {
258 // -- at char
satoke808e432010-12-02 14:53:24 +0900259 unsigned short c = Dictionary::getChar(DICT, &pos);
satok30088252010-12-01 21:22:15 +0900260 // -- at flag/add
261 unsigned short lowerC = toLowerCase(c);
satoke808e432010-12-02 14:53:24 +0900262 bool terminal = Dictionary::getTerminal(DICT, &pos);
263 int childrenAddress = Dictionary::getAddress(DICT, &pos);
satok30088252010-12-01 21:22:15 +0900264 // -- after address or flag
265 int freq = 1;
satoke808e432010-12-02 14:53:24 +0900266 if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos);
satok30088252010-12-01 21:22:15 +0900267 // -- after add or freq
268
269 // If we are only doing completions, no need to look at the typed characters.
270 if (completion) {
271 mWord[depth] = c;
272 if (terminal) {
273 addWord(mWord, depth + 1, freq * snr);
274 if (depth >= mInputLength && skipPos < 0) {
275 registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
276 }
277 }
278 if (childrenAddress != 0) {
279 getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
280 diffs, skipPos, nextLetters, nextLettersSize);
281 }
282 } else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) {
283 // Skip the ' or other letter and continue deeper
284 mWord[depth] = c;
285 if (childrenAddress != 0) {
286 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
287 skipPos, nextLetters, nextLettersSize);
288 }
289 } else {
290 int j = 0;
291 while (currentChars[j] > 0) {
292 if (currentChars[j] == lowerC || currentChars[j] == c) {
293 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
294 mWord[depth] = c;
295 if (mInputLength == inputIndex + 1) {
296 if (terminal) {
297 if (//INCLUDE_TYPED_WORD_IF_VALID ||
298 !sameAsTyped(mWord, depth + 1)) {
299 int finalFreq = freq * snr * addedWeight;
300 if (skipPos < 0) finalFreq *= mFullWordMultiplier;
301 addWord(mWord, depth + 1, finalFreq);
302 }
303 }
304 if (childrenAddress != 0) {
305 getWordsRec(childrenAddress, depth + 1,
306 maxDepth, true, snr * addedWeight, inputIndex + 1,
307 diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
308 }
309 } else if (childrenAddress != 0) {
310 getWordsRec(childrenAddress, depth + 1, maxDepth,
311 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
312 skipPos, nextLetters, nextLettersSize);
313 }
314 }
315 j++;
316 if (skipPos >= 0) break;
317 }
318 }
319 }
320}
321
322int
323UnigramDictionary::getBigramAddress(int *pos, bool advance)
324{
325 int address = 0;
326
satoke808e432010-12-02 14:53:24 +0900327 address += (DICT[*pos] & 0x3F) << 16;
328 address += (DICT[*pos + 1] & 0xFF) << 8;
329 address += (DICT[*pos + 2] & 0xFF);
satok30088252010-12-01 21:22:15 +0900330
331 if (advance) {
332 *pos += 3;
333 }
334
335 return address;
336}
337
338int
339UnigramDictionary::getBigramFreq(int *pos)
340{
satoke808e432010-12-02 14:53:24 +0900341 int freq = DICT[(*pos)++] & FLAG_BIGRAM_FREQ;
satok30088252010-12-01 21:22:15 +0900342
343 return freq;
344}
345
346
347int
348UnigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
349 unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
350 int maxAlternatives)
351{
352 mBigramFreq = bigramFreq;
353 mBigramChars = bigramChars;
354 mInputCodes = codes;
355 mInputLength = codesSize;
356 mMaxBigrams = maxBigrams;
357
satoke808e432010-12-02 14:53:24 +0900358 if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
359 int pos = mParentDictionary->isValidWordRec(
satok30088252010-12-01 21:22:15 +0900360 DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
361 LOGI("Pos -> %d\n", pos);
362 if (pos < 0) {
363 return 0;
364 }
365
366 int bigramCount = 0;
satoke808e432010-12-02 14:53:24 +0900367 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
satok30088252010-12-01 21:22:15 +0900368 if (bigramExist > 0) {
369 int nextBigramExist = 1;
370 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
371 int bigramAddress = getBigramAddress(&pos, true);
satoke808e432010-12-02 14:53:24 +0900372 int frequency = (FLAG_BIGRAM_FREQ & DICT[pos]);
satok30088252010-12-01 21:22:15 +0900373 // search for all bigrams and store them
374 searchForTerminalNode(bigramAddress, frequency);
satoke808e432010-12-02 14:53:24 +0900375 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
satok30088252010-12-01 21:22:15 +0900376 bigramCount++;
377 }
378 }
379
380 return bigramCount;
381 }
382 return 0;
383}
384
385void
386UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
387{
388 // track word with such address and store it in an array
389 unsigned short word[MAX_WORD_LENGTH];
390
391 int pos;
392 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
393 bool found = false;
394 char followingChar = ' ';
395 int depth = -1;
396
397 while(!found) {
398 bool followDownAddressSearchStop = false;
399 bool firstAddress = true;
400 bool haveToSearchAll = true;
401
402 if (depth >= 0) {
403 word[depth] = (unsigned short) followingChar;
404 }
405 pos = followDownBranchAddress; // pos start at count
satoke808e432010-12-02 14:53:24 +0900406 int count = DICT[pos] & 0xFF;
satok30088252010-12-01 21:22:15 +0900407 LOGI("count - %d\n",count);
408 pos++;
409 for (int i = 0; i < count; i++) {
410 // pos at data
411 pos++;
412 // pos now at flag
413 if (!getFirstBitOfByte(&pos)) { // non-terminal
414 if (!followDownAddressSearchStop) {
415 int addr = getBigramAddress(&pos, false);
416 if (addr > addressLookingFor) {
417 followDownAddressSearchStop = true;
418 if (firstAddress) {
419 firstAddress = false;
420 haveToSearchAll = true;
421 } else if (!haveToSearchAll) {
422 break;
423 }
424 } else {
425 followDownBranchAddress = addr;
satoke808e432010-12-02 14:53:24 +0900426 followingChar = (char)(0xFF & DICT[pos-1]);
satok30088252010-12-01 21:22:15 +0900427 if (firstAddress) {
428 firstAddress = false;
429 haveToSearchAll = false;
430 }
431 }
432 }
433 pos += 3;
434 } else if (getFirstBitOfByte(&pos)) { // terminal
435 if (addressLookingFor == (pos-1)) { // found !!
436 depth++;
satoke808e432010-12-02 14:53:24 +0900437 word[depth] = (0xFF & DICT[pos-1]);
satok30088252010-12-01 21:22:15 +0900438 found = true;
439 break;
440 }
441 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
442 if (!followDownAddressSearchStop) {
443 int addr = getBigramAddress(&pos, false);
444 if (addr > addressLookingFor) {
445 followDownAddressSearchStop = true;
446 if (firstAddress) {
447 firstAddress = false;
448 haveToSearchAll = true;
449 } else if (!haveToSearchAll) {
450 break;
451 }
452 } else {
453 followDownBranchAddress = addr;
satoke808e432010-12-02 14:53:24 +0900454 followingChar = (char)(0xFF & DICT[pos-1]);
satok30088252010-12-01 21:22:15 +0900455 if (firstAddress) {
456 firstAddress = false;
457 haveToSearchAll = true;
458 }
459 }
460 }
461 pos += 4;
462 } else { // freq only (2 byte)
463 pos += 2;
464 }
465
466 // skipping bigram
satoke808e432010-12-02 14:53:24 +0900467 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
satok30088252010-12-01 21:22:15 +0900468 if (bigramExist > 0) {
469 int nextBigramExist = 1;
470 while (nextBigramExist > 0) {
471 pos += 3;
satoke808e432010-12-02 14:53:24 +0900472 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
satok30088252010-12-01 21:22:15 +0900473 }
474 } else {
475 pos++;
476 }
477 }
478 }
479 depth++;
480 if (followDownBranchAddress == 0) {
481 LOGI("ERROR!!! Cannot find bigram!!");
482 break;
483 }
484 }
485 if (checkFirstCharacter(word)) {
486 addWordBigram(word, depth, frequency);
487 }
488}
489
490bool
491UnigramDictionary::checkFirstCharacter(unsigned short *word)
492{
493 // Checks whether this word starts with same character or neighboring characters of
494 // what user typed.
495
496 int *inputCodes = mInputCodes;
497 int maxAlt = MAX_ALTERNATIVES;
498 while (maxAlt > 0) {
499 if ((unsigned int) *inputCodes == (unsigned int) *word) {
500 return true;
501 }
502 inputCodes++;
503 maxAlt--;
504 }
505 return false;
506}
507
satok30088252010-12-01 21:22:15 +0900508} // namespace latinime