blob: 88382a94d7d21a3c5077141c6eafe3a067a989cb [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
18#include <stdio.h>
19#include <fcntl.h>
20#include <sys/mman.h>
21#include <string.h>
22
23#ifdef FLAG_DBG
24#define LOG_TAG "LatinIME: dictionary.cpp"
25#include <cutils/log.h>
26#define DEBUG_DICT 1
27#else // FLAG_DBG
28#define LOGI
29#define DEBUG_DICT 0
30#endif // FLAG_DBG
31
32#include "unigram_dictionary.h"
33#include "basechars.h"
34#include "char_utils.h"
35
36#define DICTIONARY_VERSION_MIN 200
37#define DICTIONARY_HEADER_SIZE 2
38#define NOT_VALID_WORD -99
39
40#define SUGGEST_MISSING_CHARACTERS true
41#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5
42
43
44namespace latinime {
45
46UnigramDictionary::UnigramDictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
47 int maxWordLength, int maxWords, int maxAlternatives, Dictionary *parentDictionary)
48 : MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords), MAX_ALTERNATIVES(maxAlternatives)
49{
50 LOGI("UnigramDictionary - constructor");
51 mDict = (unsigned char*) dict;
52 mTypedLetterMultiplier = typedLetterMultiplier;
53 mFullWordMultiplier = fullWordMultiplier;
54 mParentDictionary = parentDictionary;
55 getVersionNumber();
56}
57
58UnigramDictionary::~UnigramDictionary()
59{
60}
61
62int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
63 int *nextLetters, int nextLettersSize)
64{
65
66 initSuggestions(codes, codesSize, outWords, frequencies);
67
68 int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
69 nextLettersSize);
70
71 // If there aren't sufficient suggestions, search for words by allowing wild cards at
72 // the different character positions. This feature is not ready for prime-time as we need
73 // to figure out the best ranking for such words compared to proximity corrections and
74 // completions.
75 if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
76 for (int i = 0; i < codesSize; ++i) {
77 int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
78 if (tempCount > suggestedWordsCount) {
79 suggestedWordsCount = tempCount;
80 break;
81 }
82 }
83 }
84
85 if (DEBUG_DICT) {
86 LOGI("Returning %d words", suggestedWordsCount);
87 LOGI("Next letters: ");
88 for (int k = 0; k < nextLettersSize; k++) {
89 if (nextLetters[k] > 0) {
90 LOGI("%c = %d,", k, nextLetters[k]);
91 }
92 }
93 LOGI("\n");
94 }
95 return suggestedWordsCount;
96}
97
98void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
99 int *frequencies) {
100 mFrequencies = frequencies;
101 mOutputChars = outWords;
102 mInputCodes = codes;
103 mInputLength = codesSize;
104 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
105}
106
107int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
108 int *nextLetters, int nextLettersSize) {
109 if (checkIfDictVersionIsLatest()) {
110 getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
111 nextLetters, nextLettersSize);
112 } else {
113 getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
114 }
115
116 // Get the word count
117 int suggestedWordsCount = 0;
118 while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
119 suggestedWordsCount++;
120 }
121 return suggestedWordsCount;
122}
123
124void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
125 if (c < nextLettersSize) {
126 nextLetters[c]++;
127 }
128}
129
130// TODO: Should be const static variable calculate in the constructor
131void
132UnigramDictionary::getVersionNumber()
133{
134 mVersion = (mDict[0] & 0xFF);
135 mBigram = (mDict[1] & 0xFF);
136 LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
137}
138
139// TODO: Should be const static variable calculate in the constructor
140// Checks whether it has the latest dictionary or the old dictionary
141bool
142UnigramDictionary::checkIfDictVersionIsLatest()
143{
144 return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
145}
146
147unsigned short
148UnigramDictionary::getChar(int *pos)
149{
150 unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
151 // If the code is 255, then actual 16 bit code follows (in big endian)
152 if (ch == 0xFF) {
153 ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
154 (*pos) += 2;
155 }
156 return ch;
157}
158
159int
160UnigramDictionary::getAddress(int *pos)
161{
162 int address = 0;
163 if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
164 *pos += 1;
165 } else {
166 address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
167 address += (mDict[*pos + 1] & 0xFF) << 8;
168 address += (mDict[*pos + 2] & 0xFF);
169 *pos += 3;
170 }
171 return address;
172}
173
174int
175UnigramDictionary::getFreq(int *pos)
176{
177 int freq = mDict[(*pos)++] & 0xFF;
178
179 if (checkIfDictVersionIsLatest()) {
180 // skipping bigram
181 int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
182 if (bigramExist > 0) {
183 int nextBigramExist = 1;
184 while (nextBigramExist > 0) {
185 (*pos) += 3;
186 nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
187 }
188 } else {
189 (*pos)++;
190 }
191 }
192
193 return freq;
194}
195
196int
197UnigramDictionary::wideStrLen(unsigned short *str)
198{
199 if (!str) return 0;
200 unsigned short *end = str;
201 while (*end)
202 end++;
203 return end - str;
204}
205
206bool
207UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
208{
209 word[length] = 0;
210 if (DEBUG_DICT) {
211 char s[length + 1];
212 for (int i = 0; i <= length; i++) s[i] = word[i];
213 LOGI("Found word = %s, freq = %d : \n", s, frequency);
214 }
215
216 // Find the right insertion point
217 int insertAt = 0;
218 while (insertAt < MAX_WORDS) {
219 if (frequency > mFrequencies[insertAt]
220 || (mFrequencies[insertAt] == frequency
221 && length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
222 break;
223 }
224 insertAt++;
225 }
226 if (insertAt < MAX_WORDS) {
227 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
228 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
229 (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
230 mFrequencies[insertAt] = frequency;
231 memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
232 (char*) mOutputChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
233 (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
234 unsigned short *dest = mOutputChars + (insertAt ) * MAX_WORD_LENGTH;
235 while (length--) {
236 *dest++ = *word++;
237 }
238 *dest = 0; // NULL terminate
239 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
240 return true;
241 }
242 return false;
243}
244
245bool
246UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
247{
248 word[length] = 0;
249 if (DEBUG_DICT) {
250 char s[length + 1];
251 for (int i = 0; i <= length; i++) s[i] = word[i];
252 LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
253 }
254
255 // Find the right insertion point
256 int insertAt = 0;
257 while (insertAt < mMaxBigrams) {
258 if (frequency > mBigramFreq[insertAt]
259 || (mBigramFreq[insertAt] == frequency
260 && length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
261 break;
262 }
263 insertAt++;
264 }
265 LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
266 if (insertAt < mMaxBigrams) {
267 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
268 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
269 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
270 mBigramFreq[insertAt] = frequency;
271 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
272 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
273 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
274 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
275 while (length--) {
276 *dest++ = *word++;
277 }
278 *dest = 0; // NULL terminate
279 if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
280 return true;
281 }
282 return false;
283}
284
285unsigned short
286UnigramDictionary::toLowerCase(unsigned short c) {
287 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
288 c = BASE_CHARS[c];
289 }
290 if (c >='A' && c <= 'Z') {
291 c |= 32;
292 } else if (c > 127) {
293 c = latin_tolower(c);
294 }
295 return c;
296}
297
298bool
299UnigramDictionary::sameAsTyped(unsigned short *word, int length)
300{
301 if (length != mInputLength) {
302 return false;
303 }
304 int *inputCodes = mInputCodes;
305 while (length--) {
306 if ((unsigned int) *inputCodes != (unsigned int) *word) {
307 return false;
308 }
309 inputCodes += MAX_ALTERNATIVES;
310 word++;
311 }
312 return true;
313}
314
315static char QUOTE = '\'';
316
317void
318UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
319 int diffs, int skipPos, int *nextLetters, int nextLettersSize)
320{
321 // Optimization: Prune out words that are too long compared to how much was typed.
322 if (depth > maxDepth) {
323 return;
324 }
325 if (diffs > mMaxEditDistance) {
326 return;
327 }
328 int count = getCount(&pos);
329 int *currentChars = NULL;
330 if (mInputLength <= inputIndex) {
331 completion = true;
332 } else {
333 currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
334 }
335
336 for (int i = 0; i < count; i++) {
337 // -- at char
338 unsigned short c = getChar(&pos);
339 // -- at flag/add
340 unsigned short lowerC = toLowerCase(c);
341 bool terminal = getTerminal(&pos);
342 int childrenAddress = getAddress(&pos);
343 // -- after address or flag
344 int freq = 1;
345 if (terminal) freq = getFreq(&pos);
346 // -- after add or freq
347
348 // If we are only doing completions, no need to look at the typed characters.
349 if (completion) {
350 mWord[depth] = c;
351 if (terminal) {
352 addWord(mWord, depth + 1, freq * snr);
353 if (depth >= mInputLength && skipPos < 0) {
354 registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
355 }
356 }
357 if (childrenAddress != 0) {
358 getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
359 diffs, skipPos, nextLetters, nextLettersSize);
360 }
361 } else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) {
362 // Skip the ' or other letter and continue deeper
363 mWord[depth] = c;
364 if (childrenAddress != 0) {
365 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
366 skipPos, nextLetters, nextLettersSize);
367 }
368 } else {
369 int j = 0;
370 while (currentChars[j] > 0) {
371 if (currentChars[j] == lowerC || currentChars[j] == c) {
372 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
373 mWord[depth] = c;
374 if (mInputLength == inputIndex + 1) {
375 if (terminal) {
376 if (//INCLUDE_TYPED_WORD_IF_VALID ||
377 !sameAsTyped(mWord, depth + 1)) {
378 int finalFreq = freq * snr * addedWeight;
379 if (skipPos < 0) finalFreq *= mFullWordMultiplier;
380 addWord(mWord, depth + 1, finalFreq);
381 }
382 }
383 if (childrenAddress != 0) {
384 getWordsRec(childrenAddress, depth + 1,
385 maxDepth, true, snr * addedWeight, inputIndex + 1,
386 diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
387 }
388 } else if (childrenAddress != 0) {
389 getWordsRec(childrenAddress, depth + 1, maxDepth,
390 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
391 skipPos, nextLetters, nextLettersSize);
392 }
393 }
394 j++;
395 if (skipPos >= 0) break;
396 }
397 }
398 }
399}
400
401int
402UnigramDictionary::getBigramAddress(int *pos, bool advance)
403{
404 int address = 0;
405
406 address += (mDict[*pos] & 0x3F) << 16;
407 address += (mDict[*pos + 1] & 0xFF) << 8;
408 address += (mDict[*pos + 2] & 0xFF);
409
410 if (advance) {
411 *pos += 3;
412 }
413
414 return address;
415}
416
417int
418UnigramDictionary::getBigramFreq(int *pos)
419{
420 int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
421
422 return freq;
423}
424
425
426int
427UnigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
428 unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
429 int maxAlternatives)
430{
431 mBigramFreq = bigramFreq;
432 mBigramChars = bigramChars;
433 mInputCodes = codes;
434 mInputLength = codesSize;
435 mMaxBigrams = maxBigrams;
436
437 if (mBigram == 1 && checkIfDictVersionIsLatest()) {
438 int pos = isValidWordRec(
439 DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
440 LOGI("Pos -> %d\n", pos);
441 if (pos < 0) {
442 return 0;
443 }
444
445 int bigramCount = 0;
446 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
447 if (bigramExist > 0) {
448 int nextBigramExist = 1;
449 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
450 int bigramAddress = getBigramAddress(&pos, true);
451 int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
452 // search for all bigrams and store them
453 searchForTerminalNode(bigramAddress, frequency);
454 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
455 bigramCount++;
456 }
457 }
458
459 return bigramCount;
460 }
461 return 0;
462}
463
464void
465UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
466{
467 // track word with such address and store it in an array
468 unsigned short word[MAX_WORD_LENGTH];
469
470 int pos;
471 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
472 bool found = false;
473 char followingChar = ' ';
474 int depth = -1;
475
476 while(!found) {
477 bool followDownAddressSearchStop = false;
478 bool firstAddress = true;
479 bool haveToSearchAll = true;
480
481 if (depth >= 0) {
482 word[depth] = (unsigned short) followingChar;
483 }
484 pos = followDownBranchAddress; // pos start at count
485 int count = mDict[pos] & 0xFF;
486 LOGI("count - %d\n",count);
487 pos++;
488 for (int i = 0; i < count; i++) {
489 // pos at data
490 pos++;
491 // pos now at flag
492 if (!getFirstBitOfByte(&pos)) { // non-terminal
493 if (!followDownAddressSearchStop) {
494 int addr = getBigramAddress(&pos, false);
495 if (addr > addressLookingFor) {
496 followDownAddressSearchStop = true;
497 if (firstAddress) {
498 firstAddress = false;
499 haveToSearchAll = true;
500 } else if (!haveToSearchAll) {
501 break;
502 }
503 } else {
504 followDownBranchAddress = addr;
505 followingChar = (char)(0xFF & mDict[pos-1]);
506 if (firstAddress) {
507 firstAddress = false;
508 haveToSearchAll = false;
509 }
510 }
511 }
512 pos += 3;
513 } else if (getFirstBitOfByte(&pos)) { // terminal
514 if (addressLookingFor == (pos-1)) { // found !!
515 depth++;
516 word[depth] = (0xFF & mDict[pos-1]);
517 found = true;
518 break;
519 }
520 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
521 if (!followDownAddressSearchStop) {
522 int addr = getBigramAddress(&pos, false);
523 if (addr > addressLookingFor) {
524 followDownAddressSearchStop = true;
525 if (firstAddress) {
526 firstAddress = false;
527 haveToSearchAll = true;
528 } else if (!haveToSearchAll) {
529 break;
530 }
531 } else {
532 followDownBranchAddress = addr;
533 followingChar = (char)(0xFF & mDict[pos-1]);
534 if (firstAddress) {
535 firstAddress = false;
536 haveToSearchAll = true;
537 }
538 }
539 }
540 pos += 4;
541 } else { // freq only (2 byte)
542 pos += 2;
543 }
544
545 // skipping bigram
546 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
547 if (bigramExist > 0) {
548 int nextBigramExist = 1;
549 while (nextBigramExist > 0) {
550 pos += 3;
551 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
552 }
553 } else {
554 pos++;
555 }
556 }
557 }
558 depth++;
559 if (followDownBranchAddress == 0) {
560 LOGI("ERROR!!! Cannot find bigram!!");
561 break;
562 }
563 }
564 if (checkFirstCharacter(word)) {
565 addWordBigram(word, depth, frequency);
566 }
567}
568
569bool
570UnigramDictionary::checkFirstCharacter(unsigned short *word)
571{
572 // Checks whether this word starts with same character or neighboring characters of
573 // what user typed.
574
575 int *inputCodes = mInputCodes;
576 int maxAlt = MAX_ALTERNATIVES;
577 while (maxAlt > 0) {
578 if ((unsigned int) *inputCodes == (unsigned int) *word) {
579 return true;
580 }
581 inputCodes++;
582 maxAlt--;
583 }
584 return false;
585}
586
587// TODO: Move to parent dictionary
588bool
589UnigramDictionary::isValidWord(unsigned short *word, int length)
590{
591 if (checkIfDictVersionIsLatest()) {
592 return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
593 } else {
594 return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
595 }
596}
597
598int
599UnigramDictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
600 // returns address of bigram data of that word
601 // return -99 if not found
602
603 int count = getCount(&pos);
604 unsigned short currentChar = (unsigned short) word[offset];
605 for (int j = 0; j < count; j++) {
606 unsigned short c = getChar(&pos);
607 int terminal = getTerminal(&pos);
608 int childPos = getAddress(&pos);
609 if (c == currentChar) {
610 if (offset == length - 1) {
611 if (terminal) {
612 return (pos+1);
613 }
614 } else {
615 if (childPos != 0) {
616 int t = isValidWordRec(childPos, word, offset + 1, length);
617 if (t > 0) {
618 return t;
619 }
620 }
621 }
622 }
623 if (terminal) {
624 getFreq(&pos);
625 }
626 // There could be two instances of each alphabet - upper and lower case. So continue
627 // looking ...
628 }
629 return NOT_VALID_WORD;
630}
631} // namespace latinime