blob: 44dc75e9c54eaa03fdcb05d9e2961a3f142e8e47 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
Ken Wakasa5460ea32012-07-30 16:27:44 +09002 * Copyright (C) 2010, The Android Open Source Project
Ken Wakasa0bbb9172012-07-25 17:51:43 +09003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
satok30088252010-12-01 21:22:15 +090016
Ken Wakasaf1008c52012-07-31 17:56:40 +090017#include <cstring>
satok18c28f42010-12-02 18:11:54 +090018
satoke808e432010-12-02 14:53:24 +090019#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
20
satok30088252010-12-01 21:22:15 +090021#include "bigram_dictionary.h"
Jean Chalard588e2f22011-07-25 14:03:19 +090022#include "binary_format.h"
Jean Chalard49ba1352012-05-07 20:14:00 +090023#include "bloom_filter.h"
Ken Wakasa3b088a22012-05-16 23:05:32 +090024#include "defines.h"
Jean Chalard49ba1352012-05-07 20:14:00 +090025#include "dictionary.h"
satok30088252010-12-01 21:22:15 +090026
27namespace latinime {
28
Ken Wakasa5db594a2013-01-12 01:18:00 +090029BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) {
Ken Wakasade3070a2011-03-19 09:16:42 +090030 if (DEBUG_DICT) {
satok9fb6f472012-01-13 18:01:22 +090031 AKLOGI("BigramDictionary - constructor");
Ken Wakasade3070a2011-03-19 09:16:42 +090032 }
satok30088252010-12-01 21:22:15 +090033}
34
satok18c28f42010-12-02 18:11:54 +090035BigramDictionary::~BigramDictionary() {
satok30088252010-12-01 21:22:15 +090036}
satok18c28f42010-12-02 18:11:54 +090037
Ken Wakasaf6870cc2013-01-11 18:59:01 +090038void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq,
Ken Wakasa1e614932012-10-29 18:06:22 +090039 int *bigramCodePoints, int *outputTypes) const {
satok18c28f42010-12-02 18:11:54 +090040 word[length] = 0;
41 if (DEBUG_DICT) {
Doug Kwance9efbf2011-07-07 22:53:50 -070042#ifdef FLAG_DBG
satok18c28f42010-12-02 18:11:54 +090043 char s[length + 1];
Ken Wakasa1e614932012-10-29 18:06:22 +090044 for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
satok9fb6f472012-01-13 18:01:22 +090045 AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
satok787945b2011-07-14 08:32:57 +090046#endif
satok18c28f42010-12-02 18:11:54 +090047 }
48
49 // Find the right insertion point
50 int insertAt = 0;
Ken Wakasaf6870cc2013-01-11 18:59:01 +090051 while (insertAt < MAX_RESULTS) {
satokb1ed1d42012-06-14 16:35:23 -070052 if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency
Ken Wakasa1e614932012-10-29 18:06:22 +090053 && length < Dictionary::wideStrLen(
54 bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
satok18c28f42010-12-02 18:11:54 +090055 break;
56 }
57 insertAt++;
58 }
Ken Wakasade3070a2011-03-19 09:16:42 +090059 if (DEBUG_DICT) {
Ken Wakasaf6870cc2013-01-11 18:59:01 +090060 AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS);
Ken Wakasade3070a2011-03-19 09:16:42 +090061 }
Ken Wakasaf6870cc2013-01-11 18:59:01 +090062 if (insertAt >= MAX_RESULTS) {
63 return;
satok18c28f42010-12-02 18:11:54 +090064 }
Ken Wakasaf6870cc2013-01-11 18:59:01 +090065 memmove(bigramFreq + (insertAt + 1),
66 bigramFreq + insertAt,
67 (MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0]));
68 bigramFreq[insertAt] = frequency;
69 outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
70 memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
71 bigramCodePoints + insertAt * MAX_WORD_LENGTH,
72 (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH);
73 int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH;
74 while (length--) {
75 *dest++ = *word++;
76 }
77 *dest = 0; // NULL terminate
78 if (DEBUG_DICT) {
79 AKLOGI("Bigram: Added word at %d", insertAt);
80 }
satok18c28f42010-12-02 18:11:54 +090081}
82
Jean Chalard588e2f22011-07-25 14:03:19 +090083/* Parameters :
84 * prevWord: the word before, the one for which we need to look up bigrams.
85 * prevWordLength: its length.
Ken Wakasa5db594a2013-01-12 01:18:00 +090086 * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions.
87 * inputSize: the size of the codes array.
Ken Wakasa1e614932012-10-29 18:06:22 +090088 * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
Jean Chalard588e2f22011-07-25 14:03:19 +090089 * bigramFreq: an array to output frequencies.
Jean Chalard6931df92012-07-12 12:55:48 +090090 * outputTypes: an array to output types.
Jean Chalard588e2f22011-07-25 14:03:19 +090091 * This method returns the number of bigrams this word has, for backward compatibility.
92 * Note: this is not the number of bigrams output in the array, which is the number of
93 * bigrams this word has WHOSE first letter also matches the letter the user typed.
94 * TODO: this may not be a sensible thing to do. It makes sense when the bigrams are
95 * used to match the first letter of the second word, but once the user has typed more
96 * and the bigrams are used to boost unigram result scores, it makes little sense to
97 * reduce their scope to the ones that match the first letter.
98 */
Ken Wakasa5db594a2013-01-12 01:18:00 +090099int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
100 int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const {
Jean Chalard588e2f22011-07-25 14:03:19 +0900101 // TODO: remove unused arguments, and refrain from storing stuff in members of this class
102 // TODO: have "in" arguments before "out" ones, and make out args explicit in the name
satok18c28f42010-12-02 18:11:54 +0900103
Ken Wakasa5db594a2013-01-12 01:18:00 +0900104 const uint8_t *const root = DICT_ROOT;
Jean Chalarde9a86e22012-06-28 21:01:29 +0900105 int pos = getBigramListPositionForWord(prevWord, prevWordLength,
106 false /* forceLowerCaseSearch */);
Jean Chalard351864b2012-04-24 18:06:51 +0900107 // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
Jean Chalarde9a86e22012-06-28 21:01:29 +0900108 if (0 == pos) {
109 // If no bigrams for this exact word, search again in lower case.
110 pos = getBigramListPositionForWord(prevWord, prevWordLength,
111 true /* forceLowerCaseSearch */);
112 }
113 // If still no bigrams, we really don't have them!
Jean Chalardee396df2012-04-17 16:02:35 +0900114 if (0 == pos) return 0;
Ken Wakasad86d3132012-09-04 20:29:38 +0900115 uint8_t bigramFlags;
Jean Chalard588e2f22011-07-25 14:03:19 +0900116 int bigramCount = 0;
117 do {
118 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
Ken Wakasa1e614932012-10-29 18:06:22 +0900119 int bigramBuffer[MAX_WORD_LENGTH];
Jean Chalard62cd9192012-05-30 14:46:43 +0900120 int unigramFreq = 0;
Jean Chalard588e2f22011-07-25 14:03:19 +0900121 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
122 &pos);
123 const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
Jean Chalarde3084592012-05-29 16:22:46 +0900124 bigramBuffer, &unigramFreq);
satok18c28f42010-12-02 18:11:54 +0900125
Ken Wakasa5db594a2013-01-12 01:18:00 +0900126 // inputSize == 0 means we are trying to find bigram predictions.
127 if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
Jean Chalard19560502012-07-31 23:45:32 +0900128 const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
Jean Chalard46fe49f2012-05-29 16:50:25 +0900129 // Due to space constraints, the frequency for bigrams is approximate - the lower the
130 // unigram frequency, the worse the precision. The theoritical maximum error in
131 // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
132 // in very bad cases. This means that sometimes, we'll see some bigrams interverted
133 // here, but it can't get too bad.
Jean Chalarde3084592012-05-29 16:22:46 +0900134 const int frequency =
satokb1ed1d42012-06-14 16:35:23 -0700135 BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp);
Ken Wakasaf6870cc2013-01-11 18:59:01 +0900136 addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints,
137 outputTypes);
138 ++bigramCount;
satok18c28f42010-12-02 18:11:54 +0900139 }
Jean Chalard19560502012-07-31 23:45:32 +0900140 } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
Ken Wakasaf6870cc2013-01-11 18:59:01 +0900141 return min(bigramCount, MAX_RESULTS);
satok18c28f42010-12-02 18:11:54 +0900142}
143
Jean Chalardee396df2012-04-17 16:02:35 +0900144// Returns a pointer to the start of the bigram list.
145// If the word is not found or has no bigrams, this function returns 0.
Ken Wakasaaa5a3e82012-12-03 19:54:30 +0900146int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
147 const bool forceLowerCaseSearch) const {
Jean Chalard1ff8dc42012-05-02 16:00:24 +0900148 if (0 >= prevWordLength) return 0;
Ken Wakasa5db594a2013-01-12 01:18:00 +0900149 const uint8_t *const root = DICT_ROOT;
Jean Chalarde9a86e22012-06-28 21:01:29 +0900150 int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
151 forceLowerCaseSearch);
Jean Chalard9c2a96a2012-04-17 11:38:44 +0900152
153 if (NOT_VALID_WORD == pos) return 0;
Ken Wakasad86d3132012-09-04 20:29:38 +0900154 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
Jean Chalard19560502012-07-31 23:45:32 +0900155 if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
156 if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
Ken Wakasaf2789812012-09-04 12:49:46 +0900157 BinaryFormat::getCodePointAndForwardPointer(root, &pos);
Jean Chalard9c2a96a2012-04-17 11:38:44 +0900158 } else {
159 pos = BinaryFormat::skipOtherCharacters(root, pos);
160 }
Jean Chalard9c2a96a2012-04-17 11:38:44 +0900161 pos = BinaryFormat::skipFrequency(flags, pos);
Jean Chalard402b0572012-05-29 15:56:30 +0900162 pos = BinaryFormat::skipChildrenPosition(flags, pos);
Jean Chalard9c2a96a2012-04-17 11:38:44 +0900163 pos = BinaryFormat::skipShortcuts(root, flags, pos);
164 return pos;
165}
166
Ken Wakasaaa5a3e82012-12-03 19:54:30 +0900167void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord,
satokb1ed1d42012-06-14 16:35:23 -0700168 const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
Jean Chalardf1634c82012-05-02 19:05:27 +0900169 memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
Ken Wakasa5db594a2013-01-12 01:18:00 +0900170 const uint8_t *const root = DICT_ROOT;
Jean Chalarde9a86e22012-06-28 21:01:29 +0900171 int pos = getBigramListPositionForWord(prevWord, prevWordLength,
172 false /* forceLowerCaseSearch */);
173 if (0 == pos) {
174 // If no bigrams for this exact string, search again in lower case.
175 pos = getBigramListPositionForWord(prevWord, prevWordLength,
176 true /* forceLowerCaseSearch */);
177 }
Jean Chalard1ff8dc42012-05-02 16:00:24 +0900178 if (0 == pos) return;
179
Ken Wakasad86d3132012-09-04 20:29:38 +0900180 uint8_t bigramFlags;
Jean Chalard1ff8dc42012-05-02 16:00:24 +0900181 do {
182 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
Jean Chalard19560502012-07-31 23:45:32 +0900183 const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
Jean Chalard1ff8dc42012-05-02 16:00:24 +0900184 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
185 &pos);
186 (*map)[bigramPos] = frequency;
Jean Chalardf1634c82012-05-02 19:05:27 +0900187 setInFilter(filter, bigramPos);
Jean Chalard19560502012-07-31 23:45:32 +0900188 } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
Jean Chalard1ff8dc42012-05-02 16:00:24 +0900189}
190
Ken Wakasa5db594a2013-01-12 01:18:00 +0900191bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const {
satok18c28f42010-12-02 18:11:54 +0900192 // Checks whether this word starts with same character or neighboring characters of
193 // what user typed.
194
satok18c28f42010-12-02 18:11:54 +0900195 int maxAlt = MAX_ALTERNATIVES;
Ken Wakasa5db594a2013-01-12 01:18:00 +0900196 const int firstBaseLowerCodePoint = toBaseLowerCase(*word);
satok18c28f42010-12-02 18:11:54 +0900197 while (maxAlt > 0) {
Ken Wakasa5db594a2013-01-12 01:18:00 +0900198 if (toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) {
satok18c28f42010-12-02 18:11:54 +0900199 return true;
200 }
Ken Wakasa5db594a2013-01-12 01:18:00 +0900201 inputCodePoints++;
satok18c28f42010-12-02 18:11:54 +0900202 maxAlt--;
203 }
204 return false;
205}
206
Ken Wakasaaa5a3e82012-12-03 19:54:30 +0900207bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
satokb1ed1d42012-06-14 16:35:23 -0700208 int length2) const {
Ken Wakasa5db594a2013-01-12 01:18:00 +0900209 const uint8_t *const root = DICT_ROOT;
Jean Chalarde9a86e22012-06-28 21:01:29 +0900210 int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
Tom Ouyang4d289d32012-04-26 23:50:21 -0700211 // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
212 if (0 == pos) return false;
Jean Chalarde9a86e22012-06-28 21:01:29 +0900213 int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
214 false /* forceLowerCaseSearch */);
Tom Ouyang4d289d32012-04-26 23:50:21 -0700215 if (NOT_VALID_WORD == nextWordPos) return false;
Ken Wakasad86d3132012-09-04 20:29:38 +0900216 uint8_t bigramFlags;
Tom Ouyang4d289d32012-04-26 23:50:21 -0700217 do {
218 bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
219 const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
220 &pos);
221 if (bigramPos == nextWordPos) {
222 return true;
223 }
Jean Chalard19560502012-07-31 23:45:32 +0900224 } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
Tom Ouyang4d289d32012-04-26 23:50:21 -0700225 return false;
226}
227
satok30088252010-12-01 21:22:15 +0900228// TODO: Move functions related to bigram to here
229} // namespace latinime