blob: d11aee28eea1e810efb67efa6cbe89036d747e5e [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok18c28f42010-12-02 18:11:54 +090018#include <string.h>
19
satoke808e432010-12-02 14:53:24 +090020#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
21
satok30088252010-12-01 21:22:15 +090022#include "bigram_dictionary.h"
satok18c28f42010-12-02 18:11:54 +090023#include "dictionary.h"
satok30088252010-12-01 21:22:15 +090024
25namespace latinime {
26
satok18c28f42010-12-02 18:11:54 +090027BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
28 int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
29 Dictionary *parentDictionary)
30 : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
31 MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
32 HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
Ken Wakasade3070a2011-03-19 09:16:42 +090033 if (DEBUG_DICT) {
34 LOGI("BigramDictionary - constructor");
35 LOGI("Has Bigram : %d", hasBigram);
36 }
satok30088252010-12-01 21:22:15 +090037}
38
satok18c28f42010-12-02 18:11:54 +090039BigramDictionary::~BigramDictionary() {
satok30088252010-12-01 21:22:15 +090040}
satok18c28f42010-12-02 18:11:54 +090041
42bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) {
43 word[length] = 0;
44 if (DEBUG_DICT) {
Doug Kwance9efbf2011-07-07 22:53:50 -070045#ifdef FLAG_DBG
satok18c28f42010-12-02 18:11:54 +090046 char s[length + 1];
47 for (int i = 0; i <= length; i++) s[i] = word[i];
Doug Kwance9efbf2011-07-07 22:53:50 -070048#endif
Ken Wakasae90b3332011-01-07 15:01:51 +090049 LOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
satok18c28f42010-12-02 18:11:54 +090050 }
51
52 // Find the right insertion point
53 int insertAt = 0;
54 while (insertAt < mMaxBigrams) {
55 if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency
56 && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
57 break;
58 }
59 insertAt++;
60 }
Ken Wakasade3070a2011-03-19 09:16:42 +090061 if (DEBUG_DICT) {
62 LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
63 }
satok18c28f42010-12-02 18:11:54 +090064 if (insertAt < mMaxBigrams) {
65 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
66 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
67 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
68 mBigramFreq[insertAt] = frequency;
69 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
70 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
71 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
72 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
73 while (length--) {
74 *dest++ = *word++;
75 }
76 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +090077 if (DEBUG_DICT) {
78 LOGI("Bigram: Added word at %d", insertAt);
79 }
satok18c28f42010-12-02 18:11:54 +090080 return true;
81 }
82 return false;
83}
84
85int BigramDictionary::getBigramAddress(int *pos, bool advance) {
86 int address = 0;
87
88 address += (DICT[*pos] & 0x3F) << 16;
89 address += (DICT[*pos + 1] & 0xFF) << 8;
90 address += (DICT[*pos + 2] & 0xFF);
91
92 if (advance) {
93 *pos += 3;
94 }
95
96 return address;
97}
98
99int BigramDictionary::getBigramFreq(int *pos) {
100 int freq = DICT[(*pos)++] & FLAG_BIGRAM_FREQ;
101
102 return freq;
103}
104
105
106int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes,
107 int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength,
108 int maxBigrams, int maxAlternatives) {
109 mBigramFreq = bigramFreq;
110 mBigramChars = bigramChars;
111 mInputCodes = codes;
112 mInputLength = codesSize;
113 mMaxBigrams = maxBigrams;
114
115 if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
Jean Chalard581335c2011-06-17 12:45:17 +0900116 int pos = mParentDictionary->getBigramPosition(prevWord, prevWordLength);
Ken Wakasade3070a2011-03-19 09:16:42 +0900117 if (DEBUG_DICT) {
118 LOGI("Pos -> %d", pos);
119 }
satok18c28f42010-12-02 18:11:54 +0900120 if (pos < 0) {
121 return 0;
122 }
123
124 int bigramCount = 0;
125 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
126 if (bigramExist > 0) {
127 int nextBigramExist = 1;
128 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
129 int bigramAddress = getBigramAddress(&pos, true);
130 int frequency = (FLAG_BIGRAM_FREQ & DICT[pos]);
131 // search for all bigrams and store them
132 searchForTerminalNode(bigramAddress, frequency);
133 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
134 bigramCount++;
135 }
136 }
137
138 return bigramCount;
139 }
140 return 0;
141}
142
143void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency) {
144 // track word with such address and store it in an array
145 unsigned short word[MAX_WORD_LENGTH];
146
147 int pos;
148 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
149 bool found = false;
150 char followingChar = ' ';
151 int depth = -1;
152
153 while(!found) {
154 bool followDownAddressSearchStop = false;
155 bool firstAddress = true;
156 bool haveToSearchAll = true;
157
satokf5cded12010-12-06 21:28:24 +0900158 if (depth < MAX_WORD_LENGTH && depth >= 0) {
satok18c28f42010-12-02 18:11:54 +0900159 word[depth] = (unsigned short) followingChar;
160 }
161 pos = followDownBranchAddress; // pos start at count
162 int count = DICT[pos] & 0xFF;
Ken Wakasade3070a2011-03-19 09:16:42 +0900163 if (DEBUG_DICT) {
164 LOGI("count - %d",count);
165 }
satok18c28f42010-12-02 18:11:54 +0900166 pos++;
167 for (int i = 0; i < count; i++) {
168 // pos at data
169 pos++;
170 // pos now at flag
171 if (!getFirstBitOfByte(&pos)) { // non-terminal
172 if (!followDownAddressSearchStop) {
173 int addr = getBigramAddress(&pos, false);
174 if (addr > addressLookingFor) {
175 followDownAddressSearchStop = true;
176 if (firstAddress) {
177 firstAddress = false;
178 haveToSearchAll = true;
179 } else if (!haveToSearchAll) {
180 break;
181 }
182 } else {
183 followDownBranchAddress = addr;
184 followingChar = (char)(0xFF & DICT[pos-1]);
185 if (firstAddress) {
186 firstAddress = false;
187 haveToSearchAll = false;
188 }
189 }
190 }
191 pos += 3;
192 } else if (getFirstBitOfByte(&pos)) { // terminal
193 if (addressLookingFor == (pos-1)) { // found !!
194 depth++;
195 word[depth] = (0xFF & DICT[pos-1]);
196 found = true;
197 break;
198 }
199 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
200 if (!followDownAddressSearchStop) {
201 int addr = getBigramAddress(&pos, false);
202 if (addr > addressLookingFor) {
203 followDownAddressSearchStop = true;
204 if (firstAddress) {
205 firstAddress = false;
206 haveToSearchAll = true;
207 } else if (!haveToSearchAll) {
208 break;
209 }
210 } else {
211 followDownBranchAddress = addr;
212 followingChar = (char)(0xFF & DICT[pos-1]);
213 if (firstAddress) {
214 firstAddress = false;
215 haveToSearchAll = true;
216 }
217 }
218 }
219 pos += 4;
220 } else { // freq only (2 byte)
221 pos += 2;
222 }
223
224 // skipping bigram
225 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
226 if (bigramExist > 0) {
227 int nextBigramExist = 1;
228 while (nextBigramExist > 0) {
229 pos += 3;
230 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
231 }
232 } else {
233 pos++;
234 }
235 }
236 }
237 depth++;
238 if (followDownBranchAddress == 0) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900239 if (DEBUG_DICT) {
240 LOGI("ERROR!!! Cannot find bigram!!");
241 }
satok18c28f42010-12-02 18:11:54 +0900242 break;
243 }
244 }
245 if (checkFirstCharacter(word)) {
246 addWordBigram(word, depth, frequency);
247 }
248}
249
250bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
251 // Checks whether this word starts with same character or neighboring characters of
252 // what user typed.
253
254 int *inputCodes = mInputCodes;
255 int maxAlt = MAX_ALTERNATIVES;
256 while (maxAlt > 0) {
257 if ((unsigned int) *inputCodes == (unsigned int) *word) {
258 return true;
259 }
260 inputCodes++;
261 maxAlt--;
262 }
263 return false;
264}
265
satok30088252010-12-01 21:22:15 +0900266// TODO: Move functions related to bigram to here
267} // namespace latinime