blob: 36761b88dd2a147c20a01addf8b9ecfce08cd449 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok18c28f42010-12-02 18:11:54 +090018#include <string.h>
19
satoke808e432010-12-02 14:53:24 +090020#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
21
satok30088252010-12-01 21:22:15 +090022#include "bigram_dictionary.h"
satok18c28f42010-12-02 18:11:54 +090023#include "dictionary.h"
satok30088252010-12-01 21:22:15 +090024
25namespace latinime {
26
satok18c28f42010-12-02 18:11:54 +090027BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
28 int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
29 Dictionary *parentDictionary)
30 : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
31 MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
32 HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
Ken Wakasade3070a2011-03-19 09:16:42 +090033 if (DEBUG_DICT) {
34 LOGI("BigramDictionary - constructor");
35 LOGI("Has Bigram : %d", hasBigram);
36 }
satok30088252010-12-01 21:22:15 +090037}
38
satok18c28f42010-12-02 18:11:54 +090039BigramDictionary::~BigramDictionary() {
satok30088252010-12-01 21:22:15 +090040}
satok18c28f42010-12-02 18:11:54 +090041
42bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) {
43 word[length] = 0;
44 if (DEBUG_DICT) {
45 char s[length + 1];
46 for (int i = 0; i <= length; i++) s[i] = word[i];
Ken Wakasae90b3332011-01-07 15:01:51 +090047 LOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
satok18c28f42010-12-02 18:11:54 +090048 }
49
50 // Find the right insertion point
51 int insertAt = 0;
52 while (insertAt < mMaxBigrams) {
53 if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency
54 && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
55 break;
56 }
57 insertAt++;
58 }
Ken Wakasade3070a2011-03-19 09:16:42 +090059 if (DEBUG_DICT) {
60 LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
61 }
satok18c28f42010-12-02 18:11:54 +090062 if (insertAt < mMaxBigrams) {
63 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
64 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
65 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
66 mBigramFreq[insertAt] = frequency;
67 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
68 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
69 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
70 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
71 while (length--) {
72 *dest++ = *word++;
73 }
74 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +090075 if (DEBUG_DICT) {
76 LOGI("Bigram: Added word at %d", insertAt);
77 }
satok18c28f42010-12-02 18:11:54 +090078 return true;
79 }
80 return false;
81}
82
83int BigramDictionary::getBigramAddress(int *pos, bool advance) {
84 int address = 0;
85
86 address += (DICT[*pos] & 0x3F) << 16;
87 address += (DICT[*pos + 1] & 0xFF) << 8;
88 address += (DICT[*pos + 2] & 0xFF);
89
90 if (advance) {
91 *pos += 3;
92 }
93
94 return address;
95}
96
97int BigramDictionary::getBigramFreq(int *pos) {
98 int freq = DICT[(*pos)++] & FLAG_BIGRAM_FREQ;
99
100 return freq;
101}
102
103
104int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes,
105 int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength,
106 int maxBigrams, int maxAlternatives) {
107 mBigramFreq = bigramFreq;
108 mBigramChars = bigramChars;
109 mInputCodes = codes;
110 mInputLength = codesSize;
111 mMaxBigrams = maxBigrams;
112
113 if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
114 int pos = mParentDictionary->isValidWordRec(
115 DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
Ken Wakasade3070a2011-03-19 09:16:42 +0900116 if (DEBUG_DICT) {
117 LOGI("Pos -> %d", pos);
118 }
satok18c28f42010-12-02 18:11:54 +0900119 if (pos < 0) {
120 return 0;
121 }
122
123 int bigramCount = 0;
124 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
125 if (bigramExist > 0) {
126 int nextBigramExist = 1;
127 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
128 int bigramAddress = getBigramAddress(&pos, true);
129 int frequency = (FLAG_BIGRAM_FREQ & DICT[pos]);
130 // search for all bigrams and store them
131 searchForTerminalNode(bigramAddress, frequency);
132 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
133 bigramCount++;
134 }
135 }
136
137 return bigramCount;
138 }
139 return 0;
140}
141
142void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency) {
143 // track word with such address and store it in an array
144 unsigned short word[MAX_WORD_LENGTH];
145
146 int pos;
147 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
148 bool found = false;
149 char followingChar = ' ';
150 int depth = -1;
151
152 while(!found) {
153 bool followDownAddressSearchStop = false;
154 bool firstAddress = true;
155 bool haveToSearchAll = true;
156
satokf5cded12010-12-06 21:28:24 +0900157 if (depth < MAX_WORD_LENGTH && depth >= 0) {
satok18c28f42010-12-02 18:11:54 +0900158 word[depth] = (unsigned short) followingChar;
159 }
160 pos = followDownBranchAddress; // pos start at count
161 int count = DICT[pos] & 0xFF;
Ken Wakasade3070a2011-03-19 09:16:42 +0900162 if (DEBUG_DICT) {
163 LOGI("count - %d",count);
164 }
satok18c28f42010-12-02 18:11:54 +0900165 pos++;
166 for (int i = 0; i < count; i++) {
167 // pos at data
168 pos++;
169 // pos now at flag
170 if (!getFirstBitOfByte(&pos)) { // non-terminal
171 if (!followDownAddressSearchStop) {
172 int addr = getBigramAddress(&pos, false);
173 if (addr > addressLookingFor) {
174 followDownAddressSearchStop = true;
175 if (firstAddress) {
176 firstAddress = false;
177 haveToSearchAll = true;
178 } else if (!haveToSearchAll) {
179 break;
180 }
181 } else {
182 followDownBranchAddress = addr;
183 followingChar = (char)(0xFF & DICT[pos-1]);
184 if (firstAddress) {
185 firstAddress = false;
186 haveToSearchAll = false;
187 }
188 }
189 }
190 pos += 3;
191 } else if (getFirstBitOfByte(&pos)) { // terminal
192 if (addressLookingFor == (pos-1)) { // found !!
193 depth++;
194 word[depth] = (0xFF & DICT[pos-1]);
195 found = true;
196 break;
197 }
198 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
199 if (!followDownAddressSearchStop) {
200 int addr = getBigramAddress(&pos, false);
201 if (addr > addressLookingFor) {
202 followDownAddressSearchStop = true;
203 if (firstAddress) {
204 firstAddress = false;
205 haveToSearchAll = true;
206 } else if (!haveToSearchAll) {
207 break;
208 }
209 } else {
210 followDownBranchAddress = addr;
211 followingChar = (char)(0xFF & DICT[pos-1]);
212 if (firstAddress) {
213 firstAddress = false;
214 haveToSearchAll = true;
215 }
216 }
217 }
218 pos += 4;
219 } else { // freq only (2 byte)
220 pos += 2;
221 }
222
223 // skipping bigram
224 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
225 if (bigramExist > 0) {
226 int nextBigramExist = 1;
227 while (nextBigramExist > 0) {
228 pos += 3;
229 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
230 }
231 } else {
232 pos++;
233 }
234 }
235 }
236 depth++;
237 if (followDownBranchAddress == 0) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900238 if (DEBUG_DICT) {
239 LOGI("ERROR!!! Cannot find bigram!!");
240 }
satok18c28f42010-12-02 18:11:54 +0900241 break;
242 }
243 }
244 if (checkFirstCharacter(word)) {
245 addWordBigram(word, depth, frequency);
246 }
247}
248
249bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
250 // Checks whether this word starts with same character or neighboring characters of
251 // what user typed.
252
253 int *inputCodes = mInputCodes;
254 int maxAlt = MAX_ALTERNATIVES;
255 while (maxAlt > 0) {
256 if ((unsigned int) *inputCodes == (unsigned int) *word) {
257 return true;
258 }
259 inputCodes++;
260 maxAlt--;
261 }
262 return false;
263}
264
satok30088252010-12-01 21:22:15 +0900265// TODO: Move functions related to bigram to here
266} // namespace latinime