blob: 4bc43653748cb4113565189d9e3cbc5b12eebbad [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok18c28f42010-12-02 18:11:54 +090018#include <string.h>
19
satoke808e432010-12-02 14:53:24 +090020#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
21
satok30088252010-12-01 21:22:15 +090022#include "bigram_dictionary.h"
satok18c28f42010-12-02 18:11:54 +090023#include "dictionary.h"
satok30088252010-12-01 21:22:15 +090024
25namespace latinime {
26
satok18c28f42010-12-02 18:11:54 +090027BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
28 int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
29 Dictionary *parentDictionary)
30 : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
31 MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
32 HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
Ken Wakasade3070a2011-03-19 09:16:42 +090033 if (DEBUG_DICT) {
34 LOGI("BigramDictionary - constructor");
35 LOGI("Has Bigram : %d", hasBigram);
36 }
satok30088252010-12-01 21:22:15 +090037}
38
satok18c28f42010-12-02 18:11:54 +090039BigramDictionary::~BigramDictionary() {
satok30088252010-12-01 21:22:15 +090040}
satok18c28f42010-12-02 18:11:54 +090041
42bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) {
43 word[length] = 0;
44 if (DEBUG_DICT) {
45 char s[length + 1];
46 for (int i = 0; i <= length; i++) s[i] = word[i];
Ken Wakasae90b3332011-01-07 15:01:51 +090047 LOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
satok18c28f42010-12-02 18:11:54 +090048 }
49
50 // Find the right insertion point
51 int insertAt = 0;
52 while (insertAt < mMaxBigrams) {
53 if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency
54 && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
55 break;
56 }
57 insertAt++;
58 }
Ken Wakasade3070a2011-03-19 09:16:42 +090059 if (DEBUG_DICT) {
60 LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
61 }
satok18c28f42010-12-02 18:11:54 +090062 if (insertAt < mMaxBigrams) {
63 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
64 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
65 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
66 mBigramFreq[insertAt] = frequency;
67 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
68 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
69 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
70 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
71 while (length--) {
72 *dest++ = *word++;
73 }
74 *dest = 0; // NULL terminate
Ken Wakasade3070a2011-03-19 09:16:42 +090075 if (DEBUG_DICT) {
76 LOGI("Bigram: Added word at %d", insertAt);
77 }
satok18c28f42010-12-02 18:11:54 +090078 return true;
79 }
80 return false;
81}
82
83int BigramDictionary::getBigramAddress(int *pos, bool advance) {
84 int address = 0;
85
86 address += (DICT[*pos] & 0x3F) << 16;
87 address += (DICT[*pos + 1] & 0xFF) << 8;
88 address += (DICT[*pos + 2] & 0xFF);
89
90 if (advance) {
91 *pos += 3;
92 }
93
94 return address;
95}
96
97int BigramDictionary::getBigramFreq(int *pos) {
98 int freq = DICT[(*pos)++] & FLAG_BIGRAM_FREQ;
99
100 return freq;
101}
102
103
104int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes,
105 int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength,
106 int maxBigrams, int maxAlternatives) {
107 mBigramFreq = bigramFreq;
108 mBigramChars = bigramChars;
109 mInputCodes = codes;
110 mInputLength = codesSize;
111 mMaxBigrams = maxBigrams;
112
113 if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
Jean Chalard8124e642011-06-16 22:33:41 +0900114 int pos = mParentDictionary->isValidWord(prevWord, prevWordLength);
Ken Wakasade3070a2011-03-19 09:16:42 +0900115 if (DEBUG_DICT) {
116 LOGI("Pos -> %d", pos);
117 }
satok18c28f42010-12-02 18:11:54 +0900118 if (pos < 0) {
119 return 0;
120 }
121
122 int bigramCount = 0;
123 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
124 if (bigramExist > 0) {
125 int nextBigramExist = 1;
126 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
127 int bigramAddress = getBigramAddress(&pos, true);
128 int frequency = (FLAG_BIGRAM_FREQ & DICT[pos]);
129 // search for all bigrams and store them
130 searchForTerminalNode(bigramAddress, frequency);
131 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
132 bigramCount++;
133 }
134 }
135
136 return bigramCount;
137 }
138 return 0;
139}
140
141void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency) {
142 // track word with such address and store it in an array
143 unsigned short word[MAX_WORD_LENGTH];
144
145 int pos;
146 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
147 bool found = false;
148 char followingChar = ' ';
149 int depth = -1;
150
151 while(!found) {
152 bool followDownAddressSearchStop = false;
153 bool firstAddress = true;
154 bool haveToSearchAll = true;
155
satokf5cded12010-12-06 21:28:24 +0900156 if (depth < MAX_WORD_LENGTH && depth >= 0) {
satok18c28f42010-12-02 18:11:54 +0900157 word[depth] = (unsigned short) followingChar;
158 }
159 pos = followDownBranchAddress; // pos start at count
160 int count = DICT[pos] & 0xFF;
Ken Wakasade3070a2011-03-19 09:16:42 +0900161 if (DEBUG_DICT) {
162 LOGI("count - %d",count);
163 }
satok18c28f42010-12-02 18:11:54 +0900164 pos++;
165 for (int i = 0; i < count; i++) {
166 // pos at data
167 pos++;
168 // pos now at flag
169 if (!getFirstBitOfByte(&pos)) { // non-terminal
170 if (!followDownAddressSearchStop) {
171 int addr = getBigramAddress(&pos, false);
172 if (addr > addressLookingFor) {
173 followDownAddressSearchStop = true;
174 if (firstAddress) {
175 firstAddress = false;
176 haveToSearchAll = true;
177 } else if (!haveToSearchAll) {
178 break;
179 }
180 } else {
181 followDownBranchAddress = addr;
182 followingChar = (char)(0xFF & DICT[pos-1]);
183 if (firstAddress) {
184 firstAddress = false;
185 haveToSearchAll = false;
186 }
187 }
188 }
189 pos += 3;
190 } else if (getFirstBitOfByte(&pos)) { // terminal
191 if (addressLookingFor == (pos-1)) { // found !!
192 depth++;
193 word[depth] = (0xFF & DICT[pos-1]);
194 found = true;
195 break;
196 }
197 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
198 if (!followDownAddressSearchStop) {
199 int addr = getBigramAddress(&pos, false);
200 if (addr > addressLookingFor) {
201 followDownAddressSearchStop = true;
202 if (firstAddress) {
203 firstAddress = false;
204 haveToSearchAll = true;
205 } else if (!haveToSearchAll) {
206 break;
207 }
208 } else {
209 followDownBranchAddress = addr;
210 followingChar = (char)(0xFF & DICT[pos-1]);
211 if (firstAddress) {
212 firstAddress = false;
213 haveToSearchAll = true;
214 }
215 }
216 }
217 pos += 4;
218 } else { // freq only (2 byte)
219 pos += 2;
220 }
221
222 // skipping bigram
223 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
224 if (bigramExist > 0) {
225 int nextBigramExist = 1;
226 while (nextBigramExist > 0) {
227 pos += 3;
228 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
229 }
230 } else {
231 pos++;
232 }
233 }
234 }
235 depth++;
236 if (followDownBranchAddress == 0) {
Ken Wakasade3070a2011-03-19 09:16:42 +0900237 if (DEBUG_DICT) {
238 LOGI("ERROR!!! Cannot find bigram!!");
239 }
satok18c28f42010-12-02 18:11:54 +0900240 break;
241 }
242 }
243 if (checkFirstCharacter(word)) {
244 addWordBigram(word, depth, frequency);
245 }
246}
247
248bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
249 // Checks whether this word starts with same character or neighboring characters of
250 // what user typed.
251
252 int *inputCodes = mInputCodes;
253 int maxAlt = MAX_ALTERNATIVES;
254 while (maxAlt > 0) {
255 if ((unsigned int) *inputCodes == (unsigned int) *word) {
256 return true;
257 }
258 inputCodes++;
259 maxAlt--;
260 }
261 return false;
262}
263
satok30088252010-12-01 21:22:15 +0900264// TODO: Move functions related to bigram to here
265} // namespace latinime