blob: 095b80531fd88e5af539a08675d96ba3224f2621 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2**
3** Copyright 2010, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
satok18c28f42010-12-02 18:11:54 +090018#include <string.h>
19
satoke808e432010-12-02 14:53:24 +090020#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
21
satok30088252010-12-01 21:22:15 +090022#include "bigram_dictionary.h"
satok18c28f42010-12-02 18:11:54 +090023#include "dictionary.h"
satok30088252010-12-01 21:22:15 +090024
25namespace latinime {
26
satok18c28f42010-12-02 18:11:54 +090027BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
28 int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
29 Dictionary *parentDictionary)
30 : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
31 MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
32 HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
33 LOGI("BigramDictionary - constructor");
34 LOGI("Has Bigram : %d \n", hasBigram);
satok30088252010-12-01 21:22:15 +090035}
36
satok18c28f42010-12-02 18:11:54 +090037BigramDictionary::~BigramDictionary() {
satok30088252010-12-01 21:22:15 +090038}
satok18c28f42010-12-02 18:11:54 +090039
40bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) {
41 word[length] = 0;
42 if (DEBUG_DICT) {
43 char s[length + 1];
44 for (int i = 0; i <= length; i++) s[i] = word[i];
45 LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
46 }
47
48 // Find the right insertion point
49 int insertAt = 0;
50 while (insertAt < mMaxBigrams) {
51 if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency
52 && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
53 break;
54 }
55 insertAt++;
56 }
57 LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
58 if (insertAt < mMaxBigrams) {
59 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
60 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
61 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
62 mBigramFreq[insertAt] = frequency;
63 memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
64 (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short),
65 (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
66 unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH;
67 while (length--) {
68 *dest++ = *word++;
69 }
70 *dest = 0; // NULL terminate
71 if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
72 return true;
73 }
74 return false;
75}
76
77int BigramDictionary::getBigramAddress(int *pos, bool advance) {
78 int address = 0;
79
80 address += (DICT[*pos] & 0x3F) << 16;
81 address += (DICT[*pos + 1] & 0xFF) << 8;
82 address += (DICT[*pos + 2] & 0xFF);
83
84 if (advance) {
85 *pos += 3;
86 }
87
88 return address;
89}
90
91int BigramDictionary::getBigramFreq(int *pos) {
92 int freq = DICT[(*pos)++] & FLAG_BIGRAM_FREQ;
93
94 return freq;
95}
96
97
98int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes,
99 int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength,
100 int maxBigrams, int maxAlternatives) {
101 mBigramFreq = bigramFreq;
102 mBigramChars = bigramChars;
103 mInputCodes = codes;
104 mInputLength = codesSize;
105 mMaxBigrams = maxBigrams;
106
107 if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
108 int pos = mParentDictionary->isValidWordRec(
109 DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
110 LOGI("Pos -> %d\n", pos);
111 if (pos < 0) {
112 return 0;
113 }
114
115 int bigramCount = 0;
116 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
117 if (bigramExist > 0) {
118 int nextBigramExist = 1;
119 while (nextBigramExist > 0 && bigramCount < maxBigrams) {
120 int bigramAddress = getBigramAddress(&pos, true);
121 int frequency = (FLAG_BIGRAM_FREQ & DICT[pos]);
122 // search for all bigrams and store them
123 searchForTerminalNode(bigramAddress, frequency);
124 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
125 bigramCount++;
126 }
127 }
128
129 return bigramCount;
130 }
131 return 0;
132}
133
134void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency) {
135 // track word with such address and store it in an array
136 unsigned short word[MAX_WORD_LENGTH];
137
138 int pos;
139 int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
140 bool found = false;
141 char followingChar = ' ';
142 int depth = -1;
143
144 while(!found) {
145 bool followDownAddressSearchStop = false;
146 bool firstAddress = true;
147 bool haveToSearchAll = true;
148
149 if (depth >= 0) {
150 word[depth] = (unsigned short) followingChar;
151 }
152 pos = followDownBranchAddress; // pos start at count
153 int count = DICT[pos] & 0xFF;
154 LOGI("count - %d\n",count);
155 pos++;
156 for (int i = 0; i < count; i++) {
157 // pos at data
158 pos++;
159 // pos now at flag
160 if (!getFirstBitOfByte(&pos)) { // non-terminal
161 if (!followDownAddressSearchStop) {
162 int addr = getBigramAddress(&pos, false);
163 if (addr > addressLookingFor) {
164 followDownAddressSearchStop = true;
165 if (firstAddress) {
166 firstAddress = false;
167 haveToSearchAll = true;
168 } else if (!haveToSearchAll) {
169 break;
170 }
171 } else {
172 followDownBranchAddress = addr;
173 followingChar = (char)(0xFF & DICT[pos-1]);
174 if (firstAddress) {
175 firstAddress = false;
176 haveToSearchAll = false;
177 }
178 }
179 }
180 pos += 3;
181 } else if (getFirstBitOfByte(&pos)) { // terminal
182 if (addressLookingFor == (pos-1)) { // found !!
183 depth++;
184 word[depth] = (0xFF & DICT[pos-1]);
185 found = true;
186 break;
187 }
188 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
189 if (!followDownAddressSearchStop) {
190 int addr = getBigramAddress(&pos, false);
191 if (addr > addressLookingFor) {
192 followDownAddressSearchStop = true;
193 if (firstAddress) {
194 firstAddress = false;
195 haveToSearchAll = true;
196 } else if (!haveToSearchAll) {
197 break;
198 }
199 } else {
200 followDownBranchAddress = addr;
201 followingChar = (char)(0xFF & DICT[pos-1]);
202 if (firstAddress) {
203 firstAddress = false;
204 haveToSearchAll = true;
205 }
206 }
207 }
208 pos += 4;
209 } else { // freq only (2 byte)
210 pos += 2;
211 }
212
213 // skipping bigram
214 int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
215 if (bigramExist > 0) {
216 int nextBigramExist = 1;
217 while (nextBigramExist > 0) {
218 pos += 3;
219 nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
220 }
221 } else {
222 pos++;
223 }
224 }
225 }
226 depth++;
227 if (followDownBranchAddress == 0) {
228 LOGI("ERROR!!! Cannot find bigram!!");
229 break;
230 }
231 }
232 if (checkFirstCharacter(word)) {
233 addWordBigram(word, depth, frequency);
234 }
235}
236
237bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
238 // Checks whether this word starts with same character or neighboring characters of
239 // what user typed.
240
241 int *inputCodes = mInputCodes;
242 int maxAlt = MAX_ALTERNATIVES;
243 while (maxAlt > 0) {
244 if ((unsigned int) *inputCodes == (unsigned int) *word) {
245 return true;
246 }
247 inputCodes++;
248 maxAlt--;
249 }
250 return false;
251}
252
satok30088252010-12-01 21:22:15 +0900253// TODO: Move functions related to bigram to here
254} // namespace latinime