blob: 6aecb6374972ff36ea570006f5abcaa538b5f26a [file] [log] [blame]
The Android Open Source Project923bf412009-03-13 15:11:42 -07001/*
2**
3** Copyright 2009, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9** http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
18#include <stdio.h>
19#include <fcntl.h>
20#include <sys/mman.h>
21#include <string.h>
22#include <cutils/log.h>
23
24#include <unicode/uchar.h>
25
26//#define USE_ASSET_MANAGER
27
28#ifdef USE_ASSET_MANAGER
29#include <utils/AssetManager.h>
30#include <utils/Asset.h>
31#endif
32
33#include "dictionary.h"
34#include "basechars.h"
35
36#define DEBUG_DICT 0
37
38namespace latinime {
39
40Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
41{
42 mDict = (unsigned char*) dict;
43 mTypedLetterMultiplier = typedLetterMultiplier;
44 mFullWordMultiplier = fullWordMultiplier;
45}
46
47Dictionary::~Dictionary()
48{
49}
50
51int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
52 int maxWordLength, int maxWords, int maxAlternatives)
53{
54 memset(frequencies, 0, maxWords * sizeof(*frequencies));
55 memset(outWords, 0, maxWords * maxWordLength * sizeof(*outWords));
56
57 mFrequencies = frequencies;
58 mOutputChars = outWords;
59 mInputCodes = codes;
60 mInputLength = codesSize;
61 mMaxAlternatives = maxAlternatives;
62 mMaxWordLength = maxWordLength;
63 mMaxWords = maxWords;
64 mWords = 0;
65
66 getWordsRec(0, 0, mInputLength * 3, false, 1, 0);
67
68 if (DEBUG_DICT) LOGI("Returning %d words", mWords);
69 return mWords;
70}
71
72unsigned short
73Dictionary::getChar(int *pos)
74{
75 unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
76 // If the code is 255, then actual 16 bit code follows (in big endian)
77 if (ch == 0xFF) {
78 ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
79 (*pos) += 2;
80 }
81 return ch;
82}
83
84int
85Dictionary::getAddress(int *pos)
86{
87 int address = 0;
88 address += (mDict[*pos] & 0x7F) << 16;
89 address += (mDict[*pos + 1] & 0xFF) << 8;
90 address += (mDict[*pos + 2] & 0xFF);
91 *pos += 3;
92 return address;
93}
94
95int
96Dictionary::wideStrLen(unsigned short *str)
97{
98 if (!str) return 0;
99 unsigned short *end = str;
100 while (*end)
101 end++;
102 return end - str;
103}
104
105bool
106Dictionary::addWord(unsigned short *word, int length, int frequency)
107{
108 word[length] = 0;
109 if (DEBUG_DICT) LOGI("Found word = %s, freq = %d : \n", word, frequency);
110
111 // Find the right insertion point
112 int insertAt = 0;
113 while (insertAt < mMaxWords) {
114 if (frequency > mFrequencies[insertAt]
115 || (mFrequencies[insertAt] == frequency
116 && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
117 break;
118 }
119 insertAt++;
120 }
121 if (insertAt < mMaxWords) {
122 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
123 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
124 (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
125 mFrequencies[insertAt] = frequency;
126 memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
127 (char*) mOutputChars + (insertAt ) * mMaxWordLength * sizeof(short),
128 (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
129 unsigned short *dest = mOutputChars + (insertAt ) * mMaxWordLength;
130 while (length--) {
131 *dest++ = *word++;
132 }
133 *dest = 0; // NULL terminate
134 // Update the word count
135 if (insertAt + 1 > mWords) mWords = insertAt + 1;
136 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
137 return true;
138 }
139 return false;
140}
141
142unsigned short
143Dictionary::toLowerCase(unsigned short c, const int depth) {
144 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
145 c = BASE_CHARS[c];
146 }
147 if (depth == 0) {
148 if (c >='A' && c <= 'Z') {
149 c |= 32;
150 } else if (c > 127) {
151 c = u_tolower(c);
152 }
153 }
154 return c;
155}
156
157bool
158Dictionary::sameAsTyped(unsigned short *word, int length)
159{
160 if (length != mInputLength) {
161 return false;
162 }
163 int *inputCodes = mInputCodes;
164 while (length--) {
165 if ((unsigned int) *inputCodes != (unsigned int) *word) {
166 return false;
167 }
168 inputCodes += mMaxAlternatives;
169 word++;
170 }
171 return true;
172}
173
174static char QUOTE = '\'';
175
176void
177Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex)
178{
179 // Optimization: Prune out words that are too long compared to how much was typed.
180 if (depth > maxDepth) {
181 return;
182 }
183 int count = getCount(&pos);
184 int *currentChars = NULL;
185 if (mInputLength <= inputIndex) {
186 completion = true;
187 } else {
188 currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
189 }
190
191 for (int i = 0; i < count; i++) {
192 unsigned short c = getChar(&pos);
193 unsigned short lowerC = toLowerCase(c, depth);
194 bool terminal = getTerminal(&pos);
195 int childrenAddress = getAddress(&pos);
196 int freq = getFreq(&pos);
197 // If we are only doing completions, no need to look at the typed characters.
198 if (completion) {
199 mWord[depth] = c;
200 if (terminal) {
201 addWord(mWord, depth + 1, freq * snr);
202 }
203 if (childrenAddress != 0) {
204 getWordsRec(childrenAddress, depth + 1, maxDepth,
205 completion, snr, inputIndex);
206 }
207 } else if (c == QUOTE && currentChars[0] != QUOTE) {
208 // Skip the ' and continue deeper
209 mWord[depth] = QUOTE;
210 if (childrenAddress != 0) {
211 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex);
212 }
213 } else {
214 int j = 0;
215 while (currentChars[j] > 0) {
216 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
217 if (currentChars[j] == lowerC || currentChars[j] == c) {
218 mWord[depth] = c;
219 if (mInputLength == inputIndex + 1) {
220 if (terminal) {
221 if (//INCLUDE_TYPED_WORD_IF_VALID ||
222 !sameAsTyped(mWord, depth + 1)) {
223 addWord(mWord, depth + 1,
224 (freq * snr * addedWeight * mFullWordMultiplier));
225 }
226 }
227 if (childrenAddress != 0) {
228 getWordsRec(childrenAddress, depth + 1,
229 maxDepth, true, snr * addedWeight, inputIndex + 1);
230 }
231 } else if (childrenAddress != 0) {
232 getWordsRec(childrenAddress, depth + 1, maxDepth,
233 false, snr * addedWeight, inputIndex + 1);
234 }
235 }
236 j++;
237 }
238 }
239 }
240}
241
242bool
243Dictionary::isValidWord(unsigned short *word, int length)
244{
245 return isValidWordRec(0, word, 0, length);
246}
247
248bool
249Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
250 int count = getCount(&pos);
251 unsigned short currentChar = (unsigned short) word[offset];
252 for (int j = 0; j < count; j++) {
253 unsigned short c = getChar(&pos);
254 int terminal = getTerminal(&pos);
255 int childPos = getAddress(&pos);
256 if (c == currentChar) {
257 if (offset == length - 1) {
258 if (terminal) {
259 return true;
260 }
261 } else {
262 if (childPos != 0) {
263 if (isValidWordRec(childPos, word, offset + 1, length)) {
264 return true;
265 }
266 }
267 }
268 }
269 getFreq(&pos);
270 // There could be two instances of each alphabet - upper and lower case. So continue
271 // looking ...
272 }
273 return false;
274}
275
276
277} // namespace latinime