blob: 1d74998f6b4f6d3cb1814f4a9ed3cc2b62e80005 [file] [log] [blame]
Jean Chalard1059f272011-06-28 20:45:05 +09001/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_BINARY_FORMAT_H
18#define LATINIME_BINARY_FORMAT_H
19
Jean Chalardf0a98092011-07-20 18:42:32 +090020#include "unigram_dictionary.h"
21
Jean Chalard1059f272011-06-28 20:45:05 +090022namespace latinime {
23
24class BinaryFormat {
Ken Wakasae12e9b52012-01-06 12:24:38 +090025 private:
Jean Chalard1059f272011-06-28 20:45:05 +090026 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
27 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
28 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
29
Ken Wakasae12e9b52012-01-06 12:24:38 +090030 public:
Jean Chalardf0a98092011-07-20 18:42:32 +090031 const static int UNKNOWN_FORMAT = -1;
32 const static int FORMAT_VERSION_1 = 1;
33 const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
34
35 static int detectFormat(const uint8_t* const dict);
Jean Chalard1059f272011-06-28 20:45:05 +090036 static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
37 static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
38 static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
39 static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
40 static int skipOtherCharacters(const uint8_t* const dict, const int pos);
41 static int skipAttributes(const uint8_t* const dict, const int pos);
42 static int skipChildrenPosition(const uint8_t flags, const int pos);
43 static int skipFrequency(const uint8_t flags, const int pos);
44 static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
45 static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
46 const int pos);
47 static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
48 static bool hasChildrenInFlags(const uint8_t flags);
49 static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
50 int *pos);
Jean Chalard6a0e9642011-07-25 18:17:11 +090051 static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
52 const int length);
Jean Chalard588e2f22011-07-25 14:03:19 +090053 static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
54 uint16_t* outWord);
Jean Chalard1059f272011-06-28 20:45:05 +090055};
56
Jean Chalardf0a98092011-07-20 18:42:32 +090057inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
58 const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
59 if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
60 return UNKNOWN_FORMAT;
61}
62
Jean Chalard1059f272011-06-28 20:45:05 +090063inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
Jean Chalard4c0eca62012-01-16 15:15:53 +090064 const int msb = dict[(*pos)++];
65 if (msb < 0x80) return msb;
66 return ((msb & 0x7F) << 8) | dict[(*pos)++];
Jean Chalard1059f272011-06-28 20:45:05 +090067}
68
69inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
70 return dict[(*pos)++];
71}
72
73inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
74 const int origin = *pos;
75 const int32_t character = dict[origin];
76 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
77 if (character == CHARACTER_ARRAY_TERMINATOR) {
78 *pos = origin + 1;
79 return NOT_A_CHARACTER;
80 } else {
81 *pos = origin + 3;
82 const int32_t char_1 = character << 16;
83 const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
84 return char_2 + dict[origin + 2];
85 }
86 } else {
87 *pos = origin + 1;
88 return character;
89 }
90}
91
92inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
93 const int pos) {
94 return dict[pos];
95}
96
97inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
98 int currentPos = pos;
99 int32_t character = dict[currentPos++];
100 while (CHARACTER_ARRAY_TERMINATOR != character) {
101 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
102 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
103 }
104 character = dict[currentPos++];
105 }
106 return currentPos;
107}
108
109static inline int attributeAddressSize(const uint8_t flags) {
110 static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
111 return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
112 /* Note: this is a value-dependant optimization of what may probably be
113 more readably written this way:
114 switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
115 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
116 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
117 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
118 default: return 0;
119 }
120 */
121}
122
123inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
124 int currentPos = pos;
125 uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
126 while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
127 currentPos += attributeAddressSize(flags);
128 flags = getFlagsAndForwardPointer(dict, &currentPos);
129 }
130 currentPos += attributeAddressSize(flags);
131 return currentPos;
132}
133
134static inline int childrenAddressSize(const uint8_t flags) {
135 static const int CHILDREN_ADDRESS_SHIFT = 6;
136 return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
137 /* See the note in attributeAddressSize. The same applies here */
138}
139
140inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
141 return pos + childrenAddressSize(flags);
142}
143
144inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
145 return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
146}
147
148inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
149 const int pos) {
Jean Chalarde0e33962011-12-26 20:23:32 +0900150 // This function skips all attributes: shortcuts and bigrams.
151 int newPos = pos;
152 if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
153 newPos = skipAttributes(dict, newPos);
Jean Chalard1059f272011-06-28 20:45:05 +0900154 }
Jean Chalarde0e33962011-12-26 20:23:32 +0900155 if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
156 newPos = skipAttributes(dict, newPos);
157 }
158 return newPos;
Jean Chalard1059f272011-06-28 20:45:05 +0900159}
160
161inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
162 const uint8_t flags, const int pos) {
163 int currentPos = pos;
164 currentPos = skipChildrenPosition(flags, currentPos);
165 currentPos = skipAllAttributes(dict, flags, currentPos);
166 return currentPos;
167}
168
169inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
170 const int pos) {
171 int offset = 0;
172 switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
173 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
174 offset = dict[pos];
175 break;
176 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
177 offset = dict[pos] << 8;
178 offset += dict[pos + 1];
179 break;
180 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
181 offset = dict[pos] << 16;
182 offset += dict[pos + 1] << 8;
183 offset += dict[pos + 2];
184 break;
185 default:
186 // If we come here, it means we asked for the children of a word with
187 // no children.
188 return -1;
189 }
190 return pos + offset;
191}
192
193inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
194 return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
195 != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
196}
197
198inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
199 const uint8_t flags, int *pos) {
200 int offset = 0;
201 const int origin = *pos;
202 switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
203 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
204 offset = dict[origin];
205 *pos = origin + 1;
206 break;
207 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
208 offset = dict[origin] << 8;
209 offset += dict[origin + 1];
210 *pos = origin + 2;
211 break;
212 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
213 offset = dict[origin] << 16;
214 offset += dict[origin + 1] << 8;
215 offset += dict[origin + 2];
216 *pos = origin + 3;
217 break;
218 }
219 if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
220 return origin - offset;
221 } else {
222 return origin + offset;
223 }
224}
225
Jean Chalard6a0e9642011-07-25 18:17:11 +0900226// This function gets the byte position of the last chargroup of the exact matching word in the
227// dictionary. If no match is found, it returns NOT_VALID_WORD.
228inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
229 const uint16_t* const inWord, const int length) {
230 int pos = 0;
231 int wordPos = 0;
232
233 while (true) {
234 // If we already traversed the tree further than the word is long, there means
235 // there was no match (or we would have found it).
236 if (wordPos > length) return NOT_VALID_WORD;
237 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
238 const uint16_t wChar = inWord[wordPos];
239 while (true) {
240 // If there are no more character groups in this node, it means we could not
241 // find a matching character for this depth, therefore there is no match.
242 if (0 >= charGroupCount) return NOT_VALID_WORD;
243 const int charGroupPos = pos;
244 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
245 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
246 if (character == wChar) {
247 // This is the correct node. Only one character group may start with the same
248 // char within a node, so either we found our match in this node, or there is
249 // no match and we can return NOT_VALID_WORD. So we will check all the characters
250 // in this character group indeed does match.
251 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
252 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
253 while (NOT_A_CHARACTER != character) {
254 ++wordPos;
255 // If we shoot the length of the word we search for, or if we find a single
256 // character that does not match, as explained above, it means the word is
257 // not in the dictionary (by virtue of this chargroup being the only one to
258 // match the word on the first character, but not matching the whole word).
259 if (wordPos > length) return NOT_VALID_WORD;
260 if (inWord[wordPos] != character) return NOT_VALID_WORD;
261 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
262 }
263 }
264 // If we come here we know that so far, we do match. Either we are on a terminal
265 // and we match the length, in which case we found it, or we traverse children.
266 // If we don't match the length AND don't have children, then a word in the
267 // dictionary fully matches a prefix of the searched word but not the full word.
268 ++wordPos;
269 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
270 if (wordPos == length) {
271 return charGroupPos;
272 }
273 pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
274 }
275 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
276 == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
277 return NOT_VALID_WORD;
278 }
279 // We have children and we are still shorter than the word we are searching for, so
280 // we need to traverse children. Put the pointer on the children position, and
281 // break
282 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
283 break;
284 } else {
285 // This chargroup does not match, so skip the remaining part and go to the next.
286 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
287 pos = BinaryFormat::skipOtherCharacters(root, pos);
288 }
289 pos = BinaryFormat::skipFrequency(flags, pos);
290 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
291 }
292 --charGroupCount;
293 }
294 }
295}
296
Jean Chalard588e2f22011-07-25 14:03:19 +0900297// This function searches for a terminal in the dictionary by its address.
298// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
299// it is possible to check for this with advantageous complexity. For each node, we search
300// for groups with children and compare the children address with the address we look for.
301// When we shoot the address we look for, it means the word we look for is in the children
302// of the previous group. The only tricky part is the fact that if we arrive at the end of a
303// node with the last group's children address still less than what we are searching for, we
304// must descend the last group's children (for example, if the word we are searching for starts
305// with a z, it's the last group of the root node, so all children addresses will be smaller
306// than the address we look for, and we have to descend the z node).
307/* Parameters :
308 * root: the dictionary buffer
309 * address: the byte position of the last chargroup of the word we are searching for (this is
310 * what is stored as the "bigram address" in each bigram)
311 * outword: an array to write the found word, with MAX_WORD_LENGTH size.
312 * Return value : the length of the word, of 0 if the word was not found.
313 */
314inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
315 const int maxDepth, uint16_t* outWord) {
316 int pos = 0;
317 int wordPos = 0;
318
319 // One iteration of the outer loop iterates through nodes. As stated above, we will only
320 // traverse nodes that are actually a part of the terminal we are searching, so each time
321 // we enter this loop we are one depth level further than last time.
322 // The only reason we count nodes is because we want to reduce the probability of infinite
323 // looping in case there is a bug. Since we know there is an upper bound to the depth we are
324 // supposed to traverse, it does not hurt to count iterations.
325 for (int loopCount = maxDepth; loopCount > 0; --loopCount) {
326 int lastCandidateGroupPos = 0;
327 // Let's loop through char groups in this node searching for either the terminal
328 // or one of its ascendants.
329 for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0;
330 --charGroupCount) {
331 const int startPos = pos;
332 const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
333 const int32_t character = getCharCodeAndForwardPointer(root, &pos);
334 if (address == startPos) {
335 // We found the address. Copy the rest of the word in the buffer and return
336 // the length.
337 outWord[wordPos] = character;
338 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
339 int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
340 // We count chars in order to avoid infinite loops if the file is broken or
341 // if there is some other bug
342 int charCount = maxDepth;
343 while (-1 != nextChar && --charCount > 0) {
344 outWord[++wordPos] = nextChar;
345 nextChar = getCharCodeAndForwardPointer(root, &pos);
346 }
347 }
348 return ++wordPos;
349 }
350 // We need to skip past this char group, so skip any remaining chars after the
351 // first and possibly the frequency.
352 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
353 pos = skipOtherCharacters(root, pos);
354 }
355 pos = skipFrequency(flags, pos);
356
357 // The fact that this group has children is very important. Since we already know
358 // that this group does not match, if it has no children we know it is irrelevant
359 // to what we are searching for.
360 const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
361 (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
362 // We will write in `found' whether we have passed the children address we are
363 // searching for. For example if we search for "beer", the children of b are less
364 // than the address we are searching for and the children of c are greater. When we
365 // come here for c, we realize this is too big, and that we should descend b.
366 bool found;
367 if (hasChildren) {
368 // Here comes the tricky part. First, read the children position.
369 const int childrenPos = readChildrenPosition(root, flags, pos);
370 if (childrenPos > address) {
371 // If the children pos is greater than address, it means the previous chargroup,
372 // which address is stored in lastCandidateGroupPos, was the right one.
373 found = true;
374 } else if (1 >= charGroupCount) {
375 // However if we are on the LAST group of this node, and we have NOT shot the
376 // address we should descend THIS node. So we trick the lastCandidateGroupPos
377 // so that we will descend this node, not the previous one.
378 lastCandidateGroupPos = startPos;
379 found = true;
380 } else {
381 // Else, we should continue looking.
382 found = false;
383 }
384 } else {
385 // Even if we don't have children here, we could still be on the last group of this
386 // node. If this is the case, we should descend the last group that had children,
387 // and their address is already in lastCandidateGroup.
388 found = (1 >= charGroupCount);
389 }
390
391 if (found) {
392 // Okay, we found the group we should descend. Its address is in
393 // the lastCandidateGroupPos variable, so we just re-read it.
394 if (0 != lastCandidateGroupPos) {
395 const uint8_t lastFlags =
396 getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
397 const int32_t lastChar =
398 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
399 // We copy all the characters in this group to the buffer
400 outWord[wordPos] = lastChar;
401 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
402 int32_t nextChar =
403 getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
404 int charCount = maxDepth;
405 while (-1 != nextChar && --charCount > 0) {
406 outWord[++wordPos] = nextChar;
407 nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
408 }
409 }
410 ++wordPos;
411 // Now we only need to branch to the children address. Skip the frequency if
412 // it's there, read pos, and break to resume the search at pos.
413 lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
414 pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
415 break;
416 } else {
417 // Here is a little tricky part: we come here if we found out that all children
418 // addresses in this group are bigger than the address we are searching for.
419 // Should we conclude the word is not in the dictionary? No! It could still be
420 // one of the remaining chargroups in this node, so we have to keep looking in
421 // this node until we find it (or we realize it's not there either, in which
422 // case it's actually not in the dictionary). Pass the end of this group, ready
423 // to start the next one.
424 pos = skipChildrenPosAndAttributes(root, flags, pos);
425 }
426 } else {
427 // If we did not find it, we should record the last children address for the next
428 // iteration.
429 if (hasChildren) lastCandidateGroupPos = startPos;
430 // Now skip the end of this group (children pos and the attributes if any) so that
431 // our pos is after the end of this char group, at the start of the next one.
432 pos = skipChildrenPosAndAttributes(root, flags, pos);
433 }
434
435 }
436 }
437 // If we have looked through all the chargroups and found no match, the address is
438 // not the address of a terminal in this dictionary.
439 return 0;
440}
441
Jean Chalard1059f272011-06-28 20:45:05 +0900442} // namespace latinime
443
444#endif // LATINIME_BINARY_FORMAT_H