blob: a946b1ee3951ca3c68c6539e18d0a7eff17f9760 [file] [log] [blame]
Jean Chalard1059f272011-06-28 20:45:05 +09001/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_BINARY_FORMAT_H
18#define LATINIME_BINARY_FORMAT_H
19
Jean Chalardf0a98092011-07-20 18:42:32 +090020#include "unigram_dictionary.h"
21
Jean Chalard1059f272011-06-28 20:45:05 +090022namespace latinime {
23
24class BinaryFormat {
25private:
26 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
27 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
28 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
29
30public:
Jean Chalardf0a98092011-07-20 18:42:32 +090031 const static int UNKNOWN_FORMAT = -1;
32 const static int FORMAT_VERSION_1 = 1;
33 const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
34
35 static int detectFormat(const uint8_t* const dict);
Jean Chalard1059f272011-06-28 20:45:05 +090036 static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
37 static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
38 static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
39 static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
40 static int skipOtherCharacters(const uint8_t* const dict, const int pos);
41 static int skipAttributes(const uint8_t* const dict, const int pos);
42 static int skipChildrenPosition(const uint8_t flags, const int pos);
43 static int skipFrequency(const uint8_t flags, const int pos);
44 static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
45 static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
46 const int pos);
47 static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
48 static bool hasChildrenInFlags(const uint8_t flags);
49 static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
50 int *pos);
Jean Chalard6a0e9642011-07-25 18:17:11 +090051 static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
52 const int length);
Jean Chalard1059f272011-06-28 20:45:05 +090053};
54
Jean Chalardf0a98092011-07-20 18:42:32 +090055inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
56 const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
57 if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
58 return UNKNOWN_FORMAT;
59}
60
Jean Chalard1059f272011-06-28 20:45:05 +090061inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
62 return dict[(*pos)++];
63}
64
65inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
66 return dict[(*pos)++];
67}
68
69inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
70 const int origin = *pos;
71 const int32_t character = dict[origin];
72 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
73 if (character == CHARACTER_ARRAY_TERMINATOR) {
74 *pos = origin + 1;
75 return NOT_A_CHARACTER;
76 } else {
77 *pos = origin + 3;
78 const int32_t char_1 = character << 16;
79 const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
80 return char_2 + dict[origin + 2];
81 }
82 } else {
83 *pos = origin + 1;
84 return character;
85 }
86}
87
88inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
89 const int pos) {
90 return dict[pos];
91}
92
93inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
94 int currentPos = pos;
95 int32_t character = dict[currentPos++];
96 while (CHARACTER_ARRAY_TERMINATOR != character) {
97 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
98 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
99 }
100 character = dict[currentPos++];
101 }
102 return currentPos;
103}
104
105static inline int attributeAddressSize(const uint8_t flags) {
106 static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
107 return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
108 /* Note: this is a value-dependant optimization of what may probably be
109 more readably written this way:
110 switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
111 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
112 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
113 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
114 default: return 0;
115 }
116 */
117}
118
119inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
120 int currentPos = pos;
121 uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
122 while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
123 currentPos += attributeAddressSize(flags);
124 flags = getFlagsAndForwardPointer(dict, &currentPos);
125 }
126 currentPos += attributeAddressSize(flags);
127 return currentPos;
128}
129
130static inline int childrenAddressSize(const uint8_t flags) {
131 static const int CHILDREN_ADDRESS_SHIFT = 6;
132 return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
133 /* See the note in attributeAddressSize. The same applies here */
134}
135
136inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
137 return pos + childrenAddressSize(flags);
138}
139
140inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
141 return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
142}
143
144inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
145 const int pos) {
146 // This function skips all attributes. The format makes provision for future extension
147 // with other attributes (notably shortcuts) but for the time being, bigrams are the
148 // only attributes that may be found in a character group, so we only look at bigrams
149 // in this version.
150 if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
151 return skipAttributes(dict, pos);
152 } else {
153 return pos;
154 }
155}
156
157inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
158 const uint8_t flags, const int pos) {
159 int currentPos = pos;
160 currentPos = skipChildrenPosition(flags, currentPos);
161 currentPos = skipAllAttributes(dict, flags, currentPos);
162 return currentPos;
163}
164
165inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
166 const int pos) {
167 int offset = 0;
168 switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
169 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
170 offset = dict[pos];
171 break;
172 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
173 offset = dict[pos] << 8;
174 offset += dict[pos + 1];
175 break;
176 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
177 offset = dict[pos] << 16;
178 offset += dict[pos + 1] << 8;
179 offset += dict[pos + 2];
180 break;
181 default:
182 // If we come here, it means we asked for the children of a word with
183 // no children.
184 return -1;
185 }
186 return pos + offset;
187}
188
189inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
190 return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
191 != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
192}
193
194inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
195 const uint8_t flags, int *pos) {
196 int offset = 0;
197 const int origin = *pos;
198 switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
199 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
200 offset = dict[origin];
201 *pos = origin + 1;
202 break;
203 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
204 offset = dict[origin] << 8;
205 offset += dict[origin + 1];
206 *pos = origin + 2;
207 break;
208 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
209 offset = dict[origin] << 16;
210 offset += dict[origin + 1] << 8;
211 offset += dict[origin + 2];
212 *pos = origin + 3;
213 break;
214 }
215 if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
216 return origin - offset;
217 } else {
218 return origin + offset;
219 }
220}
221
Jean Chalard6a0e9642011-07-25 18:17:11 +0900222// This function gets the byte position of the last chargroup of the exact matching word in the
223// dictionary. If no match is found, it returns NOT_VALID_WORD.
224inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
225 const uint16_t* const inWord, const int length) {
226 int pos = 0;
227 int wordPos = 0;
228
229 while (true) {
230 // If we already traversed the tree further than the word is long, there means
231 // there was no match (or we would have found it).
232 if (wordPos > length) return NOT_VALID_WORD;
233 int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
234 const uint16_t wChar = inWord[wordPos];
235 while (true) {
236 // If there are no more character groups in this node, it means we could not
237 // find a matching character for this depth, therefore there is no match.
238 if (0 >= charGroupCount) return NOT_VALID_WORD;
239 const int charGroupPos = pos;
240 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
241 int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
242 if (character == wChar) {
243 // This is the correct node. Only one character group may start with the same
244 // char within a node, so either we found our match in this node, or there is
245 // no match and we can return NOT_VALID_WORD. So we will check all the characters
246 // in this character group indeed does match.
247 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
248 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
249 while (NOT_A_CHARACTER != character) {
250 ++wordPos;
251 // If we shoot the length of the word we search for, or if we find a single
252 // character that does not match, as explained above, it means the word is
253 // not in the dictionary (by virtue of this chargroup being the only one to
254 // match the word on the first character, but not matching the whole word).
255 if (wordPos > length) return NOT_VALID_WORD;
256 if (inWord[wordPos] != character) return NOT_VALID_WORD;
257 character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
258 }
259 }
260 // If we come here we know that so far, we do match. Either we are on a terminal
261 // and we match the length, in which case we found it, or we traverse children.
262 // If we don't match the length AND don't have children, then a word in the
263 // dictionary fully matches a prefix of the searched word but not the full word.
264 ++wordPos;
265 if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
266 if (wordPos == length) {
267 return charGroupPos;
268 }
269 pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
270 }
271 if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
272 == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
273 return NOT_VALID_WORD;
274 }
275 // We have children and we are still shorter than the word we are searching for, so
276 // we need to traverse children. Put the pointer on the children position, and
277 // break
278 pos = BinaryFormat::readChildrenPosition(root, flags, pos);
279 break;
280 } else {
281 // This chargroup does not match, so skip the remaining part and go to the next.
282 if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
283 pos = BinaryFormat::skipOtherCharacters(root, pos);
284 }
285 pos = BinaryFormat::skipFrequency(flags, pos);
286 pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
287 }
288 --charGroupCount;
289 }
290 }
291}
292
Jean Chalard1059f272011-06-28 20:45:05 +0900293} // namespace latinime
294
295#endif // LATINIME_BINARY_FORMAT_H