Jean Chalard | 1059f27 | 2011-06-28 20:45:05 +0900 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2011 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #ifndef LATINIME_BINARY_FORMAT_H |
| 18 | #define LATINIME_BINARY_FORMAT_H |
| 19 | |
Jean Chalard | f0a9809 | 2011-07-20 18:42:32 +0900 | [diff] [blame] | 20 | #include "unigram_dictionary.h" |
| 21 | |
Jean Chalard | 1059f27 | 2011-06-28 20:45:05 +0900 | [diff] [blame] | 22 | namespace latinime { |
| 23 | |
| 24 | class BinaryFormat { |
| 25 | private: |
| 26 | const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; |
| 27 | const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; |
| 28 | const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; |
| 29 | |
| 30 | public: |
Jean Chalard | f0a9809 | 2011-07-20 18:42:32 +0900 | [diff] [blame] | 31 | const static int UNKNOWN_FORMAT = -1; |
| 32 | const static int FORMAT_VERSION_1 = 1; |
| 33 | const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; |
| 34 | |
| 35 | static int detectFormat(const uint8_t* const dict); |
Jean Chalard | 1059f27 | 2011-06-28 20:45:05 +0900 | [diff] [blame] | 36 | static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); |
| 37 | static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); |
| 38 | static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); |
| 39 | static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); |
| 40 | static int skipOtherCharacters(const uint8_t* const dict, const int pos); |
| 41 | static int skipAttributes(const uint8_t* const dict, const int pos); |
| 42 | static int skipChildrenPosition(const uint8_t flags, const int pos); |
| 43 | static int skipFrequency(const uint8_t flags, const int pos); |
| 44 | static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); |
| 45 | static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, |
| 46 | const int pos); |
| 47 | static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); |
| 48 | static bool hasChildrenInFlags(const uint8_t flags); |
| 49 | static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, |
| 50 | int *pos); |
Jean Chalard | 6a0e964 | 2011-07-25 18:17:11 +0900 | [diff] [blame] | 51 | static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord, |
| 52 | const int length); |
Jean Chalard | 588e2f2 | 2011-07-25 14:03:19 +0900 | [diff] [blame] | 53 | static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, |
| 54 | uint16_t* outWord); |
Jean Chalard | 1059f27 | 2011-06-28 20:45:05 +0900 | [diff] [blame] | 55 | }; |
| 56 | |
Jean Chalard | f0a9809 | 2011-07-20 18:42:32 +0900 | [diff] [blame] | 57 | inline int BinaryFormat::detectFormat(const uint8_t* const dict) { |
| 58 | const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian |
| 59 | if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; |
| 60 | return UNKNOWN_FORMAT; |
| 61 | } |
| 62 | |
Jean Chalard | 1059f27 | 2011-06-28 20:45:05 +0900 | [diff] [blame] | 63 | inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { |
| 64 | return dict[(*pos)++]; |
| 65 | } |
| 66 | |
| 67 | inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { |
| 68 | return dict[(*pos)++]; |
| 69 | } |
| 70 | |
| 71 | inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { |
| 72 | const int origin = *pos; |
| 73 | const int32_t character = dict[origin]; |
| 74 | if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { |
| 75 | if (character == CHARACTER_ARRAY_TERMINATOR) { |
| 76 | *pos = origin + 1; |
| 77 | return NOT_A_CHARACTER; |
| 78 | } else { |
| 79 | *pos = origin + 3; |
| 80 | const int32_t char_1 = character << 16; |
| 81 | const int32_t char_2 = char_1 + (dict[origin + 1] << 8); |
| 82 | return char_2 + dict[origin + 2]; |
| 83 | } |
| 84 | } else { |
| 85 | *pos = origin + 1; |
| 86 | return character; |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, |
| 91 | const int pos) { |
| 92 | return dict[pos]; |
| 93 | } |
| 94 | |
| 95 | inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { |
| 96 | int currentPos = pos; |
| 97 | int32_t character = dict[currentPos++]; |
| 98 | while (CHARACTER_ARRAY_TERMINATOR != character) { |
| 99 | if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { |
| 100 | currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; |
| 101 | } |
| 102 | character = dict[currentPos++]; |
| 103 | } |
| 104 | return currentPos; |
| 105 | } |
| 106 | |
| 107 | static inline int attributeAddressSize(const uint8_t flags) { |
| 108 | static const int ATTRIBUTE_ADDRESS_SHIFT = 4; |
| 109 | return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; |
| 110 | /* Note: this is a value-dependant optimization of what may probably be |
| 111 | more readably written this way: |
| 112 | switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { |
| 113 | case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; |
| 114 | case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; |
| 115 | case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; |
| 116 | default: return 0; |
| 117 | } |
| 118 | */ |
| 119 | } |
| 120 | |
| 121 | inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) { |
| 122 | int currentPos = pos; |
| 123 | uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos); |
| 124 | while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { |
| 125 | currentPos += attributeAddressSize(flags); |
| 126 | flags = getFlagsAndForwardPointer(dict, ¤tPos); |
| 127 | } |
| 128 | currentPos += attributeAddressSize(flags); |
| 129 | return currentPos; |
| 130 | } |
| 131 | |
| 132 | static inline int childrenAddressSize(const uint8_t flags) { |
| 133 | static const int CHILDREN_ADDRESS_SHIFT = 6; |
| 134 | return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; |
| 135 | /* See the note in attributeAddressSize. The same applies here */ |
| 136 | } |
| 137 | |
| 138 | inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { |
| 139 | return pos + childrenAddressSize(flags); |
| 140 | } |
| 141 | |
| 142 | inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { |
| 143 | return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; |
| 144 | } |
| 145 | |
| 146 | inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, |
| 147 | const int pos) { |
| 148 | // This function skips all attributes. The format makes provision for future extension |
| 149 | // with other attributes (notably shortcuts) but for the time being, bigrams are the |
| 150 | // only attributes that may be found in a character group, so we only look at bigrams |
| 151 | // in this version. |
| 152 | if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { |
| 153 | return skipAttributes(dict, pos); |
| 154 | } else { |
| 155 | return pos; |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, |
| 160 | const uint8_t flags, const int pos) { |
| 161 | int currentPos = pos; |
| 162 | currentPos = skipChildrenPosition(flags, currentPos); |
| 163 | currentPos = skipAllAttributes(dict, flags, currentPos); |
| 164 | return currentPos; |
| 165 | } |
| 166 | |
| 167 | inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, |
| 168 | const int pos) { |
| 169 | int offset = 0; |
| 170 | switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { |
| 171 | case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: |
| 172 | offset = dict[pos]; |
| 173 | break; |
| 174 | case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: |
| 175 | offset = dict[pos] << 8; |
| 176 | offset += dict[pos + 1]; |
| 177 | break; |
| 178 | case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: |
| 179 | offset = dict[pos] << 16; |
| 180 | offset += dict[pos + 1] << 8; |
| 181 | offset += dict[pos + 2]; |
| 182 | break; |
| 183 | default: |
| 184 | // If we come here, it means we asked for the children of a word with |
| 185 | // no children. |
| 186 | return -1; |
| 187 | } |
| 188 | return pos + offset; |
| 189 | } |
| 190 | |
| 191 | inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { |
| 192 | return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS |
| 193 | != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); |
| 194 | } |
| 195 | |
| 196 | inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, |
| 197 | const uint8_t flags, int *pos) { |
| 198 | int offset = 0; |
| 199 | const int origin = *pos; |
| 200 | switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { |
| 201 | case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: |
| 202 | offset = dict[origin]; |
| 203 | *pos = origin + 1; |
| 204 | break; |
| 205 | case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: |
| 206 | offset = dict[origin] << 8; |
| 207 | offset += dict[origin + 1]; |
| 208 | *pos = origin + 2; |
| 209 | break; |
| 210 | case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: |
| 211 | offset = dict[origin] << 16; |
| 212 | offset += dict[origin + 1] << 8; |
| 213 | offset += dict[origin + 2]; |
| 214 | *pos = origin + 3; |
| 215 | break; |
| 216 | } |
| 217 | if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { |
| 218 | return origin - offset; |
| 219 | } else { |
| 220 | return origin + offset; |
| 221 | } |
| 222 | } |
| 223 | |
Jean Chalard | 6a0e964 | 2011-07-25 18:17:11 +0900 | [diff] [blame] | 224 | // This function gets the byte position of the last chargroup of the exact matching word in the |
| 225 | // dictionary. If no match is found, it returns NOT_VALID_WORD. |
| 226 | inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, |
| 227 | const uint16_t* const inWord, const int length) { |
| 228 | int pos = 0; |
| 229 | int wordPos = 0; |
| 230 | |
| 231 | while (true) { |
| 232 | // If we already traversed the tree further than the word is long, there means |
| 233 | // there was no match (or we would have found it). |
| 234 | if (wordPos > length) return NOT_VALID_WORD; |
| 235 | int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); |
| 236 | const uint16_t wChar = inWord[wordPos]; |
| 237 | while (true) { |
| 238 | // If there are no more character groups in this node, it means we could not |
| 239 | // find a matching character for this depth, therefore there is no match. |
| 240 | if (0 >= charGroupCount) return NOT_VALID_WORD; |
| 241 | const int charGroupPos = pos; |
| 242 | const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); |
| 243 | int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); |
| 244 | if (character == wChar) { |
| 245 | // This is the correct node. Only one character group may start with the same |
| 246 | // char within a node, so either we found our match in this node, or there is |
| 247 | // no match and we can return NOT_VALID_WORD. So we will check all the characters |
| 248 | // in this character group indeed does match. |
| 249 | if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { |
| 250 | character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); |
| 251 | while (NOT_A_CHARACTER != character) { |
| 252 | ++wordPos; |
| 253 | // If we shoot the length of the word we search for, or if we find a single |
| 254 | // character that does not match, as explained above, it means the word is |
| 255 | // not in the dictionary (by virtue of this chargroup being the only one to |
| 256 | // match the word on the first character, but not matching the whole word). |
| 257 | if (wordPos > length) return NOT_VALID_WORD; |
| 258 | if (inWord[wordPos] != character) return NOT_VALID_WORD; |
| 259 | character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); |
| 260 | } |
| 261 | } |
| 262 | // If we come here we know that so far, we do match. Either we are on a terminal |
| 263 | // and we match the length, in which case we found it, or we traverse children. |
| 264 | // If we don't match the length AND don't have children, then a word in the |
| 265 | // dictionary fully matches a prefix of the searched word but not the full word. |
| 266 | ++wordPos; |
| 267 | if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { |
| 268 | if (wordPos == length) { |
| 269 | return charGroupPos; |
| 270 | } |
| 271 | pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); |
| 272 | } |
| 273 | if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS |
| 274 | == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { |
| 275 | return NOT_VALID_WORD; |
| 276 | } |
| 277 | // We have children and we are still shorter than the word we are searching for, so |
| 278 | // we need to traverse children. Put the pointer on the children position, and |
| 279 | // break |
| 280 | pos = BinaryFormat::readChildrenPosition(root, flags, pos); |
| 281 | break; |
| 282 | } else { |
| 283 | // This chargroup does not match, so skip the remaining part and go to the next. |
| 284 | if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { |
| 285 | pos = BinaryFormat::skipOtherCharacters(root, pos); |
| 286 | } |
| 287 | pos = BinaryFormat::skipFrequency(flags, pos); |
| 288 | pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); |
| 289 | } |
| 290 | --charGroupCount; |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
Jean Chalard | 588e2f2 | 2011-07-25 14:03:19 +0900 | [diff] [blame] | 295 | // This function searches for a terminal in the dictionary by its address. |
| 296 | // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, |
| 297 | // it is possible to check for this with advantageous complexity. For each node, we search |
| 298 | // for groups with children and compare the children address with the address we look for. |
| 299 | // When we shoot the address we look for, it means the word we look for is in the children |
| 300 | // of the previous group. The only tricky part is the fact that if we arrive at the end of a |
| 301 | // node with the last group's children address still less than what we are searching for, we |
| 302 | // must descend the last group's children (for example, if the word we are searching for starts |
| 303 | // with a z, it's the last group of the root node, so all children addresses will be smaller |
| 304 | // than the address we look for, and we have to descend the z node). |
| 305 | /* Parameters : |
| 306 | * root: the dictionary buffer |
| 307 | * address: the byte position of the last chargroup of the word we are searching for (this is |
| 308 | * what is stored as the "bigram address" in each bigram) |
| 309 | * outword: an array to write the found word, with MAX_WORD_LENGTH size. |
| 310 | * Return value : the length of the word, of 0 if the word was not found. |
| 311 | */ |
| 312 | inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, |
| 313 | const int maxDepth, uint16_t* outWord) { |
| 314 | int pos = 0; |
| 315 | int wordPos = 0; |
| 316 | |
| 317 | // One iteration of the outer loop iterates through nodes. As stated above, we will only |
| 318 | // traverse nodes that are actually a part of the terminal we are searching, so each time |
| 319 | // we enter this loop we are one depth level further than last time. |
| 320 | // The only reason we count nodes is because we want to reduce the probability of infinite |
| 321 | // looping in case there is a bug. Since we know there is an upper bound to the depth we are |
| 322 | // supposed to traverse, it does not hurt to count iterations. |
| 323 | for (int loopCount = maxDepth; loopCount > 0; --loopCount) { |
| 324 | int lastCandidateGroupPos = 0; |
| 325 | // Let's loop through char groups in this node searching for either the terminal |
| 326 | // or one of its ascendants. |
| 327 | for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; |
| 328 | --charGroupCount) { |
| 329 | const int startPos = pos; |
| 330 | const uint8_t flags = getFlagsAndForwardPointer(root, &pos); |
| 331 | const int32_t character = getCharCodeAndForwardPointer(root, &pos); |
| 332 | if (address == startPos) { |
| 333 | // We found the address. Copy the rest of the word in the buffer and return |
| 334 | // the length. |
| 335 | outWord[wordPos] = character; |
| 336 | if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { |
| 337 | int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); |
| 338 | // We count chars in order to avoid infinite loops if the file is broken or |
| 339 | // if there is some other bug |
| 340 | int charCount = maxDepth; |
| 341 | while (-1 != nextChar && --charCount > 0) { |
| 342 | outWord[++wordPos] = nextChar; |
| 343 | nextChar = getCharCodeAndForwardPointer(root, &pos); |
| 344 | } |
| 345 | } |
| 346 | return ++wordPos; |
| 347 | } |
| 348 | // We need to skip past this char group, so skip any remaining chars after the |
| 349 | // first and possibly the frequency. |
| 350 | if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { |
| 351 | pos = skipOtherCharacters(root, pos); |
| 352 | } |
| 353 | pos = skipFrequency(flags, pos); |
| 354 | |
| 355 | // The fact that this group has children is very important. Since we already know |
| 356 | // that this group does not match, if it has no children we know it is irrelevant |
| 357 | // to what we are searching for. |
| 358 | const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != |
| 359 | (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); |
| 360 | // We will write in `found' whether we have passed the children address we are |
| 361 | // searching for. For example if we search for "beer", the children of b are less |
| 362 | // than the address we are searching for and the children of c are greater. When we |
| 363 | // come here for c, we realize this is too big, and that we should descend b. |
| 364 | bool found; |
| 365 | if (hasChildren) { |
| 366 | // Here comes the tricky part. First, read the children position. |
| 367 | const int childrenPos = readChildrenPosition(root, flags, pos); |
| 368 | if (childrenPos > address) { |
| 369 | // If the children pos is greater than address, it means the previous chargroup, |
| 370 | // which address is stored in lastCandidateGroupPos, was the right one. |
| 371 | found = true; |
| 372 | } else if (1 >= charGroupCount) { |
| 373 | // However if we are on the LAST group of this node, and we have NOT shot the |
| 374 | // address we should descend THIS node. So we trick the lastCandidateGroupPos |
| 375 | // so that we will descend this node, not the previous one. |
| 376 | lastCandidateGroupPos = startPos; |
| 377 | found = true; |
| 378 | } else { |
| 379 | // Else, we should continue looking. |
| 380 | found = false; |
| 381 | } |
| 382 | } else { |
| 383 | // Even if we don't have children here, we could still be on the last group of this |
| 384 | // node. If this is the case, we should descend the last group that had children, |
| 385 | // and their address is already in lastCandidateGroup. |
| 386 | found = (1 >= charGroupCount); |
| 387 | } |
| 388 | |
| 389 | if (found) { |
| 390 | // Okay, we found the group we should descend. Its address is in |
| 391 | // the lastCandidateGroupPos variable, so we just re-read it. |
| 392 | if (0 != lastCandidateGroupPos) { |
| 393 | const uint8_t lastFlags = |
| 394 | getFlagsAndForwardPointer(root, &lastCandidateGroupPos); |
| 395 | const int32_t lastChar = |
| 396 | getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); |
| 397 | // We copy all the characters in this group to the buffer |
| 398 | outWord[wordPos] = lastChar; |
| 399 | if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { |
| 400 | int32_t nextChar = |
| 401 | getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); |
| 402 | int charCount = maxDepth; |
| 403 | while (-1 != nextChar && --charCount > 0) { |
| 404 | outWord[++wordPos] = nextChar; |
| 405 | nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); |
| 406 | } |
| 407 | } |
| 408 | ++wordPos; |
| 409 | // Now we only need to branch to the children address. Skip the frequency if |
| 410 | // it's there, read pos, and break to resume the search at pos. |
| 411 | lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); |
| 412 | pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); |
| 413 | break; |
| 414 | } else { |
| 415 | // Here is a little tricky part: we come here if we found out that all children |
| 416 | // addresses in this group are bigger than the address we are searching for. |
| 417 | // Should we conclude the word is not in the dictionary? No! It could still be |
| 418 | // one of the remaining chargroups in this node, so we have to keep looking in |
| 419 | // this node until we find it (or we realize it's not there either, in which |
| 420 | // case it's actually not in the dictionary). Pass the end of this group, ready |
| 421 | // to start the next one. |
| 422 | pos = skipChildrenPosAndAttributes(root, flags, pos); |
| 423 | } |
| 424 | } else { |
| 425 | // If we did not find it, we should record the last children address for the next |
| 426 | // iteration. |
| 427 | if (hasChildren) lastCandidateGroupPos = startPos; |
| 428 | // Now skip the end of this group (children pos and the attributes if any) so that |
| 429 | // our pos is after the end of this char group, at the start of the next one. |
| 430 | pos = skipChildrenPosAndAttributes(root, flags, pos); |
| 431 | } |
| 432 | |
| 433 | } |
| 434 | } |
| 435 | // If we have looked through all the chargroups and found no match, the address is |
| 436 | // not the address of a terminal in this dictionary. |
| 437 | return 0; |
| 438 | } |
| 439 | |
Jean Chalard | 1059f27 | 2011-06-28 20:45:05 +0900 | [diff] [blame] | 440 | } // namespace latinime |
| 441 | |
| 442 | #endif // LATINIME_BINARY_FORMAT_H |