blob: 7deec27d39560b21ab0c0978d60f166e13410df5 [file] [log] [blame]
Jean Chalard1059f272011-06-28 20:45:05 +09001/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_BINARY_FORMAT_H
18#define LATINIME_BINARY_FORMAT_H
19
Jean Chalardf0a98092011-07-20 18:42:32 +090020#include "unigram_dictionary.h"
21
Jean Chalard1059f272011-06-28 20:45:05 +090022namespace latinime {
23
24class BinaryFormat {
25private:
26 const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
27 const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
28 const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
29
30public:
Jean Chalardf0a98092011-07-20 18:42:32 +090031 const static int UNKNOWN_FORMAT = -1;
32 const static int FORMAT_VERSION_1 = 1;
33 const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
34
35 static int detectFormat(const uint8_t* const dict);
Jean Chalard1059f272011-06-28 20:45:05 +090036 static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
37 static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
38 static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
39 static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
40 static int skipOtherCharacters(const uint8_t* const dict, const int pos);
41 static int skipAttributes(const uint8_t* const dict, const int pos);
42 static int skipChildrenPosition(const uint8_t flags, const int pos);
43 static int skipFrequency(const uint8_t flags, const int pos);
44 static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
45 static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
46 const int pos);
47 static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
48 static bool hasChildrenInFlags(const uint8_t flags);
49 static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
50 int *pos);
51};
52
Jean Chalardf0a98092011-07-20 18:42:32 +090053inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
54 const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
55 if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
56 return UNKNOWN_FORMAT;
57}
58
Jean Chalard1059f272011-06-28 20:45:05 +090059inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
60 return dict[(*pos)++];
61}
62
63inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
64 return dict[(*pos)++];
65}
66
67inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
68 const int origin = *pos;
69 const int32_t character = dict[origin];
70 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
71 if (character == CHARACTER_ARRAY_TERMINATOR) {
72 *pos = origin + 1;
73 return NOT_A_CHARACTER;
74 } else {
75 *pos = origin + 3;
76 const int32_t char_1 = character << 16;
77 const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
78 return char_2 + dict[origin + 2];
79 }
80 } else {
81 *pos = origin + 1;
82 return character;
83 }
84}
85
86inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
87 const int pos) {
88 return dict[pos];
89}
90
91inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
92 int currentPos = pos;
93 int32_t character = dict[currentPos++];
94 while (CHARACTER_ARRAY_TERMINATOR != character) {
95 if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
96 currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
97 }
98 character = dict[currentPos++];
99 }
100 return currentPos;
101}
102
103static inline int attributeAddressSize(const uint8_t flags) {
104 static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
105 return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
106 /* Note: this is a value-dependant optimization of what may probably be
107 more readably written this way:
108 switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
109 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
110 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
111 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
112 default: return 0;
113 }
114 */
115}
116
117inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
118 int currentPos = pos;
119 uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
120 while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
121 currentPos += attributeAddressSize(flags);
122 flags = getFlagsAndForwardPointer(dict, &currentPos);
123 }
124 currentPos += attributeAddressSize(flags);
125 return currentPos;
126}
127
128static inline int childrenAddressSize(const uint8_t flags) {
129 static const int CHILDREN_ADDRESS_SHIFT = 6;
130 return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
131 /* See the note in attributeAddressSize. The same applies here */
132}
133
134inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
135 return pos + childrenAddressSize(flags);
136}
137
138inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
139 return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
140}
141
142inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
143 const int pos) {
144 // This function skips all attributes. The format makes provision for future extension
145 // with other attributes (notably shortcuts) but for the time being, bigrams are the
146 // only attributes that may be found in a character group, so we only look at bigrams
147 // in this version.
148 if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
149 return skipAttributes(dict, pos);
150 } else {
151 return pos;
152 }
153}
154
155inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
156 const uint8_t flags, const int pos) {
157 int currentPos = pos;
158 currentPos = skipChildrenPosition(flags, currentPos);
159 currentPos = skipAllAttributes(dict, flags, currentPos);
160 return currentPos;
161}
162
163inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
164 const int pos) {
165 int offset = 0;
166 switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
167 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
168 offset = dict[pos];
169 break;
170 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
171 offset = dict[pos] << 8;
172 offset += dict[pos + 1];
173 break;
174 case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
175 offset = dict[pos] << 16;
176 offset += dict[pos + 1] << 8;
177 offset += dict[pos + 2];
178 break;
179 default:
180 // If we come here, it means we asked for the children of a word with
181 // no children.
182 return -1;
183 }
184 return pos + offset;
185}
186
187inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
188 return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
189 != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
190}
191
192inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
193 const uint8_t flags, int *pos) {
194 int offset = 0;
195 const int origin = *pos;
196 switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
197 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
198 offset = dict[origin];
199 *pos = origin + 1;
200 break;
201 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
202 offset = dict[origin] << 8;
203 offset += dict[origin + 1];
204 *pos = origin + 2;
205 break;
206 case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
207 offset = dict[origin] << 16;
208 offset += dict[origin + 1] << 8;
209 offset += dict[origin + 2];
210 *pos = origin + 3;
211 break;
212 }
213 if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
214 return origin - offset;
215 } else {
216 return origin + offset;
217 }
218}
219
220} // namespace latinime
221
222#endif // LATINIME_BINARY_FORMAT_H