Blame - native/src/binary_format.h - android_packages_inputmethods_LatinIME

blob: a946b1ee3951ca3c68c6539e18d0a7eff17f9760 [file] [log] [blame]

Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef LATINIME_BINARY_FORMAT_H
				18	#define LATINIME_BINARY_FORMAT_H
				19
Jean Chalard	f0a9809	2011-07-20 18:42:32 +0900	[diff] [blame]	20	#include "unigram_dictionary.h"
				21
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	22	namespace latinime {
				23
				24	class BinaryFormat {
				25	private:
				26	const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
				27	const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
				28	const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
				29
				30	public:
Jean Chalard	f0a9809	2011-07-20 18:42:32 +0900	[diff] [blame]	31	const static int UNKNOWN_FORMAT = -1;
				32	const static int FORMAT_VERSION_1 = 1;
				33	const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
				34
				35	static int detectFormat(const uint8_t* const dict);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	36	static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
				37	static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
				38	static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
				39	static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
				40	static int skipOtherCharacters(const uint8_t* const dict, const int pos);
				41	static int skipAttributes(const uint8_t* const dict, const int pos);
				42	static int skipChildrenPosition(const uint8_t flags, const int pos);
				43	static int skipFrequency(const uint8_t flags, const int pos);
				44	static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
				45	static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
				46	const int pos);
				47	static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
				48	static bool hasChildrenInFlags(const uint8_t flags);
				49	static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
				50	int *pos);
Jean Chalard	6a0e964	2011-07-25 18:17:11 +0900	[diff] [blame]	51	static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
				52	const int length);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	53	};
				54
Jean Chalard	f0a9809	2011-07-20 18:42:32 +0900	[diff] [blame]	55	inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
				56	const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
				57	if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
				58	return UNKNOWN_FORMAT;
				59	}
				60
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	61	inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
				62	return dict[(*pos)++];
				63	}
				64
				65	inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
				66	return dict[(*pos)++];
				67	}
				68
				69	inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
				70	const int origin = *pos;
				71	const int32_t character = dict[origin];
				72	if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
				73	if (character == CHARACTER_ARRAY_TERMINATOR) {
				74	*pos = origin + 1;
				75	return NOT_A_CHARACTER;
				76	} else {
				77	*pos = origin + 3;
				78	const int32_t char_1 = character << 16;
				79	const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
				80	return char_2 + dict[origin + 2];
				81	}
				82	} else {
				83	*pos = origin + 1;
				84	return character;
				85	}
				86	}
				87
				88	inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
				89	const int pos) {
				90	return dict[pos];
				91	}
				92
				93	inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
				94	int currentPos = pos;
				95	int32_t character = dict[currentPos++];
				96	while (CHARACTER_ARRAY_TERMINATOR != character) {
				97	if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
				98	currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
				99	}
				100	character = dict[currentPos++];
				101	}
				102	return currentPos;
				103	}
				104
				105	static inline int attributeAddressSize(const uint8_t flags) {
				106	static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
				107	return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
				108	/* Note: this is a value-dependant optimization of what may probably be
				109	more readably written this way:
				110	switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
				111	case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
				112	case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
				113	case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
				114	default: return 0;
				115	}
				116	*/
				117	}
				118
				119	inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
				120	int currentPos = pos;
				121	uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
				122	while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
				123	currentPos += attributeAddressSize(flags);
				124	flags = getFlagsAndForwardPointer(dict, &currentPos);
				125	}
				126	currentPos += attributeAddressSize(flags);
				127	return currentPos;
				128	}
				129
				130	static inline int childrenAddressSize(const uint8_t flags) {
				131	static const int CHILDREN_ADDRESS_SHIFT = 6;
				132	return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
				133	/* See the note in attributeAddressSize. The same applies here */
				134	}
				135
				136	inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
				137	return pos + childrenAddressSize(flags);
				138	}
				139
				140	inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
				141	return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
				142	}
				143
				144	inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
				145	const int pos) {
				146	// This function skips all attributes. The format makes provision for future extension
				147	// with other attributes (notably shortcuts) but for the time being, bigrams are the
				148	// only attributes that may be found in a character group, so we only look at bigrams
				149	// in this version.
				150	if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
				151	return skipAttributes(dict, pos);
				152	} else {
				153	return pos;
				154	}
				155	}
				156
				157	inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
				158	const uint8_t flags, const int pos) {
				159	int currentPos = pos;
				160	currentPos = skipChildrenPosition(flags, currentPos);
				161	currentPos = skipAllAttributes(dict, flags, currentPos);
				162	return currentPos;
				163	}
				164
				165	inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
				166	const int pos) {
				167	int offset = 0;
				168	switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
				169	case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
				170	offset = dict[pos];
				171	break;
				172	case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
				173	offset = dict[pos] << 8;
				174	offset += dict[pos + 1];
				175	break;
				176	case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
				177	offset = dict[pos] << 16;
				178	offset += dict[pos + 1] << 8;
				179	offset += dict[pos + 2];
				180	break;
				181	default:
				182	// If we come here, it means we asked for the children of a word with
				183	// no children.
				184	return -1;
				185	}
				186	return pos + offset;
				187	}
				188
				189	inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
				190	return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
				191	!= (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
				192	}
				193
				194	inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
				195	const uint8_t flags, int *pos) {
				196	int offset = 0;
				197	const int origin = *pos;
				198	switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
				199	case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
				200	offset = dict[origin];
				201	*pos = origin + 1;
				202	break;
				203	case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
				204	offset = dict[origin] << 8;
				205	offset += dict[origin + 1];
				206	*pos = origin + 2;
				207	break;
				208	case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
				209	offset = dict[origin] << 16;
				210	offset += dict[origin + 1] << 8;
				211	offset += dict[origin + 2];
				212	*pos = origin + 3;
				213	break;
				214	}
				215	if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
				216	return origin - offset;
				217	} else {
				218	return origin + offset;
				219	}
				220	}
				221
Jean Chalard	6a0e964	2011-07-25 18:17:11 +0900	[diff] [blame]	222	// This function gets the byte position of the last chargroup of the exact matching word in the
				223	// dictionary. If no match is found, it returns NOT_VALID_WORD.
				224	inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
				225	const uint16_t* const inWord, const int length) {
				226	int pos = 0;
				227	int wordPos = 0;
				228
				229	while (true) {
				230	// If we already traversed the tree further than the word is long, there means
				231	// there was no match (or we would have found it).
				232	if (wordPos > length) return NOT_VALID_WORD;
				233	int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
				234	const uint16_t wChar = inWord[wordPos];
				235	while (true) {
				236	// If there are no more character groups in this node, it means we could not
				237	// find a matching character for this depth, therefore there is no match.
				238	if (0 >= charGroupCount) return NOT_VALID_WORD;
				239	const int charGroupPos = pos;
				240	const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
				241	int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				242	if (character == wChar) {
				243	// This is the correct node. Only one character group may start with the same
				244	// char within a node, so either we found our match in this node, or there is
				245	// no match and we can return NOT_VALID_WORD. So we will check all the characters
				246	// in this character group indeed does match.
				247	if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
				248	character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				249	while (NOT_A_CHARACTER != character) {
				250	++wordPos;
				251	// If we shoot the length of the word we search for, or if we find a single
				252	// character that does not match, as explained above, it means the word is
				253	// not in the dictionary (by virtue of this chargroup being the only one to
				254	// match the word on the first character, but not matching the whole word).
				255	if (wordPos > length) return NOT_VALID_WORD;
				256	if (inWord[wordPos] != character) return NOT_VALID_WORD;
				257	character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				258	}
				259	}
				260	// If we come here we know that so far, we do match. Either we are on a terminal
				261	// and we match the length, in which case we found it, or we traverse children.
				262	// If we don't match the length AND don't have children, then a word in the
				263	// dictionary fully matches a prefix of the searched word but not the full word.
				264	++wordPos;
				265	if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
				266	if (wordPos == length) {
				267	return charGroupPos;
				268	}
				269	pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
				270	}
				271	if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
				272	== (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
				273	return NOT_VALID_WORD;
				274	}
				275	// We have children and we are still shorter than the word we are searching for, so
				276	// we need to traverse children. Put the pointer on the children position, and
				277	// break
				278	pos = BinaryFormat::readChildrenPosition(root, flags, pos);
				279	break;
				280	} else {
				281	// This chargroup does not match, so skip the remaining part and go to the next.
				282	if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
				283	pos = BinaryFormat::skipOtherCharacters(root, pos);
				284	}
				285	pos = BinaryFormat::skipFrequency(flags, pos);
				286	pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
				287	}
				288	--charGroupCount;
				289	}
				290	}
				291	}
				292
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	293	} // namespace latinime
				294
				295	#endif // LATINIME_BINARY_FORMAT_H