Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

blob: 5dd4c7e219b98b36f9eb73654580d246d485056e [file] [log] [blame]

satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	1	/*
				2	**
				3	** Copyright 2010, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	18	#include <assert.h>
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	19	#include <string.h>
				20
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	21	#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	22
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	23	#include "char_utils.h"
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	24	#include "dictionary.h"
				25	#include "unigram_dictionary.h"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	26
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	27	#include "binary_format.h"
Jean Chalard	cf9dbbd	2011-12-26 15:16:59 +0900	[diff] [blame^]	28	#include "terminal_attributes.h"
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	29
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	30	namespace latinime {
				31
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	32	const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
				33	{ { 'a', 'e' },
				34	{ 'o', 'e' },
				35	{ 'u', 'e' } };
				36
Jean Chalard	293ece0	2011-06-16 20:55:16 +0900	[diff] [blame]	37	// TODO: check the header
				38	UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	39	int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame]	40	const bool isLatestDictVersion)
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	41	: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
Jean Chalard	293ece0	2011-06-16 20:55:16 +0900	[diff] [blame]	42	MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	43	MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
				44	TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	45	// TODO : remove this variable.
				46	ROOT_POS(0),
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	47	BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	48	MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	49	if (DEBUG_DICT) {
				50	LOGI("UnigramDictionary - constructor");
				51	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	52	}
				53
satok	2df3060	2011-07-15 13:49:00 +0900	[diff] [blame]	54	UnigramDictionary::~UnigramDictionary() {
satok	2df3060	2011-07-15 13:49:00 +0900	[diff] [blame]	55	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	56
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	57	static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize,
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	58	const int MAX_PROXIMITY_CHARS) {
				59	return sizeof(codes) MAX_PROXIMITY_CHARS * codesSize;
				60	}
				61
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	62	// TODO: This needs to take an const unsigned short* and not tinker with its contents
				63	static inline void addWord(
				64	unsigned short word, int length, int frequency, WordsPriorityQueue queue) {
				65	queue->push(frequency, word, length);
				66	}
				67
				68	bool UnigramDictionary::isDigraph(const int *codes, const int i, const int codesSize) const {
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	69
				70	// There can't be a digraph if we don't have at least 2 characters to examine
				71	if (i + 2 > codesSize) return false;
				72
				73	// Search for the first char of some digraph
				74	int lastDigraphIndex = -1;
				75	const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
				76	for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
				77	lastDigraphIndex >= 0; --lastDigraphIndex) {
				78	if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
				79	}
				80	// No match: return early
				81	if (lastDigraphIndex < 0) return false;
				82
				83	// It's an interesting digraph if the second char matches too.
				84	return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
				85	}
				86
				87	// Mostly the same arguments as the non-recursive version, except:
				88	// codes is the original value. It points to the start of the work buffer, and gets passed as is.
				89	// codesSize is the size of the user input (thus, it is the size of codesSrc).
				90	// codesDest is the current point in the work buffer.
				91	// codesSrc is the current point in the user-input, original, content-unmodified buffer.
				92	// codesRemain is the remaining size in codesSrc.
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	93	void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	94	const int xcoordinates, const int ycoordinates, const int *codesBuffer,
				95	const int codesBufferSize, const int flags, const int *codesSrc,
				96	const int codesRemain, const int currentDepth, int codesDest, Correction correction,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	97	WordsPriorityQueuePool *queuePool) {
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	98
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	99	if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
				100	for (int i = 0; i < codesRemain; ++i) {
				101	if (isDigraph(codesSrc, i, codesRemain)) {
				102	// Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	103
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	104	// Copy the word up to the first char of the digraph, then continue processing
				105	// on the remaining part of the word, skipping the second char of the digraph.
				106	// In our example, copy "pru" and continue running on "fen"
				107	// Make i the index of the second char of the digraph for simplicity. Forgetting
				108	// to do that results in an infinite recursion so take care!
				109	++i;
				110	memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
				111	getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
				112	codesBuffer, codesBufferSize, flags,
				113	codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	114	currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, correction,
				115	queuePool);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	116
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	117	// Copy the second char of the digraph in place, then continue processing on
				118	// the remaining part of the word.
				119	// In our example, after "pru" in the buffer copy the "e", and continue on "fen"
				120	memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
				121	BYTES_IN_ONE_CHAR);
				122	getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	123	codesBuffer, codesBufferSize, flags,
				124	codesSrc + i * MAX_PROXIMITY_CHARS, codesRemain - i, currentDepth + 1,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	125	codesDest + i * MAX_PROXIMITY_CHARS, correction, queuePool);
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	126	return;
				127	}
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	128	}
				129	}
				130
				131	// If we come here, we hit the end of the word: let's check it against the dictionary.
				132	// In our example, we'll come here once for "prufen" and then once for "pruefen".
				133	// If the word contains several digraphs, we'll come it for the product of them.
				134	// eg. if the word is "ueberpruefen" we'll test, in order, against
				135	// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
				136	const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
				137	if (0 != remainingBytes)
				138	memcpy(codesDest, codesSrc, remainingBytes);
				139
				140	getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	141	(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, flags, correction,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	142	queuePool);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	143	}
				144
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	145	int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
				146	WordsPriorityQueuePool queuePool, Correction correction, const int *xcoordinates,
				147	const int ycoordinates, const int codes, const int codesSize, const int flags,
				148	unsigned short outWords, int frequencies) {
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	149
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	150	Correction* masterCorrection = correction;
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	151	if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
				152	{ // Incrementally tune the word and try all possibilities
				153	int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
				154	getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	155	codesSize, flags, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	156	} else { // Normal processing
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	157	getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, flags,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	158	masterCorrection, queuePool);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	159	}
				160
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	161	PROF_START(20);
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	162	const int suggestedWordsCount =
				163	queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	164
				165	if (DEBUG_DICT) {
				166	LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	167	/// Print the returned words
				168	for (int j = 0; j < suggestedWordsCount; ++j) {
Doug Kwan	ce9efbf	2011-07-07 22:53:50 -0700	[diff] [blame]	169	#ifdef FLAG_DBG
satok	16379df	2011-12-12 20:53:22 +0900	[diff] [blame]	170	short unsigned int* w = outWords + j * MAX_WORD_LENGTH;
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	171	char s[MAX_WORD_LENGTH];
				172	for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
satok	16379df	2011-12-12 20:53:22 +0900	[diff] [blame]	173	LOGI("%s %i", s, frequencies[j]);
satok	787945b	2011-07-14 08:32:57 +0900	[diff] [blame]	174	#endif
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	175	}
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	176	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	177	PROF_END(20);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	178	PROF_CLOSE;
				179	return suggestedWordsCount;
				180	}
				181
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	182	void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	183	const int xcoordinates, const int ycoordinates, const int *codes,
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	184	const int inputLength, const int flags, Correction *correction,
				185	WordsPriorityQueuePool *queuePool) {
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	186
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	187	PROF_OPEN;
				188	PROF_START(0);
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	189	// Note: This line is intentionally left blank
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	190	PROF_END(0);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	191
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	192	PROF_START(1);
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	193	const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags;
				194	getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance,
				195	inputLength, correction, queuePool);
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	196	PROF_END(1);
				197
				198	PROF_START(2);
satok	10266c0	2011-08-19 22:05:59 +0900	[diff] [blame]	199	// Note: This line is intentionally left blank
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	200	PROF_END(2);
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	201
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	202	PROF_START(3);
satok	10266c0	2011-08-19 22:05:59 +0900	[diff] [blame]	203	// Note: This line is intentionally left blank
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	204	PROF_END(3);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	205
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	206	PROF_START(4);
satok	10266c0	2011-08-19 22:05:59 +0900	[diff] [blame]	207	// Note: This line is intentionally left blank
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	208	PROF_END(4);
satok	a3d78f6	2010-12-09 22:08:33 +0900	[diff] [blame]	209
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	210	PROF_START(5);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	211	// Suggestions with missing space
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	212	if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	213	&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
				214	for (int i = 1; i < inputLength; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	215	if (DEBUG_DICT) {
				216	LOGI("--- Suggest missing space characters %d", i);
				217	}
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	218	getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
				219	useFullEditDistance, inputLength, i, correction, queuePool);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	220	}
				221	}
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	222	PROF_END(5);
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	223
				224	PROF_START(6);
Jean Chalard	e93b1f22	2011-06-01 17:12:25 +0900	[diff] [blame]	225	if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	226	// The first and last "mistyped spaces" are taken care of by excessive character handling
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	227	for (int i = 1; i < inputLength - 1; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	228	if (DEBUG_DICT) {
				229	LOGI("--- Suggest words with proximity space %d", i);
				230	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	231	const int x = xcoordinates[i];
				232	const int y = ycoordinates[i];
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	233	if (DEBUG_PROXIMITY_INFO) {
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	234	LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
				235	i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	236	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	237	if (proximityInfo->hasSpaceProximity(x, y)) {
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	238	getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
				239	useFullEditDistance, inputLength, i, correction, queuePool);
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	240	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	241	}
				242	}
				243	PROF_END(6);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	244	}
				245
Yusuke Nojima	258bfe6	2011-09-28 12:59:43 +0900	[diff] [blame]	246	void UnigramDictionary::initSuggestions(ProximityInfo proximityInfo, const int xCoordinates,
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	247	const int yCoordinates, const int codes, const int inputLength,
				248	WordsPriorityQueue queue, Correction correction) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	249	if (DEBUG_DICT) {
				250	LOGI("initSuggest");
				251	}
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	252	proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates);
				253	if (queue) {
				254	queue->clear();
				255	}
				256	const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
				257	correction->initCorrection(proximityInfo, inputLength, maxDepth);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	258	}
				259
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	260	static const char QUOTE = '\'';
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	261	static const char SPACE = ' ';
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	262
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	263	void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
				264	const int xcoordinates, const int ycoordinates, const int *codes,
				265	const bool useFullEditDistance, const int inputLength, Correction *correction,
				266	WordsPriorityQueuePool *queuePool) {
				267	WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	268	initSuggestions(
				269	proximityInfo, xcoordinates, ycoordinates, codes, inputLength, masterQueue, correction);
				270	getSuggestionCandidates(useFullEditDistance, inputLength, correction, masterQueue,
				271	true /* doAutoCompletion */, DEFAULT_MAX_ERRORS);
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	272	}
				273
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	274	void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	275	const int inputLength, Correction correction, WordsPriorityQueue queue,
				276	const bool doAutoCompletion, const int maxErrors) {
satok	10266c0	2011-08-19 22:05:59 +0900	[diff] [blame]	277	// TODO: Remove setCorrectionParams
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	278	correction->setCorrectionParams(0, 0, 0,
satok	d03317c	2011-12-14 21:38:11 +0900	[diff] [blame]	279	-1 /* spaceProximityPos /, -1 / missingSpacePos */, useFullEditDistance,
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	280	doAutoCompletion, maxErrors);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	281	int rootPosition = ROOT_POS;
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	282	// Get the number of children of root, then increment the position
Jean Chalard	293ece0	2011-06-16 20:55:16 +0900	[diff] [blame]	283	int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	284	int outputIndex = 0;
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	285
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	286	correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0));
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	287
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	288	// Depth first search
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	289	while (outputIndex >= 0) {
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	290	if (correction->initProcessState(outputIndex)) {
				291	int siblingPos = correction->getTreeSiblingPos(outputIndex);
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	292	int firstChildPos;
satok	0f6c8e8	2011-08-03 02:19:44 +0900	[diff] [blame]	293
satok	4e4e74e	2011-08-03 23:27:32 +0900	[diff] [blame]	294	const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	295	correction, &childCount, &firstChildPos, &siblingPos, queue);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	296	// Update next sibling pos
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	297	correction->setTreeSiblingPos(outputIndex, siblingPos);
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	298
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	299	if (needsToTraverseChildrenNodes) {
				300	// Goes to child node
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	301	outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos);
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	302	}
				303	} else {
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	304	// Goes to parent sibling node
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	305	outputIndex = correction->getTreeParentIndex(outputIndex);
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	306	}
				307	}
				308	}
				309
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	310	void UnigramDictionary::getMissingSpaceWords(ProximityInfo proximityInfo, const int xcoordinates,
				311	const int ycoordinates, const int codes, const bool useFullEditDistance,
				312	const int inputLength, const int missingSpacePos, Correction *correction,
				313	WordsPriorityQueuePool* queuePool) {
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	314	getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
				315	useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */,
				316	correction, queuePool);
satok	b2e5e59	2011-04-26 14:50:54 +0900	[diff] [blame]	317	}
				318
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	319	void UnigramDictionary::getMistypedSpaceWords(ProximityInfo proximityInfo, const int xcoordinates,
				320	const int ycoordinates, const int codes, const bool useFullEditDistance,
				321	const int inputLength, const int spaceProximityPos, Correction *correction,
				322	WordsPriorityQueuePool* queuePool) {
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	323	getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
				324	useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos,
				325	correction, queuePool);
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	326	}
satok	a3d78f6	2010-12-09 22:08:33 +0900	[diff] [blame]	327
Jean Chalard	cf9dbbd	2011-12-26 15:16:59 +0900	[diff] [blame^]	328	inline void UnigramDictionary::onTerminal(const int freq,
				329	const TerminalAttributes& terminalAttributes, Correction *correction,
				330	WordsPriorityQueue *queue) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	331	int wordLength;
				332	unsigned short* wordPointer;
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	333	const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
satok	4e4e74e	2011-08-03 23:27:32 +0900	[diff] [blame]	334	if (finalFreq >= 0) {
Jean Chalard	cf9dbbd	2011-12-26 15:16:59 +0900	[diff] [blame^]	335	if (!terminalAttributes.isShortcutOnly()) {
				336	addWord(wordPointer, wordLength, finalFreq, queue);
				337	}
				338	TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator();
				339	while (iterator.hasNextShortcutTarget()) {
				340	// TODO: add the shortcut to the list of suggestions using the
				341	// iterator.getNextShortcutTarget(int, uint16_t*) method
				342	}
Jean Chalard	ca5ef28	2011-06-17 15:36:26 +0900	[diff] [blame]	343	}
				344	}
				345
satok	744dab6	2011-12-15 22:29:05 +0900	[diff] [blame]	346	void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
				347	const int xcoordinates, const int ycoordinates, const int *codes,
				348	const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
				349	const int spaceProximityPos, Correction correction, WordsPriorityQueuePool queuePool) {
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	350	WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
				351
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	352	if (DEBUG_DICT) {
				353	int inputCount = 0;
				354	if (spaceProximityPos >= 0) ++inputCount;
				355	if (missingSpacePos >= 0) ++inputCount;
				356	assert(inputCount <= 1);
				357	}
				358	const bool isSpaceProximity = spaceProximityPos >= 0;
				359	const int firstWordStartPos = 0;
				360	const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
				361	const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
				362	const int secondWordLength = isSpaceProximity
				363	? (inputLength - spaceProximityPos - 1)
				364	: (inputLength - missingSpacePos);
				365
				366	if (inputLength >= MAX_WORD_LENGTH) return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	367	if (0 >= firstWordLength \|\| 0 >= secondWordLength \|\| firstWordStartPos >= secondWordStartPos
				368	\|\| firstWordStartPos < 0 \|\| secondWordStartPos + secondWordLength > inputLength)
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	369	return;
				370
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	371	const int newWordLength = firstWordLength + secondWordLength + 1;
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	372
				373
				374	// Space proximity preparation
				375	//WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
				376	//initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
				377	//correction);
				378	//getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
				379	//MAX_ERRORS_FOR_TWO_WORDS);
				380
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	381	// Allocating variable length array on stack
				382	unsigned short word[newWordLength];
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	383	const int firstFreq = getMostFrequentWordLike(
				384	firstWordStartPos, firstWordLength, proximityInfo, mWord);
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	385	if (DEBUG_DICT) {
				386	LOGI("First freq: %d", firstFreq);
				387	}
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	388	if (firstFreq <= 0) return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	389
				390	for (int i = 0; i < firstWordLength; ++i) {
				391	word[i] = mWord[i];
				392	}
				393
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	394	const int secondFreq = getMostFrequentWordLike(
				395	secondWordStartPos, secondWordLength, proximityInfo, mWord);
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	396	if (DEBUG_DICT) {
				397	LOGI("Second freq: %d", secondFreq);
				398	}
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	399	if (secondFreq <= 0) return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	400
				401	word[firstWordLength] = SPACE;
				402	for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
				403	word[i] = mWord[i - firstWordLength - 1];
				404	}
				405
satok	1a6da63	2011-12-16 23:15:06 +0900	[diff] [blame]	406	// TODO: Remove initSuggestions and correction->setCorrectionParams
				407	initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength,
				408	0 /* do not clear queue */, correction);
				409
				410	correction->setCorrectionParams(-1 /* skipPos /, -1 / excessivePos */,
				411	-1 /* transposedPos */, spaceProximityPos, missingSpacePos,
				412	useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	413	const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	414	if (DEBUG_DICT) {
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	415	LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	416	}
satok	a7e5a5a	2011-12-15 16:49:12 +0900	[diff] [blame]	417	addWord(word, newWordLength, pairFreq, masterQueue);
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	418	return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	419	}
				420
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	421	// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
				422	// interface.
				423	inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	424	const int inputLength, ProximityInfo proximityInfo, unsigned short word) {
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	425	uint16_t inWord[inputLength];
				426
				427	for (int i = 0; i < inputLength; ++i) {
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	428	inWord[i] = (uint16_t)proximityInfo->getPrimaryCharAt(startInputIndex + i);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	429	}
				430	return getMostFrequentWordLikeInner(inWord, inputLength, word);
				431	}
				432
				433	// This function will take the position of a character array within a CharGroup,
				434	// and check it actually like-matches the word in inWord starting at startInputIndex,
				435	// that is, it matches it with case and accents squashed.
				436	// The function returns true if there was a full match, false otherwise.
				437	// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
				438	// It will also place the end position of the array in outPos; in outInputIndex,
				439	// it will place the index of the first char AFTER the match if there was a match,
				440	// and the initial position if there was not. It makes sense because if there was
				441	// a match we want to continue searching, but if there was not, we want to go to
				442	// the next CharGroup.
				443	// In and out parameters may point to the same location. This function takes care
				444	// not to use any input parameters after it wrote into its outputs.
				445	static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
				446	const uint8_t* const root, const int startPos,
				447	const uint16_t* const inWord, const int startInputIndex,
				448	int32_t* outNewWord, int* outInputIndex, int* outPos) {
				449	const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
				450	int pos = startPos;
				451	int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
Tadashi G. Takaoka	6e3cb27	2011-11-11 14:26:13 +0900	[diff] [blame]	452	int32_t baseChar = toBaseLowerCase(character);
				453	const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	454
				455	if (baseChar != wChar) {
				456	*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
				457	*outInputIndex = startInputIndex;
				458	return false;
				459	}
				460	int inputIndex = startInputIndex;
				461	outNewWord[inputIndex] = character;
				462	if (hasMultipleChars) {
				463	character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				464	while (NOT_A_CHARACTER != character) {
Tadashi G. Takaoka	6e3cb27	2011-11-11 14:26:13 +0900	[diff] [blame]	465	baseChar = toBaseLowerCase(character);
				466	if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	467	*outPos = BinaryFormat::skipOtherCharacters(root, pos);
				468	*outInputIndex = startInputIndex;
				469	return false;
				470	}
				471	outNewWord[inputIndex] = character;
				472	character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				473	}
				474	}
				475	*outInputIndex = inputIndex + 1;
				476	*outPos = pos;
				477	return true;
				478	}
				479
				480	// This function is invoked when a word like the word searched for is found.
				481	// It will compare the frequency to the max frequency, and if greater, will
				482	// copy the word into the output buffer. In output value maxFreq, it will
				483	// write the new maximum frequency if it changed.
				484	static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
				485	short unsigned int* outWord, int* maxFreq) {
				486	if (freq > *maxFreq) {
				487	for (int q = 0; q < length; ++q)
				488	outWord[q] = newWord[q];
				489	outWord[length] = 0;
				490	*maxFreq = freq;
				491	}
				492	}
				493
				494	// Will find the highest frequency of the words like the one passed as an argument,
				495	// that is, everything that only differs by case/accents.
				496	int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
				497	const int length, short unsigned int* outWord) {
				498	int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
				499	int depth = 0;
				500	int maxFreq = -1;
				501	const uint8_t* const root = DICT_ROOT;
				502
				503	mStackChildCount[0] = root[0];
				504	mStackInputIndex[0] = 0;
				505	mStackSiblingPos[0] = 1;
				506	while (depth >= 0) {
				507	const int charGroupCount = mStackChildCount[depth];
				508	int pos = mStackSiblingPos[depth];
				509	for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
				510	int inputIndex = mStackInputIndex[depth];
				511	const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
				512	// Test whether all chars in this group match with the word we are searching for. If so,
				513	// we want to traverse its children (or if the length match, evaluate its frequency).
				514	// Note that this function will output the position regardless, but will only write
				515	// into inputIndex if there is a match.
				516	const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
				517	inputIndex, newWord, &inputIndex, &pos);
				518	if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
				519	const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
				520	onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
				521	}
				522	pos = BinaryFormat::skipFrequency(flags, pos);
				523	const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
				524	const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
				525	// If we had a match and the word has children, we want to traverse them. We don't have
				526	// to traverse words longer than the one we are searching for, since they will not match
				527	// anyway, so don't traverse unless inputIndex < length.
				528	if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
				529	// Save position for this depth, to get back to this once children are done
				530	mStackChildCount[depth] = charGroupIndex;
				531	mStackSiblingPos[depth] = siblingPos;
				532	// Prepare stack values for next depth
				533	++depth;
				534	int childrenPos = childrenNodePos;
				535	mStackChildCount[depth] =
				536	BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
				537	mStackSiblingPos[depth] = childrenPos;
				538	mStackInputIndex[depth] = inputIndex;
				539	pos = childrenPos;
				540	// Go to the next depth level.
				541	++depth;
				542	break;
				543	} else {
				544	// No match, or no children, or word too long to ever match: go the next sibling.
				545	pos = siblingPos;
				546	}
				547	}
				548	--depth;
				549	}
				550	return maxFreq;
				551	}
				552
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	553	bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
Jean Chalard	6a0e964	2011-07-25 18:17:11 +0900	[diff] [blame]	554	return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	555	}
				556
				557	// TODO: remove this function.
				558	int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
				559	int length) const {
				560	return -1;
				561	}
				562
				563	// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
				564	// If the return value is false, then the caller should read in the output "nextSiblingPosition"
				565	// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
				566	// It is worthy to note that when false is returned, the output values other than
				567	// nextSiblingPosition are undefined.
				568	// If the return value is true, then the caller must proceed to traverse the children of this
				569	// node. processCurrentNode will output the information about the children: their count in
				570	// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
				571	// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
				572	// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
				573	// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
				574	// there aren't any more nodes at this level, it merely returns the address of the first byte after
				575	// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
				576	// given level, as output into newCount when traversing this level's parent.
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	577	inline bool UnigramDictionary::processCurrentNode(const int initialPos,
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	578	Correction correction, int newCount,
satok	1147c7b	2011-12-14 15:04:58 +0900	[diff] [blame]	579	int newChildrenPosition, int nextSiblingPosition, WordsPriorityQueue *queue) {
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	580	if (DEBUG_DICT) {
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	581	correction->checkState();
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	582	}
Jean Chalard	0584f02	2011-06-30 19:23:16 +0900	[diff] [blame]	583	int pos = initialPos;
Jean Chalard	0584f02	2011-06-30 19:23:16 +0900	[diff] [blame]	584
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	585	// Flags contain the following information:
				586	// - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
				587	// - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
				588	// is on the specified number of bytes.
				589	// - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
				590	// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
				591	// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
				592	// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
				593	const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
				594	const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	595	const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
				596
				597	bool needsToInvokeOnTerminal = false;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	598
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	599	// This gets only ONE character from the stream. Next there will be:
				600	// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
				601	// else if FLAG_IS_TERMINAL: the frequency
				602	// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
				603	// Note that you can't have a node that both is not a terminal and has no children.
				604	int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
				605	assert(NOT_A_CHARACTER != c);
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	606
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	607	// We are going to loop through each character and make it look like it's a different
				608	// node each time. To do that, we will process characters in this node in order until
				609	// we find the character terminator. This is signalled by getCharCode* returning
				610	// NOT_A_CHARACTER.
				611	// As a special case, if there is only one character in this node, we must not read the
				612	// next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
				613	// This way, each loop run will look like a "virtual node".
				614	do {
				615	// We prefetch the next char. If 'c' is the last char of this node, we will have
				616	// NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
				617	// should behave as a terminal or not and whether we have children.
				618	const int32_t nextc = hasMultipleChars
				619	? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
				620	const bool isLastChar = (NOT_A_CHARACTER == nextc);
				621	// If there are more chars in this nodes, then this virtual node is not a terminal.
				622	// If we are on the last char, this virtual node is a terminal if this node is.
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	623	const bool isTerminal = isLastChar && isTerminalNode;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	624
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	625	Correction::CorrectionType stateType = correction->processCharAndCalcState(
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	626	c, isTerminal);
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	627	if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL
				628	\|\| stateType == Correction::ON_TERMINAL) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	629	needsToInvokeOnTerminal = true;
satok	d03317c	2011-12-14 21:38:11 +0900	[diff] [blame]	630	} else if (stateType == Correction::UNRELATED \|\| correction->needsToPrune()) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	631	// We found that this is an unrelated character, so we should give up traversing
				632	// this node and its children entirely.
				633	// However we may not be on the last virtual node yet so we skip the remaining
				634	// characters in this node, the frequency if it's there, read the next sibling
				635	// position to output it, then return false.
				636	// We don't have to output other values because we return false, as in
				637	// "don't traverse children".
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	638	if (!isLastChar) {
				639	pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
				640	}
				641	pos = BinaryFormat::skipFrequency(flags, pos);
				642	*nextSiblingPosition =
				643	BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				644	return false;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	645	}
				646
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	647	// Prepare for the next character. Promote the prefetched char to current char - the loop
				648	// will take care of prefetching the next. If we finally found our last char, nextc will
				649	// contain NOT_A_CHARACTER.
				650	c = nextc;
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	651	} while (NOT_A_CHARACTER != c);
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	652
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	653	if (isTerminalNode) {
				654	if (needsToInvokeOnTerminal) {
				655	// The frequency should be here, because we come here only if this is actually
				656	// a terminal node, and we are on its last char.
				657	const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
Jean Chalard	cf9dbbd	2011-12-26 15:16:59 +0900	[diff] [blame^]	658	TerminalAttributes terminalAttributes(DICT_ROOT, flags,
				659	BinaryFormat::skipFrequency(flags, pos));
				660	onTerminal(freq, terminalAttributes, correction, queue);
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	661	}
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	662
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	663	// If there are more chars in this node, then this virtual node has children.
				664	// If we are on the last char, this virtual node has children if this node has.
				665	const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
				666
				667	// This character matched the typed character (enough to traverse the node at least)
				668	// so we just evaluated it. Now we should evaluate this virtual node's children - that
				669	// is, if it has any. If it has no children, we're done here - so we skip the end of
				670	// the node, output the siblings position, and return false "don't traverse children".
				671	// Note that !hasChildren implies isLastChar, so we know we don't have to skip any
				672	// remaining char in this group for there can't be any.
				673	if (!hasChildren) {
				674	pos = BinaryFormat::skipFrequency(flags, pos);
				675	*nextSiblingPosition =
				676	BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				677	return false;
				678	}
				679
				680	// Optimization: Prune out words that are too long compared to how much was typed.
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	681	if (correction->needsToPrune()) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	682	pos = BinaryFormat::skipFrequency(flags, pos);
				683	*nextSiblingPosition =
				684	BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
satok	10266c0	2011-08-19 22:05:59 +0900	[diff] [blame]	685	if (DEBUG_DICT_FULL) {
				686	LOGI("Traversing was pruned.");
				687	}
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	688	return false;
				689	}
				690	}
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	691
				692	// Now we finished processing this node, and we want to traverse children. If there are no
				693	// children, we can't come here.
				694	assert(BinaryFormat::hasChildrenInFlags(flags));
				695
				696	// If this node was a terminal it still has the frequency under the pointer (it may have been
				697	// read, but not skipped - see readFrequencyWithoutMovingPointer).
				698	// Next come the children position, then possibly attributes (attributes are bigrams only for
				699	// now, maybe something related to shortcuts in the future).
				700	// Once this is read, we still need to output the number of nodes in the immediate children of
				701	// this node, so we read and output it before returning true, as in "please traverse children".
				702	pos = BinaryFormat::skipFrequency(flags, pos);
				703	int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
				704	*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				705	*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
				706	*newChildrenPosition = childrenPos;
				707	return true;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	708	}
				709
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	710	} // namespace latinime