Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

blob: bbfaea4543c5ba98bf2d5c7a49d8438c8c6dc39d [file] [log] [blame]

satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	1	/*
				2	**
				3	** Copyright 2010, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	18	#include <assert.h>
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	19	#include <string.h>
				20
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	21	#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	22
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	23	#include "char_utils.h"
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	24	#include "dictionary.h"
				25	#include "unigram_dictionary.h"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	26
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	27	#include "binary_format.h"
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	28
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	29	namespace latinime {
				30
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	31	const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
				32	{ { 'a', 'e' },
				33	{ 'o', 'e' },
				34	{ 'u', 'e' } };
				35
Jean Chalard	293ece0	2011-06-16 20:55:16 +0900	[diff] [blame]	36	// TODO: check the header
				37	UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	38	int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame]	39	const bool isLatestDictVersion)
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	40	: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
Jean Chalard	293ece0	2011-06-16 20:55:16 +0900	[diff] [blame]	41	MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	42	MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
				43	TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	44	// TODO : remove this variable.
				45	ROOT_POS(0),
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	46	BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	47	MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	48	if (DEBUG_DICT) {
				49	LOGI("UnigramDictionary - constructor");
				50	}
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	51	mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	52	}
				53
satok	2df3060	2011-07-15 13:49:00 +0900	[diff] [blame]	54	UnigramDictionary::~UnigramDictionary() {
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	55	delete mCorrection;
satok	2df3060	2011-07-15 13:49:00 +0900	[diff] [blame]	56	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	57
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	58	static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
				59	const int MAX_PROXIMITY_CHARS) {
				60	return sizeof(codes) MAX_PROXIMITY_CHARS * codesSize;
				61	}
				62
				63	bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
				64
				65	// There can't be a digraph if we don't have at least 2 characters to examine
				66	if (i + 2 > codesSize) return false;
				67
				68	// Search for the first char of some digraph
				69	int lastDigraphIndex = -1;
				70	const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
				71	for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
				72	lastDigraphIndex >= 0; --lastDigraphIndex) {
				73	if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
				74	}
				75	// No match: return early
				76	if (lastDigraphIndex < 0) return false;
				77
				78	// It's an interesting digraph if the second char matches too.
				79	return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
				80	}
				81
				82	// Mostly the same arguments as the non-recursive version, except:
				83	// codes is the original value. It points to the start of the work buffer, and gets passed as is.
				84	// codesSize is the size of the user input (thus, it is the size of codesSrc).
				85	// codesDest is the current point in the work buffer.
				86	// codesSrc is the current point in the user-input, original, content-unmodified buffer.
				87	// codesRemain is the remaining size in codesSrc.
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	88	void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	89	const int xcoordinates, const int ycoordinates, const int *codesBuffer,
				90	const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok	3c4bb77	2011-03-04 22:50:19 -0800	[diff] [blame]	91	const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	92
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	93	if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
				94	for (int i = 0; i < codesRemain; ++i) {
				95	if (isDigraph(codesSrc, i, codesRemain)) {
				96	// Found a digraph. We will try both spellings. eg. the word is "pruefen"
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	97
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	98	// Copy the word up to the first char of the digraph, then continue processing
				99	// on the remaining part of the word, skipping the second char of the digraph.
				100	// In our example, copy "pru" and continue running on "fen"
				101	// Make i the index of the second char of the digraph for simplicity. Forgetting
				102	// to do that results in an infinite recursion so take care!
				103	++i;
				104	memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
				105	getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
				106	codesBuffer, codesBufferSize, flags,
				107	codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
				108	currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
				109	frequencies);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	110
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	111	// Copy the second char of the digraph in place, then continue processing on
				112	// the remaining part of the word.
				113	// In our example, after "pru" in the buffer copy the "e", and continue on "fen"
				114	memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
				115	BYTES_IN_ONE_CHAR);
				116	getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
				117	codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
				118	codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
				119	outWords, frequencies);
				120	return;
				121	}
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	122	}
				123	}
				124
				125	// If we come here, we hit the end of the word: let's check it against the dictionary.
				126	// In our example, we'll come here once for "prufen" and then once for "pruefen".
				127	// If the word contains several digraphs, we'll come it for the product of them.
				128	// eg. if the word is "ueberpruefen" we'll test, in order, against
				129	// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
				130	const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
				131	if (0 != remainingBytes)
				132	memcpy(codesDest, codesSrc, remainingBytes);
				133
				134	getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
				135	(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
				136	}
				137
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	138	int UnigramDictionary::getSuggestions(ProximityInfo proximityInfo, const int xcoordinates,
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	139	const int ycoordinates, const int codes, const int codesSize, const int flags,
				140	unsigned short outWords, int frequencies) {
				141
				142	if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
				143	{ // Incrementally tune the word and try all possibilities
				144	int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
				145	getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
Jean Chalard	a787dba	2011-03-04 12:17:48 +0900	[diff] [blame]	146	codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	147	} else { // Normal processing
				148	getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
				149	outWords, frequencies);
				150	}
				151
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	152	PROF_START(20);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	153	// Get the word count
				154	int suggestedWordsCount = 0;
				155	while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
				156	suggestedWordsCount++;
				157	}
				158
				159	if (DEBUG_DICT) {
				160	LOGI("Returning %d words", suggestedWordsCount);
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	161	/// Print the returned words
				162	for (int j = 0; j < suggestedWordsCount; ++j) {
Doug Kwan	ce9efbf	2011-07-07 22:53:50 -0700	[diff] [blame]	163	#ifdef FLAG_DBG
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	164	short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
				165	char s[MAX_WORD_LENGTH];
				166	for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
				167	LOGI("%s %i", s, mFrequencies[j]);
satok	787945b	2011-07-14 08:32:57 +0900	[diff] [blame]	168	#endif
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	169	}
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	170	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	171	PROF_END(20);
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	172	PROF_CLOSE;
				173	return suggestedWordsCount;
				174	}
				175
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	176	void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	177	const int xcoordinates, const int ycoordinates, const int *codes, const int codesSize,
				178	unsigned short outWords, int frequencies) {
				179
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	180	PROF_OPEN;
				181	PROF_START(0);
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	182	initSuggestions(
				183	proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	184	if (DEBUG_DICT) assert(codesSize == mInputLength);
				185
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	186	const int maxDepth = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	187	mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth);
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	188	PROF_END(0);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	189
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	190	PROF_START(1);
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	191	getSuggestionCandidates(-1, -1, -1);
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	192	PROF_END(1);
				193
				194	PROF_START(2);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	195	// Suggestion with missing character
				196	if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	197	for (int i = 0; i < codesSize; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	198	if (DEBUG_DICT) {
				199	LOGI("--- Suggest missing characters %d", i);
				200	}
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	201	getSuggestionCandidates(i, -1, -1);
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	202	}
				203	}
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	204	PROF_END(2);
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	205
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	206	PROF_START(3);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	207	// Suggestion with excessive character
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	208	if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
				209	&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	210	for (int i = 0; i < codesSize; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	211	if (DEBUG_DICT) {
				212	LOGI("--- Suggest excessive characters %d", i);
				213	}
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	214	getSuggestionCandidates(-1, i, -1);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	215	}
				216	}
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	217	PROF_END(3);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	218
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	219	PROF_START(4);
satok	a3d78f6	2010-12-09 22:08:33 +0900	[diff] [blame]	220	// Suggestion with transposed characters
				221	// Only suggest words that length is mInputLength
				222	if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
				223	for (int i = 0; i < codesSize; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	224	if (DEBUG_DICT) {
				225	LOGI("--- Suggest transposed characters %d", i);
				226	}
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	227	getSuggestionCandidates(-1, -1, i);
satok	a3d78f6	2010-12-09 22:08:33 +0900	[diff] [blame]	228	}
				229	}
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	230	PROF_END(4);
satok	a3d78f6	2010-12-09 22:08:33 +0900	[diff] [blame]	231
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	232	PROF_START(5);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	233	// Suggestions with missing space
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	234	if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
				235	&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	236	for (int i = 1; i < codesSize; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	237	if (DEBUG_DICT) {
				238	LOGI("--- Suggest missing space characters %d", i);
				239	}
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	240	getMissingSpaceWords(mInputLength, i, mCorrection);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	241	}
				242	}
satok	61e2f85	2011-01-05 14:13:07 +0900	[diff] [blame]	243	PROF_END(5);
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	244
				245	PROF_START(6);
Jean Chalard	e93b1f22	2011-06-01 17:12:25 +0900	[diff] [blame]	246	if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	247	// The first and last "mistyped spaces" are taken care of by excessive character handling
				248	for (int i = 1; i < codesSize - 1; ++i) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	249	if (DEBUG_DICT) {
				250	LOGI("--- Suggest words with proximity space %d", i);
				251	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	252	const int x = xcoordinates[i];
				253	const int y = ycoordinates[i];
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	254	if (DEBUG_PROXIMITY_INFO) {
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	255	LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
				256	i, x, y, proximityInfo->hasSpaceProximity(x, y));
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	257	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	258	if (proximityInfo->hasSpaceProximity(x, y)) {
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	259	getMistypedSpaceWords(mInputLength, i, mCorrection);
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	260	}
satok	817e517	2011-03-04 06:06:45 -0800	[diff] [blame]	261	}
				262	}
				263	PROF_END(6);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	264	}
				265
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	266	void UnigramDictionary::initSuggestions(ProximityInfo proximityInfo, const int xcoordinates,
				267	const int ycoordinates, const int codes, const int codesSize,
Jean Chalard	c2bbc6a	2011-02-25 17:56:53 +0900	[diff] [blame]	268	unsigned short outWords, int frequencies) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	269	if (DEBUG_DICT) {
				270	LOGI("initSuggest");
				271	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	272	mFrequencies = frequencies;
				273	mOutputChars = outWords;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	274	mInputLength = codesSize;
satok	1d7eaf8	2011-07-13 10:32:02 +0900	[diff] [blame]	275	proximityInfo->setInputParams(codes, codesSize);
				276	mProximityInfo = proximityInfo;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	277	}
				278
Jean Chalard	8124e64	2011-06-16 22:33:41 +0900	[diff] [blame]	279	static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	280	if (c < nextLettersSize) {
				281	nextLetters[c]++;
				282	}
				283	}
				284
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	285	// TODO: We need to optimize addWord by using STL or something
Jean Chalard	ca5ef28	2011-06-17 15:36:26 +0900	[diff] [blame]	286	// TODO: This needs to take an const unsigned short* and not tinker with its contents
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	287	bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	288	word[length] = 0;
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	289	if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
Doug Kwan	ce9efbf	2011-07-07 22:53:50 -0700	[diff] [blame]	290	#ifdef FLAG_DBG
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	291	char s[length + 1];
				292	for (int i = 0; i <= length; i++) s[i] = word[i];
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	293	LOGI("Found word = %s, freq = %d", s, frequency);
satok	787945b	2011-07-14 08:32:57 +0900	[diff] [blame]	294	#endif
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	295	}
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame]	296	if (length > MAX_WORD_LENGTH) {
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	297	if (DEBUG_DICT) {
				298	LOGI("Exceeded max word length.");
				299	}
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame]	300	return false;
				301	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	302
				303	// Find the right insertion point
				304	int insertAt = 0;
				305	while (insertAt < MAX_WORDS) {
Jean Chalard	17e44a7	2011-06-16 22:51:11 +0900	[diff] [blame]	306	// TODO: How should we sort words with the same frequency?
				307	if (frequency > mFrequencies[insertAt]) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	308	break;
				309	}
				310	insertAt++;
				311	}
				312	if (insertAt < MAX_WORDS) {
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	313	if (DEBUG_DICT) {
Doug Kwan	ce9efbf	2011-07-07 22:53:50 -0700	[diff] [blame]	314	#ifdef FLAG_DBG
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	315	char s[length + 1];
				316	for (int i = 0; i <= length; i++) s[i] = word[i];
satok	b2e5e59	2011-04-26 14:50:54 +0900	[diff] [blame]	317	LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
satok	787945b	2011-07-14 08:32:57 +0900	[diff] [blame]	318	#endif
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	319	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	320	memmove((char) mFrequencies + (insertAt + 1) sizeof(mFrequencies[0]),
				321	(char) mFrequencies + insertAt sizeof(mFrequencies[0]),
				322	(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
				323	mFrequencies[insertAt] = frequency;
				324	memmove((char) mOutputChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	325	(char) mOutputChars + insertAt MAX_WORD_LENGTH * sizeof(short),
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	326	(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	327	unsigned short dest = mOutputChars + insertAt MAX_WORD_LENGTH;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	328	while (length--) {
				329	dest++ = word++;
				330	}
				331	*dest = 0; // NULL terminate
Ken Wakasa	de3070a	2011-03-19 09:16:42 +0900	[diff] [blame]	332	if (DEBUG_DICT) {
				333	LOGI("Added word at %d", insertAt);
				334	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	335	return true;
				336	}
				337	return false;
				338	}
				339
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	340	static const char QUOTE = '\'';
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	341	static const char SPACE = ' ';
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	342
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	343	void UnigramDictionary::getSuggestionCandidates(const int skipPos,
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	344	const int excessivePos, const int transposedPos) {
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	345	if (DEBUG_DICT) {
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	346	assert(transposedPos + 1 < mInputLength);
				347	assert(excessivePos < mInputLength);
				348	assert(missingPos < mInputLength);
				349	}
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	350	mCorrection->setCorrectionParams(skipPos, excessivePos, transposedPos,
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	351	-1 /* spaceProximityPos /, -1 / missingSpacePos */);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	352	int rootPosition = ROOT_POS;
Jean Chalard	980d6b6	2011-06-30 17:02:23 +0900	[diff] [blame]	353	// Get the number of children of root, then increment the position
Jean Chalard	293ece0	2011-06-16 20:55:16 +0900	[diff] [blame]	354	int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	355	int outputIndex = 0;
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	356
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	357	mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0));
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	358
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	359	// Depth first search
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	360	while (outputIndex >= 0) {
				361	if (mCorrection->initProcessState(outputIndex)) {
				362	int siblingPos = mCorrection->getTreeSiblingPos(outputIndex);
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	363	int firstChildPos;
satok	0f6c8e8	2011-08-03 02:19:44 +0900	[diff] [blame]	364
satok	4e4e74e	2011-08-03 23:27:32 +0900	[diff] [blame]	365	const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	366	mCorrection, &childCount, &firstChildPos, &siblingPos);
satok	662fe69	2010-12-08 17:05:39 +0900	[diff] [blame]	367	// Update next sibling pos
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	368	mCorrection->setTreeSiblingPos(outputIndex, siblingPos);
				369
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	370	if (needsToTraverseChildrenNodes) {
				371	// Goes to child node
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	372	outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos);
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	373	}
				374	} else {
satok	cdbbea7	2010-12-08 16:04:16 +0900	[diff] [blame]	375	// Goes to parent sibling node
satok	208268d	2011-08-10 15:44:08 +0900	[diff] [blame]	376	outputIndex = mCorrection->getTreeParentIndex(outputIndex);
satok	d299792	2010-12-07 13:08:39 +0900	[diff] [blame]	377	}
				378	}
				379	}
				380
satok	b2e5e59	2011-04-26 14:50:54 +0900	[diff] [blame]	381	static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
				382	inline static void multiplyIntCapped(const int multiplier, int *base) {
				383	const int temp = *base;
				384	if (temp != S_INT_MAX) {
				385	// Branch if multiplier == 2 for the optimization
				386	if (multiplier == 2) {
				387	*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
				388	} else {
				389	const int tempRetval = temp * multiplier;
				390	*base = tempRetval >= temp ? tempRetval : S_INT_MAX;
				391	}
				392	}
				393	}
				394
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	395	void UnigramDictionary::getMissingSpaceWords(
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	396	const int inputLength, const int missingSpacePos, Correction *correction) {
				397	correction->setCorrectionParams(-1 /* skipPos /, -1 / excessivePos */,
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	398	-1 /* transposedPos /, -1 / spaceProximityPos */, missingSpacePos);
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	399	getSplitTwoWordsSuggestion(inputLength, correction);
satok	b2e5e59	2011-04-26 14:50:54 +0900	[diff] [blame]	400	}
				401
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	402	void UnigramDictionary::getMistypedSpaceWords(
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	403	const int inputLength, const int spaceProximityPos, Correction *correction) {
				404	correction->setCorrectionParams(-1 /* skipPos /, -1 / excessivePos */,
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	405	-1 /* transposedPos /, spaceProximityPos, -1 / missingSpacePos */);
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	406	getSplitTwoWordsSuggestion(inputLength, correction);
satok	54fe9e0	2010-12-13 14:42:35 +0900	[diff] [blame]	407	}
satok	a3d78f6	2010-12-09 22:08:33 +0900	[diff] [blame]	408
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	409	inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	410	const int inputIndex, const int skipPos, const int depth) {
satok	d24df43	2011-07-14 15:43:42 +0900	[diff] [blame]	411	const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex);
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	412	// Skip the ' or other letter and continue deeper
				413	return (c == QUOTE && userTypedChar != QUOTE) \|\| skipPos == depth;
				414	}
				415
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	416	inline void UnigramDictionary::onTerminal(const int freq, Correction *correction) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	417	int wordLength;
				418	unsigned short* wordPointer;
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	419	const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
satok	4e4e74e	2011-08-03 23:27:32 +0900	[diff] [blame]	420	if (finalFreq >= 0) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	421	addWord(wordPointer, wordLength, finalFreq);
Jean Chalard	ca5ef28	2011-06-17 15:36:26 +0900	[diff] [blame]	422	}
				423	}
				424
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	425	void UnigramDictionary::getSplitTwoWordsSuggestion(
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	426	const int inputLength, Correction* correction) {
				427	const int spaceProximityPos = correction->getSpaceProximityPos();
				428	const int missingSpacePos = correction->getMissingSpacePos();
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	429	if (DEBUG_DICT) {
				430	int inputCount = 0;
				431	if (spaceProximityPos >= 0) ++inputCount;
				432	if (missingSpacePos >= 0) ++inputCount;
				433	assert(inputCount <= 1);
				434	}
				435	const bool isSpaceProximity = spaceProximityPos >= 0;
				436	const int firstWordStartPos = 0;
				437	const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
				438	const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
				439	const int secondWordLength = isSpaceProximity
				440	? (inputLength - spaceProximityPos - 1)
				441	: (inputLength - missingSpacePos);
				442
				443	if (inputLength >= MAX_WORD_LENGTH) return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	444	if (0 >= firstWordLength \|\| 0 >= secondWordLength \|\| firstWordStartPos >= secondWordStartPos
				445	\|\| firstWordStartPos < 0 \|\| secondWordStartPos + secondWordLength > inputLength)
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	446	return;
				447
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	448	const int newWordLength = firstWordLength + secondWordLength + 1;
				449	// Allocating variable length array on stack
				450	unsigned short word[newWordLength];
				451	const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
				452	if (DEBUG_DICT) {
				453	LOGI("First freq: %d", firstFreq);
				454	}
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	455	if (firstFreq <= 0) return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	456
				457	for (int i = 0; i < firstWordLength; ++i) {
				458	word[i] = mWord[i];
				459	}
				460
				461	const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
				462	if (DEBUG_DICT) {
				463	LOGI("Second freq: %d", secondFreq);
				464	}
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	465	if (secondFreq <= 0) return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	466
				467	word[firstWordLength] = SPACE;
				468	for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
				469	word[i] = mWord[i - firstWordLength - 1];
				470	}
				471
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	472	const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq);
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	473	if (DEBUG_DICT) {
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	474	LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	475	}
				476	addWord(word, newWordLength, pairFreq);
satok	612c6e4	2011-08-01 19:35:27 +0900	[diff] [blame]	477	return;
Jean Chalard	e6715e3	2011-06-30 19:47:25 +0900	[diff] [blame]	478	}
				479
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	480	// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
				481	// interface.
				482	inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
				483	const int inputLength, unsigned short *word) {
				484	uint16_t inWord[inputLength];
				485
				486	for (int i = 0; i < inputLength; ++i) {
satok	d24df43	2011-07-14 15:43:42 +0900	[diff] [blame]	487	inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	488	}
				489	return getMostFrequentWordLikeInner(inWord, inputLength, word);
				490	}
				491
				492	// This function will take the position of a character array within a CharGroup,
				493	// and check it actually like-matches the word in inWord starting at startInputIndex,
				494	// that is, it matches it with case and accents squashed.
				495	// The function returns true if there was a full match, false otherwise.
				496	// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
				497	// It will also place the end position of the array in outPos; in outInputIndex,
				498	// it will place the index of the first char AFTER the match if there was a match,
				499	// and the initial position if there was not. It makes sense because if there was
				500	// a match we want to continue searching, but if there was not, we want to go to
				501	// the next CharGroup.
				502	// In and out parameters may point to the same location. This function takes care
				503	// not to use any input parameters after it wrote into its outputs.
				504	static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
				505	const uint8_t* const root, const int startPos,
				506	const uint16_t* const inWord, const int startInputIndex,
				507	int32_t* outNewWord, int* outInputIndex, int* outPos) {
				508	const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
				509	int pos = startPos;
				510	int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
satok	d24df43	2011-07-14 15:43:42 +0900	[diff] [blame]	511	int32_t baseChar = Dictionary::toBaseLowerCase(character);
				512	const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	513
				514	if (baseChar != wChar) {
				515	*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
				516	*outInputIndex = startInputIndex;
				517	return false;
				518	}
				519	int inputIndex = startInputIndex;
				520	outNewWord[inputIndex] = character;
				521	if (hasMultipleChars) {
				522	character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				523	while (NOT_A_CHARACTER != character) {
satok	d24df43	2011-07-14 15:43:42 +0900	[diff] [blame]	524	baseChar = Dictionary::toBaseLowerCase(character);
				525	if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	526	*outPos = BinaryFormat::skipOtherCharacters(root, pos);
				527	*outInputIndex = startInputIndex;
				528	return false;
				529	}
				530	outNewWord[inputIndex] = character;
				531	character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
				532	}
				533	}
				534	*outInputIndex = inputIndex + 1;
				535	*outPos = pos;
				536	return true;
				537	}
				538
				539	// This function is invoked when a word like the word searched for is found.
				540	// It will compare the frequency to the max frequency, and if greater, will
				541	// copy the word into the output buffer. In output value maxFreq, it will
				542	// write the new maximum frequency if it changed.
				543	static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
				544	short unsigned int* outWord, int* maxFreq) {
				545	if (freq > *maxFreq) {
				546	for (int q = 0; q < length; ++q)
				547	outWord[q] = newWord[q];
				548	outWord[length] = 0;
				549	*maxFreq = freq;
				550	}
				551	}
				552
				553	// Will find the highest frequency of the words like the one passed as an argument,
				554	// that is, everything that only differs by case/accents.
				555	int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
				556	const int length, short unsigned int* outWord) {
				557	int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
				558	int depth = 0;
				559	int maxFreq = -1;
				560	const uint8_t* const root = DICT_ROOT;
				561
				562	mStackChildCount[0] = root[0];
				563	mStackInputIndex[0] = 0;
				564	mStackSiblingPos[0] = 1;
				565	while (depth >= 0) {
				566	const int charGroupCount = mStackChildCount[depth];
				567	int pos = mStackSiblingPos[depth];
				568	for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
				569	int inputIndex = mStackInputIndex[depth];
				570	const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
				571	// Test whether all chars in this group match with the word we are searching for. If so,
				572	// we want to traverse its children (or if the length match, evaluate its frequency).
				573	// Note that this function will output the position regardless, but will only write
				574	// into inputIndex if there is a match.
				575	const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
				576	inputIndex, newWord, &inputIndex, &pos);
				577	if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
				578	const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
				579	onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
				580	}
				581	pos = BinaryFormat::skipFrequency(flags, pos);
				582	const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
				583	const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
				584	// If we had a match and the word has children, we want to traverse them. We don't have
				585	// to traverse words longer than the one we are searching for, since they will not match
				586	// anyway, so don't traverse unless inputIndex < length.
				587	if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
				588	// Save position for this depth, to get back to this once children are done
				589	mStackChildCount[depth] = charGroupIndex;
				590	mStackSiblingPos[depth] = siblingPos;
				591	// Prepare stack values for next depth
				592	++depth;
				593	int childrenPos = childrenNodePos;
				594	mStackChildCount[depth] =
				595	BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
				596	mStackSiblingPos[depth] = childrenPos;
				597	mStackInputIndex[depth] = inputIndex;
				598	pos = childrenPos;
				599	// Go to the next depth level.
				600	++depth;
				601	break;
				602	} else {
				603	// No match, or no children, or word too long to ever match: go the next sibling.
				604	pos = siblingPos;
				605	}
				606	}
				607	--depth;
				608	}
				609	return maxFreq;
				610	}
				611
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	612	bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
Jean Chalard	6a0e964	2011-07-25 18:17:11 +0900	[diff] [blame]	613	return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	614	}
				615
				616	// TODO: remove this function.
				617	int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
				618	int length) const {
				619	return -1;
				620	}
				621
				622	// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
				623	// If the return value is false, then the caller should read in the output "nextSiblingPosition"
				624	// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
				625	// It is worthy to note that when false is returned, the output values other than
				626	// nextSiblingPosition are undefined.
				627	// If the return value is true, then the caller must proceed to traverse the children of this
				628	// node. processCurrentNode will output the information about the children: their count in
				629	// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
				630	// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
				631	// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
				632	// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
				633	// there aren't any more nodes at this level, it merely returns the address of the first byte after
				634	// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
				635	// given level, as output into newCount when traversing this level's parent.
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	636	inline bool UnigramDictionary::processCurrentNode(const int initialPos,
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	637	Correction correction, int newCount,
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	638	int newChildrenPosition, int nextSiblingPosition) {
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	639	if (DEBUG_DICT) {
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	640	correction->checkState();
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	641	}
Jean Chalard	0584f02	2011-06-30 19:23:16 +0900	[diff] [blame]	642	int pos = initialPos;
Jean Chalard	0584f02	2011-06-30 19:23:16 +0900	[diff] [blame]	643
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	644	// Flags contain the following information:
				645	// - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
				646	// - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
				647	// is on the specified number of bytes.
				648	// - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
				649	// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
				650	// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
				651	// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
				652	const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
				653	const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	654	const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
				655
				656	bool needsToInvokeOnTerminal = false;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	657
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	658	// This gets only ONE character from the stream. Next there will be:
				659	// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
				660	// else if FLAG_IS_TERMINAL: the frequency
				661	// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
				662	// Note that you can't have a node that both is not a terminal and has no children.
				663	int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
				664	assert(NOT_A_CHARACTER != c);
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	665
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	666	// We are going to loop through each character and make it look like it's a different
				667	// node each time. To do that, we will process characters in this node in order until
				668	// we find the character terminator. This is signalled by getCharCode* returning
				669	// NOT_A_CHARACTER.
				670	// As a special case, if there is only one character in this node, we must not read the
				671	// next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
				672	// This way, each loop run will look like a "virtual node".
				673	do {
				674	// We prefetch the next char. If 'c' is the last char of this node, we will have
				675	// NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
				676	// should behave as a terminal or not and whether we have children.
				677	const int32_t nextc = hasMultipleChars
				678	? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
				679	const bool isLastChar = (NOT_A_CHARACTER == nextc);
				680	// If there are more chars in this nodes, then this virtual node is not a terminal.
				681	// If we are on the last char, this virtual node is a terminal if this node is.
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	682	const bool isTerminal = isLastChar && isTerminalNode;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	683
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	684	Correction::CorrectionType stateType = correction->processCharAndCalcState(
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	685	c, isTerminal);
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	686	if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL
				687	\|\| stateType == Correction::ON_TERMINAL) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	688	needsToInvokeOnTerminal = true;
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	689	} else if (stateType == Correction::UNRELATED) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	690	// We found that this is an unrelated character, so we should give up traversing
				691	// this node and its children entirely.
				692	// However we may not be on the last virtual node yet so we skip the remaining
				693	// characters in this node, the frequency if it's there, read the next sibling
				694	// position to output it, then return false.
				695	// We don't have to output other values because we return false, as in
				696	// "don't traverse children".
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	697	if (!isLastChar) {
				698	pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
				699	}
				700	pos = BinaryFormat::skipFrequency(flags, pos);
				701	*nextSiblingPosition =
				702	BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				703	return false;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	704	}
				705
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	706	// Prepare for the next character. Promote the prefetched char to current char - the loop
				707	// will take care of prefetching the next. If we finally found our last char, nextc will
				708	// contain NOT_A_CHARACTER.
				709	c = nextc;
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	710	} while (NOT_A_CHARACTER != c);
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	711
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	712	if (isTerminalNode) {
				713	if (needsToInvokeOnTerminal) {
				714	// The frequency should be here, because we come here only if this is actually
				715	// a terminal node, and we are on its last char.
				716	const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	717	onTerminal(freq, mCorrection);
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	718	}
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	719
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	720	// If there are more chars in this node, then this virtual node has children.
				721	// If we are on the last char, this virtual node has children if this node has.
				722	const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
				723
				724	// This character matched the typed character (enough to traverse the node at least)
				725	// so we just evaluated it. Now we should evaluate this virtual node's children - that
				726	// is, if it has any. If it has no children, we're done here - so we skip the end of
				727	// the node, output the siblings position, and return false "don't traverse children".
				728	// Note that !hasChildren implies isLastChar, so we know we don't have to skip any
				729	// remaining char in this group for there can't be any.
				730	if (!hasChildren) {
				731	pos = BinaryFormat::skipFrequency(flags, pos);
				732	*nextSiblingPosition =
				733	BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				734	return false;
				735	}
				736
				737	// Optimization: Prune out words that are too long compared to how much was typed.
satok	cfca3c6	2011-08-10 14:30:10 +0900	[diff] [blame]	738	if (correction->needsToPrune()) {
satok	8876b75	2011-08-04 18:31:57 +0900	[diff] [blame]	739	pos = BinaryFormat::skipFrequency(flags, pos);
				740	*nextSiblingPosition =
				741	BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				742	return false;
				743	}
				744	}
Jean Chalard	1059f27	2011-06-28 20:45:05 +0900	[diff] [blame]	745
				746	// Now we finished processing this node, and we want to traverse children. If there are no
				747	// children, we can't come here.
				748	assert(BinaryFormat::hasChildrenInFlags(flags));
				749
				750	// If this node was a terminal it still has the frequency under the pointer (it may have been
				751	// read, but not skipped - see readFrequencyWithoutMovingPointer).
				752	// Next come the children position, then possibly attributes (attributes are bigrams only for
				753	// now, maybe something related to shortcuts in the future).
				754	// Once this is read, we still need to output the number of nodes in the immediate children of
				755	// this node, so we read and output it before returning true, as in "please traverse children".
				756	pos = BinaryFormat::skipFrequency(flags, pos);
				757	int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
				758	*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
				759	*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
				760	*newChildrenPosition = childrenPos;
				761	return true;
Jean Chalard	85a1d1e	2011-06-21 22:23:21 +0900	[diff] [blame]	762	}
				763
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	764	} // namespace latinime