Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

blob: 707f1e6fb0d66a14da4014363357134f49ba1293 [file] [log] [blame]

satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	1	/*
				2	**
				3	** Copyright 2010, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	18	#include <assert.h>
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	19	#include <fcntl.h>
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame^]	20	#include <stdio.h>
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	21	#include <string.h>
				22
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	23	#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	24
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	25	#include "basechars.h"
				26	#include "char_utils.h"
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	27	#include "dictionary.h"
				28	#include "unigram_dictionary.h"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	29
				30	namespace latinime {
				31
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	32	UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
				33	int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame]	34	const bool isLatestDictVersion)
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	35	: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
				36	MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame]	37	TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	38	LOGI("UnigramDictionary - constructor");
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	39	}
				40
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame]	41	UnigramDictionary::~UnigramDictionary() {}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	42
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame]	43	int UnigramDictionary::getSuggestions(int codes, int codesSize, unsigned short outWords,
				44	int frequencies, int nextLetters, int nextLettersSize)
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	45	{
				46
				47	initSuggestions(codes, codesSize, outWords, frequencies);
				48
				49	int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
				50	nextLettersSize);
				51
				52	// If there aren't sufficient suggestions, search for words by allowing wild cards at
				53	// the different character positions. This feature is not ready for prime-time as we need
				54	// to figure out the best ranking for such words compared to proximity corrections and
				55	// completions.
				56	if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
				57	for (int i = 0; i < codesSize; ++i) {
				58	int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
				59	if (tempCount > suggestedWordsCount) {
				60	suggestedWordsCount = tempCount;
				61	break;
				62	}
				63	}
				64	}
				65
				66	if (DEBUG_DICT) {
				67	LOGI("Returning %d words", suggestedWordsCount);
				68	LOGI("Next letters: ");
				69	for (int k = 0; k < nextLettersSize; k++) {
				70	if (nextLetters[k] > 0) {
				71	LOGI("%c = %d,", k, nextLetters[k]);
				72	}
				73	}
				74	LOGI("\n");
				75	}
				76	return suggestedWordsCount;
				77	}
				78
				79	void UnigramDictionary::initSuggestions(int codes, int codesSize, unsigned short outWords,
				80	int *frequencies) {
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame^]	81	if (DEBUG_DICT) LOGI("initSuggest");
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	82	mFrequencies = frequencies;
				83	mOutputChars = outWords;
				84	mInputCodes = codes;
				85	mInputLength = codesSize;
				86	mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
				87	}
				88
				89	int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
				90	int *nextLetters, int nextLettersSize) {
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame^]	91	if (DEBUG_DICT) LOGI("getSuggestionCandidates");
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	92	int initialPos = 0;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	93	if (IS_LATEST_DICT_VERSION) {
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	94	initialPos = DICTIONARY_HEADER_SIZE;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	95	}
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	96	getWords(initialPos, inputLength, skipPos, nextLetters, nextLettersSize);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	97
				98	// Get the word count
				99	int suggestedWordsCount = 0;
				100	while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
				101	suggestedWordsCount++;
				102	}
				103	return suggestedWordsCount;
				104	}
				105
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	106	void UnigramDictionary::registerNextLetter(
				107	unsigned short c, int *nextLetters, int nextLettersSize) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	108	if (c < nextLettersSize) {
				109	nextLetters[c]++;
				110	}
				111	}
				112
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	113	bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	114	word[length] = 0;
				115	if (DEBUG_DICT) {
				116	char s[length + 1];
				117	for (int i = 0; i <= length; i++) s[i] = word[i];
				118	LOGI("Found word = %s, freq = %d : \n", s, frequency);
				119	}
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame^]	120	if (length > MAX_WORD_LENGTH) {
				121	if (DEBUG_DICT) LOGI("Exceeded max word length.");
				122	return false;
				123	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	124
				125	// Find the right insertion point
				126	int insertAt = 0;
				127	while (insertAt < MAX_WORDS) {
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	128	if (frequency > mFrequencies[insertAt] \|\| (mFrequencies[insertAt] == frequency
				129	&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	130	break;
				131	}
				132	insertAt++;
				133	}
				134	if (insertAt < MAX_WORDS) {
				135	memmove((char) mFrequencies + (insertAt + 1) sizeof(mFrequencies[0]),
				136	(char) mFrequencies + insertAt sizeof(mFrequencies[0]),
				137	(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
				138	mFrequencies[insertAt] = frequency;
				139	memmove((char) mOutputChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	140	(char) mOutputChars + insertAt MAX_WORD_LENGTH * sizeof(short),
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	141	(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	142	unsigned short dest = mOutputChars + insertAt MAX_WORD_LENGTH;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	143	while (length--) {
				144	dest++ = word++;
				145	}
				146	*dest = 0; // NULL terminate
				147	if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
				148	return true;
				149	}
				150	return false;
				151	}
				152
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	153	unsigned short UnigramDictionary::toLowerCase(unsigned short c) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	154	if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
				155	c = BASE_CHARS[c];
				156	}
				157	if (c >='A' && c <= 'Z') {
				158	c \|= 32;
				159	} else if (c > 127) {
				160	c = latin_tolower(c);
				161	}
				162	return c;
				163	}
				164
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	165	bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	166	if (length != mInputLength) {
				167	return false;
				168	}
				169	int *inputCodes = mInputCodes;
				170	while (length--) {
				171	if ((unsigned int) inputCodes != (unsigned int) word) {
				172	return false;
				173	}
				174	inputCodes += MAX_ALTERNATIVES;
				175	word++;
				176	}
				177	return true;
				178	}
				179
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	180	static const char QUOTE = '\'';
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	181
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	182	void UnigramDictionary::getWords(const int initialPos, const int inputLength, const int skipPos,
				183	int *nextLetters, const int nextLettersSize) {
				184	int initialPosition = initialPos;
				185	const int count = Dictionary::getCount(DICT, &initialPosition);
satok	f5cded1	2010-12-06 21:28:24 +0900	[diff] [blame^]	186	getWordsRec(count, initialPosition, 0,
				187	min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	188	mInputLength <= 0, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
				189	}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	190
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	191	// snr : frequency?
				192	void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
				193	const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
				194	const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) {
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	195	int siblingPos = pos;
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	196	for (int i = 0; i < childrenCount; ++i) {
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	197	int newCount;
				198	int newChildPosition;
				199	int newDepth;
				200	bool newTraverseAllNodes;
				201	int newSnr;
				202	int newInputIndex;
				203	int newDiffs;
				204	int newSiblingPos;
				205	const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
				206	traverseAllNodes, snr, inputIndex, diffs, skipPos, nextLetters, nextLettersSize,
				207	&newCount, &newChildPosition, &newDepth, &newTraverseAllNodes, &newSnr,
				208	&newInputIndex, &newDiffs, &newSiblingPos);
				209	siblingPos = newSiblingPos;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	210
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	211	if (needsToTraverseChildrenNodes) {
				212	getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	213	newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	214	}
				215	}
				216	}
				217
satok	715514d	2010-12-02 20:19:59 +0900	[diff] [blame]	218	inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
				219	unsigned short *word, const int inputLength, const int depth, const int snr,
				220	int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
				221	addWord(word, depth + 1, freq * snr);
				222	if (depth >= inputLength && skipPos < 0) {
				223	registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
				224	}
				225	}
				226
				227	inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
				228	unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,
				229	const int addedWeight) {
				230	if (!sameAsTyped(word, depth + 1)) {
				231	int finalFreq = freq * snr * addedWeight;
				232	// Proximity collection will promote a word of the same length as
				233	// what user typed.
				234	if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
				235	addWord(word, depth + 1, finalFreq);
				236	}
				237	}
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	238
				239	inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	240	const int inputIndex, const int skipPos, const int depth) {
				241	const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_ALTERNATIVES))[0];
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	242	// Skip the ' or other letter and continue deeper
				243	return (c == QUOTE && userTypedChar != QUOTE) \|\| skipPos == depth;
				244	}
				245
				246	inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	247	const unsigned short c, const int skipPos) {
				248	const unsigned short lowerC = toLowerCase(c);
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	249	int j = 0;
				250	while (currentChars[j] > 0) {
satok	6831926	2010-12-03 19:38:08 +0900	[diff] [blame]	251	const bool matched = (currentChars[j] == lowerC \|\| currentChars[j] == c);
satok	28bd03b	2010-12-03 16:39:16 +0900	[diff] [blame]	252	// If skipPos is defined, not to search proximity collections.
				253	// First char is what user typed.
				254	if (matched) {
				255	return j;
				256	} else if (skipPos >= 0) {
				257	return -1;
				258	}
				259	++j;
				260	}
				261	return -1;
				262	}
				263
satok	48e432c	2010-12-06 17:38:58 +0900	[diff] [blame]	264	inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
				265	const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
				266	const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
				267	int newCount, int newChildPosition, int newDepth, bool newTraverseAllNodes,
				268	int newSnr, intnewInputIndex, int newDiffs, int nextSiblingPosition) {
				269	unsigned short c;
				270	int childPosition;
				271	bool terminal;
				272	int freq;
				273	*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
				274	&childPosition, &terminal, &freq);
				275
				276	const bool needsToTraverseChildrenNodes = childPosition != 0;
				277
				278	// If we are only doing traverseAllNodes, no need to look at the typed characters.
				279	if (traverseAllNodes \|\| needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
				280	mWord[depth] = c;
				281	if (traverseAllNodes && terminal) {
				282	onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
				283	snr, nextLetters, nextLettersSize, skipPos, freq);
				284	}
				285	if (!needsToTraverseChildrenNodes) return false;
				286	*newTraverseAllNodes = traverseAllNodes;
				287	*newSnr = snr;
				288	*newDiffs = diffs;
				289	*newInputIndex = inputIndex;
				290	*newDepth = depth + 1;
				291	} else {
				292	int currentChars = mInputCodes + (inputIndex MAX_ALTERNATIVES);
				293	int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
				294	if (matchedProximityCharId < 0) return false;
				295	mWord[depth] = c;
				296	// If inputIndex is greater than mInputLength, that means there is no
				297	// proximity chars. So, we don't need to check proximity.
				298	const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
				299	const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
				300	if (isSameAsUserTypedLength && terminal) {
				301	onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
				302	skipPos, freq, addedWeight);
				303	}
				304	if (!needsToTraverseChildrenNodes) return false;
				305	// Start traversing all nodes after the index exceeds the user typed length
				306	*newTraverseAllNodes = isSameAsUserTypedLength;
				307	newSnr = snr addedWeight;
				308	*newDiffs = diffs + (matchedProximityCharId > 0);
				309	*newInputIndex = inputIndex + 1;
				310	*newDepth = depth + 1;
				311	}
				312	// Optimization: Prune out words that are too long compared to how much was typed.
				313	if (newDepth > maxDepth \|\| newDiffs > mMaxEditDistance) {
				314	return false;
				315	}
				316
				317	// If inputIndex is greater than mInputLength, that means there are no proximity chars.
				318	if (mInputLength <= *newInputIndex) {
				319	*newTraverseAllNodes = true;
				320	}
				321	// get the count of nodes and increment childAddress.
				322	*newCount = Dictionary::getCount(DICT, &childPosition);
				323	*newChildPosition = childPosition;
				324	if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
				325	return needsToTraverseChildrenNodes;
				326	}
				327
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	328	} // namespace latinime