Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

blob: d0c903e81ee35e25409b0847452b044ed7e555ab [file] [log] [blame]

satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	1	/*
				2	**
				3	** Copyright 2010, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
				18	#include <stdio.h>
				19	#include <fcntl.h>
				20	#include <sys/mman.h>
				21	#include <string.h>
				22
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	23	#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	24
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	25	#include "basechars.h"
				26	#include "char_utils.h"
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	27	#include "dictionary.h"
				28	#include "unigram_dictionary.h"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	29
				30	namespace latinime {
				31
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	32	UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
				33	int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	34	const bool isLatestDictVersion)
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	35	: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
				36	MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	37	TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	38	LOGI("UnigramDictionary - constructor");
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	39	}
				40
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	41	UnigramDictionary::~UnigramDictionary() {}
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	42
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	43	int UnigramDictionary::getSuggestions(int codes, int codesSize, unsigned short outWords,
				44	int frequencies, int nextLetters, int nextLettersSize)
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	45	{
				46
				47	initSuggestions(codes, codesSize, outWords, frequencies);
				48
				49	int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
				50	nextLettersSize);
				51
				52	// If there aren't sufficient suggestions, search for words by allowing wild cards at
				53	// the different character positions. This feature is not ready for prime-time as we need
				54	// to figure out the best ranking for such words compared to proximity corrections and
				55	// completions.
				56	if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
				57	for (int i = 0; i < codesSize; ++i) {
				58	int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
				59	if (tempCount > suggestedWordsCount) {
				60	suggestedWordsCount = tempCount;
				61	break;
				62	}
				63	}
				64	}
				65
				66	if (DEBUG_DICT) {
				67	LOGI("Returning %d words", suggestedWordsCount);
				68	LOGI("Next letters: ");
				69	for (int k = 0; k < nextLettersSize; k++) {
				70	if (nextLetters[k] > 0) {
				71	LOGI("%c = %d,", k, nextLetters[k]);
				72	}
				73	}
				74	LOGI("\n");
				75	}
				76	return suggestedWordsCount;
				77	}
				78
				79	void UnigramDictionary::initSuggestions(int codes, int codesSize, unsigned short outWords,
				80	int *frequencies) {
				81	mFrequencies = frequencies;
				82	mOutputChars = outWords;
				83	mInputCodes = codes;
				84	mInputLength = codesSize;
				85	mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
				86	}
				87
				88	int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
				89	int *nextLetters, int nextLettersSize) {
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	90	if (IS_LATEST_DICT_VERSION) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	91	getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
				92	nextLetters, nextLettersSize);
				93	} else {
				94	getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
				95	}
				96
				97	// Get the word count
				98	int suggestedWordsCount = 0;
				99	while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
				100	suggestedWordsCount++;
				101	}
				102	return suggestedWordsCount;
				103	}
				104
				105	void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
				106	if (c < nextLettersSize) {
				107	nextLetters[c]++;
				108	}
				109	}
				110
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	111	bool
				112	UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
				113	{
				114	word[length] = 0;
				115	if (DEBUG_DICT) {
				116	char s[length + 1];
				117	for (int i = 0; i <= length; i++) s[i] = word[i];
				118	LOGI("Found word = %s, freq = %d : \n", s, frequency);
				119	}
				120
				121	// Find the right insertion point
				122	int insertAt = 0;
				123	while (insertAt < MAX_WORDS) {
				124	if (frequency > mFrequencies[insertAt]
				125	\|\| (mFrequencies[insertAt] == frequency
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	126	&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	127	break;
				128	}
				129	insertAt++;
				130	}
				131	if (insertAt < MAX_WORDS) {
				132	memmove((char) mFrequencies + (insertAt + 1) sizeof(mFrequencies[0]),
				133	(char) mFrequencies + insertAt sizeof(mFrequencies[0]),
				134	(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
				135	mFrequencies[insertAt] = frequency;
				136	memmove((char) mOutputChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
				137	(char) mOutputChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
				138	(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
				139	unsigned short dest = mOutputChars + (insertAt ) MAX_WORD_LENGTH;
				140	while (length--) {
				141	dest++ = word++;
				142	}
				143	*dest = 0; // NULL terminate
				144	if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
				145	return true;
				146	}
				147	return false;
				148	}
				149
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	150	unsigned short
				151	UnigramDictionary::toLowerCase(unsigned short c) {
				152	if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
				153	c = BASE_CHARS[c];
				154	}
				155	if (c >='A' && c <= 'Z') {
				156	c \|= 32;
				157	} else if (c > 127) {
				158	c = latin_tolower(c);
				159	}
				160	return c;
				161	}
				162
				163	bool
				164	UnigramDictionary::sameAsTyped(unsigned short *word, int length)
				165	{
				166	if (length != mInputLength) {
				167	return false;
				168	}
				169	int *inputCodes = mInputCodes;
				170	while (length--) {
				171	if ((unsigned int) inputCodes != (unsigned int) word) {
				172	return false;
				173	}
				174	inputCodes += MAX_ALTERNATIVES;
				175	word++;
				176	}
				177	return true;
				178	}
				179
				180	static char QUOTE = '\'';
				181
				182	void
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	183	UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr,
				184	int inputIndex, int diffs, int skipPos, int *nextLetters, int nextLettersSize)
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	185	{
				186	// Optimization: Prune out words that are too long compared to how much was typed.
				187	if (depth > maxDepth) {
				188	return;
				189	}
				190	if (diffs > mMaxEditDistance) {
				191	return;
				192	}
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	193	int count = Dictionary::getCount(DICT, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	194	int *currentChars = NULL;
				195	if (mInputLength <= inputIndex) {
				196	completion = true;
				197	} else {
				198	currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
				199	}
				200
				201	for (int i = 0; i < count; i++) {
				202	// -- at char
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	203	unsigned short c = Dictionary::getChar(DICT, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	204	// -- at flag/add
				205	unsigned short lowerC = toLowerCase(c);
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	206	bool terminal = Dictionary::getTerminal(DICT, &pos);
				207	int childrenAddress = Dictionary::getAddress(DICT, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	208	// -- after address or flag
				209	int freq = 1;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	210	if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	211	// -- after add or freq
				212
				213	// If we are only doing completions, no need to look at the typed characters.
				214	if (completion) {
				215	mWord[depth] = c;
				216	if (terminal) {
				217	addWord(mWord, depth + 1, freq * snr);
				218	if (depth >= mInputLength && skipPos < 0) {
				219	registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
				220	}
				221	}
				222	if (childrenAddress != 0) {
				223	getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
				224	diffs, skipPos, nextLetters, nextLettersSize);
				225	}
				226	} else if ((c == QUOTE && currentChars[0] != QUOTE) \|\| skipPos == depth) {
				227	// Skip the ' or other letter and continue deeper
				228	mWord[depth] = c;
				229	if (childrenAddress != 0) {
				230	getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
				231	skipPos, nextLetters, nextLettersSize);
				232	}
				233	} else {
				234	int j = 0;
				235	while (currentChars[j] > 0) {
				236	if (currentChars[j] == lowerC \|\| currentChars[j] == c) {
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	237	int addedWeight = j == 0 ? TYPED_LETTER_MULTIPLIER : 1;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	238	mWord[depth] = c;
				239	if (mInputLength == inputIndex + 1) {
				240	if (terminal) {
				241	if (//INCLUDE_TYPED_WORD_IF_VALID \|\|
				242	!sameAsTyped(mWord, depth + 1)) {
				243	int finalFreq = freq * snr * addedWeight;
satok	18c28f4	2010-12-02 18:11:54 +0900	[diff] [blame^]	244	if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	245	addWord(mWord, depth + 1, finalFreq);
				246	}
				247	}
				248	if (childrenAddress != 0) {
				249	getWordsRec(childrenAddress, depth + 1,
				250	maxDepth, true, snr * addedWeight, inputIndex + 1,
				251	diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
				252	}
				253	} else if (childrenAddress != 0) {
				254	getWordsRec(childrenAddress, depth + 1, maxDepth,
				255	false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
				256	skipPos, nextLetters, nextLettersSize);
				257	}
				258	}
				259	j++;
				260	if (skipPos >= 0) break;
				261	}
				262	}
				263	}
				264	}
				265
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	266	} // namespace latinime