Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

blob: 5a48a97a8a48f3e904878db1443c20bbc19fe124 [file] [log] [blame]

satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	1	/*
				2	**
				3	** Copyright 2010, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
				18	#include <stdio.h>
				19	#include <fcntl.h>
				20	#include <sys/mman.h>
				21	#include <string.h>
				22
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	23	#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	24
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	25	#include "basechars.h"
				26	#include "char_utils.h"
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	27	#include "dictionary.h"
				28	#include "unigram_dictionary.h"
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	29
				30	namespace latinime {
				31
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	32	UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
				33	int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,
				34	const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary)
				35	: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
				36	MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
				37	HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary)
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	38	{
				39	LOGI("UnigramDictionary - constructor");
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	40	LOGI("Has Bigram : %d \n", hasBigram);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	41	mTypedLetterMultiplier = typedLetterMultiplier;
				42	mFullWordMultiplier = fullWordMultiplier;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	43	}
				44
				45	UnigramDictionary::~UnigramDictionary()
				46	{
				47	}
				48
				49	int UnigramDictionary::getSuggestions(int codes, int codesSize, unsigned short outWords, int *frequencies,
				50	int *nextLetters, int nextLettersSize)
				51	{
				52
				53	initSuggestions(codes, codesSize, outWords, frequencies);
				54
				55	int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
				56	nextLettersSize);
				57
				58	// If there aren't sufficient suggestions, search for words by allowing wild cards at
				59	// the different character positions. This feature is not ready for prime-time as we need
				60	// to figure out the best ranking for such words compared to proximity corrections and
				61	// completions.
				62	if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
				63	for (int i = 0; i < codesSize; ++i) {
				64	int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
				65	if (tempCount > suggestedWordsCount) {
				66	suggestedWordsCount = tempCount;
				67	break;
				68	}
				69	}
				70	}
				71
				72	if (DEBUG_DICT) {
				73	LOGI("Returning %d words", suggestedWordsCount);
				74	LOGI("Next letters: ");
				75	for (int k = 0; k < nextLettersSize; k++) {
				76	if (nextLetters[k] > 0) {
				77	LOGI("%c = %d,", k, nextLetters[k]);
				78	}
				79	}
				80	LOGI("\n");
				81	}
				82	return suggestedWordsCount;
				83	}
				84
				85	void UnigramDictionary::initSuggestions(int codes, int codesSize, unsigned short outWords,
				86	int *frequencies) {
				87	mFrequencies = frequencies;
				88	mOutputChars = outWords;
				89	mInputCodes = codes;
				90	mInputLength = codesSize;
				91	mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
				92	}
				93
				94	int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
				95	int *nextLetters, int nextLettersSize) {
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	96	if (IS_LATEST_DICT_VERSION) {
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	97	getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
				98	nextLetters, nextLettersSize);
				99	} else {
				100	getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
				101	}
				102
				103	// Get the word count
				104	int suggestedWordsCount = 0;
				105	while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
				106	suggestedWordsCount++;
				107	}
				108	return suggestedWordsCount;
				109	}
				110
				111	void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
				112	if (c < nextLettersSize) {
				113	nextLetters[c]++;
				114	}
				115	}
				116
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	117	int
				118	UnigramDictionary::wideStrLen(unsigned short *str)
				119	{
				120	if (!str) return 0;
				121	unsigned short *end = str;
				122	while (*end)
				123	end++;
				124	return end - str;
				125	}
				126
				127	bool
				128	UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
				129	{
				130	word[length] = 0;
				131	if (DEBUG_DICT) {
				132	char s[length + 1];
				133	for (int i = 0; i <= length; i++) s[i] = word[i];
				134	LOGI("Found word = %s, freq = %d : \n", s, frequency);
				135	}
				136
				137	// Find the right insertion point
				138	int insertAt = 0;
				139	while (insertAt < MAX_WORDS) {
				140	if (frequency > mFrequencies[insertAt]
				141	\|\| (mFrequencies[insertAt] == frequency
				142	&& length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
				143	break;
				144	}
				145	insertAt++;
				146	}
				147	if (insertAt < MAX_WORDS) {
				148	memmove((char) mFrequencies + (insertAt + 1) sizeof(mFrequencies[0]),
				149	(char) mFrequencies + insertAt sizeof(mFrequencies[0]),
				150	(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
				151	mFrequencies[insertAt] = frequency;
				152	memmove((char) mOutputChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
				153	(char) mOutputChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
				154	(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
				155	unsigned short dest = mOutputChars + (insertAt ) MAX_WORD_LENGTH;
				156	while (length--) {
				157	dest++ = word++;
				158	}
				159	*dest = 0; // NULL terminate
				160	if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
				161	return true;
				162	}
				163	return false;
				164	}
				165
				166	bool
				167	UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
				168	{
				169	word[length] = 0;
				170	if (DEBUG_DICT) {
				171	char s[length + 1];
				172	for (int i = 0; i <= length; i++) s[i] = word[i];
				173	LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
				174	}
				175
				176	// Find the right insertion point
				177	int insertAt = 0;
				178	while (insertAt < mMaxBigrams) {
				179	if (frequency > mBigramFreq[insertAt]
				180	\|\| (mBigramFreq[insertAt] == frequency
				181	&& length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
				182	break;
				183	}
				184	insertAt++;
				185	}
				186	LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
				187	if (insertAt < mMaxBigrams) {
				188	memmove((char) mBigramFreq + (insertAt + 1) sizeof(mBigramFreq[0]),
				189	(char) mBigramFreq + insertAt sizeof(mBigramFreq[0]),
				190	(mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
				191	mBigramFreq[insertAt] = frequency;
				192	memmove((char) mBigramChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
				193	(char) mBigramChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
				194	(mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
				195	unsigned short dest = mBigramChars + (insertAt ) MAX_WORD_LENGTH;
				196	while (length--) {
				197	dest++ = word++;
				198	}
				199	*dest = 0; // NULL terminate
				200	if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
				201	return true;
				202	}
				203	return false;
				204	}
				205
				206	unsigned short
				207	UnigramDictionary::toLowerCase(unsigned short c) {
				208	if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
				209	c = BASE_CHARS[c];
				210	}
				211	if (c >='A' && c <= 'Z') {
				212	c \|= 32;
				213	} else if (c > 127) {
				214	c = latin_tolower(c);
				215	}
				216	return c;
				217	}
				218
				219	bool
				220	UnigramDictionary::sameAsTyped(unsigned short *word, int length)
				221	{
				222	if (length != mInputLength) {
				223	return false;
				224	}
				225	int *inputCodes = mInputCodes;
				226	while (length--) {
				227	if ((unsigned int) inputCodes != (unsigned int) word) {
				228	return false;
				229	}
				230	inputCodes += MAX_ALTERNATIVES;
				231	word++;
				232	}
				233	return true;
				234	}
				235
				236	static char QUOTE = '\'';
				237
				238	void
				239	UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
				240	int diffs, int skipPos, int *nextLetters, int nextLettersSize)
				241	{
				242	// Optimization: Prune out words that are too long compared to how much was typed.
				243	if (depth > maxDepth) {
				244	return;
				245	}
				246	if (diffs > mMaxEditDistance) {
				247	return;
				248	}
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	249	int count = Dictionary::getCount(DICT, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	250	int *currentChars = NULL;
				251	if (mInputLength <= inputIndex) {
				252	completion = true;
				253	} else {
				254	currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
				255	}
				256
				257	for (int i = 0; i < count; i++) {
				258	// -- at char
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	259	unsigned short c = Dictionary::getChar(DICT, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	260	// -- at flag/add
				261	unsigned short lowerC = toLowerCase(c);
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	262	bool terminal = Dictionary::getTerminal(DICT, &pos);
				263	int childrenAddress = Dictionary::getAddress(DICT, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	264	// -- after address or flag
				265	int freq = 1;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	266	if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	267	// -- after add or freq
				268
				269	// If we are only doing completions, no need to look at the typed characters.
				270	if (completion) {
				271	mWord[depth] = c;
				272	if (terminal) {
				273	addWord(mWord, depth + 1, freq * snr);
				274	if (depth >= mInputLength && skipPos < 0) {
				275	registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
				276	}
				277	}
				278	if (childrenAddress != 0) {
				279	getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
				280	diffs, skipPos, nextLetters, nextLettersSize);
				281	}
				282	} else if ((c == QUOTE && currentChars[0] != QUOTE) \|\| skipPos == depth) {
				283	// Skip the ' or other letter and continue deeper
				284	mWord[depth] = c;
				285	if (childrenAddress != 0) {
				286	getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
				287	skipPos, nextLetters, nextLettersSize);
				288	}
				289	} else {
				290	int j = 0;
				291	while (currentChars[j] > 0) {
				292	if (currentChars[j] == lowerC \|\| currentChars[j] == c) {
				293	int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
				294	mWord[depth] = c;
				295	if (mInputLength == inputIndex + 1) {
				296	if (terminal) {
				297	if (//INCLUDE_TYPED_WORD_IF_VALID \|\|
				298	!sameAsTyped(mWord, depth + 1)) {
				299	int finalFreq = freq * snr * addedWeight;
				300	if (skipPos < 0) finalFreq *= mFullWordMultiplier;
				301	addWord(mWord, depth + 1, finalFreq);
				302	}
				303	}
				304	if (childrenAddress != 0) {
				305	getWordsRec(childrenAddress, depth + 1,
				306	maxDepth, true, snr * addedWeight, inputIndex + 1,
				307	diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
				308	}
				309	} else if (childrenAddress != 0) {
				310	getWordsRec(childrenAddress, depth + 1, maxDepth,
				311	false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
				312	skipPos, nextLetters, nextLettersSize);
				313	}
				314	}
				315	j++;
				316	if (skipPos >= 0) break;
				317	}
				318	}
				319	}
				320	}
				321
				322	int
				323	UnigramDictionary::getBigramAddress(int *pos, bool advance)
				324	{
				325	int address = 0;
				326
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	327	address += (DICT[*pos] & 0x3F) << 16;
				328	address += (DICT[*pos + 1] & 0xFF) << 8;
				329	address += (DICT[*pos + 2] & 0xFF);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	330
				331	if (advance) {
				332	*pos += 3;
				333	}
				334
				335	return address;
				336	}
				337
				338	int
				339	UnigramDictionary::getBigramFreq(int *pos)
				340	{
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	341	int freq = DICT[(*pos)++] & FLAG_BIGRAM_FREQ;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	342
				343	return freq;
				344	}
				345
				346
				347	int
				348	UnigramDictionary::getBigrams(unsigned short prevWord, int prevWordLength, int codes, int codesSize,
				349	unsigned short bigramChars, int bigramFreq, int maxWordLength, int maxBigrams,
				350	int maxAlternatives)
				351	{
				352	mBigramFreq = bigramFreq;
				353	mBigramChars = bigramChars;
				354	mInputCodes = codes;
				355	mInputLength = codesSize;
				356	mMaxBigrams = maxBigrams;
				357
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	358	if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
				359	int pos = mParentDictionary->isValidWordRec(
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	360	DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
				361	LOGI("Pos -> %d\n", pos);
				362	if (pos < 0) {
				363	return 0;
				364	}
				365
				366	int bigramCount = 0;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	367	int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	368	if (bigramExist > 0) {
				369	int nextBigramExist = 1;
				370	while (nextBigramExist > 0 && bigramCount < maxBigrams) {
				371	int bigramAddress = getBigramAddress(&pos, true);
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	372	int frequency = (FLAG_BIGRAM_FREQ & DICT[pos]);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	373	// search for all bigrams and store them
				374	searchForTerminalNode(bigramAddress, frequency);
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	375	nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	376	bigramCount++;
				377	}
				378	}
				379
				380	return bigramCount;
				381	}
				382	return 0;
				383	}
				384
				385	void
				386	UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
				387	{
				388	// track word with such address and store it in an array
				389	unsigned short word[MAX_WORD_LENGTH];
				390
				391	int pos;
				392	int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
				393	bool found = false;
				394	char followingChar = ' ';
				395	int depth = -1;
				396
				397	while(!found) {
				398	bool followDownAddressSearchStop = false;
				399	bool firstAddress = true;
				400	bool haveToSearchAll = true;
				401
				402	if (depth >= 0) {
				403	word[depth] = (unsigned short) followingChar;
				404	}
				405	pos = followDownBranchAddress; // pos start at count
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	406	int count = DICT[pos] & 0xFF;
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	407	LOGI("count - %d\n",count);
				408	pos++;
				409	for (int i = 0; i < count; i++) {
				410	// pos at data
				411	pos++;
				412	// pos now at flag
				413	if (!getFirstBitOfByte(&pos)) { // non-terminal
				414	if (!followDownAddressSearchStop) {
				415	int addr = getBigramAddress(&pos, false);
				416	if (addr > addressLookingFor) {
				417	followDownAddressSearchStop = true;
				418	if (firstAddress) {
				419	firstAddress = false;
				420	haveToSearchAll = true;
				421	} else if (!haveToSearchAll) {
				422	break;
				423	}
				424	} else {
				425	followDownBranchAddress = addr;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	426	followingChar = (char)(0xFF & DICT[pos-1]);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	427	if (firstAddress) {
				428	firstAddress = false;
				429	haveToSearchAll = false;
				430	}
				431	}
				432	}
				433	pos += 3;
				434	} else if (getFirstBitOfByte(&pos)) { // terminal
				435	if (addressLookingFor == (pos-1)) { // found !!
				436	depth++;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	437	word[depth] = (0xFF & DICT[pos-1]);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	438	found = true;
				439	break;
				440	}
				441	if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
				442	if (!followDownAddressSearchStop) {
				443	int addr = getBigramAddress(&pos, false);
				444	if (addr > addressLookingFor) {
				445	followDownAddressSearchStop = true;
				446	if (firstAddress) {
				447	firstAddress = false;
				448	haveToSearchAll = true;
				449	} else if (!haveToSearchAll) {
				450	break;
				451	}
				452	} else {
				453	followDownBranchAddress = addr;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	454	followingChar = (char)(0xFF & DICT[pos-1]);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	455	if (firstAddress) {
				456	firstAddress = false;
				457	haveToSearchAll = true;
				458	}
				459	}
				460	}
				461	pos += 4;
				462	} else { // freq only (2 byte)
				463	pos += 2;
				464	}
				465
				466	// skipping bigram
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	467	int bigramExist = (DICT[pos] & FLAG_BIGRAM_READ);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	468	if (bigramExist > 0) {
				469	int nextBigramExist = 1;
				470	while (nextBigramExist > 0) {
				471	pos += 3;
satok	e808e43	2010-12-02 14:53:24 +0900	[diff] [blame]	472	nextBigramExist = (DICT[pos++] & FLAG_BIGRAM_CONTINUED);
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	473	}
				474	} else {
				475	pos++;
				476	}
				477	}
				478	}
				479	depth++;
				480	if (followDownBranchAddress == 0) {
				481	LOGI("ERROR!!! Cannot find bigram!!");
				482	break;
				483	}
				484	}
				485	if (checkFirstCharacter(word)) {
				486	addWordBigram(word, depth, frequency);
				487	}
				488	}
				489
				490	bool
				491	UnigramDictionary::checkFirstCharacter(unsigned short *word)
				492	{
				493	// Checks whether this word starts with same character or neighboring characters of
				494	// what user typed.
				495
				496	int *inputCodes = mInputCodes;
				497	int maxAlt = MAX_ALTERNATIVES;
				498	while (maxAlt > 0) {
				499	if ((unsigned int) inputCodes == (unsigned int) word) {
				500	return true;
				501	}
				502	inputCodes++;
				503	maxAlt--;
				504	}
				505	return false;
				506	}
				507
satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame]	508	} // namespace latinime