Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

blob: 88382a94d7d21a3c5077141c6eafe3a067a989cb [file] [log] [blame]

satok	3008825	2010-12-01 21:22:15 +0900	[diff] [blame^]	1	/*
				2	**
				3	** Copyright 2010, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
				18	#include <stdio.h>
				19	#include <fcntl.h>
				20	#include <sys/mman.h>
				21	#include <string.h>
				22
				23	#ifdef FLAG_DBG
				24	#define LOG_TAG "LatinIME: dictionary.cpp"
				25	#include <cutils/log.h>
				26	#define DEBUG_DICT 1
				27	#else // FLAG_DBG
				28	#define LOGI
				29	#define DEBUG_DICT 0
				30	#endif // FLAG_DBG
				31
				32	#include "unigram_dictionary.h"
				33	#include "basechars.h"
				34	#include "char_utils.h"
				35
				36	#define DICTIONARY_VERSION_MIN 200
				37	#define DICTIONARY_HEADER_SIZE 2
				38	#define NOT_VALID_WORD -99
				39
				40	#define SUGGEST_MISSING_CHARACTERS true
				41	#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5
				42
				43
				44	namespace latinime {
				45
				46	UnigramDictionary::UnigramDictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
				47	int maxWordLength, int maxWords, int maxAlternatives, Dictionary *parentDictionary)
				48	: MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords), MAX_ALTERNATIVES(maxAlternatives)
				49	{
				50	LOGI("UnigramDictionary - constructor");
				51	mDict = (unsigned char*) dict;
				52	mTypedLetterMultiplier = typedLetterMultiplier;
				53	mFullWordMultiplier = fullWordMultiplier;
				54	mParentDictionary = parentDictionary;
				55	getVersionNumber();
				56	}
				57
				58	UnigramDictionary::~UnigramDictionary()
				59	{
				60	}
				61
				62	int UnigramDictionary::getSuggestions(int codes, int codesSize, unsigned short outWords, int *frequencies,
				63	int *nextLetters, int nextLettersSize)
				64	{
				65
				66	initSuggestions(codes, codesSize, outWords, frequencies);
				67
				68	int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
				69	nextLettersSize);
				70
				71	// If there aren't sufficient suggestions, search for words by allowing wild cards at
				72	// the different character positions. This feature is not ready for prime-time as we need
				73	// to figure out the best ranking for such words compared to proximity corrections and
				74	// completions.
				75	if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
				76	for (int i = 0; i < codesSize; ++i) {
				77	int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
				78	if (tempCount > suggestedWordsCount) {
				79	suggestedWordsCount = tempCount;
				80	break;
				81	}
				82	}
				83	}
				84
				85	if (DEBUG_DICT) {
				86	LOGI("Returning %d words", suggestedWordsCount);
				87	LOGI("Next letters: ");
				88	for (int k = 0; k < nextLettersSize; k++) {
				89	if (nextLetters[k] > 0) {
				90	LOGI("%c = %d,", k, nextLetters[k]);
				91	}
				92	}
				93	LOGI("\n");
				94	}
				95	return suggestedWordsCount;
				96	}
				97
				98	void UnigramDictionary::initSuggestions(int codes, int codesSize, unsigned short outWords,
				99	int *frequencies) {
				100	mFrequencies = frequencies;
				101	mOutputChars = outWords;
				102	mInputCodes = codes;
				103	mInputLength = codesSize;
				104	mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
				105	}
				106
				107	int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
				108	int *nextLetters, int nextLettersSize) {
				109	if (checkIfDictVersionIsLatest()) {
				110	getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,
				111	nextLetters, nextLettersSize);
				112	} else {
				113	getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
				114	}
				115
				116	// Get the word count
				117	int suggestedWordsCount = 0;
				118	while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
				119	suggestedWordsCount++;
				120	}
				121	return suggestedWordsCount;
				122	}
				123
				124	void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
				125	if (c < nextLettersSize) {
				126	nextLetters[c]++;
				127	}
				128	}
				129
				130	// TODO: Should be const static variable calculate in the constructor
				131	void
				132	UnigramDictionary::getVersionNumber()
				133	{
				134	mVersion = (mDict[0] & 0xFF);
				135	mBigram = (mDict[1] & 0xFF);
				136	LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
				137	}
				138
				139	// TODO: Should be const static variable calculate in the constructor
				140	// Checks whether it has the latest dictionary or the old dictionary
				141	bool
				142	UnigramDictionary::checkIfDictVersionIsLatest()
				143	{
				144	return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 \|\| mBigram == 0);
				145	}
				146
				147	unsigned short
				148	UnigramDictionary::getChar(int *pos)
				149	{
				150	unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
				151	// If the code is 255, then actual 16 bit code follows (in big endian)
				152	if (ch == 0xFF) {
				153	ch = ((mDict[pos] & 0xFF) << 8) \| (mDict[pos + 1] & 0xFF);
				154	(*pos) += 2;
				155	}
				156	return ch;
				157	}
				158
				159	int
				160	UnigramDictionary::getAddress(int *pos)
				161	{
				162	int address = 0;
				163	if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
				164	*pos += 1;
				165	} else {
				166	address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
				167	address += (mDict[*pos + 1] & 0xFF) << 8;
				168	address += (mDict[*pos + 2] & 0xFF);
				169	*pos += 3;
				170	}
				171	return address;
				172	}
				173
				174	int
				175	UnigramDictionary::getFreq(int *pos)
				176	{
				177	int freq = mDict[(*pos)++] & 0xFF;
				178
				179	if (checkIfDictVersionIsLatest()) {
				180	// skipping bigram
				181	int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
				182	if (bigramExist > 0) {
				183	int nextBigramExist = 1;
				184	while (nextBigramExist > 0) {
				185	(*pos) += 3;
				186	nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
				187	}
				188	} else {
				189	(*pos)++;
				190	}
				191	}
				192
				193	return freq;
				194	}
				195
				196	int
				197	UnigramDictionary::wideStrLen(unsigned short *str)
				198	{
				199	if (!str) return 0;
				200	unsigned short *end = str;
				201	while (*end)
				202	end++;
				203	return end - str;
				204	}
				205
				206	bool
				207	UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
				208	{
				209	word[length] = 0;
				210	if (DEBUG_DICT) {
				211	char s[length + 1];
				212	for (int i = 0; i <= length; i++) s[i] = word[i];
				213	LOGI("Found word = %s, freq = %d : \n", s, frequency);
				214	}
				215
				216	// Find the right insertion point
				217	int insertAt = 0;
				218	while (insertAt < MAX_WORDS) {
				219	if (frequency > mFrequencies[insertAt]
				220	\|\| (mFrequencies[insertAt] == frequency
				221	&& length < wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
				222	break;
				223	}
				224	insertAt++;
				225	}
				226	if (insertAt < MAX_WORDS) {
				227	memmove((char) mFrequencies + (insertAt + 1) sizeof(mFrequencies[0]),
				228	(char) mFrequencies + insertAt sizeof(mFrequencies[0]),
				229	(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
				230	mFrequencies[insertAt] = frequency;
				231	memmove((char) mOutputChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
				232	(char) mOutputChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
				233	(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
				234	unsigned short dest = mOutputChars + (insertAt ) MAX_WORD_LENGTH;
				235	while (length--) {
				236	dest++ = word++;
				237	}
				238	*dest = 0; // NULL terminate
				239	if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
				240	return true;
				241	}
				242	return false;
				243	}
				244
				245	bool
				246	UnigramDictionary::addWordBigram(unsigned short *word, int length, int frequency)
				247	{
				248	word[length] = 0;
				249	if (DEBUG_DICT) {
				250	char s[length + 1];
				251	for (int i = 0; i <= length; i++) s[i] = word[i];
				252	LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
				253	}
				254
				255	// Find the right insertion point
				256	int insertAt = 0;
				257	while (insertAt < mMaxBigrams) {
				258	if (frequency > mBigramFreq[insertAt]
				259	\|\| (mBigramFreq[insertAt] == frequency
				260	&& length < wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
				261	break;
				262	}
				263	insertAt++;
				264	}
				265	LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
				266	if (insertAt < mMaxBigrams) {
				267	memmove((char) mBigramFreq + (insertAt + 1) sizeof(mBigramFreq[0]),
				268	(char) mBigramFreq + insertAt sizeof(mBigramFreq[0]),
				269	(mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
				270	mBigramFreq[insertAt] = frequency;
				271	memmove((char) mBigramChars + (insertAt + 1) MAX_WORD_LENGTH * sizeof(short),
				272	(char) mBigramChars + (insertAt ) MAX_WORD_LENGTH * sizeof(short),
				273	(mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
				274	unsigned short dest = mBigramChars + (insertAt ) MAX_WORD_LENGTH;
				275	while (length--) {
				276	dest++ = word++;
				277	}
				278	*dest = 0; // NULL terminate
				279	if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
				280	return true;
				281	}
				282	return false;
				283	}
				284
				285	unsigned short
				286	UnigramDictionary::toLowerCase(unsigned short c) {
				287	if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
				288	c = BASE_CHARS[c];
				289	}
				290	if (c >='A' && c <= 'Z') {
				291	c \|= 32;
				292	} else if (c > 127) {
				293	c = latin_tolower(c);
				294	}
				295	return c;
				296	}
				297
				298	bool
				299	UnigramDictionary::sameAsTyped(unsigned short *word, int length)
				300	{
				301	if (length != mInputLength) {
				302	return false;
				303	}
				304	int *inputCodes = mInputCodes;
				305	while (length--) {
				306	if ((unsigned int) inputCodes != (unsigned int) word) {
				307	return false;
				308	}
				309	inputCodes += MAX_ALTERNATIVES;
				310	word++;
				311	}
				312	return true;
				313	}
				314
				315	static char QUOTE = '\'';
				316
				317	void
				318	UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
				319	int diffs, int skipPos, int *nextLetters, int nextLettersSize)
				320	{
				321	// Optimization: Prune out words that are too long compared to how much was typed.
				322	if (depth > maxDepth) {
				323	return;
				324	}
				325	if (diffs > mMaxEditDistance) {
				326	return;
				327	}
				328	int count = getCount(&pos);
				329	int *currentChars = NULL;
				330	if (mInputLength <= inputIndex) {
				331	completion = true;
				332	} else {
				333	currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
				334	}
				335
				336	for (int i = 0; i < count; i++) {
				337	// -- at char
				338	unsigned short c = getChar(&pos);
				339	// -- at flag/add
				340	unsigned short lowerC = toLowerCase(c);
				341	bool terminal = getTerminal(&pos);
				342	int childrenAddress = getAddress(&pos);
				343	// -- after address or flag
				344	int freq = 1;
				345	if (terminal) freq = getFreq(&pos);
				346	// -- after add or freq
				347
				348	// If we are only doing completions, no need to look at the typed characters.
				349	if (completion) {
				350	mWord[depth] = c;
				351	if (terminal) {
				352	addWord(mWord, depth + 1, freq * snr);
				353	if (depth >= mInputLength && skipPos < 0) {
				354	registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
				355	}
				356	}
				357	if (childrenAddress != 0) {
				358	getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex,
				359	diffs, skipPos, nextLetters, nextLettersSize);
				360	}
				361	} else if ((c == QUOTE && currentChars[0] != QUOTE) \|\| skipPos == depth) {
				362	// Skip the ' or other letter and continue deeper
				363	mWord[depth] = c;
				364	if (childrenAddress != 0) {
				365	getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs,
				366	skipPos, nextLetters, nextLettersSize);
				367	}
				368	} else {
				369	int j = 0;
				370	while (currentChars[j] > 0) {
				371	if (currentChars[j] == lowerC \|\| currentChars[j] == c) {
				372	int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
				373	mWord[depth] = c;
				374	if (mInputLength == inputIndex + 1) {
				375	if (terminal) {
				376	if (//INCLUDE_TYPED_WORD_IF_VALID \|\|
				377	!sameAsTyped(mWord, depth + 1)) {
				378	int finalFreq = freq * snr * addedWeight;
				379	if (skipPos < 0) finalFreq *= mFullWordMultiplier;
				380	addWord(mWord, depth + 1, finalFreq);
				381	}
				382	}
				383	if (childrenAddress != 0) {
				384	getWordsRec(childrenAddress, depth + 1,
				385	maxDepth, true, snr * addedWeight, inputIndex + 1,
				386	diffs + (j > 0), skipPos, nextLetters, nextLettersSize);
				387	}
				388	} else if (childrenAddress != 0) {
				389	getWordsRec(childrenAddress, depth + 1, maxDepth,
				390	false, snr * addedWeight, inputIndex + 1, diffs + (j > 0),
				391	skipPos, nextLetters, nextLettersSize);
				392	}
				393	}
				394	j++;
				395	if (skipPos >= 0) break;
				396	}
				397	}
				398	}
				399	}
				400
				401	int
				402	UnigramDictionary::getBigramAddress(int *pos, bool advance)
				403	{
				404	int address = 0;
				405
				406	address += (mDict[*pos] & 0x3F) << 16;
				407	address += (mDict[*pos + 1] & 0xFF) << 8;
				408	address += (mDict[*pos + 2] & 0xFF);
				409
				410	if (advance) {
				411	*pos += 3;
				412	}
				413
				414	return address;
				415	}
				416
				417	int
				418	UnigramDictionary::getBigramFreq(int *pos)
				419	{
				420	int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
				421
				422	return freq;
				423	}
				424
				425
				426	int
				427	UnigramDictionary::getBigrams(unsigned short prevWord, int prevWordLength, int codes, int codesSize,
				428	unsigned short bigramChars, int bigramFreq, int maxWordLength, int maxBigrams,
				429	int maxAlternatives)
				430	{
				431	mBigramFreq = bigramFreq;
				432	mBigramChars = bigramChars;
				433	mInputCodes = codes;
				434	mInputLength = codesSize;
				435	mMaxBigrams = maxBigrams;
				436
				437	if (mBigram == 1 && checkIfDictVersionIsLatest()) {
				438	int pos = isValidWordRec(
				439	DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
				440	LOGI("Pos -> %d\n", pos);
				441	if (pos < 0) {
				442	return 0;
				443	}
				444
				445	int bigramCount = 0;
				446	int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
				447	if (bigramExist > 0) {
				448	int nextBigramExist = 1;
				449	while (nextBigramExist > 0 && bigramCount < maxBigrams) {
				450	int bigramAddress = getBigramAddress(&pos, true);
				451	int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
				452	// search for all bigrams and store them
				453	searchForTerminalNode(bigramAddress, frequency);
				454	nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
				455	bigramCount++;
				456	}
				457	}
				458
				459	return bigramCount;
				460	}
				461	return 0;
				462	}
				463
				464	void
				465	UnigramDictionary::searchForTerminalNode(int addressLookingFor, int frequency)
				466	{
				467	// track word with such address and store it in an array
				468	unsigned short word[MAX_WORD_LENGTH];
				469
				470	int pos;
				471	int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
				472	bool found = false;
				473	char followingChar = ' ';
				474	int depth = -1;
				475
				476	while(!found) {
				477	bool followDownAddressSearchStop = false;
				478	bool firstAddress = true;
				479	bool haveToSearchAll = true;
				480
				481	if (depth >= 0) {
				482	word[depth] = (unsigned short) followingChar;
				483	}
				484	pos = followDownBranchAddress; // pos start at count
				485	int count = mDict[pos] & 0xFF;
				486	LOGI("count - %d\n",count);
				487	pos++;
				488	for (int i = 0; i < count; i++) {
				489	// pos at data
				490	pos++;
				491	// pos now at flag
				492	if (!getFirstBitOfByte(&pos)) { // non-terminal
				493	if (!followDownAddressSearchStop) {
				494	int addr = getBigramAddress(&pos, false);
				495	if (addr > addressLookingFor) {
				496	followDownAddressSearchStop = true;
				497	if (firstAddress) {
				498	firstAddress = false;
				499	haveToSearchAll = true;
				500	} else if (!haveToSearchAll) {
				501	break;
				502	}
				503	} else {
				504	followDownBranchAddress = addr;
				505	followingChar = (char)(0xFF & mDict[pos-1]);
				506	if (firstAddress) {
				507	firstAddress = false;
				508	haveToSearchAll = false;
				509	}
				510	}
				511	}
				512	pos += 3;
				513	} else if (getFirstBitOfByte(&pos)) { // terminal
				514	if (addressLookingFor == (pos-1)) { // found !!
				515	depth++;
				516	word[depth] = (0xFF & mDict[pos-1]);
				517	found = true;
				518	break;
				519	}
				520	if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
				521	if (!followDownAddressSearchStop) {
				522	int addr = getBigramAddress(&pos, false);
				523	if (addr > addressLookingFor) {
				524	followDownAddressSearchStop = true;
				525	if (firstAddress) {
				526	firstAddress = false;
				527	haveToSearchAll = true;
				528	} else if (!haveToSearchAll) {
				529	break;
				530	}
				531	} else {
				532	followDownBranchAddress = addr;
				533	followingChar = (char)(0xFF & mDict[pos-1]);
				534	if (firstAddress) {
				535	firstAddress = false;
				536	haveToSearchAll = true;
				537	}
				538	}
				539	}
				540	pos += 4;
				541	} else { // freq only (2 byte)
				542	pos += 2;
				543	}
				544
				545	// skipping bigram
				546	int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
				547	if (bigramExist > 0) {
				548	int nextBigramExist = 1;
				549	while (nextBigramExist > 0) {
				550	pos += 3;
				551	nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
				552	}
				553	} else {
				554	pos++;
				555	}
				556	}
				557	}
				558	depth++;
				559	if (followDownBranchAddress == 0) {
				560	LOGI("ERROR!!! Cannot find bigram!!");
				561	break;
				562	}
				563	}
				564	if (checkFirstCharacter(word)) {
				565	addWordBigram(word, depth, frequency);
				566	}
				567	}
				568
				569	bool
				570	UnigramDictionary::checkFirstCharacter(unsigned short *word)
				571	{
				572	// Checks whether this word starts with same character or neighboring characters of
				573	// what user typed.
				574
				575	int *inputCodes = mInputCodes;
				576	int maxAlt = MAX_ALTERNATIVES;
				577	while (maxAlt > 0) {
				578	if ((unsigned int) inputCodes == (unsigned int) word) {
				579	return true;
				580	}
				581	inputCodes++;
				582	maxAlt--;
				583	}
				584	return false;
				585	}
				586
				587	// TODO: Move to parent dictionary
				588	bool
				589	UnigramDictionary::isValidWord(unsigned short *word, int length)
				590	{
				591	if (checkIfDictVersionIsLatest()) {
				592	return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
				593	} else {
				594	return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
				595	}
				596	}
				597
				598	int
				599	UnigramDictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
				600	// returns address of bigram data of that word
				601	// return -99 if not found
				602
				603	int count = getCount(&pos);
				604	unsigned short currentChar = (unsigned short) word[offset];
				605	for (int j = 0; j < count; j++) {
				606	unsigned short c = getChar(&pos);
				607	int terminal = getTerminal(&pos);
				608	int childPos = getAddress(&pos);
				609	if (c == currentChar) {
				610	if (offset == length - 1) {
				611	if (terminal) {
				612	return (pos+1);
				613	}
				614	} else {
				615	if (childPos != 0) {
				616	int t = isValidWordRec(childPos, word, offset + 1, length);
				617	if (t > 0) {
				618	return t;
				619	}
				620	}
				621	}
				622	}
				623	if (terminal) {
				624	getFreq(&pos);
				625	}
				626	// There could be two instances of each alphabet - upper and lower case. So continue
				627	// looking ...
				628	}
				629	return NOT_VALID_WORD;
				630	}
				631	} // namespace latinime