Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

*/

#include <stdio.h>

#include <fcntl.h>

#include <sys/mman.h>

21

#include <string.h>

22

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

36

MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

38

LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

39

}

40

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

41

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

42

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

43

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

44

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

45

{

46

47

initSuggestions(codes, codesSize, outWords, frequencies);

48

49

int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,

50

nextLettersSize);

51

52

// If there aren't sufficient suggestions, search for words by allowing wild cards at

53

// the different character positions. This feature is not ready for prime-time as we need

54

// to figure out the best ranking for such words compared to proximity corrections and

55

// completions.

56

if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {

57

for (int i = 0; i < codesSize; ++i) {

58

int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);

59

if (tempCount > suggestedWordsCount) {

60

suggestedWordsCount = tempCount;

break;

}

}

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

68

LOGI("Next letters: ");

69

for (int k = 0; k < nextLettersSize; k++) {

70

if (nextLetters[k] > 0) {

71

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

return suggestedWordsCount;

77

}

78

79

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

80

int *frequencies) {

81

mFrequencies = frequencies;

82

mOutputChars = outWords;

83

mInputCodes = codes;

84

mInputLength = codesSize;

85

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

86

}

87

88

int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,

89

int *nextLetters, int nextLettersSize) {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

90

if (IS_LATEST_DICT_VERSION) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

91

getWordsRec(DICTIONARY_HEADER_SIZE, 0, inputLength * 3, false, 1, 0, 0, skipPos,

92

nextLetters, nextLettersSize);

93

} else {

94

getWordsRec(0, 0, inputLength * 3, false, 1, 0, 0, skipPos, nextLetters, nextLettersSize);

95

}

96

97

// Get the word count

98

int suggestedWordsCount = 0;

99

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

100

suggestedWordsCount++;

101

}

102

return suggestedWordsCount;

103

}

104

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

105

void UnigramDictionary::registerNextLetter(

106

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

107

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

112

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word[length] = 0;

if (DEBUG_DICT) {

char s[length + 1];

for (int i = 0; i <= length; i++) s[i] = word[i];

117

LOGI("Found word = %s, freq = %d : \n", s, frequency);

118

}

119

120

// Find the right insertion point

121

int insertAt = 0;

122

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

123

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

124

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

130

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

131

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

132

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

133

mFrequencies[insertAt] = frequency;

134

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

135

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

136

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

137

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

142

if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

148

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

149

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

150

c = BASE_CHARS[c];

151

}

152

if (c >='A' && c <= 'Z') {

153

c |= 32;

154

} else if (c > 127) {

155

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

160

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

161

if (length != mInputLength) {

162

return false;

163

}

164

int *inputCodes = mInputCodes;

165

while (length--) {

166

if ((unsigned int) *inputCodes != (unsigned int) *word) {

167

return false;

168

}

169

inputCodes += MAX_ALTERNATIVES;

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

175

static const char QUOTE = '\'';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

176

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

177

// snr : frequency?

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

178

void UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool traverseAllNodes,

179

int snr, int inputIndex, int diffs, int skipPos, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

180

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

181

if (depth > maxDepth || diffs > mMaxEditDistance) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

182

return;

183

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

184

// get the count of nodes and increment pos.

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

185

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

186

int *currentChars = NULL;

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

187

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

188

if (mInputLength <= inputIndex) {

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

189

traverseAllNodes = true;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

190

} else {

191

currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);

192

}

193

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

194

for (int i = 0; i < count; ++i) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

195

// -- at char

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

196

const unsigned short c = Dictionary::getChar(DICT, &pos);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

197

// -- at flag/add

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

198

const unsigned short lowerC = toLowerCase(c);

199

const bool terminal = Dictionary::getTerminal(DICT, &pos);

200

const int childrenAddress = Dictionary::getAddress(DICT, &pos);

201

int matchedProximityCharId = -1;

202

const bool needsToTraverseNextNode = childrenAddress != 0;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

203

// -- after address or flag

204

int freq = 1;

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

205

// If terminal, increment pos

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

206

if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

207

// -- after add or freq

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

208

bool newTraverseAllNodes = traverseAllNodes;

209

int newSnr = snr;

210

int newDiffs = diffs;

211

int newInputIndex = inputIndex;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

212

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

213

// If we are only doing traverseAllNodes, no need to look at the typed characters.

214

if (traverseAllNodes || needsToSkipCurrentNode(c, currentChars[0], skipPos, depth)) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

215

mWord[depth] = c;

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

216

if (traverseAllNodes && terminal) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

217

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,

218

snr, nextLetters, nextLettersSize, skipPos, freq);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

219

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

220

} else {

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

221

matchedProximityCharId = getMatchedProximityId(currentChars, lowerC, c, skipPos);

222

if (matchedProximityCharId < 0) continue;

223

mWord[depth] = c;

224

// If inputIndex is greater than mInputLength, that means there is no

225

// proximity chars. So, we don't need to check proximity.

226

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

227

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

228

if (isSameAsUserTypedLength && terminal) {

229

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,

230

skipPos, freq, addedWeight);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

231

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame^]

232

if (!needsToTraverseNextNode) continue;

233

// Start traversing all nodes after the index exceeds the user typed length

234

newTraverseAllNodes = isSameAsUserTypedLength;

235

newSnr *= addedWeight;

236

newDiffs += (matchedProximityCharId > 0);

237

++newInputIndex;

238

}

239

if (needsToTraverseNextNode) {

240

getWordsRec(childrenAddress, depth + 1, maxDepth, newTraverseAllNodes,

241

newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

246

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

247

unsigned short *word, const int inputLength, const int depth, const int snr,

248

int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {

249

addWord(word, depth + 1, freq * snr);

250

if (depth >= inputLength && skipPos < 0) {

251

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

256

unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,

257

const int addedWeight) {

258

if (!sameAsTyped(word, depth + 1)) {

259

int finalFreq = freq * snr * addedWeight;

260

// Proximity collection will promote a word of the same length as

261

// what user typed.

262

if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

263

addWord(word, depth + 1, finalFreq);

264

}

265

}

satok