Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

*/

#include <stdio.h>

#include <fcntl.h>

#include <sys/mman.h>

21

#include <string.h>

22

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives,

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

36

MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

38

LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

39

}

40

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

41

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

42

satok

2010-12-02 18:11:54 +0900

[diff] [blame]

43

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

44

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

45

{

46

47

initSuggestions(codes, codesSize, outWords, frequencies);

48

49

int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,

50

nextLettersSize);

51

52

// If there aren't sufficient suggestions, search for words by allowing wild cards at

53

// the different character positions. This feature is not ready for prime-time as we need

54

// to figure out the best ranking for such words compared to proximity corrections and

55

// completions.

56

if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {

57

for (int i = 0; i < codesSize; ++i) {

58

int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);

59

if (tempCount > suggestedWordsCount) {

60

suggestedWordsCount = tempCount;

break;

}

}

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

68

LOGI("Next letters: ");

69

for (int k = 0; k < nextLettersSize; k++) {

70

if (nextLetters[k] > 0) {

71

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

return suggestedWordsCount;

77

}

78

79

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

80

int *frequencies) {

81

mFrequencies = frequencies;

82

mOutputChars = outWords;

83

mInputCodes = codes;

84

mInputLength = codesSize;

85

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

86

}

87

88

int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,

89

int *nextLetters, int nextLettersSize) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

90

int initialPos = 0;

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

91

if (IS_LATEST_DICT_VERSION) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

92

initialPos = DICTIONARY_HEADER_SIZE;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

93

}

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

94

getWords(initialPos, inputLength, skipPos, nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

95

96

// Get the word count

97

int suggestedWordsCount = 0;

98

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

99

suggestedWordsCount++;

100

}

101

return suggestedWordsCount;

102

}

103

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

104

void UnigramDictionary::registerNextLetter(

105

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

106

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

111

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word[length] = 0;

if (DEBUG_DICT) {

char s[length + 1];

for (int i = 0; i <= length; i++) s[i] = word[i];

116

LOGI("Found word = %s, freq = %d : \n", s, frequency);

117

}

118

119

// Find the right insertion point

120

int insertAt = 0;

121

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

122

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

123

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

129

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

130

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

131

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

132

mFrequencies[insertAt] = frequency;

133

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

134

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

135

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

136

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

141

if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

147

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

148

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

149

c = BASE_CHARS[c];

150

}

151

if (c >='A' && c <= 'Z') {

152

c |= 32;

153

} else if (c > 127) {

154

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

159

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

160

if (length != mInputLength) {

161

return false;

162

}

163

int *inputCodes = mInputCodes;

164

while (length--) {

165

if ((unsigned int) *inputCodes != (unsigned int) *word) {

166

return false;

167

}

168

inputCodes += MAX_ALTERNATIVES;

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

174

static const char QUOTE = '\'';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

175

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

176

void UnigramDictionary::getWords(const int initialPos, const int inputLength, const int skipPos,

177

int *nextLetters, const int nextLettersSize) {

178

int initialPosition = initialPos;

179

const int count = Dictionary::getCount(DICT, &initialPosition);

180

getWordsRec(count, initialPosition, 0, inputLength * MAX_DEPTH_MULTIPLIER,

181

mInputLength <= 0, 1, 0, 0, skipPos, nextLetters, nextLettersSize);

182

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

183

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

184

// snr : frequency?

185

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

186

const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,

187

const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) {

188

int position = pos;

189

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

190

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

191

// -- at char

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

192

const unsigned short c = Dictionary::getChar(DICT, &position);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

193

// -- at flag/add

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

194

const unsigned short lowerC = toLowerCase(c);

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

195

const bool terminal = Dictionary::getTerminal(DICT, &position);

196

int childrenPosition = Dictionary::getAddress(DICT, &position);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

197

int matchedProximityCharId = -1;

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

198

const bool needsToTraverseNextNode = childrenPosition != 0;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

199

// -- after address or flag

200

int freq = 1;

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

201

// If terminal, increment pos

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

202

if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &position);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

203

// -- after add or freq

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

204

bool newTraverseAllNodes = traverseAllNodes;

205

int newSnr = snr;

206

int newDiffs = diffs;

207

int newInputIndex = inputIndex;

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

208

const int newDepth = depth + 1;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

209

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

210

// If we are only doing traverseAllNodes, no need to look at the typed characters.

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

211

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

212

mWord[depth] = c;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

213

if (traverseAllNodes && terminal) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

214

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,

215

snr, nextLetters, nextLettersSize, skipPos, freq);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

216

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

217

} else {

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

218

int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

219

matchedProximityCharId = getMatchedProximityId(currentChars, lowerC, c, skipPos);

220

if (matchedProximityCharId < 0) continue;

221

mWord[depth] = c;

222

// If inputIndex is greater than mInputLength, that means there is no

223

// proximity chars. So, we don't need to check proximity.

224

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

225

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

226

if (isSameAsUserTypedLength && terminal) {

227

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,

228

skipPos, freq, addedWeight);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

229

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

230

if (!needsToTraverseNextNode) continue;

231

// Start traversing all nodes after the index exceeds the user typed length

232

newTraverseAllNodes = isSameAsUserTypedLength;

233

newSnr *= addedWeight;

234

newDiffs += (matchedProximityCharId > 0);

235

++newInputIndex;

236

}

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

237

// Optimization: Prune out words that are too long compared to how much was typed.

238

if (newDepth > maxDepth || newDiffs > mMaxEditDistance) {

239

continue;

240

}

241

if (mInputLength <= newInputIndex) {

242

newTraverseAllNodes = true;

243

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

244

if (needsToTraverseNextNode) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

245

// get the count of nodes and increment childAddress.

246

const int count = Dictionary::getCount(DICT, &childrenPosition);

247

getWordsRec(count, childrenPosition, newDepth, maxDepth, newTraverseAllNodes,

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

248

newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

253

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

254

unsigned short *word, const int inputLength, const int depth, const int snr,

255

int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {

256

addWord(word, depth + 1, freq * snr);

257

if (depth >= inputLength && skipPos < 0) {

258

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

263

unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,

264

const int addedWeight) {

265

if (!sameAsTyped(word, depth + 1)) {

266

int finalFreq = freq * snr * addedWeight;

267

// Proximity collection will promote a word of the same length as

268

// what user typed.

269

if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

270

addWord(word, depth + 1, finalFreq);

271

}

272

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

273

274

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

275

const int inputIndex, const int skipPos, const int depth) {

276

const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_ALTERNATIVES))[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

277

// Skip the ' or other letter and continue deeper

278

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

279

}

280

281

inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

282

const unsigned short lowerC, const unsigned short c, const int skipPos) {

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

283

int j = 0;

284

while (currentChars[j] > 0) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame^]

285

const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);

satok