Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

36

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

38

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

39

LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

40

}

41

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

43

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

44

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

45

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

46

{

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

47

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

48

getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

49

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

50

// Suggestion with missing character

51

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

53

if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

54

getSuggestionCandidates(codesSize, i, -1, NULL, 0);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

58

// Suggestion with excessive character

59

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

60

for (int i = 0; i < codesSize; ++i) {

61

if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

62

getSuggestionCandidates(codesSize, -1, i, NULL, 0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

66

// Suggestions with missing space

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

67

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

68

for (int i = 1; i < codesSize; ++i) {

69

if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);

70

getMissingSpaceWords(mInputLength, i);

}

}

// Get the word count

75

int suggestedWordsCount = 0;

76

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

77

suggestedWordsCount++;

78

}

79

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

80

if (DEBUG_DICT) {

81

LOGI("Returning %d words", suggestedWordsCount);

82

LOGI("Next letters: ");

83

for (int k = 0; k < nextLettersSize; k++) {

84

if (nextLetters[k] > 0) {

85

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

90

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

91

return suggestedWordsCount;

92

}

93

94

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

95

int *frequencies) {

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

96

if (DEBUG_DICT) LOGI("initSuggest");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

97

mFrequencies = frequencies;

98

mOutputChars = outWords;

99

mInputCodes = codes;

100

mInputLength = codesSize;

101

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

102

}

103

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

104

void UnigramDictionary::registerNextLetter(

105

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

106

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

111

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

112

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

113

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

114

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

115

char s[length + 1];

116

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

117

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

118

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

119

if (length > MAX_WORD_LENGTH) {

120

if (DEBUG_DICT) LOGI("Exceeded max word length.");

121

return false;

122

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

123

124

// Find the right insertion point

125

int insertAt = 0;

126

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

127

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

128

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

134

if (DEBUG_DICT) {

135

char s[length + 1];

136

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

137

LOGI("Added word = %s, freq = %d", s, frequency);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

138

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

139

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

140

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

141

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

142

mFrequencies[insertAt] = frequency;

143

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

144

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

145

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

146

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

151

if (DEBUG_DICT) LOGI("Added word at %d", insertAt);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

157

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

158

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

159

c = BASE_CHARS[c];

160

}

161

if (c >='A' && c <= 'Z') {

162

c |= 32;

163

} else if (c > 127) {

164

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

169

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

170

if (length != mInputLength) {

171

return false;

172

}

173

int *inputCodes = mInputCodes;

174

while (length--) {

175

if ((unsigned int) *inputCodes != (unsigned int) *word) {

176

return false;

177

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

178

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

184

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

185

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

186

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

187

void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

188

const int excessivePos, int *nextLetters, const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

189

if (DEBUG_DICT) LOGI("getSuggestionCandidates");

190

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

191

const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

192

// Get the number of child of root, then increment the position

193

int childCount = Dictionary::getCount(DICT, &rootPosition);

194

int depth = 0;

195

196

mStackChildCount[0] = childCount;

197

mStackTraverseAll[0] = (mInputLength <= 0);

198

mStackNodeFreq[0] = 1;

199

mStackInputIndex[0] = 0;

200

mStackDiffs[0] = 0;

201

mStackSiblingPos[0] = rootPosition;

202

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

203

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

204

while (depth >= 0) {

205

if (mStackChildCount[depth] > 0) {

206

--mStackChildCount[depth];

207

bool traverseAllNodes = mStackTraverseAll[depth];

208

int snr = mStackNodeFreq[depth];

209

int inputIndex = mStackInputIndex[depth];

210

int diffs = mStackDiffs[depth];

211

int siblingPos = mStackSiblingPos[depth];

212

int firstChildPos;

213

// depth will never be greater than MAX_DEPTH because in that case,

214

// needsToTraverseChildrenNodes should be false

215

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

216

MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,

217

nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,

218

&snr, &inputIndex, &diffs, &siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

219

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

220

mStackSiblingPos[depth] = siblingPos;

221

if (needsToTraverseChildrenNodes) {

222

// Goes to child node

223

++depth;

224

mStackChildCount[depth] = childCount;

225

mStackTraverseAll[depth] = traverseAllNodes;

226

mStackNodeFreq[depth] = snr;

227

mStackInputIndex[depth] = inputIndex;

228

mStackDiffs[depth] = diffs;

229

mStackSiblingPos[depth] = firstChildPos;

230

}

231

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

232

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

238

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

239

if (missingSpacePos <= 0 || missingSpacePos >= inputLength

240

|| inputLength >= MAX_WORD_LENGTH) return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

241

const int newWordLength = inputLength + 1;

242

// Allocating variable length array on stack

243

unsigned short word[newWordLength];

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

244

const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);

245

if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);

246

if (firstFreq <= 0) return false;

247

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

248

for (int i = 0; i < missingSpacePos; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

249

word[i] = mWord[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

250

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

251

252

const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);

253

if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);

254

if (secondFreq <= 0) return false;

255

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

256

word[missingSpacePos] = SPACE;

257

for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

258

word[i] = mWord[i - missingSpacePos - 1];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

259

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

260

261

int pairFreq = ((firstFreq + secondFreq) / 2);

262

for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

263

addWord(word, newWordLength, pairFreq);

return true;

}

// Keep this for comparing spec to new getWords

268

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

269

const int excessivePos, int *nextLetters, const int nextLettersSize) {

270

int initialPosition = initialPos;

271

const int count = Dictionary::getCount(DICT, &initialPosition);

272

getWordsRec(count, initialPosition, 0,

273

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

274

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);

275

}

276

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

277

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

278

const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

279

const int diffs, const int skipPos, const int excessivePos, int *nextLetters,

280

const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

281

int siblingPos = pos;

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

282

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

283

int newCount;

284

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

285

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

286

bool newTraverseAllNodes;

int newSnr;

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

292

traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,

293

nextLettersSize,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

294

&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

295

&newInputIndex, &newDiffs, &newSiblingPos);

296

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

297

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

298

if (needsToTraverseChildrenNodes) {

299

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

300

newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,

301

nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

306

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

307

unsigned short *word, const int inputLength, const int depth, const int snr,

308

int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

309

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

310

if (depth >= inputLength && skipPos < 0) {

311

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

316

unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,

317

const int addedWeight) {

318

if (!sameAsTyped(word, depth + 1)) {

319

int finalFreq = freq * snr * addedWeight;

320

// Proximity collection will promote a word of the same length as

321

// what user typed.

322

if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

323

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

324

}

325

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

326

327

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

328

const int inputIndex, const int skipPos, const int depth) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

329

const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

330

// Skip the ' or other letter and continue deeper

331

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

332

}

333

334

inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

335

const unsigned short c, const int skipPos) {

336

const unsigned short lowerC = toLowerCase(c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

337

int j = 0;

338

while (currentChars[j] > 0) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

339

const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

340

// If skipPos is defined, not to search proximity collections.

341

// First char is what user typed.

342

if (matched) {

343

return j;

344

} else if (skipPos >= 0) {

return -1;

}

++j;

}

return -1;

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

352

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

353

const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,

354

const int diffs, const int skipPos, const int excessivePos, int *nextLetters,

355

const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

356

int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

357

if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

362

363

if (excessivePos == depth) ++inputIndex;

364

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

365

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

366

&childPosition, &terminal, &freq);

367

368

const bool needsToTraverseChildrenNodes = childPosition != 0;

369

370

// If we are only doing traverseAllNodes, no need to look at the typed characters.

371

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

372

mWord[depth] = c;

373

if (traverseAllNodes && terminal) {

374

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,

375

snr, nextLetters, nextLettersSize, skipPos, freq);

376

}

377

if (!needsToTraverseChildrenNodes) return false;

378

*newTraverseAllNodes = traverseAllNodes;

379

*newSnr = snr;

380

*newDiffs = diffs;

381

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

382

} else {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

383

int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

384

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);

385

if (matchedProximityCharId < 0) return false;

386

mWord[depth] = c;

387

// If inputIndex is greater than mInputLength, that means there is no

388

// proximity chars. So, we don't need to check proximity.

389

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

390

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

391

if (isSameAsUserTypedLength && terminal) {

392

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,

393

skipPos, freq, addedWeight);

394

}

395

if (!needsToTraverseChildrenNodes) return false;

396

// Start traversing all nodes after the index exceeds the user typed length

397

*newTraverseAllNodes = isSameAsUserTypedLength;

398

*newSnr = snr * addedWeight;

399

*newDiffs = diffs + (matchedProximityCharId > 0);

400

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

401

}

402

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

403

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

408

if (mInputLength <= *newInputIndex) {

409

*newTraverseAllNodes = true;

410

}

411

// get the count of nodes and increment childAddress.

412

*newCount = Dictionary::getCount(DICT, &childPosition);

413

*newChildPosition = childPosition;

414

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

415

return needsToTraverseChildrenNodes;

416

}

417

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

418

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

419

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

420

int pos = ROOT_POS;

421

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

422

int maxFreq = 0;

423

int depth = 0;

424

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

425

bool terminal = false;

426

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

427

mStackChildCount[0] = count;

428

mStackSiblingPos[0] = pos;

429

430

while (depth >= 0) {

431

if (mStackChildCount[depth] > 0) {

432

--mStackChildCount[depth];

433

int firstChildPos;

434

int newFreq;

435

int siblingPos = mStackSiblingPos[depth];

436

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

437

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

438

&siblingPos);

439

mStackSiblingPos[depth] = siblingPos;

440

if (depth == (inputLength - 1)) {

441

// Traverse sibling node

442

if (terminal) {

443

if (newFreq > maxFreq) {

444

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

445

if (DEBUG_DICT && DEBUG_NODE) {

446

char s[inputLength + 1];

447

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

448

s[inputLength] = 0;

449

LOGI("New missing space word found: %d > %d (%s), %d, %d",

450

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

456

// Traverse children nodes

457

++depth;

458

mStackChildCount[depth] = count;

459

mStackSiblingPos[depth] = firstChildPos;

460

}

461

} else {

462

// Traverse parent node

463

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

464

}

465

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

466

467

word[inputLength] = 0;

468

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

469

}

470

471

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

472

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

473

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

474

const int inputIndex = startInputIndex + depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

475

const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

476

unsigned short c;

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

477

*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,

478

newChildPosition, newTerminal, newFreq);

479

const unsigned int inputC = currentChars[0];

480

if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);

481

const unsigned short lowerC = toLowerCase(c);

482

const bool matched = (inputC == lowerC || inputC == c);

483

const bool hasChild = *newChildPosition != 0;

484

if (matched) {

485

word[depth] = c;

486

if (DEBUG_DICT && DEBUG_NODE) {

487

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

488

if (*newTerminal) LOGI("Terminal %d", *newFreq);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

489

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

490

if (hasChild) {

491

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

498

// Thus newTerminal shouldn't be true.

499

*newTerminal = false;

500

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

501

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

502

}

satok