Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

36

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

38

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

39

LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

40

}

41

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

43

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

44

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

45

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

46

{

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

47

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

48

getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

49

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

50

// Suggestion with missing character

51

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

53

if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

54

getSuggestionCandidates(codesSize, i, -1, NULL, 0);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

58

// Suggestion with excessive character

59

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

60

for (int i = 0; i < codesSize; ++i) {

61

if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

62

getSuggestionCandidates(codesSize, -1, i, NULL, 0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

66

// Suggestions with missing space

67

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER) {

68

for (int i = 1; i < codesSize; ++i) {

69

if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);

70

getMissingSpaceWords(mInputLength, i);

}

}

// Get the word count

75

int suggestedWordsCount = 0;

76

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

77

suggestedWordsCount++;

78

}

79

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

80

if (DEBUG_DICT) {

81

LOGI("Returning %d words", suggestedWordsCount);

82

LOGI("Next letters: ");

83

for (int k = 0; k < nextLettersSize; k++) {

84

if (nextLetters[k] > 0) {

85

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

90

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

91

return suggestedWordsCount;

92

}

93

94

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

95

int *frequencies) {

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

96

if (DEBUG_DICT) LOGI("initSuggest");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

97

mFrequencies = frequencies;

98

mOutputChars = outWords;

99

mInputCodes = codes;

100

mInputLength = codesSize;

101

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

102

}

103

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

104

void UnigramDictionary::registerNextLetter(

105

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

106

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

111

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

112

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

113

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

114

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

115

char s[length + 1];

116

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

117

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

118

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

119

if (length > MAX_WORD_LENGTH) {

120

if (DEBUG_DICT) LOGI("Exceeded max word length.");

121

return false;

122

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

123

124

// Find the right insertion point

125

int insertAt = 0;

126

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

127

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

128

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

134

if (DEBUG_DICT) {

135

char s[length + 1];

136

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

137

LOGI("Added word = %s, freq = %d", s, frequency);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

138

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

139

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

140

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

141

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

142

mFrequencies[insertAt] = frequency;

143

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

144

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

145

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

146

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

151

if (DEBUG_DICT) LOGI("Added word at %d", insertAt);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

157

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

158

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

159

c = BASE_CHARS[c];

160

}

161

if (c >='A' && c <= 'Z') {

162

c |= 32;

163

} else if (c > 127) {

164

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

169

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

170

if (length != mInputLength) {

171

return false;

172

}

173

int *inputCodes = mInputCodes;

174

while (length--) {

175

if ((unsigned int) *inputCodes != (unsigned int) *word) {

176

return false;

177

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

178

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

184

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

185

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

186

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

187

void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

188

const int excessivePos, int *nextLetters, const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

189

if (DEBUG_DICT) LOGI("getSuggestionCandidates");

190

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

191

const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

192

// Get the number of child of root, then increment the position

193

int childCount = Dictionary::getCount(DICT, &rootPosition);

194

int depth = 0;

195

196

mStackChildCount[0] = childCount;

197

mStackTraverseAll[0] = (mInputLength <= 0);

198

mStackNodeFreq[0] = 1;

199

mStackInputIndex[0] = 0;

200

mStackDiffs[0] = 0;

201

mStackSiblingPos[0] = rootPosition;

202

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

203

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

204

while (depth >= 0) {

205

if (mStackChildCount[depth] > 0) {

206

--mStackChildCount[depth];

207

bool traverseAllNodes = mStackTraverseAll[depth];

208

int snr = mStackNodeFreq[depth];

209

int inputIndex = mStackInputIndex[depth];

210

int diffs = mStackDiffs[depth];

211

int siblingPos = mStackSiblingPos[depth];

212

int firstChildPos;

213

// depth will never be greater than MAX_DEPTH because in that case,

214

// needsToTraverseChildrenNodes should be false

215

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

216

MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,

217

nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,

218

&snr, &inputIndex, &diffs, &siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

219

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

220

mStackSiblingPos[depth] = siblingPos;

221

if (needsToTraverseChildrenNodes) {

222

// Goes to child node

223

++depth;

224

mStackChildCount[depth] = childCount;

225

mStackTraverseAll[depth] = traverseAllNodes;

226

mStackNodeFreq[depth] = snr;

227

mStackInputIndex[depth] = inputIndex;

228

mStackDiffs[depth] = diffs;

229

mStackSiblingPos[depth] = firstChildPos;

230

}

231

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

232

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

238

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

239

if (missingSpacePos <= 0 || missingSpacePos >= inputLength) return false;

240

const int firstFreq = getWordFreq(0, missingSpacePos);

241

const int secondFreq = getWordFreq(missingSpacePos, inputLength - missingSpacePos);

242

if (DEBUG_DICT) LOGI("First freq: %d, Second freq: %d", firstFreq, secondFreq);

243

244

if (firstFreq <= 0 || secondFreq <= 0) return false;

245

int pairFreq = (firstFreq + secondFreq) / 2;

246

for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;

247

const int newWordLength = inputLength + 1;

248

// Allocating variable length array on stack

249

unsigned short word[newWordLength];

250

int j = 0;

251

for (int i = 0; i < missingSpacePos; ++i) {

252

// Down-casting

253

if (DEBUG_DICT) {

254

assert((*(mInputCodes + i * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);

255

}

256

word[i] = (unsigned short) *(mInputCodes + i * MAX_PROXIMITY_CHARS);

257

}

258

word[missingSpacePos] = SPACE;

259

for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {

260

// Down-casting

261

if (DEBUG_DICT) {

262

assert((*(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);

263

}

264

word[i] = (unsigned short) *(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS);

265

}

266

addWord(word, newWordLength, pairFreq);

return true;

}

// Keep this for comparing spec to new getWords

271

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

272

const int excessivePos, int *nextLetters, const int nextLettersSize) {

273

int initialPosition = initialPos;

274

const int count = Dictionary::getCount(DICT, &initialPosition);

275

getWordsRec(count, initialPosition, 0,

276

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

277

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);

278

}

279

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

280

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

281

const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

282

const int diffs, const int skipPos, const int excessivePos, int *nextLetters,

283

const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

284

int siblingPos = pos;

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

285

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

286

int newCount;

287

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

288

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

289

bool newTraverseAllNodes;

int newSnr;

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

295

traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,

296

nextLettersSize,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

297

&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

298

&newInputIndex, &newDiffs, &newSiblingPos);

299

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

300

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

301

if (needsToTraverseChildrenNodes) {

302

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

303

newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,

304

nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

309

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

310

unsigned short *word, const int inputLength, const int depth, const int snr,

311

int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

312

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

313

if (depth >= inputLength && skipPos < 0) {

314

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

319

unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,

320

const int addedWeight) {

321

if (!sameAsTyped(word, depth + 1)) {

322

int finalFreq = freq * snr * addedWeight;

323

// Proximity collection will promote a word of the same length as

324

// what user typed.

325

if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

326

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

327

}

328

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

329

330

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

331

const int inputIndex, const int skipPos, const int depth) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

332

const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

333

// Skip the ' or other letter and continue deeper

334

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

335

}

336

337

inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

338

const unsigned short c, const int skipPos) {

339

const unsigned short lowerC = toLowerCase(c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

340

int j = 0;

341

while (currentChars[j] > 0) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

342

const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

343

// If skipPos is defined, not to search proximity collections.

344

// First char is what user typed.

345

if (matched) {

346

return j;

347

} else if (skipPos >= 0) {

return -1;

}

++j;

}

return -1;

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

355

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

356

const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,

357

const int diffs, const int skipPos, const int excessivePos, int *nextLetters,

358

const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

359

int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

360

if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

365

366

if (excessivePos == depth) ++inputIndex;

367

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

368

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

369

&childPosition, &terminal, &freq);

370

371

const bool needsToTraverseChildrenNodes = childPosition != 0;

372

373

// If we are only doing traverseAllNodes, no need to look at the typed characters.

374

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

375

mWord[depth] = c;

376

if (traverseAllNodes && terminal) {

377

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,

378

snr, nextLetters, nextLettersSize, skipPos, freq);

379

}

380

if (!needsToTraverseChildrenNodes) return false;

381

*newTraverseAllNodes = traverseAllNodes;

382

*newSnr = snr;

383

*newDiffs = diffs;

384

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

385

} else {

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

386

int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

387

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);

388

if (matchedProximityCharId < 0) return false;

389

mWord[depth] = c;

390

// If inputIndex is greater than mInputLength, that means there is no

391

// proximity chars. So, we don't need to check proximity.

392

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

393

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

394

if (isSameAsUserTypedLength && terminal) {

395

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,

396

skipPos, freq, addedWeight);

397

}

398

if (!needsToTraverseChildrenNodes) return false;

399

// Start traversing all nodes after the index exceeds the user typed length

400

*newTraverseAllNodes = isSameAsUserTypedLength;

401

*newSnr = snr * addedWeight;

402

*newDiffs = diffs + (matchedProximityCharId > 0);

403

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

404

}

405

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

406

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

411

if (mInputLength <= *newInputIndex) {

412

*newTraverseAllNodes = true;

413

}

414

// get the count of nodes and increment childAddress.

415

*newCount = Dictionary::getCount(DICT, &childPosition);

416

*newChildPosition = childPosition;

417

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

418

return needsToTraverseChildrenNodes;

419

}

420

satok

2010-12-08 17:05:39 +0900

[diff] [blame^]

421

inline int UnigramDictionary::getWordFreq(const int startInputIndex, const int inputLength) {

422

int pos = ROOT_POS;

423

int count = Dictionary::getCount(DICT, &pos);

424

int freq = 0;

425

bool terminal = false;

426

427

for (int i = 0; i < inputLength; ++i) {

428

bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(pos, count,

429

startInputIndex + i, &pos, &count, &terminal, &freq);

430

if (!needsToTraverseChildrenNodes && (i < inputLength - 1)) {

return 0;

}

}

if (terminal) {

return freq;

} else {

return 0;

}

}

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

442

const int count, const int inputIndex, int *newChildPosition, int *newCount,

443

bool *newTerminal, int *newFreq) {

444

const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

445

int pos = firstChildPos;

446

unsigned short c;

447

for (int i = 0; i < count; ++i) {

448

pos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

449

newChildPosition, newTerminal, newFreq);

450

const unsigned int inputC = currentChars[0];

451

const unsigned short lowerC = toLowerCase(c);

452

const bool matched = (inputC == lowerC || inputC == c);

453

const bool hasChild = *newChildPosition != 0;

454

if (matched) {

455

if (hasChild) {

456

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

}

}

return false;

}

satok