Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

36

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

38

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

39

LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

40

}

41

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

43

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

44

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

45

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

46

{

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

47

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

48

getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

49

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

50

// Suggestion with missing character

51

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

53

if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

54

getSuggestionCandidates(codesSize, i, -1, NULL, 0);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

58

// Suggestion with excessive character

59

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

60

for (int i = 0; i < codesSize; ++i) {

satok

e07baa6

2010-12-09 21:55:40 +0900

[diff] [blame^]

61

if (existsAdjacentProximityChars(i, codesSize)) {

62

if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);

63

getSuggestionCandidates(codesSize, -1, i, NULL, 0);

64

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

68

// Suggestions with missing space

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

69

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

70

for (int i = 1; i < codesSize; ++i) {

71

if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);

72

getMissingSpaceWords(mInputLength, i);

}

}

// Get the word count

77

int suggestedWordsCount = 0;

78

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

79

suggestedWordsCount++;

80

}

81

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

82

if (DEBUG_DICT) {

83

LOGI("Returning %d words", suggestedWordsCount);

84

LOGI("Next letters: ");

85

for (int k = 0; k < nextLettersSize; k++) {

86

if (nextLetters[k] > 0) {

87

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

92

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

93

return suggestedWordsCount;

94

}

95

96

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

97

int *frequencies) {

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

98

if (DEBUG_DICT) LOGI("initSuggest");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

99

mFrequencies = frequencies;

100

mOutputChars = outWords;

101

mInputCodes = codes;

102

mInputLength = codesSize;

103

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

104

}

105

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

106

void UnigramDictionary::registerNextLetter(

107

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

108

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

113

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

114

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

115

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

116

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

117

char s[length + 1];

118

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

119

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

120

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

121

if (length > MAX_WORD_LENGTH) {

122

if (DEBUG_DICT) LOGI("Exceeded max word length.");

123

return false;

124

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

125

126

// Find the right insertion point

127

int insertAt = 0;

128

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

129

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

130

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

136

if (DEBUG_DICT) {

137

char s[length + 1];

138

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

139

LOGI("Added word = %s, freq = %d", s, frequency);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

140

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

141

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

142

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

143

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

144

mFrequencies[insertAt] = frequency;

145

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

146

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

147

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

148

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

153

if (DEBUG_DICT) LOGI("Added word at %d", insertAt);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

159

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

160

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

161

c = BASE_CHARS[c];

162

}

163

if (c >='A' && c <= 'Z') {

164

c |= 32;

165

} else if (c > 127) {

166

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

171

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

172

if (length != mInputLength) {

173

return false;

174

}

175

int *inputCodes = mInputCodes;

176

while (length--) {

177

if ((unsigned int) *inputCodes != (unsigned int) *word) {

178

return false;

179

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

180

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

186

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

187

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

188

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

189

void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

190

const int excessivePos, int *nextLetters, const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

191

if (DEBUG_DICT) LOGI("getSuggestionCandidates");

192

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

193

const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

194

// Get the number of child of root, then increment the position

195

int childCount = Dictionary::getCount(DICT, &rootPosition);

196

int depth = 0;

197

198

mStackChildCount[0] = childCount;

199

mStackTraverseAll[0] = (mInputLength <= 0);

200

mStackNodeFreq[0] = 1;

201

mStackInputIndex[0] = 0;

202

mStackDiffs[0] = 0;

203

mStackSiblingPos[0] = rootPosition;

204

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

205

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

206

while (depth >= 0) {

207

if (mStackChildCount[depth] > 0) {

208

--mStackChildCount[depth];

209

bool traverseAllNodes = mStackTraverseAll[depth];

210

int snr = mStackNodeFreq[depth];

211

int inputIndex = mStackInputIndex[depth];

212

int diffs = mStackDiffs[depth];

213

int siblingPos = mStackSiblingPos[depth];

214

int firstChildPos;

215

// depth will never be greater than MAX_DEPTH because in that case,

216

// needsToTraverseChildrenNodes should be false

217

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

218

MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,

219

nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,

220

&snr, &inputIndex, &diffs, &siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

221

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

222

mStackSiblingPos[depth] = siblingPos;

223

if (needsToTraverseChildrenNodes) {

224

// Goes to child node

225

++depth;

226

mStackChildCount[depth] = childCount;

227

mStackTraverseAll[depth] = traverseAllNodes;

228

mStackNodeFreq[depth] = snr;

229

mStackInputIndex[depth] = inputIndex;

230

mStackDiffs[depth] = diffs;

231

mStackSiblingPos[depth] = firstChildPos;

232

}

233

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

234

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

240

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

241

if (missingSpacePos <= 0 || missingSpacePos >= inputLength

242

|| inputLength >= MAX_WORD_LENGTH) return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

243

const int newWordLength = inputLength + 1;

244

// Allocating variable length array on stack

245

unsigned short word[newWordLength];

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

246

const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);

247

if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);

248

if (firstFreq <= 0) return false;

249

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

250

for (int i = 0; i < missingSpacePos; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

251

word[i] = mWord[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

252

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

253

254

const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);

255

if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);

256

if (secondFreq <= 0) return false;

257

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

258

word[missingSpacePos] = SPACE;

259

for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

260

word[i] = mWord[i - missingSpacePos - 1];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

261

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

262

263

int pairFreq = ((firstFreq + secondFreq) / 2);

264

for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

265

addWord(word, newWordLength, pairFreq);

return true;

}

// Keep this for comparing spec to new getWords

270

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

271

const int excessivePos, int *nextLetters, const int nextLettersSize) {

272

int initialPosition = initialPos;

273

const int count = Dictionary::getCount(DICT, &initialPosition);

274

getWordsRec(count, initialPosition, 0,

275

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

276

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);

277

}

278

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

279

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

280

const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

281

const int diffs, const int skipPos, const int excessivePos, int *nextLetters,

282

const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

283

int siblingPos = pos;

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

284

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

285

int newCount;

286

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

287

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

288

bool newTraverseAllNodes;

int newSnr;

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

294

traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,

295

nextLettersSize,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

296

&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

297

&newInputIndex, &newDiffs, &newSiblingPos);

298

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

299

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

300

if (needsToTraverseChildrenNodes) {

301

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

302

newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,

303

nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

308

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

309

unsigned short *word, const int inputLength, const int depth, const int snr,

310

int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

311

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

312

if (depth >= inputLength && skipPos < 0) {

313

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

318

unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,

319

const int addedWeight) {

320

if (!sameAsTyped(word, depth + 1)) {

321

int finalFreq = freq * snr * addedWeight;

322

// Proximity collection will promote a word of the same length as

323

// what user typed.

324

if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

325

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

326

}

327

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

328

329

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

330

const int inputIndex, const int skipPos, const int depth) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

331

const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

332

// Skip the ' or other letter and continue deeper

333

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

334

}

335

satok

e07baa6

2010-12-09 21:55:40 +0900

[diff] [blame^]

336

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

337

const int inputLength) {

338

if (inputIndex < 0 || inputIndex >= inputLength) return false;

339

const int currentChar = *getInputCharsAt(inputIndex);

340

const int leftIndex = inputIndex - 1;

341

if (leftIndex >= 0) {

342

int *leftChars = getInputCharsAt(leftIndex);

343

int i = 0;

344

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

345

if (leftChars[i++] == currentChar) return true;

346

}

347

}

348

const int rightIndex = inputIndex + 1;

349

if (rightIndex < inputLength) {

350

int *rightChars = getInputCharsAt(rightIndex);

351

int i = 0;

352

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

353

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

359

inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

360

const unsigned short c, const int skipPos) {

361

const unsigned short lowerC = toLowerCase(c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

362

int j = 0;

satok

e07baa6

2010-12-09 21:55:40 +0900

[diff] [blame^]

363

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

364

const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

365

// If skipPos is defined, not to search proximity collections.

366

// First char is what user typed.

367

if (matched) {

368

return j;

369

} else if (skipPos >= 0) {

return -1;

}

++j;

}

return -1;

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

377

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

378

const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,

379

const int diffs, const int skipPos, const int excessivePos, int *nextLetters,

380

const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

381

int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

382

if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

387

388

if (excessivePos == depth) ++inputIndex;

389

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

390

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

391

&childPosition, &terminal, &freq);

392

393

const bool needsToTraverseChildrenNodes = childPosition != 0;

394

395

// If we are only doing traverseAllNodes, no need to look at the typed characters.

396

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

397

mWord[depth] = c;

398

if (traverseAllNodes && terminal) {

399

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,

400

snr, nextLetters, nextLettersSize, skipPos, freq);

401

}

402

if (!needsToTraverseChildrenNodes) return false;

403

*newTraverseAllNodes = traverseAllNodes;

404

*newSnr = snr;

405

*newDiffs = diffs;

406

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

407

} else {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

408

int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

409

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);

410

if (matchedProximityCharId < 0) return false;

411

mWord[depth] = c;

412

// If inputIndex is greater than mInputLength, that means there is no

413

// proximity chars. So, we don't need to check proximity.

414

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

415

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

416

if (isSameAsUserTypedLength && terminal) {

417

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,

418

skipPos, freq, addedWeight);

419

}

420

if (!needsToTraverseChildrenNodes) return false;

421

// Start traversing all nodes after the index exceeds the user typed length

422

*newTraverseAllNodes = isSameAsUserTypedLength;

423

*newSnr = snr * addedWeight;

424

*newDiffs = diffs + (matchedProximityCharId > 0);

425

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

426

}

427

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

428

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

433

if (mInputLength <= *newInputIndex) {

434

*newTraverseAllNodes = true;

435

}

436

// get the count of nodes and increment childAddress.

437

*newCount = Dictionary::getCount(DICT, &childPosition);

438

*newChildPosition = childPosition;

439

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

440

return needsToTraverseChildrenNodes;

441

}

442

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

443

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

444

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

445

int pos = ROOT_POS;

446

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

447

int maxFreq = 0;

448

int depth = 0;

449

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

450

bool terminal = false;

451

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

452

mStackChildCount[0] = count;

453

mStackSiblingPos[0] = pos;

454

455

while (depth >= 0) {

456

if (mStackChildCount[depth] > 0) {

457

--mStackChildCount[depth];

458

int firstChildPos;

459

int newFreq;

460

int siblingPos = mStackSiblingPos[depth];

461

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

462

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

463

&siblingPos);

464

mStackSiblingPos[depth] = siblingPos;

465

if (depth == (inputLength - 1)) {

466

// Traverse sibling node

467

if (terminal) {

468

if (newFreq > maxFreq) {

469

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

470

if (DEBUG_DICT && DEBUG_NODE) {

471

char s[inputLength + 1];

472

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

473

s[inputLength] = 0;

474

LOGI("New missing space word found: %d > %d (%s), %d, %d",

475

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

481

// Traverse children nodes

482

++depth;

483

mStackChildCount[depth] = count;

484

mStackSiblingPos[depth] = firstChildPos;

485

}

486

} else {

487

// Traverse parent node

488

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

489

}

490

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

491

492

word[inputLength] = 0;

493

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

494

}

495

496

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

497

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

498

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

499

const int inputIndex = startInputIndex + depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

500

const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

501

unsigned short c;

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

502

*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,

503

newChildPosition, newTerminal, newFreq);

504

const unsigned int inputC = currentChars[0];

505

if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);

506

const unsigned short lowerC = toLowerCase(c);

507

const bool matched = (inputC == lowerC || inputC == c);

508

const bool hasChild = *newChildPosition != 0;

509

if (matched) {

510

word[depth] = c;

511

if (DEBUG_DICT && DEBUG_NODE) {

512

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

513

if (*newTerminal) LOGI("Terminal %d", *newFreq);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

514

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

515

if (hasChild) {

516

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

523

// Thus newTerminal shouldn't be true.

524

*newTerminal = false;

525

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

526

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

527

}

satok