Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

36

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

38

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

39

if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

40

}

41

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

43

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

44

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

45

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

46

{

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

47

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

48

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

49

getSuggestionCandidates(codesSize, -1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

50

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

51

// Suggestion with missing character

52

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

53

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

54

if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

55

getSuggestionCandidates(codesSize, i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

59

// Suggestion with excessive character

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

60

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

61

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

62

if (existsAdjacentProximityChars(i, codesSize)) {

63

if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

64

getSuggestionCandidates(codesSize, -1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

65

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

69

// Suggestion with transposed characters

70

// Only suggest words that length is mInputLength

71

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

72

for (int i = 0; i < codesSize; ++i) {

73

if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);

74

getSuggestionCandidates(codesSize, -1, -1, i, NULL, 0, mInputLength - 1);

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

78

// Suggestions with missing space

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

79

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

80

for (int i = 1; i < codesSize; ++i) {

81

if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);

82

getMissingSpaceWords(mInputLength, i);

}

}

// Get the word count

87

int suggestedWordsCount = 0;

88

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

89

suggestedWordsCount++;

90

}

91

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

92

if (DEBUG_DICT) {

93

LOGI("Returning %d words", suggestedWordsCount);

94

LOGI("Next letters: ");

95

for (int k = 0; k < nextLettersSize; k++) {

96

if (nextLetters[k] > 0) {

97

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

102

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

103

return suggestedWordsCount;

104

}

105

106

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

107

int *frequencies) {

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

108

if (DEBUG_DICT) LOGI("initSuggest");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

109

mFrequencies = frequencies;

110

mOutputChars = outWords;

111

mInputCodes = codes;

112

mInputLength = codesSize;

113

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

114

}

115

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

116

void UnigramDictionary::registerNextLetter(

117

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

118

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

123

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

124

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

125

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

126

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

127

char s[length + 1];

128

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

129

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

130

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

131

if (length > MAX_WORD_LENGTH) {

132

if (DEBUG_DICT) LOGI("Exceeded max word length.");

133

return false;

134

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

135

136

// Find the right insertion point

137

int insertAt = 0;

138

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

139

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

140

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

146

if (DEBUG_DICT) {

147

char s[length + 1];

148

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

149

LOGI("Added word = %s, freq = %d", s, frequency);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

150

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

151

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

152

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

153

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

154

mFrequencies[insertAt] = frequency;

155

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

156

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

157

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

158

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

163

if (DEBUG_DICT) LOGI("Added word at %d", insertAt);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

169

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

170

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

171

c = BASE_CHARS[c];

172

}

173

if (c >='A' && c <= 'Z') {

174

c |= 32;

175

} else if (c > 127) {

176

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

181

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

182

if (length != mInputLength) {

183

return false;

184

}

185

int *inputCodes = mInputCodes;

186

while (length--) {

187

if ((unsigned int) *inputCodes != (unsigned int) *word) {

188

return false;

189

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

190

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

196

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

197

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

198

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

199

void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

200

const int excessivePos, const int transposedPos, int *nextLetters,

201

const int nextLettersSize, const int maxDepth) {

202

if (DEBUG_DICT) LOGI("getSuggestionCandidates %d", maxDepth);

203

if (DEBUG_DICT) assert(transposedPos + 1 < inputLength);

204

if (DEBUG_DICT) assert(excessivePos < inputLength);

205

if (DEBUG_DICT) assert(missingPos < inputLength);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

206

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

207

// Get the number of child of root, then increment the position

208

int childCount = Dictionary::getCount(DICT, &rootPosition);

209

int depth = 0;

210

211

mStackChildCount[0] = childCount;

212

mStackTraverseAll[0] = (mInputLength <= 0);

213

mStackNodeFreq[0] = 1;

214

mStackInputIndex[0] = 0;

215

mStackDiffs[0] = 0;

216

mStackSiblingPos[0] = rootPosition;

217

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

218

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

219

while (depth >= 0) {

220

if (mStackChildCount[depth] > 0) {

221

--mStackChildCount[depth];

222

bool traverseAllNodes = mStackTraverseAll[depth];

223

int snr = mStackNodeFreq[depth];

224

int inputIndex = mStackInputIndex[depth];

225

int diffs = mStackDiffs[depth];

226

int siblingPos = mStackSiblingPos[depth];

227

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

228

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

229

// needsToTraverseChildrenNodes should be false

230

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

231

maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,

232

transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,

233

&traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

234

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

235

mStackSiblingPos[depth] = siblingPos;

236

if (needsToTraverseChildrenNodes) {

237

// Goes to child node

238

++depth;

239

mStackChildCount[depth] = childCount;

240

mStackTraverseAll[depth] = traverseAllNodes;

241

mStackNodeFreq[depth] = snr;

242

mStackInputIndex[depth] = inputIndex;

243

mStackDiffs[depth] = diffs;

244

mStackSiblingPos[depth] = firstChildPos;

245

}

246

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

247

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

253

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

254

if (missingSpacePos <= 0 || missingSpacePos >= inputLength

255

|| inputLength >= MAX_WORD_LENGTH) return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

256

const int newWordLength = inputLength + 1;

257

// Allocating variable length array on stack

258

unsigned short word[newWordLength];

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

259

const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);

260

if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);

261

if (firstFreq <= 0) return false;

262

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

263

for (int i = 0; i < missingSpacePos; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

264

word[i] = mWord[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

265

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

266

267

const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

268

if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

269

if (secondFreq <= 0) return false;

270

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

271

word[missingSpacePos] = SPACE;

272

for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

273

word[i] = mWord[i - missingSpacePos - 1];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

274

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

275

276

int pairFreq = ((firstFreq + secondFreq) / 2);

277

for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

278

pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

279

addWord(word, newWordLength, pairFreq);

return true;

}

// Keep this for comparing spec to new getWords

284

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

285

const int excessivePos, const int transposedPos,int *nextLetters,

286

const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

287

int initialPosition = initialPos;

288

const int count = Dictionary::getCount(DICT, &initialPosition);

289

getWordsRec(count, initialPosition, 0,

290

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

291

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

292

nextLettersSize);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

293

}

294

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

295

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

296

const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

297

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

298

int *nextLetters, const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

299

int siblingPos = pos;

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

300

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

301

int newCount;

302

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

303

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

304

bool newTraverseAllNodes;

int newSnr;

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

310

traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,

311

nextLetters, nextLettersSize,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

312

&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

313

&newInputIndex, &newDiffs, &newSiblingPos);

314

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

315

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

316

if (needsToTraverseChildrenNodes) {

317

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

318

newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

319

nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

324

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

325

unsigned short *word, const int inputLength, const int depth, const int snr,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

326

int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,

327

const int transposedPos, const int freq) {

328

int finalFreq = freq * snr;

329

// TODO: Demote by edit distance

330

if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;

331

if (excessivePos >= 0) finalFreq = finalFreq

332

* WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;

333

if (transposedPos >= 0) finalFreq = finalFreq

334

* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;

335

336

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

337

if (depth >= inputLength && skipPos < 0) {

338

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

343

unsigned short *word, const int depth, const int snr, const int skipPos,

344

const int excessivePos, const int transposedPos, const int freq, const int addedWeight) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

345

if (!sameAsTyped(word, depth + 1)) {

346

int finalFreq = freq * snr * addedWeight;

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

347

// TODO: Demote by edit distance

348

if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;

349

if (excessivePos >= 0) finalFreq = finalFreq

350

* WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;

351

if (transposedPos >= 0) finalFreq = finalFreq

352

* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;

353

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

354

// Proximity collection will promote a word of the same length as

355

// what user typed.

356

if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

357

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

358

}

359

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

360

361

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

362

const int inputIndex, const int skipPos, const int depth) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

363

const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

364

// Skip the ' or other letter and continue deeper

365

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

366

}

367

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

368

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

369

const int inputLength) {

370

if (inputIndex < 0 || inputIndex >= inputLength) return false;

371

const int currentChar = *getInputCharsAt(inputIndex);

372

const int leftIndex = inputIndex - 1;

373

if (leftIndex >= 0) {

374

int *leftChars = getInputCharsAt(leftIndex);

375

int i = 0;

376

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

377

if (leftChars[i++] == currentChar) return true;

378

}

379

}

380

const int rightIndex = inputIndex + 1;

381

if (rightIndex < inputLength) {

382

int *rightChars = getInputCharsAt(rightIndex);

383

int i = 0;

384

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

385

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

391

inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

392

const unsigned short c, const int skipPos, const int excessivePos,

393

const int transposedPos) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

394

const unsigned short lowerC = toLowerCase(c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

395

int j = 0;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

396

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

397

const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

398

// If skipPos is defined, not to search proximity collections.

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

399

// First char is what user typed.

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

400

if (matched) {

401

return j;

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

402

} else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {

403

// Not to check proximity characters

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

return -1;

}

++j;

}

return -1;

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

411

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

412

const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

413

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

414

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

415

bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,

416

int *nextSiblingPosition) {

417

if (DEBUG_DICT) {

418

int inputCount = 0;

419

if (skipPos >= 0) ++inputCount;

420

if (excessivePos >= 0) ++inputCount;

421

if (transposedPos >= 0) ++inputCount;

422

assert(inputCount <= 1);

423

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

428

429

if (excessivePos == depth) ++inputIndex;

430

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

431

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

432

&childPosition, &terminal, &freq);

433

434

const bool needsToTraverseChildrenNodes = childPosition != 0;

435

436

// If we are only doing traverseAllNodes, no need to look at the typed characters.

437

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

438

mWord[depth] = c;

439

if (traverseAllNodes && terminal) {

440

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

441

snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

442

}

443

if (!needsToTraverseChildrenNodes) return false;

444

*newTraverseAllNodes = traverseAllNodes;

445

*newSnr = snr;

446

*newDiffs = diffs;

447

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

448

} else {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

449

int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

450

451

if (transposedPos >= 0) {

452

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

453

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

454

}

455

456

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

457

transposedPos);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

458

if (matchedProximityCharId < 0) return false;

459

mWord[depth] = c;

460

// If inputIndex is greater than mInputLength, that means there is no

461

// proximity chars. So, we don't need to check proximity.

462

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

463

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

464

if (isSameAsUserTypedLength && terminal) {

465

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

466

skipPos, excessivePos, transposedPos, freq, addedWeight);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

467

}

468

if (!needsToTraverseChildrenNodes) return false;

469

// Start traversing all nodes after the index exceeds the user typed length

470

*newTraverseAllNodes = isSameAsUserTypedLength;

471

*newSnr = snr * addedWeight;

satok

2010-12-09 22:08:33 +0900

[diff] [blame^]

472

*newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

473

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

474

}

475

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

476

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

481

if (mInputLength <= *newInputIndex) {

482

*newTraverseAllNodes = true;

483

}

484

// get the count of nodes and increment childAddress.

485

*newCount = Dictionary::getCount(DICT, &childPosition);

486

*newChildPosition = childPosition;

487

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

488

return needsToTraverseChildrenNodes;

489

}

490

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

491

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

492

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

493

int pos = ROOT_POS;

494

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

495

int maxFreq = 0;

496

int depth = 0;

497

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

498

bool terminal = false;

499

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

500

mStackChildCount[0] = count;

501

mStackSiblingPos[0] = pos;

502

503

while (depth >= 0) {

504

if (mStackChildCount[depth] > 0) {

505

--mStackChildCount[depth];

506

int firstChildPos;

507

int newFreq;

508

int siblingPos = mStackSiblingPos[depth];

509

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

510

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

511

&siblingPos);

512

mStackSiblingPos[depth] = siblingPos;

513

if (depth == (inputLength - 1)) {

514

// Traverse sibling node

515

if (terminal) {

516

if (newFreq > maxFreq) {

517

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

518

if (DEBUG_DICT && DEBUG_NODE) {

519

char s[inputLength + 1];

520

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

521

s[inputLength] = 0;

522

LOGI("New missing space word found: %d > %d (%s), %d, %d",

523

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

529

// Traverse children nodes

530

++depth;

531

mStackChildCount[depth] = count;

532

mStackSiblingPos[depth] = firstChildPos;

533

}

534

} else {

535

// Traverse parent node

536

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

537

}

538

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

539

540

word[inputLength] = 0;

541

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

542

}

543

544

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

545

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

546

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

547

const int inputIndex = startInputIndex + depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

548

const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

549

unsigned short c;

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

550

*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,

551

newChildPosition, newTerminal, newFreq);

552

const unsigned int inputC = currentChars[0];

553

if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);

554

const unsigned short lowerC = toLowerCase(c);

555

const bool matched = (inputC == lowerC || inputC == c);

556

const bool hasChild = *newChildPosition != 0;

557

if (matched) {

558

word[depth] = c;

559

if (DEBUG_DICT && DEBUG_NODE) {

560

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

561

if (*newTerminal) LOGI("Terminal %d", *newFreq);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

562

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

563

if (hasChild) {

564

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

571

// Thus newTerminal shouldn't be true.

572

*newTerminal = false;

573

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

574

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

575

}

satok