Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

32

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

33

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

34

const bool isLatestDictVersion)

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

35

: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

36

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

37

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

38

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

39

if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

40

}

41

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

43

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

44

int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,

45

int *frequencies, int *nextLetters, int nextLettersSize)

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

46

{

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

47

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

48

if (DEBUG_DICT) assert(codesSize == mInputLength);

49

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

50

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

51

getSuggestionCandidates(-1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

53

// Suggestion with missing character

54

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

55

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

56

if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

57

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

61

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

62

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

63

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

64

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

65

if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);

66

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

70

// Suggestion with transposed characters

71

// Only suggest words that length is mInputLength

72

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

73

for (int i = 0; i < codesSize; ++i) {

74

if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

75

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

79

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

80

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

81

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

82

for (int i = 1; i < codesSize; ++i) {

83

if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);

84

getMissingSpaceWords(mInputLength, i);

}

}

// Get the word count

89

int suggestedWordsCount = 0;

90

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

91

suggestedWordsCount++;

92

}

93

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

94

if (DEBUG_DICT) {

95

LOGI("Returning %d words", suggestedWordsCount);

96

LOGI("Next letters: ");

97

for (int k = 0; k < nextLettersSize; k++) {

98

if (nextLetters[k] > 0) {

99

LOGI("%c = %d,", k, nextLetters[k]);

}

}

LOGI("\n");

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

104

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

105

return suggestedWordsCount;

106

}

107

108

void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,

109

int *frequencies) {

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

110

if (DEBUG_DICT) LOGI("initSuggest");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

111

mFrequencies = frequencies;

112

mOutputChars = outWords;

113

mInputCodes = codes;

114

mInputLength = codesSize;

115

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

116

}

117

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

118

void UnigramDictionary::registerNextLetter(

119

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

120

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

125

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

126

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

127

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

128

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

129

char s[length + 1];

130

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

131

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

132

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

133

if (length > MAX_WORD_LENGTH) {

134

if (DEBUG_DICT) LOGI("Exceeded max word length.");

135

return false;

136

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

137

138

// Find the right insertion point

139

int insertAt = 0;

140

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

141

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

142

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

148

if (DEBUG_DICT) {

149

char s[length + 1];

150

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

151

LOGI("Added word = %s, freq = %d", s, frequency);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

152

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

153

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

154

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

155

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

156

mFrequencies[insertAt] = frequency;

157

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

158

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

159

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

160

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

165

if (DEBUG_DICT) LOGI("Added word at %d", insertAt);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

171

unsigned short UnigramDictionary::toLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

172

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

173

c = BASE_CHARS[c];

174

}

175

if (c >='A' && c <= 'Z') {

176

c |= 32;

177

} else if (c > 127) {

178

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

183

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

184

if (length != mInputLength) {

185

return false;

186

}

187

int *inputCodes = mInputCodes;

188

while (length--) {

189

if ((unsigned int) *inputCodes != (unsigned int) *word) {

190

return false;

191

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

192

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

198

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

199

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

200

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

201

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

202

const int excessivePos, const int transposedPos, int *nextLetters,

203

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

204

if (DEBUG_DICT) {

205

LOGI("getSuggestionCandidates %d", maxDepth);

206

assert(transposedPos + 1 < mInputLength);

207

assert(excessivePos < mInputLength);

208

assert(missingPos < mInputLength);

209

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

210

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

211

// Get the number of child of root, then increment the position

212

int childCount = Dictionary::getCount(DICT, &rootPosition);

213

int depth = 0;

214

215

mStackChildCount[0] = childCount;

216

mStackTraverseAll[0] = (mInputLength <= 0);

217

mStackNodeFreq[0] = 1;

218

mStackInputIndex[0] = 0;

219

mStackDiffs[0] = 0;

220

mStackSiblingPos[0] = rootPosition;

221

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

222

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

223

while (depth >= 0) {

224

if (mStackChildCount[depth] > 0) {

225

--mStackChildCount[depth];

226

bool traverseAllNodes = mStackTraverseAll[depth];

227

int snr = mStackNodeFreq[depth];

228

int inputIndex = mStackInputIndex[depth];

229

int diffs = mStackDiffs[depth];

230

int siblingPos = mStackSiblingPos[depth];

231

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

232

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

233

// needsToTraverseChildrenNodes should be false

234

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

235

maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,

236

transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,

237

&traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

238

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

239

mStackSiblingPos[depth] = siblingPos;

240

if (needsToTraverseChildrenNodes) {

241

// Goes to child node

242

++depth;

243

mStackChildCount[depth] = childCount;

244

mStackTraverseAll[depth] = traverseAllNodes;

245

mStackNodeFreq[depth] = snr;

246

mStackInputIndex[depth] = inputIndex;

247

mStackDiffs[depth] = diffs;

248

mStackSiblingPos[depth] = firstChildPos;

249

}

250

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

251

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

257

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

258

if (missingSpacePos <= 0 || missingSpacePos >= inputLength

259

|| inputLength >= MAX_WORD_LENGTH) return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

260

const int newWordLength = inputLength + 1;

261

// Allocating variable length array on stack

262

unsigned short word[newWordLength];

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

263

const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);

264

if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);

265

if (firstFreq <= 0) return false;

266

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

267

for (int i = 0; i < missingSpacePos; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

268

word[i] = mWord[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

269

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

270

271

const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

272

if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

273

if (secondFreq <= 0) return false;

274

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

275

word[missingSpacePos] = SPACE;

276

for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

277

word[i] = mWord[i - missingSpacePos - 1];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

278

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

279

280

int pairFreq = ((firstFreq + secondFreq) / 2);

281

for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

282

pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

283

addWord(word, newWordLength, pairFreq);

return true;

}

// Keep this for comparing spec to new getWords

288

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

289

const int excessivePos, const int transposedPos,int *nextLetters,

290

const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

291

int initialPosition = initialPos;

292

const int count = Dictionary::getCount(DICT, &initialPosition);

293

getWordsRec(count, initialPosition, 0,

294

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

295

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

296

nextLettersSize);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

297

}

298

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

299

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

300

const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

301

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

302

int *nextLetters, const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

303

int siblingPos = pos;

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

304

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

305

int newCount;

306

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

307

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

308

bool newTraverseAllNodes;

int newSnr;

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

314

traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,

315

nextLetters, nextLettersSize,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

316

&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

317

&newInputIndex, &newDiffs, &newSiblingPos);

318

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

319

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

320

if (needsToTraverseChildrenNodes) {

321

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

322

newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

323

nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

328

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int snr,

329

const int skipPos, const int excessivePos, const int transposedPos, const int freq,

330

const bool sameLength) {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

331

// TODO: Demote by edit distance

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

332

int finalFreq = freq * snr;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

333

if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

334

if (transposedPos >= 0) finalFreq = finalFreq

335

* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

336

if (excessivePos >= 0) {

337

finalFreq = finalFreq * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;

338

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

339

finalFreq = finalFreq

340

* WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE / 100;

341

}

342

}

343

if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

344

return finalFreq;

345

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

346

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

347

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

348

unsigned short *word, const int inputIndex, const int depth, const int snr,

349

int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,

350

const int transposedPos, const int freq) {

351

const int finalFreq = calculateFinalFreq(inputIndex, snr, skipPos, excessivePos, transposedPos,

352

freq, false);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

353

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

354

if (depth >= mInputLength && skipPos < 0) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

355

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

360

unsigned short *word, const int inputIndex, const int depth, const int snr,

361

const int skipPos, const int excessivePos, const int transposedPos, const int freq,

362

const int addedWeight) {

363

if (sameAsTyped(word, depth + 1)) return;

364

const int finalFreq = calculateFinalFreq(inputIndex, snr * addedWeight, skipPos,

365

excessivePos, transposedPos, freq, true);

366

// Proximity collection will promote a word of the same length as what user typed.

367

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

368

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

369

370

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

371

const int inputIndex, const int skipPos, const int depth) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

372

const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

373

// Skip the ' or other letter and continue deeper

374

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

375

}

376

satok

e07baa6

2010-12-09 21:55:40 +0900

[diff] [blame]

377

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

378

const int inputLength) {

379

if (inputIndex < 0 || inputIndex >= inputLength) return false;

380

const int currentChar = *getInputCharsAt(inputIndex);

381

const int leftIndex = inputIndex - 1;

382

if (leftIndex >= 0) {

383

int *leftChars = getInputCharsAt(leftIndex);

384

int i = 0;

385

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

386

if (leftChars[i++] == currentChar) return true;

387

}

388

}

389

const int rightIndex = inputIndex + 1;

390

if (rightIndex < inputLength) {

391

int *rightChars = getInputCharsAt(rightIndex);

392

int i = 0;

393

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

394

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

400

inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

401

const unsigned short c, const int skipPos, const int excessivePos,

402

const int transposedPos) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

403

const unsigned short lowerC = toLowerCase(c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

404

int j = 0;

satok

e07baa6

2010-12-09 21:55:40 +0900

[diff] [blame]

405

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

satok

2010-12-03 19:38:08 +0900

[diff] [blame]

406

const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

407

// If skipPos is defined, not to search proximity collections.

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

408

// First char is what user typed.

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

409

if (matched) {

410

return j;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

411

} else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {

412

// Not to check proximity characters

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

return -1;

}

++j;

}

return -1;

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

420

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

421

const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

422

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

423

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

424

bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,

425

int *nextSiblingPosition) {

426

if (DEBUG_DICT) {

427

int inputCount = 0;

428

if (skipPos >= 0) ++inputCount;

429

if (excessivePos >= 0) ++inputCount;

430

if (transposedPos >= 0) ++inputCount;

431

assert(inputCount <= 1);

432

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

437

438

if (excessivePos == depth) ++inputIndex;

439

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

440

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

441

&childPosition, &terminal, &freq);

442

443

const bool needsToTraverseChildrenNodes = childPosition != 0;

444

445

// If we are only doing traverseAllNodes, no need to look at the typed characters.

446

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

447

mWord[depth] = c;

448

if (traverseAllNodes && terminal) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

449

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

450

snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

451

}

452

if (!needsToTraverseChildrenNodes) return false;

453

*newTraverseAllNodes = traverseAllNodes;

454

*newSnr = snr;

455

*newDiffs = diffs;

456

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

457

} else {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

458

int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

459

460

if (transposedPos >= 0) {

461

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

462

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

463

}

464

465

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

466

transposedPos);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

467

if (matchedProximityCharId < 0) return false;

468

mWord[depth] = c;

469

// If inputIndex is greater than mInputLength, that means there is no

470

// proximity chars. So, we don't need to check proximity.

471

const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;

472

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;

473

if (isSameAsUserTypedLength && terminal) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

474

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, snr,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

475

skipPos, excessivePos, transposedPos, freq, addedWeight);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

476

}

477

if (!needsToTraverseChildrenNodes) return false;

478

// Start traversing all nodes after the index exceeds the user typed length

479

*newTraverseAllNodes = isSameAsUserTypedLength;

480

*newSnr = snr * addedWeight;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

481

*newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

482

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

483

}

484

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

485

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

490

if (mInputLength <= *newInputIndex) {

491

*newTraverseAllNodes = true;

492

}

493

// get the count of nodes and increment childAddress.

494

*newCount = Dictionary::getCount(DICT, &childPosition);

495

*newChildPosition = childPosition;

496

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

497

return needsToTraverseChildrenNodes;

498

}

499

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

500

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

501

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

502

int pos = ROOT_POS;

503

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

504

int maxFreq = 0;

505

int depth = 0;

506

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

507

bool terminal = false;

508

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

509

mStackChildCount[0] = count;

510

mStackSiblingPos[0] = pos;

511

512

while (depth >= 0) {

513

if (mStackChildCount[depth] > 0) {

514

--mStackChildCount[depth];

515

int firstChildPos;

516

int newFreq;

517

int siblingPos = mStackSiblingPos[depth];

518

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

519

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

520

&siblingPos);

521

mStackSiblingPos[depth] = siblingPos;

522

if (depth == (inputLength - 1)) {

523

// Traverse sibling node

524

if (terminal) {

525

if (newFreq > maxFreq) {

526

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

527

if (DEBUG_DICT && DEBUG_NODE) {

528

char s[inputLength + 1];

529

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

530

s[inputLength] = 0;

531

LOGI("New missing space word found: %d > %d (%s), %d, %d",

532

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

538

// Traverse children nodes

539

++depth;

540

mStackChildCount[depth] = count;

541

mStackSiblingPos[depth] = firstChildPos;

542

}

543

} else {

544

// Traverse parent node

545

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

546

}

547

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

548

549

word[inputLength] = 0;

550

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

551

}

552

553

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

554

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

555

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

556

const int inputIndex = startInputIndex + depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

557

const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

558

unsigned short c;

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

559

*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,

560

newChildPosition, newTerminal, newFreq);

561

const unsigned int inputC = currentChars[0];

562

if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);

563

const unsigned short lowerC = toLowerCase(c);

564

const bool matched = (inputC == lowerC || inputC == c);

565

const bool hasChild = *newChildPosition != 0;

566

if (matched) {

567

word[depth] = c;

568

if (DEBUG_DICT && DEBUG_NODE) {

569

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

570

if (*newTerminal) LOGI("Terminal %d", *newFreq);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

571

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

572

if (hasChild) {

573

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

580

// Thus newTerminal shouldn't be true.

581

*newTerminal = false;

582

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

583

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

584

}

satok