Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

32

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

37

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

38

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

39

const bool isLatestDictVersion)

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

40

: DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

42

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

43

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

44

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),

45

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

46

if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

47

}

48

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

49

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

50

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

51

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

52

const int MAX_PROXIMITY_CHARS) {

53

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

54

}

55

56

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

57

58

// There can't be a digraph if we don't have at least 2 characters to examine

59

if (i + 2 > codesSize) return false;

60

61

// Search for the first char of some digraph

62

int lastDigraphIndex = -1;

63

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

64

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

65

lastDigraphIndex >= 0; --lastDigraphIndex) {

66

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

67

}

68

// No match: return early

69

if (lastDigraphIndex < 0) return false;

70

71

// It's an interesting digraph if the second char matches too.

72

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

73

}

74

75

// Mostly the same arguments as the non-recursive version, except:

76

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

77

// codesSize is the size of the user input (thus, it is the size of codesSrc).

78

// codesDest is the current point in the work buffer.

79

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

80

// codesRemain is the remaining size in codesSrc.

81

void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,

82

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

83

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

84

int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

85

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

86

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

87

for (int i = 0; i < codesRemain; ++i) {

88

if (isDigraph(codesSrc, i, codesRemain)) {

89

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

90

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

91

// Copy the word up to the first char of the digraph, then continue processing

92

// on the remaining part of the word, skipping the second char of the digraph.

93

// In our example, copy "pru" and continue running on "fen"

94

// Make i the index of the second char of the digraph for simplicity. Forgetting

95

// to do that results in an infinite recursion so take care!

96

++i;

97

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

98

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

99

codesBuffer, codesBufferSize, flags,

100

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

101

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

102

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

103

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

104

// Copy the second char of the digraph in place, then continue processing on

105

// the remaining part of the word.

106

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

107

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

108

BYTES_IN_ONE_CHAR);

109

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

110

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

111

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

112

outWords, frequencies);

113

return;

114

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

119

// In our example, we'll come here once for "prufen" and then once for "pruefen".

120

// If the word contains several digraphs, we'll come it for the product of them.

121

// eg. if the word is "ueberpruefen" we'll test, in order, against

122

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

123

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

124

if (0 != remainingBytes)

125

memcpy(codesDest, codesSrc, remainingBytes);

126

127

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

128

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

129

}

130

131

int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,

132

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

133

unsigned short *outWords, int *frequencies) {

134

135

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

136

{ // Incrementally tune the word and try all possibilities

137

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

138

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

139

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

140

} else { // Normal processing

141

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

142

outWords, frequencies);

}

PROF_START(6);

// Get the word count

147

int suggestedWordsCount = 0;

148

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

149

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

154

LOGI("Next letters: ");

155

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

156

if (mNextLettersFrequency[k] > 0) {

157

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

}

}

}

PROF_END(6);

PROF_CLOSE;

return suggestedWordsCount;

164

}

165

166

void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,

167

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

168

unsigned short *outWords, int *frequencies) {

169

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

170

PROF_OPEN;

171

PROF_START(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

172

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

173

if (DEBUG_DICT) assert(codesSize == mInputLength);

174

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

175

if (DEBUG_PROXIMITY_INFO) {

176

for (int i = 0; i < codesSize; ++i) {

177

LOGI("Input[%d] x = %d, y = %d", i, xcoordinates[i], ycoordinates[i]);

}

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

181

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

182

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

183

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

184

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

185

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

186

PROF_END(1);

187

188

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

189

// Suggestion with missing character

190

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

191

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

192

if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

193

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

194

}

195

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

196

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

197

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

198

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

199

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

200

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

201

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

202

for (int i = 0; i < codesSize; ++i) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

203

if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);

204

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

205

}

206

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

207

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

208

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

209

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

210

// Suggestion with transposed characters

211

// Only suggest words that length is mInputLength

212

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

213

for (int i = 0; i < codesSize; ++i) {

214

if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

215

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

216

}

217

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

218

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

219

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

220

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

221

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

222

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

223

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

224

for (int i = 1; i < codesSize; ++i) {

225

if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);

226

getMissingSpaceWords(mInputLength, i);

227

}

228

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

229

PROF_END(5);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

230

}

231

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

232

void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,

233

unsigned short *outWords, int *frequencies) {

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

234

if (DEBUG_DICT) LOGI("initSuggest");

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

235

mFrequencies = frequencies;

236

mOutputChars = outWords;

237

mInputCodes = codes;

238

mInputLength = codesSize;

239

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

240

}

241

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

242

void UnigramDictionary::registerNextLetter(

243

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

244

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

249

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

250

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

251

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

252

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

253

char s[length + 1];

254

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

255

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

256

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

257

if (length > MAX_WORD_LENGTH) {

258

if (DEBUG_DICT) LOGI("Exceeded max word length.");

259

return false;

260

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

261

262

// Find the right insertion point

263

int insertAt = 0;

264

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

265

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

266

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

272

if (DEBUG_DICT) {

273

char s[length + 1];

274

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

275

LOGI("Added word = %s, freq = %d", s, frequency);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

276

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

277

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

278

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

279

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

280

mFrequencies[insertAt] = frequency;

281

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

282

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

283

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

284

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

289

if (DEBUG_DICT) LOGI("Added word at %d", insertAt);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

295

unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

296

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

297

c = BASE_CHARS[c];

298

}

299

if (c >='A' && c <= 'Z') {

300

c |= 32;

301

} else if (c > 127) {

302

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

307

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

308

if (length != mInputLength) {

309

return false;

310

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

311

const int *inputCodes = mInputCodes;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

312

while (length--) {

313

if ((unsigned int) *inputCodes != (unsigned int) *word) {

314

return false;

315

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

316

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

322

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

323

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

324

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

325

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

326

const int excessivePos, const int transposedPos, int *nextLetters,

327

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

328

if (DEBUG_DICT) {

329

LOGI("getSuggestionCandidates %d", maxDepth);

330

assert(transposedPos + 1 < mInputLength);

331

assert(excessivePos < mInputLength);

332

assert(missingPos < mInputLength);

333

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

334

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

335

// Get the number of child of root, then increment the position

336

int childCount = Dictionary::getCount(DICT, &rootPosition);

337

int depth = 0;

338

339

mStackChildCount[0] = childCount;

340

mStackTraverseAll[0] = (mInputLength <= 0);

341

mStackNodeFreq[0] = 1;

342

mStackInputIndex[0] = 0;

343

mStackDiffs[0] = 0;

344

mStackSiblingPos[0] = rootPosition;

345

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

346

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

347

while (depth >= 0) {

348

if (mStackChildCount[depth] > 0) {

349

--mStackChildCount[depth];

350

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

351

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

352

int inputIndex = mStackInputIndex[depth];

353

int diffs = mStackDiffs[depth];

354

int siblingPos = mStackSiblingPos[depth];

355

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

356

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

357

// needsToTraverseChildrenNodes should be false

358

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

359

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

360

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

361

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

362

&siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

363

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

364

mStackSiblingPos[depth] = siblingPos;

365

if (needsToTraverseChildrenNodes) {

366

// Goes to child node

367

++depth;

368

mStackChildCount[depth] = childCount;

369

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

370

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

371

mStackInputIndex[depth] = inputIndex;

372

mStackDiffs[depth] = diffs;

373

mStackSiblingPos[depth] = firstChildPos;

374

}

375

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

376

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

382

inline static void multiplyRate(const int rate, int *freq) {

383

if (rate > 1000000) {

384

*freq = (*freq / 100) * rate;

385

} else {

386

*freq = *freq * rate / 100;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

390

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

391

if (missingSpacePos <= 0 || missingSpacePos >= inputLength

392

|| inputLength >= MAX_WORD_LENGTH) return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

393

const int newWordLength = inputLength + 1;

394

// Allocating variable length array on stack

395

unsigned short word[newWordLength];

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

396

const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);

397

if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);

398

if (firstFreq <= 0) return false;

399

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

400

for (int i = 0; i < missingSpacePos; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

401

word[i] = mWord[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

402

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

403

404

const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

405

if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

406

if (secondFreq <= 0) return false;

407

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

408

word[missingSpacePos] = SPACE;

409

for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

410

word[i] = mWord[i - missingSpacePos - 1];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

411

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

412

413

int pairFreq = ((firstFreq + secondFreq) / 2);

414

for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

415

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &pairFreq);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

416

addWord(word, newWordLength, pairFreq);

return true;

}

// Keep this for comparing spec to new getWords

421

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

422

const int excessivePos, const int transposedPos,int *nextLetters,

423

const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

424

int initialPosition = initialPos;

425

const int count = Dictionary::getCount(DICT, &initialPosition);

426

getWordsRec(count, initialPosition, 0,

427

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

428

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

429

nextLettersSize);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

430

}

431

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

432

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

433

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

434

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

435

const int transposedPos, int *nextLetters, const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

436

int siblingPos = pos;

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

437

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

438

int newCount;

439

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

440

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

441

bool newTraverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

442

int newMatchRate;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

447

traverseAllNodes, matchWeight, inputIndex, diffs,

448

skipPos, excessivePos, transposedPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

449

nextLetters, nextLettersSize,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

450

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

451

&newInputIndex, &newDiffs, &newSiblingPos);

452

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

453

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

454

if (needsToTraverseChildrenNodes) {

455

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

456

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

457

nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

462

static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;

463

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

464

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

465

}

satok

2011-01-27 03:23:39 +0900

[diff] [blame]

466

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

467

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

satok

2011-01-27 03:23:39 +0900

[diff] [blame]

468

const int freq, const bool sameLength) {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

469

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

470

int finalFreq = freq * matchWeight;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

471

if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq);

472

if (transposedPos >= 0) multiplyRate(

473

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

474

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

475

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

476

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

477

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

478

}

479

}

satok

2011-01-27 03:23:39 +0900

[diff] [blame]

480

int lengthFreq = TYPED_LETTER_MULTIPLIER;

481

for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

482

if (lengthFreq == matchWeight) {

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

483

if (depth > 1) {

484

if (DEBUG_DICT) LOGI("Found full matched word.");

485

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

486

}

487

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

488

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

489

}

satok

2011-01-27 03:23:39 +0900

[diff] [blame]

490

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

491

if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;

492

return finalFreq;

493

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

494

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

495

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

496

unsigned short *word, const int inputIndex, const int depth, const int matchWeight,

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

497

int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,

498

const int transposedPos, const int freq) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

499

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, excessivePos,

satok

2011-01-27 03:23:39 +0900

[diff] [blame]

500

transposedPos, freq, false);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

501

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

502

if (depth >= mInputLength && skipPos < 0) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

503

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

508

unsigned short *word, const int inputIndex, const int depth, const int matchWeight,

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

509

const int skipPos, const int excessivePos, const int transposedPos, const int freq) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

510

if (sameAsTyped(word, depth + 1)) return;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

511

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

512

excessivePos, transposedPos, freq, true);

513

// Proximity collection will promote a word of the same length as what user typed.

514

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

515

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

516

517

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

518

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

519

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

520

// Skip the ' or other letter and continue deeper

521

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

522

}

523

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

524

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

525

const int inputLength) {

526

if (inputIndex < 0 || inputIndex >= inputLength) return false;

527

const int currentChar = *getInputCharsAt(inputIndex);

528

const int leftIndex = inputIndex - 1;

529

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

530

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

531

int i = 0;

532

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

533

if (leftChars[i++] == currentChar) return true;

534

}

535

}

536

const int rightIndex = inputIndex + 1;

537

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

538

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

539

int i = 0;

540

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

541

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

547

548

// In the following function, c is the current character of the dictionary word

549

// currently examined.

550

// currentChars is an array containing the keys close to the character the

551

// user actually typed at the same position. We want to see if c is in it: if so,

552

// then the word contains at that position a character close to what the user

553

// typed.

554

// What the user typed is actually the first character of the array.

555

// Notice : accented characters do not have a proximity list, so they are alone

556

// in their list. The non-accented version of the character should be considered

557

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

558

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

559

const int *currentChars, const unsigned short c, const int skipPos,

560

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

561

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

562

563

// The first char in the array is what user typed. If it matches right away,

564

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

565

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

566

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

567

568

// If one of those is true, we should not check for close characters at all.

569

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

570

return UNRELATED_CHAR;

571

572

// If the non-accented, lowercased version of that first character matches c,

573

// then we have a non-accented version of the accented character the user

574

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

575

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

576

return NEAR_PROXIMITY_CHAR;

577

578

// Not an exact nor an accent-alike match: search the list of close keys

579

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

580

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

581

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

582

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

583

++j;

584

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

585

586

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

587

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

588

}

589

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

590

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

591

const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

592

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

593

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

594

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

595

int *nextSiblingPosition) {

596

if (DEBUG_DICT) {

597

int inputCount = 0;

598

if (skipPos >= 0) ++inputCount;

599

if (excessivePos >= 0) ++inputCount;

600

if (transposedPos >= 0) ++inputCount;

601

assert(inputCount <= 1);

602

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

607

bool isSameAsUserTypedLength = false;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

608

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

609

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

610

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

611

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

612

&childPosition, &terminal, &freq);

613

614

const bool needsToTraverseChildrenNodes = childPosition != 0;

615

616

// If we are only doing traverseAllNodes, no need to look at the typed characters.

617

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

618

mWord[depth] = c;

619

if (traverseAllNodes && terminal) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

620

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

621

matchWeight, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos,

622

freq);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

623

}

624

if (!needsToTraverseChildrenNodes) return false;

625

*newTraverseAllNodes = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

626

*newMatchRate = matchWeight;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

627

*newDiffs = diffs;

628

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

629

} else {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

630

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

631

632

if (transposedPos >= 0) {

633

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

634

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

635

}

636

637

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

638

transposedPos);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

639

if (UNRELATED_CHAR == matchedProximityCharId) return false;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

640

mWord[depth] = c;

641

// If inputIndex is greater than mInputLength, that means there is no

642

// proximity chars. So, we don't need to check proximity.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

643

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

644

matchWeight = matchWeight * TYPED_LETTER_MULTIPLIER;

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

645

}

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

646

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

647

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

648

if (isSameAsUserTypedLength && terminal) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

649

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, matchWeight,

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

650

skipPos, excessivePos, transposedPos, freq);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

651

}

652

if (!needsToTraverseChildrenNodes) return false;

653

// Start traversing all nodes after the index exceeds the user typed length

654

*newTraverseAllNodes = isSameAsUserTypedLength;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

655

*newMatchRate = matchWeight;

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

656

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

657

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

658

}

659

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

660

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

665

// TODO: Check if this can be isSameAsUserTypedLength only.

666

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

667

*newTraverseAllNodes = true;

668

}

669

// get the count of nodes and increment childAddress.

670

*newCount = Dictionary::getCount(DICT, &childPosition);

671

*newChildPosition = childPosition;

672

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

673

return needsToTraverseChildrenNodes;

674

}

675

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

676

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

677

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

678

int pos = ROOT_POS;

679

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

680

int maxFreq = 0;

681

int depth = 0;

682

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

683

bool terminal = false;

684

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

685

mStackChildCount[0] = count;

686

mStackSiblingPos[0] = pos;

687

688

while (depth >= 0) {

689

if (mStackChildCount[depth] > 0) {

690

--mStackChildCount[depth];

691

int firstChildPos;

692

int newFreq;

693

int siblingPos = mStackSiblingPos[depth];

694

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

695

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

696

&siblingPos);

697

mStackSiblingPos[depth] = siblingPos;

698

if (depth == (inputLength - 1)) {

699

// Traverse sibling node

700

if (terminal) {

701

if (newFreq > maxFreq) {

702

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

703

if (DEBUG_DICT && DEBUG_NODE) {

704

char s[inputLength + 1];

705

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

706

s[inputLength] = 0;

707

LOGI("New missing space word found: %d > %d (%s), %d, %d",

708

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

714

// Traverse children nodes

715

++depth;

716

mStackChildCount[depth] = count;

717

mStackSiblingPos[depth] = firstChildPos;

718

}

719

} else {

720

// Traverse parent node

721

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

722

}

723

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

724

725

word[inputLength] = 0;

726

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

727

}

728

729

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

730

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

731

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

732

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

733

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

734

unsigned short c;

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

735

*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,

736

newChildPosition, newTerminal, newFreq);

737

const unsigned int inputC = currentChars[0];

738

if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

739

const unsigned short baseLowerC = toBaseLowerCase(c);

740

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

741

const bool hasChild = *newChildPosition != 0;

742

if (matched) {

743

word[depth] = c;

744

if (DEBUG_DICT && DEBUG_NODE) {

745

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

746

if (*newTerminal) LOGI("Terminal %d", *newFreq);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

747

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

748

if (hasChild) {

749

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

756

// Thus newTerminal shouldn't be true.

757

*newTerminal = false;

758

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

759

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

760

}

satok