Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <fcntl.h>

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

20

#include <stdio.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

21

#include <string.h>

22

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

23

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

24

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

25

#include "basechars.h"

26

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

27

#include "dictionary.h"

28

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

32

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

satok

2010-12-02 14:53:24 +0900

[diff] [blame]

37

UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

38

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

39

const bool isLatestDictVersion)

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

40

: DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

42

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

43

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

44

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),

45

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

46

if (DEBUG_DICT) {

47

LOGI("UnigramDictionary - constructor");

48

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

49

}

50

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

51

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

53

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

54

const int MAX_PROXIMITY_CHARS) {

55

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

56

}

57

58

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

59

60

// There can't be a digraph if we don't have at least 2 characters to examine

61

if (i + 2 > codesSize) return false;

62

63

// Search for the first char of some digraph

64

int lastDigraphIndex = -1;

65

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

66

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

67

lastDigraphIndex >= 0; --lastDigraphIndex) {

68

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

69

}

70

// No match: return early

71

if (lastDigraphIndex < 0) return false;

72

73

// It's an interesting digraph if the second char matches too.

74

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

75

}

76

77

// Mostly the same arguments as the non-recursive version, except:

78

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

79

// codesSize is the size of the user input (thus, it is the size of codesSrc).

80

// codesDest is the current point in the work buffer.

81

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

82

// codesRemain is the remaining size in codesSrc.

83

void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,

84

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

85

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

86

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

87

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

88

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

89

for (int i = 0; i < codesRemain; ++i) {

90

if (isDigraph(codesSrc, i, codesRemain)) {

91

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

92

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

93

// Copy the word up to the first char of the digraph, then continue processing

94

// on the remaining part of the word, skipping the second char of the digraph.

95

// In our example, copy "pru" and continue running on "fen"

96

// Make i the index of the second char of the digraph for simplicity. Forgetting

97

// to do that results in an infinite recursion so take care!

98

++i;

99

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

100

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

101

codesBuffer, codesBufferSize, flags,

102

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

103

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

104

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

105

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

106

// Copy the second char of the digraph in place, then continue processing on

107

// the remaining part of the word.

108

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

109

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

110

BYTES_IN_ONE_CHAR);

111

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

112

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

113

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

114

outWords, frequencies);

115

return;

116

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

121

// In our example, we'll come here once for "prufen" and then once for "pruefen".

122

// If the word contains several digraphs, we'll come it for the product of them.

123

// eg. if the word is "ueberpruefen" we'll test, in order, against

124

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

125

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

126

if (0 != remainingBytes)

127

memcpy(codesDest, codesSrc, remainingBytes);

128

129

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

130

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

131

}

132

133

int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,

134

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

135

unsigned short *outWords, int *frequencies) {

136

137

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

138

{ // Incrementally tune the word and try all possibilities

139

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

140

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

141

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

142

} else { // Normal processing

143

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

144

outWords, frequencies);

145

}

146

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

147

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

148

// Get the word count

149

int suggestedWordsCount = 0;

150

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

151

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

156

LOGI("Next letters: ");

157

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

158

if (mNextLettersFrequency[k] > 0) {

159

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

160

}

161

}

162

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

163

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

164

PROF_CLOSE;

165

return suggestedWordsCount;

166

}

167

168

void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,

169

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

170

unsigned short *outWords, int *frequencies) {

171

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

172

PROF_OPEN;

173

PROF_START(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

174

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

175

if (DEBUG_DICT) assert(codesSize == mInputLength);

176

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

177

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

178

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

179

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

180

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

181

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

182

PROF_END(1);

183

184

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

185

// Suggestion with missing character

186

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

187

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

188

if (DEBUG_DICT) {

189

LOGI("--- Suggest missing characters %d", i);

190

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

191

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

192

}

193

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

194

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

195

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

196

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

197

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

198

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

199

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

200

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

201

if (DEBUG_DICT) {

202

LOGI("--- Suggest excessive characters %d", i);

203

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

204

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

205

}

206

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

207

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

208

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

209

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

210

// Suggestion with transposed characters

211

// Only suggest words that length is mInputLength

212

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

213

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

214

if (DEBUG_DICT) {

215

LOGI("--- Suggest transposed characters %d", i);

216

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

217

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

218

}

219

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

220

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

221

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

222

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

223

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

224

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

225

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

226

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

227

if (DEBUG_DICT) {

228

LOGI("--- Suggest missing space characters %d", i);

229

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

230

getMissingSpaceWords(mInputLength, i);

231

}

232

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

233

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

234

235

PROF_START(6);

236

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY) {

237

// The first and last "mistyped spaces" are taken care of by excessive character handling

238

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

239

if (DEBUG_DICT) {

240

LOGI("--- Suggest words with proximity space %d", i);

241

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

242

const int x = xcoordinates[i];

243

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

244

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

245

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

246

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

247

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

248

if (proximityInfo->hasSpaceProximity(x, y)) {

249

getMistypedSpaceWords(mInputLength, i);

250

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

251

}

252

}

253

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

254

}

255

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

256

void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,

257

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

258

if (DEBUG_DICT) {

259

LOGI("initSuggest");

260

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

261

mFrequencies = frequencies;

262

mOutputChars = outWords;

263

mInputCodes = codes;

264

mInputLength = codesSize;

265

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

266

}

267

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

268

void UnigramDictionary::registerNextLetter(

269

unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

270

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

275

// TODO: We need to optimize addWord by using STL or something

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

276

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

277

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

278

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

279

char s[length + 1];

280

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

281

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

282

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

283

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

284

if (DEBUG_DICT) {

285

LOGI("Exceeded max word length.");

286

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

287

return false;

288

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

289

290

// Find the right insertion point

291

int insertAt = 0;

292

while (insertAt < MAX_WORDS) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

293

if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency

294

&& length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

300

if (DEBUG_DICT) {

301

char s[length + 1];

302

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

303

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

304

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

305

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

306

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

307

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

308

mFrequencies[insertAt] = frequency;

309

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

310

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

311

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

312

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

317

if (DEBUG_DICT) {

318

LOGI("Added word at %d", insertAt);

319

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

325

unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

326

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

327

c = BASE_CHARS[c];

328

}

329

if (c >='A' && c <= 'Z') {

330

c |= 32;

331

} else if (c > 127) {

332

c = latin_tolower(c);

}

return c;

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

337

bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

338

if (length != mInputLength) {

339

return false;

340

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

341

const int *inputCodes = mInputCodes;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

342

while (length--) {

343

if ((unsigned int) *inputCodes != (unsigned int) *word) {

344

return false;

345

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

346

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

352

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

353

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

354

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

355

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

356

const int excessivePos, const int transposedPos, int *nextLetters,

357

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

358

if (DEBUG_DICT) {

359

LOGI("getSuggestionCandidates %d", maxDepth);

360

assert(transposedPos + 1 < mInputLength);

361

assert(excessivePos < mInputLength);

362

assert(missingPos < mInputLength);

363

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

364

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

365

// Get the number of child of root, then increment the position

366

int childCount = Dictionary::getCount(DICT, &rootPosition);

367

int depth = 0;

368

369

mStackChildCount[0] = childCount;

370

mStackTraverseAll[0] = (mInputLength <= 0);

371

mStackNodeFreq[0] = 1;

372

mStackInputIndex[0] = 0;

373

mStackDiffs[0] = 0;

374

mStackSiblingPos[0] = rootPosition;

375

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

376

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

377

while (depth >= 0) {

378

if (mStackChildCount[depth] > 0) {

379

--mStackChildCount[depth];

380

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

381

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

382

int inputIndex = mStackInputIndex[depth];

383

int diffs = mStackDiffs[depth];

384

int siblingPos = mStackSiblingPos[depth];

385

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

386

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

387

// needsToTraverseChildrenNodes should be false

388

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

389

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

390

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

391

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

392

&siblingPos);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

393

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

394

mStackSiblingPos[depth] = siblingPos;

395

if (needsToTraverseChildrenNodes) {

396

// Goes to child node

397

++depth;

398

mStackChildCount[depth] = childCount;

399

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

400

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

401

mStackInputIndex[depth] = inputIndex;

402

mStackDiffs[depth] = diffs;

403

mStackSiblingPos[depth] = firstChildPos;

404

}

405

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

406

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

412

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

413

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

414

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

415

}

416

417

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

418

inline static void multiplyIntCapped(const int multiplier, int *base) {

419

const int temp = *base;

420

if (temp != S_INT_MAX) {

421

// Branch if multiplier == 2 for the optimization

422

if (multiplier == 2) {

423

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

424

} else {

425

const int tempRetval = temp * multiplier;

426

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

432

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

433

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

434

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

435

int ret = base;

436

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

442

if (*freq != S_INT_MAX) {

443

if (*freq > 1000000) {

444

*freq /= 100;

445

multiplyIntCapped(rate, freq);

446

} else {

447

multiplyIntCapped(rate, freq);

448

*freq /= 100;

449

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

453

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

454

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

455

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

456

if (firstWordLength == 0 || secondWordLength == 0) {

457

return 0;

458

}

459

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

460

int tempFirstFreq = firstFreq;

461

multiplyRate(firstDemotionRate, &tempFirstFreq);

462

463

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

464

int tempSecondFreq = secondFreq;

465

multiplyRate(secondDemotionRate, &tempSecondFreq);

466

467

const int totalLength = firstWordLength + secondWordLength;

468

469

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

470

// length.

471

int totalFreq = tempFirstFreq + tempSecondFreq;

472

473

// This is a workaround to try offsetting the not-enough-demotion which will be done in

474

// calcNormalizedScore in Utils.java.

475

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

476

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

477

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

478

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

479

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

480

481

// At this moment, totalFreq is calculated by the following formula:

482

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

483

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

484

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

485

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

486

487

// This is another workaround to offset the demotion which will be done in

488

// calcNormalizedScore in Utils.java.

489

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

490

// the same amount because we already have adjusted the synthetic freq of this "missing or

491

// mistyped space" suggestion candidate above in this method.

492

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

493

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

494

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

495

if (isSpaceProximity) {

496

// A word pair with one space proximity correction

497

if (DEBUG_DICT) {

498

LOGI("Found a word pair with space proximity correction.");

499

}

500

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

501

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

502

}

503

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

504

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

508

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

509

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

510

const int secondWordLength, const bool isSpaceProximity) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

511

if (inputLength >= MAX_WORD_LENGTH) return false;

512

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

513

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

514

return false;

515

const int newWordLength = firstWordLength + secondWordLength + 1;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

516

// Allocating variable length array on stack

517

unsigned short word[newWordLength];

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

518

const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

519

if (DEBUG_DICT) {

520

LOGI("First freq: %d", firstFreq);

521

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

522

if (firstFreq <= 0) return false;

523

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

524

for (int i = 0; i < firstWordLength; ++i) {

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

525

word[i] = mWord[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

526

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

527

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

528

const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

529

if (DEBUG_DICT) {

530

LOGI("Second freq: %d", secondFreq);

531

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

532

if (secondFreq <= 0) return false;

533

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

534

word[firstWordLength] = SPACE;

535

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

536

word[i] = mWord[i - firstWordLength - 1];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

537

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

538

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

539

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

540

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

satok

a4374d2

2011-04-18 11:40:22 +0900

[diff] [blame]

541

if (DEBUG_DICT) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

542

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

satok

a4374d2

2011-04-18 11:40:22 +0900

[diff] [blame]

543

TYPED_LETTER_MULTIPLIER);

544

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

545

addWord(word, newWordLength, pairFreq);

return true;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

549

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

550

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

551

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

552

}

553

554

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

555

return getSplitTwoWordsSuggestion(

556

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

557

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

558

}

559

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

560

// Keep this for comparing spec to new getWords

561

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

562

const int excessivePos, const int transposedPos,int *nextLetters,

563

const int nextLettersSize) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

564

int initialPosition = initialPos;

565

const int count = Dictionary::getCount(DICT, &initialPosition);

566

getWordsRec(count, initialPosition, 0,

567

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

568

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

569

nextLettersSize);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

570

}

571

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

572

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

573

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

574

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

575

const int transposedPos, int *nextLetters, const int nextLettersSize) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

576

int siblingPos = pos;

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

577

for (int i = 0; i < childrenCount; ++i) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

578

int newCount;

579

int newChildPosition;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

580

const int newDepth = depth + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

581

bool newTraverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

582

int newMatchRate;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

int newInputIndex;

int newDiffs;

int newSiblingPos;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

587

traverseAllNodes, matchWeight, inputIndex, diffs,

588

skipPos, excessivePos, transposedPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

589

nextLetters, nextLettersSize,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

590

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

591

&newInputIndex, &newDiffs, &newSiblingPos);

592

siblingPos = newSiblingPos;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

593

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

594

if (needsToTraverseChildrenNodes) {

595

getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

596

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

597

nextLetters, nextLettersSize);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

}

}

}

satok

2011-01-27 03:23:39 +0900

[diff] [blame]

602

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

603

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

604

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

605

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

606

int finalFreq = freq * matchWeight;

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

607

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

608

if (mInputLength >= 2) {

609

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

610

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

611

/ (10 * mInputLength

612

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

613

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

614

LOGI("Demotion rate for missing character is %d.", demotionRate);

615

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

616

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

621

if (transposedPos >= 0) multiplyRate(

622

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

623

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

624

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

625

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

626

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

627

}

628

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

629

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

630

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

631

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

632

// Full exact match

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

633

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

634

if (DEBUG_DICT) {

635

LOGI("Found full matched word.");

636

}

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

637

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

638

}

639

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

640

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

641

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

642

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

643

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

644

if (DEBUG_DICT) {

645

LOGI("Found one proximity correction.");

646

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

647

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

648

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

649

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

650

if (DEBUG_DICT) {

651

LOGI("calc: %d, %d", depth, sameLength);

652

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

653

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

654

return finalFreq;

655

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

656

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

657

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

658

unsigned short *word, const int inputIndex, const int depth, const int matchWeight,

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

659

int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,

660

const int transposedPos, const int freq) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

661

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, excessivePos,

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

662

transposedPos, freq, false);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

663

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

664

if (depth >= mInputLength && skipPos < 0) {

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

665

registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);

}

}

inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

670

unsigned short *word, const int inputIndex, const int depth, const int matchWeight,

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

671

const int skipPos, const int excessivePos, const int transposedPos, const int freq) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

672

if (sameAsTyped(word, depth + 1)) return;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

673

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

674

excessivePos, transposedPos, freq, true);

675

// Proximity collection will promote a word of the same length as what user typed.

676

if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

677

}

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

678

679

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

680

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

681

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

682

// Skip the ' or other letter and continue deeper

683

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

684

}

685

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

686

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

687

const int inputLength) const {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

688

if (inputIndex < 0 || inputIndex >= inputLength) return false;

689

const int currentChar = *getInputCharsAt(inputIndex);

690

const int leftIndex = inputIndex - 1;

691

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

692

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

693

int i = 0;

694

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

695

if (leftChars[i++] == currentChar) return true;

696

}

697

}

698

const int rightIndex = inputIndex + 1;

699

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

700

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

701

int i = 0;

702

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

703

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

709

710

// In the following function, c is the current character of the dictionary word

711

// currently examined.

712

// currentChars is an array containing the keys close to the character the

713

// user actually typed at the same position. We want to see if c is in it: if so,

714

// then the word contains at that position a character close to what the user

715

// typed.

716

// What the user typed is actually the first character of the array.

717

// Notice : accented characters do not have a proximity list, so they are alone

718

// in their list. The non-accented version of the character should be considered

719

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

720

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

721

const int *currentChars, const unsigned short c, const int skipPos,

722

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

723

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

724

725

// The first char in the array is what user typed. If it matches right away,

726

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

727

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

728

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

729

730

// If one of those is true, we should not check for close characters at all.

731

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

732

return UNRELATED_CHAR;

733

734

// If the non-accented, lowercased version of that first character matches c,

735

// then we have a non-accented version of the accented character the user

736

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

737

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

738

return NEAR_PROXIMITY_CHAR;

739

740

// Not an exact nor an accent-alike match: search the list of close keys

741

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

742

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

743

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

744

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

745

++j;

746

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

747

748

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

749

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

750

}

751

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

752

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

753

const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

754

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

755

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

756

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

757

int *nextSiblingPosition) {

758

if (DEBUG_DICT) {

759

int inputCount = 0;

760

if (skipPos >= 0) ++inputCount;

761

if (excessivePos >= 0) ++inputCount;

762

if (transposedPos >= 0) ++inputCount;

763

assert(inputCount <= 1);

764

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

unsigned short c;

int childPosition;

bool terminal;

int freq;

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

769

bool isSameAsUserTypedLength = false;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

770

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

771

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

772

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

773

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,

774

&childPosition, &terminal, &freq);

775

776

const bool needsToTraverseChildrenNodes = childPosition != 0;

777

778

// If we are only doing traverseAllNodes, no need to look at the typed characters.

779

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

780

mWord[depth] = c;

781

if (traverseAllNodes && terminal) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

782

onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

783

matchWeight, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos,

784

freq);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

785

}

786

if (!needsToTraverseChildrenNodes) return false;

787

*newTraverseAllNodes = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

788

*newMatchRate = matchWeight;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

789

*newDiffs = diffs;

790

*newInputIndex = inputIndex;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

791

} else {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

792

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

793

794

if (transposedPos >= 0) {

795

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

796

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

797

}

798

799

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

800

transposedPos);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

801

if (UNRELATED_CHAR == matchedProximityCharId) return false;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

802

mWord[depth] = c;

803

// If inputIndex is greater than mInputLength, that means there is no

804

// proximity chars. So, we don't need to check proximity.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

805

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

806

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

807

}

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

808

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

809

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

810

if (isSameAsUserTypedLength && terminal) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

811

onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, matchWeight,

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

812

skipPos, excessivePos, transposedPos, freq);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

813

}

814

if (!needsToTraverseChildrenNodes) return false;

815

// Start traversing all nodes after the index exceeds the user typed length

816

*newTraverseAllNodes = isSameAsUserTypedLength;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

817

*newMatchRate = matchWeight;

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

818

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

819

*newInputIndex = inputIndex + 1;

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

820

}

821

// Optimization: Prune out words that are too long compared to how much was typed.

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

822

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

satok

2011-01-27 16:25:16 +0900

[diff] [blame]

827

// TODO: Check if this can be isSameAsUserTypedLength only.

828

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

829

*newTraverseAllNodes = true;

830

}

831

// get the count of nodes and increment childAddress.

832

*newCount = Dictionary::getCount(DICT, &childPosition);

833

*newChildPosition = childPosition;

834

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

835

return needsToTraverseChildrenNodes;

836

}

837

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

838

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

839

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

840

int pos = ROOT_POS;

841

int count = Dictionary::getCount(DICT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

842

int maxFreq = 0;

843

int depth = 0;

844

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

845

bool terminal = false;

846

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

847

mStackChildCount[0] = count;

848

mStackSiblingPos[0] = pos;

849

850

while (depth >= 0) {

851

if (mStackChildCount[depth] > 0) {

852

--mStackChildCount[depth];

853

int firstChildPos;

854

int newFreq;

855

int siblingPos = mStackSiblingPos[depth];

856

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

857

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

858

&siblingPos);

859

mStackSiblingPos[depth] = siblingPos;

860

if (depth == (inputLength - 1)) {

861

// Traverse sibling node

862

if (terminal) {

863

if (newFreq > maxFreq) {

864

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

865

if (DEBUG_DICT && DEBUG_NODE) {

866

char s[inputLength + 1];

867

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

868

s[inputLength] = 0;

869

LOGI("New missing space word found: %d > %d (%s), %d, %d",

870

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

876

// Traverse children nodes

877

++depth;

878

mStackChildCount[depth] = count;

879

mStackSiblingPos[depth] = firstChildPos;

880

}

881

} else {

882

// Traverse parent node

883

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

884

}

885

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

886

887

word[inputLength] = 0;

888

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

889

}

890

891

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

892

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

893

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

894

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

895

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

896

unsigned short c;

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

897

*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,

898

newChildPosition, newTerminal, newFreq);

899

const unsigned int inputC = currentChars[0];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

900

if (DEBUG_DICT) {

901

assert(inputC <= U_SHORT_MAX);

902

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

903

const unsigned short baseLowerC = toBaseLowerCase(c);

904

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

905

const bool hasChild = *newChildPosition != 0;

906

if (matched) {

907

word[depth] = c;

908

if (DEBUG_DICT && DEBUG_NODE) {

909

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

910

if (*newTerminal) {

911

LOGI("Terminal %d", *newFreq);

912

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

913

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

914

if (hasChild) {

915

*newCount = Dictionary::getCount(DICT, newChildPosition);

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

922

// Thus newTerminal shouldn't be true.

923

*newTerminal = false;

924

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

925

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

926

}

satok