Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <string.h>

20

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

21

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

22

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

23

#include "basechars.h"

24

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

25

#include "dictionary.h"

26

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

30

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

35

// TODO: check the header

36

UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

37

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

38

const bool isLatestDictVersion)

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

39

: DICT_ROOT(streamStart),

40

MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

42

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

43

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

44

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),

45

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

46

if (DEBUG_DICT) {

47

LOGI("UnigramDictionary - constructor");

48

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

49

}

50

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

51

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

53

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

54

const int MAX_PROXIMITY_CHARS) {

55

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

56

}

57

58

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

59

60

// There can't be a digraph if we don't have at least 2 characters to examine

61

if (i + 2 > codesSize) return false;

62

63

// Search for the first char of some digraph

64

int lastDigraphIndex = -1;

65

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

66

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

67

lastDigraphIndex >= 0; --lastDigraphIndex) {

68

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

69

}

70

// No match: return early

71

if (lastDigraphIndex < 0) return false;

72

73

// It's an interesting digraph if the second char matches too.

74

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

75

}

76

77

// Mostly the same arguments as the non-recursive version, except:

78

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

79

// codesSize is the size of the user input (thus, it is the size of codesSrc).

80

// codesDest is the current point in the work buffer.

81

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

82

// codesRemain is the remaining size in codesSrc.

83

void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,

84

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

85

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

86

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

87

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

88

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

89

for (int i = 0; i < codesRemain; ++i) {

90

if (isDigraph(codesSrc, i, codesRemain)) {

91

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

92

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

93

// Copy the word up to the first char of the digraph, then continue processing

94

// on the remaining part of the word, skipping the second char of the digraph.

95

// In our example, copy "pru" and continue running on "fen"

96

// Make i the index of the second char of the digraph for simplicity. Forgetting

97

// to do that results in an infinite recursion so take care!

98

++i;

99

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

100

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

101

codesBuffer, codesBufferSize, flags,

102

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

103

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

104

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

105

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

106

// Copy the second char of the digraph in place, then continue processing on

107

// the remaining part of the word.

108

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

109

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

110

BYTES_IN_ONE_CHAR);

111

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

112

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

113

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

114

outWords, frequencies);

115

return;

116

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

121

// In our example, we'll come here once for "prufen" and then once for "pruefen".

122

// If the word contains several digraphs, we'll come it for the product of them.

123

// eg. if the word is "ueberpruefen" we'll test, in order, against

124

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

125

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

126

if (0 != remainingBytes)

127

memcpy(codesDest, codesSrc, remainingBytes);

128

129

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

130

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

131

}

132

133

int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,

134

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

135

unsigned short *outWords, int *frequencies) {

136

137

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

138

{ // Incrementally tune the word and try all possibilities

139

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

140

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

141

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

142

} else { // Normal processing

143

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

144

outWords, frequencies);

145

}

146

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

147

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

148

// Get the word count

149

int suggestedWordsCount = 0;

150

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

151

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

156

/// Print the returned words

157

for (int j = 0; j < suggestedWordsCount; ++j) {

158

short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;

159

char s[MAX_WORD_LENGTH];

160

for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];

161

LOGI("%s %i", s, mFrequencies[j]);

162

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

163

LOGI("Next letters: ");

164

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

165

if (mNextLettersFrequency[k] > 0) {

166

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

167

}

168

}

169

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

170

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

171

PROF_CLOSE;

172

return suggestedWordsCount;

173

}

174

175

void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,

176

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

177

unsigned short *outWords, int *frequencies) {

178

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

179

PROF_OPEN;

180

PROF_START(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

181

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

182

if (DEBUG_DICT) assert(codesSize == mInputLength);

183

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

184

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

185

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

186

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

187

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

188

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

189

PROF_END(1);

190

191

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

192

// Suggestion with missing character

193

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

194

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

195

if (DEBUG_DICT) {

196

LOGI("--- Suggest missing characters %d", i);

197

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

198

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

199

}

200

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

201

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

202

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

203

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

204

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

205

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

206

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

207

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

208

if (DEBUG_DICT) {

209

LOGI("--- Suggest excessive characters %d", i);

210

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

211

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

212

}

213

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

214

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

215

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

216

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

217

// Suggestion with transposed characters

218

// Only suggest words that length is mInputLength

219

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

220

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

221

if (DEBUG_DICT) {

222

LOGI("--- Suggest transposed characters %d", i);

223

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

224

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

225

}

226

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

227

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

228

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

229

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

230

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

231

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

232

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

233

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

234

if (DEBUG_DICT) {

235

LOGI("--- Suggest missing space characters %d", i);

236

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

237

getMissingSpaceWords(mInputLength, i);

238

}

239

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

240

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

241

242

PROF_START(6);

Jean Chalard

e93b1f22

2011-06-01 17:12:25 +0900

[diff] [blame]

243

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

244

// The first and last "mistyped spaces" are taken care of by excessive character handling

245

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

246

if (DEBUG_DICT) {

247

LOGI("--- Suggest words with proximity space %d", i);

248

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

249

const int x = xcoordinates[i];

250

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

251

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

252

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

253

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

254

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

255

if (proximityInfo->hasSpaceProximity(x, y)) {

256

getMistypedSpaceWords(mInputLength, i);

257

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

258

}

259

}

260

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

261

}

262

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

263

void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,

264

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

265

if (DEBUG_DICT) {

266

LOGI("initSuggest");

267

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

268

mFrequencies = frequencies;

269

mOutputChars = outWords;

270

mInputCodes = codes;

271

mInputLength = codesSize;

272

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

273

}

274

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

275

static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

276

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

281

// TODO: We need to optimize addWord by using STL or something

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

282

// TODO: This needs to take an const unsigned short* and not tinker with its contents

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

283

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

284

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

285

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

286

char s[length + 1];

287

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

288

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

289

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

290

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

291

if (DEBUG_DICT) {

292

LOGI("Exceeded max word length.");

293

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

294

return false;

295

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

296

297

// Find the right insertion point

298

int insertAt = 0;

299

while (insertAt < MAX_WORDS) {

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

300

// TODO: How should we sort words with the same frequency?

301

if (frequency > mFrequencies[insertAt]) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

307

if (DEBUG_DICT) {

308

char s[length + 1];

309

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

310

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

311

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

312

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

313

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

314

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

315

mFrequencies[insertAt] = frequency;

316

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

317

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

318

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

319

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

324

if (DEBUG_DICT) {

325

LOGI("Added word at %d", insertAt);

326

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

332

static inline unsigned short toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

333

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

334

c = BASE_CHARS[c];

335

}

336

if (c >='A' && c <= 'Z') {

337

c |= 32;

338

} else if (c > 127) {

339

c = latin_tolower(c);

}

return c;

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

344

bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

345

if (length != mInputLength) {

346

return false;

347

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

348

const int *inputCodes = mInputCodes;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

349

while (length--) {

350

if ((unsigned int) *inputCodes != (unsigned int) *word) {

351

return false;

352

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

353

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

359

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

360

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

361

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

362

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

363

const int excessivePos, const int transposedPos, int *nextLetters,

364

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

365

if (DEBUG_DICT) {

366

LOGI("getSuggestionCandidates %d", maxDepth);

367

assert(transposedPos + 1 < mInputLength);

368

assert(excessivePos < mInputLength);

369

assert(missingPos < mInputLength);

370

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

371

int rootPosition = ROOT_POS;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

372

// Get the number of children of root, then increment the position

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

373

int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

374

int depth = 0;

375

376

mStackChildCount[0] = childCount;

377

mStackTraverseAll[0] = (mInputLength <= 0);

378

mStackNodeFreq[0] = 1;

379

mStackInputIndex[0] = 0;

380

mStackDiffs[0] = 0;

381

mStackSiblingPos[0] = rootPosition;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

382

mStackOutputIndex[0] = 0;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

383

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

384

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

385

while (depth >= 0) {

386

if (mStackChildCount[depth] > 0) {

387

--mStackChildCount[depth];

388

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

389

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

390

int inputIndex = mStackInputIndex[depth];

391

int diffs = mStackDiffs[depth];

392

int siblingPos = mStackSiblingPos[depth];

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

393

int outputIndex = mStackOutputIndex[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

394

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

395

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

396

// needsToTraverseChildrenNodes should be false

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

397

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

398

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

399

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

400

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

401

&siblingPos, &outputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

402

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

403

mStackSiblingPos[depth] = siblingPos;

404

if (needsToTraverseChildrenNodes) {

405

// Goes to child node

406

++depth;

407

mStackChildCount[depth] = childCount;

408

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

409

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

410

mStackInputIndex[depth] = inputIndex;

411

mStackDiffs[depth] = diffs;

412

mStackSiblingPos[depth] = firstChildPos;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

413

mStackOutputIndex[depth] = outputIndex;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

414

}

415

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

416

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

422

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

423

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

424

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

425

}

426

427

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

428

inline static void multiplyIntCapped(const int multiplier, int *base) {

429

const int temp = *base;

430

if (temp != S_INT_MAX) {

431

// Branch if multiplier == 2 for the optimization

432

if (multiplier == 2) {

433

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

434

} else {

435

const int tempRetval = temp * multiplier;

436

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

442

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

443

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

444

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

445

int ret = base;

446

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

452

if (*freq != S_INT_MAX) {

453

if (*freq > 1000000) {

454

*freq /= 100;

455

multiplyIntCapped(rate, freq);

456

} else {

457

multiplyIntCapped(rate, freq);

458

*freq /= 100;

459

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

463

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

464

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

465

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

466

if (firstWordLength == 0 || secondWordLength == 0) {

467

return 0;

468

}

469

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

470

int tempFirstFreq = firstFreq;

471

multiplyRate(firstDemotionRate, &tempFirstFreq);

472

473

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

474

int tempSecondFreq = secondFreq;

475

multiplyRate(secondDemotionRate, &tempSecondFreq);

476

477

const int totalLength = firstWordLength + secondWordLength;

478

479

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

480

// length.

481

int totalFreq = tempFirstFreq + tempSecondFreq;

482

483

// This is a workaround to try offsetting the not-enough-demotion which will be done in

484

// calcNormalizedScore in Utils.java.

485

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

486

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

487

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

488

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

489

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

490

491

// At this moment, totalFreq is calculated by the following formula:

492

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

493

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

494

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

495

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

496

497

// This is another workaround to offset the demotion which will be done in

498

// calcNormalizedScore in Utils.java.

499

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

500

// the same amount because we already have adjusted the synthetic freq of this "missing or

501

// mistyped space" suggestion candidate above in this method.

502

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

503

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

504

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

505

if (isSpaceProximity) {

506

// A word pair with one space proximity correction

507

if (DEBUG_DICT) {

508

LOGI("Found a word pair with space proximity correction.");

509

}

510

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

511

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

512

}

513

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

514

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

518

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

519

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

520

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

521

}

522

523

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

524

return getSplitTwoWordsSuggestion(

525

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

526

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

527

}

528

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

529

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

530

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

531

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

532

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

533

int finalFreq = freq * matchWeight;

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

534

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

535

if (mInputLength >= 2) {

536

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

537

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

538

/ (10 * mInputLength

539

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

540

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

541

LOGI("Demotion rate for missing character is %d.", demotionRate);

542

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

543

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

548

if (transposedPos >= 0) multiplyRate(

549

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

550

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

551

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

552

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

553

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

554

}

555

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

556

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

557

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

558

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

559

// Full exact match

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

560

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

561

if (DEBUG_DICT) {

562

LOGI("Found full matched word.");

563

}

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

564

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

565

}

566

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

567

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

568

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

569

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

570

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

571

if (DEBUG_DICT) {

572

LOGI("Found one proximity correction.");

573

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

574

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

575

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

576

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

577

if (DEBUG_DICT) {

578

LOGI("calc: %d, %d", depth, sameLength);

579

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

580

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

581

return finalFreq;

582

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

583

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

584

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

585

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

586

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

587

// Skip the ' or other letter and continue deeper

588

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

589

}

590

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

591

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

592

const int inputLength) const {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

593

if (inputIndex < 0 || inputIndex >= inputLength) return false;

594

const int currentChar = *getInputCharsAt(inputIndex);

595

const int leftIndex = inputIndex - 1;

596

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

597

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

598

int i = 0;

599

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

600

if (leftChars[i++] == currentChar) return true;

601

}

602

}

603

const int rightIndex = inputIndex + 1;

604

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

605

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

606

int i = 0;

607

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

608

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

614

// In the following function, c is the current character of the dictionary word

615

// currently examined.

616

// currentChars is an array containing the keys close to the character the

617

// user actually typed at the same position. We want to see if c is in it: if so,

618

// then the word contains at that position a character close to what the user

619

// typed.

620

// What the user typed is actually the first character of the array.

621

// Notice : accented characters do not have a proximity list, so they are alone

622

// in their list. The non-accented version of the character should be considered

623

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

624

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

625

const int *currentChars, const unsigned short c, const int skipPos,

626

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

627

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

628

629

// The first char in the array is what user typed. If it matches right away,

630

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

631

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

632

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

633

634

// If one of those is true, we should not check for close characters at all.

635

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

636

return UNRELATED_CHAR;

637

638

// If the non-accented, lowercased version of that first character matches c,

639

// then we have a non-accented version of the accented character the user

640

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

641

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

642

return NEAR_PROXIMITY_CHAR;

643

644

// Not an exact nor an accent-alike match: search the list of close keys

645

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

646

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

647

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

648

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

649

++j;

650

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

651

652

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

653

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

654

}

655

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

656

inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

657

const uint8_t* const root, const uint8_t flags, const int pos,

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

658

const int inputIndex, const int matchWeight, const int skipPos,

659

const int excessivePos, const int transposedPos, const int freq, const bool sameLength,

660

int* nextLetters, const int nextLettersSize) {

661

662

const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

663

if (isSameAsTyped) return;

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

664

665

if (depth >= MIN_SUGGEST_DEPTH) {

666

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

667

excessivePos, transposedPos, freq, sameLength);

668

if (!isSameAsTyped)

669

addWord(word, depth + 1, finalFreq);

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

670

}

671

672

if (sameLength && depth >= mInputLength && skipPos < 0) {

673

registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);

}

}

Jean Chalard

2011-06-30 19:47:25 +0900

[diff] [blame]

677

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

678

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

679

const int secondWordLength, const bool isSpaceProximity) {

680

if (inputLength >= MAX_WORD_LENGTH) return false;

681

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

682

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

683

return false;

684

const int newWordLength = firstWordLength + secondWordLength + 1;

685

// Allocating variable length array on stack

686

unsigned short word[newWordLength];

687

const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);

688

if (DEBUG_DICT) {

689

LOGI("First freq: %d", firstFreq);

690

}

691

if (firstFreq <= 0) return false;

692

693

for (int i = 0; i < firstWordLength; ++i) {

word[i] = mWord[i];

}

const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);

698

if (DEBUG_DICT) {

699

LOGI("Second freq: %d", secondFreq);

700

}

701

if (secondFreq <= 0) return false;

702

703

word[firstWordLength] = SPACE;

704

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

705

word[i] = mWord[i - firstWordLength - 1];

706

}

707

708

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

709

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

710

if (DEBUG_DICT) {

711

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

712

TYPED_LETTER_MULTIPLIER);

713

}

714

addWord(word, newWordLength, pairFreq);

return true;

}

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

718

#ifndef NEW_DICTIONARY_FORMAT

719

// TODO: Don't forget to bring inline functions back to over where they are used.

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

720

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

721

// The following functions will be entirely replaced with new implementations.

722

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

723

const int excessivePos, const int transposedPos,int *nextLetters,

724

const int nextLettersSize) {

725

int initialPosition = initialPos;

726

const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);

727

getWordsRec(count, initialPosition, 0,

728

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

729

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

730

nextLettersSize);

731

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

732

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

733

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

734

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

735

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

736

const int transposedPos, int *nextLetters, const int nextLettersSize) {

737

int siblingPos = pos;

738

for (int i = 0; i < childrenCount; ++i) {

739

int newCount;

740

int newChildPosition;

741

bool newTraverseAllNodes;

int newMatchRate;

int newInputIndex;

int newDiffs;

int newSiblingPos;

int newOutputIndex;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

748

traverseAllNodes, matchWeight, inputIndex, diffs,

749

skipPos, excessivePos, transposedPos,

750

nextLetters, nextLettersSize,

751

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

752

&newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);

753

siblingPos = newSiblingPos;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

754

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

755

if (needsToTraverseChildrenNodes) {

756

getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,

757

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

758

nextLetters, nextLettersSize);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

759

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

760

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

761

}

762

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

763

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

764

const int inputLength, unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

765

int pos = ROOT_POS;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

766

int count = Dictionary::getCount(DICT_ROOT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

767

int maxFreq = 0;

768

int depth = 0;

769

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

770

bool terminal = false;

771

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

772

mStackChildCount[0] = count;

773

mStackSiblingPos[0] = pos;

774

775

while (depth >= 0) {

776

if (mStackChildCount[depth] > 0) {

777

--mStackChildCount[depth];

778

int firstChildPos;

779

int newFreq;

780

int siblingPos = mStackSiblingPos[depth];

781

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

782

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

783

&siblingPos);

784

mStackSiblingPos[depth] = siblingPos;

785

if (depth == (inputLength - 1)) {

786

// Traverse sibling node

787

if (terminal) {

788

if (newFreq > maxFreq) {

789

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

790

if (DEBUG_DICT && DEBUG_NODE) {

791

char s[inputLength + 1];

792

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

793

s[inputLength] = 0;

794

LOGI("New missing space word found: %d > %d (%s), %d, %d",

795

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

801

// Traverse children nodes

802

++depth;

803

mStackChildCount[depth] = count;

804

mStackSiblingPos[depth] = firstChildPos;

805

}

806

} else {

807

// Traverse parent node

808

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

809

}

810

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

811

812

word[inputLength] = 0;

813

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

814

}

815

816

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

817

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

818

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

819

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

820

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

821

unsigned short c;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

822

*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,

823

&c, newChildPosition, newTerminal, newFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

824

const unsigned int inputC = currentChars[0];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

825

if (DEBUG_DICT) {

826

assert(inputC <= U_SHORT_MAX);

827

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

828

const unsigned short baseLowerC = toBaseLowerCase(c);

829

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

830

const bool hasChild = *newChildPosition != 0;

831

if (matched) {

832

word[depth] = c;

833

if (DEBUG_DICT && DEBUG_NODE) {

834

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

835

if (*newTerminal) {

836

LOGI("Terminal %d", *newFreq);

837

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

838

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

839

if (hasChild) {

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

840

*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

847

// Thus newTerminal shouldn't be true.

848

*newTerminal = false;

849

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

850

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

851

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

852

853

// TODO: use uint32_t instead of unsigned short

854

bool UnigramDictionary::isValidWord(unsigned short *word, int length) {

855

if (IS_LATEST_DICT_VERSION) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

856

return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

857

} else {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

858

return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

}

}

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

862

863

// Require strict exact match.

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

864

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

865

int length) const {

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

866

// returns address of bigram data of that word

867

// return -99 if not found

868

869

int count = Dictionary::getCount(DICT_ROOT, &pos);

870

unsigned short currentChar = (unsigned short) word[offset];

871

for (int j = 0; j < count; j++) {

872

unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);

873

int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);

874

int childPos = Dictionary::getAddress(DICT_ROOT, &pos);

875

if (c == currentChar) {

876

if (offset == length - 1) {

if (terminal) {

return (pos+1);

}

} else {

if (childPos != 0) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

882

int t = getBigramPosition(childPos, word, offset + 1, length);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

if (t > 0) {

return t;

}

}

}

}

if (terminal) {

Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);

891

}

892

// There could be two instances of each alphabet - upper and lower case. So continue

893

// looking ...

894

}

895

return NOT_VALID_WORD;

896

}

897

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

898

// The following functions will be modified.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

899

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

900

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

901

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

902

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

903

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

904

int *nextSiblingPosition, int *nextOutputIndex) {

905

if (DEBUG_DICT) {

906

int inputCount = 0;

907

if (skipPos >= 0) ++inputCount;

908

if (excessivePos >= 0) ++inputCount;

909

if (transposedPos >= 0) ++inputCount;

910

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

917

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

918

const int pos = initialPos;

919

const int depth = initialDepth;

920

const int traverseAllNodes = initialTraverseAllNodes;

921

const int diffs = initialDiffs;

922

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

923

const uint8_t flags = 0; // No flags for now

924

925

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

926

927

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

928

&c, &childPosition, &terminal, &freq);

929

*nextOutputIndex = depth + 1;

930

931

const bool needsToTraverseChildrenNodes = childPosition != 0;

932

933

// If we are only doing traverseAllNodes, no need to look at the typed characters.

934

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

935

mWord[depth] = c;

936

if (traverseAllNodes && terminal) {

937

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

938

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

939

}

940

if (!needsToTraverseChildrenNodes) return false;

941

*newTraverseAllNodes = traverseAllNodes;

942

*newMatchRate = matchWeight;

943

*newDiffs = diffs;

944

*newInputIndex = inputIndex;

945

} else {

946

const int *currentChars = getInputCharsAt(inputIndex);

947

948

if (transposedPos >= 0) {

949

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

950

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

951

}

952

953

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

954

transposedPos);

955

if (UNRELATED_CHAR == matchedProximityCharId) return false;

956

mWord[depth] = c;

957

// If inputIndex is greater than mInputLength, that means there is no

958

// proximity chars. So, we don't need to check proximity.

959

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

960

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

961

}

962

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

963

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

964

if (isSameAsUserTypedLength && terminal) {

965

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

966

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

967

}

968

if (!needsToTraverseChildrenNodes) return false;

969

// Start traversing all nodes after the index exceeds the user typed length

970

*newTraverseAllNodes = isSameAsUserTypedLength;

971

*newMatchRate = matchWeight;

972

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

973

*newInputIndex = inputIndex + 1;

974

}

975

// Optimization: Prune out words that are too long compared to how much was typed.

976

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

981

// TODO: Check if this can be isSameAsUserTypedLength only.

982

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

983

*newTraverseAllNodes = true;

984

}

985

// get the count of nodes and increment childAddress.

986

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

987

*newChildPosition = childPosition;

988

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

989

return needsToTraverseChildrenNodes;

990

}

991

992

#else // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

993

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

994

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

995

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

996

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

997

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

998

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

Jean Chalard

432789a

2011-06-30 17:50:48 +0900

[diff] [blame]

999

int *nextSiblingPosition, int *newOutputIndex) {

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1000

if (DEBUG_DICT) {

1001

int inputCount = 0;

1002

if (skipPos >= 0) ++inputCount;

1003

if (excessivePos >= 0) ++inputCount;

1004

if (transposedPos >= 0) ++inputCount;

1005

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

1012

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1013

int pos = initialPos;

1014

int depth = initialDepth;

1015

int traverseAllNodes = initialTraverseAllNodes;

1016

int diffs = initialDiffs;

1017

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1018

const uint8_t flags = 0; // No flags for now

1019

1020

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

1021

1022

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

1023

&c, &childPosition, &terminal, &freq);

Jean Chalard

432789a

2011-06-30 17:50:48 +0900

[diff] [blame]

1024

*newOutputIndex = depth + 1;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1025

1026

const bool needsToTraverseChildrenNodes = childPosition != 0;

1027

1028

// If we are only doing traverseAllNodes, no need to look at the typed characters.

1029

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

1030

mWord[depth] = c;

1031

if (traverseAllNodes && terminal) {

1032

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1033

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

1034

}

1035

if (!needsToTraverseChildrenNodes) return false;

1036

*newTraverseAllNodes = traverseAllNodes;

1037

*newMatchRate = matchWeight;

1038

*newDiffs = diffs;

1039

*newInputIndex = inputIndex;

1040

} else {

1041

const int *currentChars = getInputCharsAt(inputIndex);

1042

1043

if (transposedPos >= 0) {

1044

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

1045

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

1046

}

1047

1048

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

1049

transposedPos);

1050

if (UNRELATED_CHAR == matchedProximityCharId) return false;

1051

mWord[depth] = c;

1052

// If inputIndex is greater than mInputLength, that means there is no

1053

// proximity chars. So, we don't need to check proximity.

1054

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

1055

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

1056

}

1057

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

1058

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

1059

if (isSameAsUserTypedLength && terminal) {

1060

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1061

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

1062

}

1063

if (!needsToTraverseChildrenNodes) return false;

1064

// Start traversing all nodes after the index exceeds the user typed length

1065

*newTraverseAllNodes = isSameAsUserTypedLength;

1066

*newMatchRate = matchWeight;

1067

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

1068

*newInputIndex = inputIndex + 1;

1069

}

1070

// Optimization: Prune out words that are too long compared to how much was typed.

1071

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

1076

// TODO: Check if this can be isSameAsUserTypedLength only.

1077

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

1078

*newTraverseAllNodes = true;

1079

}

1080

// get the count of nodes and increment childAddress.

1081

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

1082

*newChildPosition = childPosition;

1083

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

1084

return needsToTraverseChildrenNodes;

1085

}

1086

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

1087

#endif // NEW_DICTIONARY_FORMAT

1088

satok