Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <string.h>

20

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

21

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

22

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

23

#include "basechars.h"

24

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

25

#include "dictionary.h"

26

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

namespace latinime {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

30

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

35

// TODO: check the header

36

UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

37

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

38

const bool isLatestDictVersion)

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

39

: DICT_ROOT(streamStart),

40

MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

42

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

43

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

44

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),

45

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

46

if (DEBUG_DICT) {

47

LOGI("UnigramDictionary - constructor");

48

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

49

}

50

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

51

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

52

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

53

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

54

const int MAX_PROXIMITY_CHARS) {

55

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

56

}

57

58

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

59

60

// There can't be a digraph if we don't have at least 2 characters to examine

61

if (i + 2 > codesSize) return false;

62

63

// Search for the first char of some digraph

64

int lastDigraphIndex = -1;

65

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

66

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

67

lastDigraphIndex >= 0; --lastDigraphIndex) {

68

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

69

}

70

// No match: return early

71

if (lastDigraphIndex < 0) return false;

72

73

// It's an interesting digraph if the second char matches too.

74

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

75

}

76

77

// Mostly the same arguments as the non-recursive version, except:

78

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

79

// codesSize is the size of the user input (thus, it is the size of codesSrc).

80

// codesDest is the current point in the work buffer.

81

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

82

// codesRemain is the remaining size in codesSrc.

83

void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,

84

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

85

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

86

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

87

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

88

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

89

for (int i = 0; i < codesRemain; ++i) {

90

if (isDigraph(codesSrc, i, codesRemain)) {

91

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

92

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

93

// Copy the word up to the first char of the digraph, then continue processing

94

// on the remaining part of the word, skipping the second char of the digraph.

95

// In our example, copy "pru" and continue running on "fen"

96

// Make i the index of the second char of the digraph for simplicity. Forgetting

97

// to do that results in an infinite recursion so take care!

98

++i;

99

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

100

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

101

codesBuffer, codesBufferSize, flags,

102

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

103

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

104

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

105

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

106

// Copy the second char of the digraph in place, then continue processing on

107

// the remaining part of the word.

108

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

109

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

110

BYTES_IN_ONE_CHAR);

111

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

112

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

113

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

114

outWords, frequencies);

115

return;

116

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

121

// In our example, we'll come here once for "prufen" and then once for "pruefen".

122

// If the word contains several digraphs, we'll come it for the product of them.

123

// eg. if the word is "ueberpruefen" we'll test, in order, against

124

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

125

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

126

if (0 != remainingBytes)

127

memcpy(codesDest, codesSrc, remainingBytes);

128

129

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

130

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

131

}

132

133

int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,

134

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

135

unsigned short *outWords, int *frequencies) {

136

137

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

138

{ // Incrementally tune the word and try all possibilities

139

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

140

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

141

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

142

} else { // Normal processing

143

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

144

outWords, frequencies);

145

}

146

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

147

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

148

// Get the word count

149

int suggestedWordsCount = 0;

150

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

151

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

156

LOGI("Next letters: ");

157

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

158

if (mNextLettersFrequency[k] > 0) {

159

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

160

}

161

}

162

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

163

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

164

PROF_CLOSE;

165

return suggestedWordsCount;

166

}

167

168

void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,

169

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

170

unsigned short *outWords, int *frequencies) {

171

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

172

PROF_OPEN;

173

PROF_START(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

174

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

175

if (DEBUG_DICT) assert(codesSize == mInputLength);

176

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

177

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

178

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

179

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

180

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

181

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

182

PROF_END(1);

183

184

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

185

// Suggestion with missing character

186

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

187

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

188

if (DEBUG_DICT) {

189

LOGI("--- Suggest missing characters %d", i);

190

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

191

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

192

}

193

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

194

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

195

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

196

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

197

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

198

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

199

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

200

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

201

if (DEBUG_DICT) {

202

LOGI("--- Suggest excessive characters %d", i);

203

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

204

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

205

}

206

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

207

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

208

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

209

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

210

// Suggestion with transposed characters

211

// Only suggest words that length is mInputLength

212

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

213

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

214

if (DEBUG_DICT) {

215

LOGI("--- Suggest transposed characters %d", i);

216

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

217

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

218

}

219

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

220

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

221

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

222

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

223

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

224

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

225

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

226

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

227

if (DEBUG_DICT) {

228

LOGI("--- Suggest missing space characters %d", i);

229

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

230

getMissingSpaceWords(mInputLength, i);

231

}

232

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

233

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

234

235

PROF_START(6);

Jean Chalard

e93b1f22

2011-06-01 17:12:25 +0900

[diff] [blame]

236

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

237

// The first and last "mistyped spaces" are taken care of by excessive character handling

238

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

239

if (DEBUG_DICT) {

240

LOGI("--- Suggest words with proximity space %d", i);

241

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

242

const int x = xcoordinates[i];

243

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

244

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

245

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

246

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

247

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

248

if (proximityInfo->hasSpaceProximity(x, y)) {

249

getMistypedSpaceWords(mInputLength, i);

250

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

251

}

252

}

253

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

254

}

255

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

256

void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,

257

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

258

if (DEBUG_DICT) {

259

LOGI("initSuggest");

260

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

261

mFrequencies = frequencies;

262

mOutputChars = outWords;

263

mInputCodes = codes;

264

mInputLength = codesSize;

265

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

266

}

267

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

268

static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

269

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

274

// TODO: We need to optimize addWord by using STL or something

Jean Chalard

ca5ef28

2011-06-17 15:36:26 +0900

[diff] [blame]

275

// TODO: This needs to take an const unsigned short* and not tinker with its contents

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

276

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

277

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

278

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

279

char s[length + 1];

280

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

281

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

282

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

283

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

284

if (DEBUG_DICT) {

285

LOGI("Exceeded max word length.");

286

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

287

return false;

288

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

289

290

// Find the right insertion point

291

int insertAt = 0;

292

while (insertAt < MAX_WORDS) {

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

293

// TODO: How should we sort words with the same frequency?

294

if (frequency > mFrequencies[insertAt]) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

300

if (DEBUG_DICT) {

301

char s[length + 1];

302

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

303

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

304

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

305

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

306

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

307

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

308

mFrequencies[insertAt] = frequency;

309

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

310

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

311

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

312

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

317

if (DEBUG_DICT) {

318

LOGI("Added word at %d", insertAt);

319

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

325

inline void UnigramDictionary::addWordAlternatesSpellings(const uint8_t* const root, int pos,

326

int depth, int finalFreq) {

327

// TODO: actually add alternates when the format supports it.

328

}

329

330

static inline bool hasAlternateSpellings(uint8_t flags) {

331

// TODO: when the format supports it, return the actual value.

return false;

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

335

static inline unsigned short toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

336

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

337

c = BASE_CHARS[c];

338

}

339

if (c >='A' && c <= 'Z') {

340

c |= 32;

341

} else if (c > 127) {

342

c = latin_tolower(c);

}

return c;

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

347

bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

348

if (length != mInputLength) {

349

return false;

350

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

351

const int *inputCodes = mInputCodes;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

352

while (length--) {

353

if ((unsigned int) *inputCodes != (unsigned int) *word) {

354

return false;

355

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

356

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

362

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

363

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

364

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

365

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

366

const int excessivePos, const int transposedPos, int *nextLetters,

367

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

368

if (DEBUG_DICT) {

369

LOGI("getSuggestionCandidates %d", maxDepth);

370

assert(transposedPos + 1 < mInputLength);

371

assert(excessivePos < mInputLength);

372

assert(missingPos < mInputLength);

373

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

374

int rootPosition = ROOT_POS;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

375

// Get the number of child of root, then increment the position

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

376

int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

377

int depth = 0;

378

379

mStackChildCount[0] = childCount;

380

mStackTraverseAll[0] = (mInputLength <= 0);

381

mStackNodeFreq[0] = 1;

382

mStackInputIndex[0] = 0;

383

mStackDiffs[0] = 0;

384

mStackSiblingPos[0] = rootPosition;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

385

mStackOutputIndex[0] = 0;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

386

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

387

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

388

while (depth >= 0) {

389

if (mStackChildCount[depth] > 0) {

390

--mStackChildCount[depth];

391

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

392

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

393

int inputIndex = mStackInputIndex[depth];

394

int diffs = mStackDiffs[depth];

395

int siblingPos = mStackSiblingPos[depth];

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

396

int outputIndex = mStackOutputIndex[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

397

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

398

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

399

// needsToTraverseChildrenNodes should be false

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

400

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

401

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

402

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

403

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

404

&siblingPos, &outputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

405

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

406

mStackSiblingPos[depth] = siblingPos;

407

if (needsToTraverseChildrenNodes) {

408

// Goes to child node

409

++depth;

410

mStackChildCount[depth] = childCount;

411

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

412

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

413

mStackInputIndex[depth] = inputIndex;

414

mStackDiffs[depth] = diffs;

415

mStackSiblingPos[depth] = firstChildPos;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

416

mStackOutputIndex[depth] = outputIndex;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

417

}

418

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

419

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

425

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

426

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

427

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

428

}

429

430

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

431

inline static void multiplyIntCapped(const int multiplier, int *base) {

432

const int temp = *base;

433

if (temp != S_INT_MAX) {

434

// Branch if multiplier == 2 for the optimization

435

if (multiplier == 2) {

436

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

437

} else {

438

const int tempRetval = temp * multiplier;

439

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

445

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

446

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

447

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

448

int ret = base;

449

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

455

if (*freq != S_INT_MAX) {

456

if (*freq > 1000000) {

457

*freq /= 100;

458

multiplyIntCapped(rate, freq);

459

} else {

460

multiplyIntCapped(rate, freq);

461

*freq /= 100;

462

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

466

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

467

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

468

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

469

if (firstWordLength == 0 || secondWordLength == 0) {

470

return 0;

471

}

472

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

473

int tempFirstFreq = firstFreq;

474

multiplyRate(firstDemotionRate, &tempFirstFreq);

475

476

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

477

int tempSecondFreq = secondFreq;

478

multiplyRate(secondDemotionRate, &tempSecondFreq);

479

480

const int totalLength = firstWordLength + secondWordLength;

481

482

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

483

// length.

484

int totalFreq = tempFirstFreq + tempSecondFreq;

485

486

// This is a workaround to try offsetting the not-enough-demotion which will be done in

487

// calcNormalizedScore in Utils.java.

488

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

489

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

490

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

491

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

492

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

493

494

// At this moment, totalFreq is calculated by the following formula:

495

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

496

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

497

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

498

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

499

500

// This is another workaround to offset the demotion which will be done in

501

// calcNormalizedScore in Utils.java.

502

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

503

// the same amount because we already have adjusted the synthetic freq of this "missing or

504

// mistyped space" suggestion candidate above in this method.

505

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

506

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

507

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

508

if (isSpaceProximity) {

509

// A word pair with one space proximity correction

510

if (DEBUG_DICT) {

511

LOGI("Found a word pair with space proximity correction.");

512

}

513

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

514

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

515

}

516

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

517

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

521

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

522

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

523

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

524

}

525

526

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

527

return getSplitTwoWordsSuggestion(

528

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

529

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

530

}

531

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

532

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

533

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

534

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

535

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

536

int finalFreq = freq * matchWeight;

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

537

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

538

if (mInputLength >= 2) {

539

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

540

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

541

/ (10 * mInputLength

542

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

543

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

544

LOGI("Demotion rate for missing character is %d.", demotionRate);

545

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

546

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

551

if (transposedPos >= 0) multiplyRate(

552

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

553

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

554

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

555

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

556

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

557

}

558

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

559

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

560

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

561

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

562

// Full exact match

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

563

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

564

if (DEBUG_DICT) {

565

LOGI("Found full matched word.");

566

}

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

567

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

568

}

569

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

570

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

571

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

572

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

573

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

574

if (DEBUG_DICT) {

575

LOGI("Found one proximity correction.");

576

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

577

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

578

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

579

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

580

if (DEBUG_DICT) {

581

LOGI("calc: %d, %d", depth, sameLength);

582

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

583

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

584

return finalFreq;

585

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

586

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

587

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

588

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

589

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

590

// Skip the ' or other letter and continue deeper

591

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

592

}

593

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

594

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

595

const int inputLength) const {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

596

if (inputIndex < 0 || inputIndex >= inputLength) return false;

597

const int currentChar = *getInputCharsAt(inputIndex);

598

const int leftIndex = inputIndex - 1;

599

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

600

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

601

int i = 0;

602

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

603

if (leftChars[i++] == currentChar) return true;

604

}

605

}

606

const int rightIndex = inputIndex + 1;

607

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

608

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

609

int i = 0;

610

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

611

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

617

// In the following function, c is the current character of the dictionary word

618

// currently examined.

619

// currentChars is an array containing the keys close to the character the

620

// user actually typed at the same position. We want to see if c is in it: if so,

621

// then the word contains at that position a character close to what the user

622

// typed.

623

// What the user typed is actually the first character of the array.

624

// Notice : accented characters do not have a proximity list, so they are alone

625

// in their list. The non-accented version of the character should be considered

626

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

627

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

628

const int *currentChars, const unsigned short c, const int skipPos,

629

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

630

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

631

632

// The first char in the array is what user typed. If it matches right away,

633

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

634

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

635

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

636

637

// If one of those is true, we should not check for close characters at all.

638

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

639

return UNRELATED_CHAR;

640

641

// If the non-accented, lowercased version of that first character matches c,

642

// then we have a non-accented version of the accented character the user

643

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

644

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

645

return NEAR_PROXIMITY_CHAR;

646

647

// Not an exact nor an accent-alike match: search the list of close keys

648

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

649

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

650

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

651

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

652

++j;

653

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

654

655

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

656

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

657

}

658

Jean Chalard

ca5ef28

2011-06-17 15:36:26 +0900

[diff] [blame]

659

inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,

660

const uint8_t* const root, const uint8_t flags, int pos,

661

const int inputIndex, const int matchWeight, const int skipPos,

662

const int excessivePos, const int transposedPos, const int freq, const bool sameLength,

663

int* nextLetters, const int nextLettersSize) {

664

665

const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;

666

const bool hasAlternates = hasAlternateSpellings(flags);

667

if (isSameAsTyped && !hasAlternates) return;

668

669

if (depth >= MIN_SUGGEST_DEPTH) {

670

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

671

excessivePos, transposedPos, freq, sameLength);

672

if (!isSameAsTyped)

673

addWord(word, depth + 1, finalFreq);

674

if (hasAlternates)

675

addWordAlternatesSpellings(DICT_ROOT, pos, flags, finalFreq);

676

}

677

678

if (sameLength && depth >= mInputLength && skipPos < 0) {

679

registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);

}

}

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

683

#ifndef NEW_DICTIONARY_FORMAT

684

// TODO: Don't forget to bring inline functions back to over where they are used.

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

685

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

686

// The following functions will be entirely replaced with new implementations.

687

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

688

const int excessivePos, const int transposedPos,int *nextLetters,

689

const int nextLettersSize) {

690

int initialPosition = initialPos;

691

const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);

692

getWordsRec(count, initialPosition, 0,

693

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

694

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

695

nextLettersSize);

696

}

Jean Chalard

ca5ef28

2011-06-17 15:36:26 +0900

[diff] [blame]

697

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

698

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

699

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

700

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

701

const int transposedPos, int *nextLetters, const int nextLettersSize) {

702

int siblingPos = pos;

703

for (int i = 0; i < childrenCount; ++i) {

704

int newCount;

705

int newChildPosition;

706

bool newTraverseAllNodes;

int newMatchRate;

int newInputIndex;

int newDiffs;

int newSiblingPos;

int newOutputIndex;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

713

traverseAllNodes, matchWeight, inputIndex, diffs,

714

skipPos, excessivePos, transposedPos,

715

nextLetters, nextLettersSize,

716

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

717

&newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);

718

siblingPos = newSiblingPos;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

719

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

720

if (needsToTraverseChildrenNodes) {

721

getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,

722

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

723

nextLetters, nextLettersSize);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

724

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

725

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

726

}

727

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

728

inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,

729

unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

730

int pos = ROOT_POS;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

731

int count = Dictionary::getCount(DICT_ROOT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

732

int maxFreq = 0;

733

int depth = 0;

734

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

735

bool terminal = false;

736

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

737

mStackChildCount[0] = count;

738

mStackSiblingPos[0] = pos;

739

740

while (depth >= 0) {

741

if (mStackChildCount[depth] > 0) {

742

--mStackChildCount[depth];

743

int firstChildPos;

744

int newFreq;

745

int siblingPos = mStackSiblingPos[depth];

746

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

747

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

748

&siblingPos);

749

mStackSiblingPos[depth] = siblingPos;

750

if (depth == (inputLength - 1)) {

751

// Traverse sibling node

752

if (terminal) {

753

if (newFreq > maxFreq) {

754

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

755

if (DEBUG_DICT && DEBUG_NODE) {

756

char s[inputLength + 1];

757

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

758

s[inputLength] = 0;

759

LOGI("New missing space word found: %d > %d (%s), %d, %d",

760

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

766

// Traverse children nodes

767

++depth;

768

mStackChildCount[depth] = count;

769

mStackSiblingPos[depth] = firstChildPos;

770

}

771

} else {

772

// Traverse parent node

773

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

774

}

775

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

776

777

word[inputLength] = 0;

778

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

779

}

780

781

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

782

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

783

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

784

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

785

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

786

unsigned short c;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

787

*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,

788

&c, newChildPosition, newTerminal, newFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

789

const unsigned int inputC = currentChars[0];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

790

if (DEBUG_DICT) {

791

assert(inputC <= U_SHORT_MAX);

792

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

793

const unsigned short baseLowerC = toBaseLowerCase(c);

794

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

795

const bool hasChild = *newChildPosition != 0;

796

if (matched) {

797

word[depth] = c;

798

if (DEBUG_DICT && DEBUG_NODE) {

799

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

800

if (*newTerminal) {

801

LOGI("Terminal %d", *newFreq);

802

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

803

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

804

if (hasChild) {

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

805

*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

812

// Thus newTerminal shouldn't be true.

813

*newTerminal = false;

814

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

815

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

816

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

817

818

// TODO: use uint32_t instead of unsigned short

819

bool UnigramDictionary::isValidWord(unsigned short *word, int length) {

820

if (IS_LATEST_DICT_VERSION) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

821

return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

822

} else {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

823

return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

}

}

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

827

828

// Require strict exact match.

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

829

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

830

int length) const {

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

831

// returns address of bigram data of that word

832

// return -99 if not found

833

834

int count = Dictionary::getCount(DICT_ROOT, &pos);

835

unsigned short currentChar = (unsigned short) word[offset];

836

for (int j = 0; j < count; j++) {

837

unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);

838

int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);

839

int childPos = Dictionary::getAddress(DICT_ROOT, &pos);

840

if (c == currentChar) {

841

if (offset == length - 1) {

if (terminal) {

return (pos+1);

}

} else {

if (childPos != 0) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

847

int t = getBigramPosition(childPos, word, offset + 1, length);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

if (t > 0) {

return t;

}

}

}

}

if (terminal) {

Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);

856

}

857

// There could be two instances of each alphabet - upper and lower case. So continue

858

// looking ...

859

}

860

return NOT_VALID_WORD;

861

}

862

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

863

864

// The following functions will be modified.

865

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

866

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

867

const int secondWordLength, const bool isSpaceProximity) {

868

if (inputLength >= MAX_WORD_LENGTH) return false;

869

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

870

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

871

return false;

872

const int newWordLength = firstWordLength + secondWordLength + 1;

873

// Allocating variable length array on stack

874

unsigned short word[newWordLength];

875

const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord);

876

if (DEBUG_DICT) {

877

LOGI("First freq: %d", firstFreq);

878

}

879

if (firstFreq <= 0) return false;

880

881

for (int i = 0; i < firstWordLength; ++i) {

word[i] = mWord[i];

}

const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord);

886

if (DEBUG_DICT) {

887

LOGI("Second freq: %d", secondFreq);

888

}

889

if (secondFreq <= 0) return false;

890

891

word[firstWordLength] = SPACE;

892

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

893

word[i] = mWord[i - firstWordLength - 1];

894

}

895

896

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

897

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

898

if (DEBUG_DICT) {

899

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

900

TYPED_LETTER_MULTIPLIER);

901

}

902

addWord(word, newWordLength, pairFreq);

return true;

}

inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,

907

const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,

908

const int diffs, const int skipPos, const int excessivePos, const int transposedPos,

909

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

910

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

911

int *nextSiblingPosition, int *nextOutputIndex) {

912

if (DEBUG_DICT) {

913

int inputCount = 0;

914

if (skipPos >= 0) ++inputCount;

915

if (excessivePos >= 0) ++inputCount;

916

if (transposedPos >= 0) ++inputCount;

917

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

924

925

const uint8_t flags = 0; // No flags for now

926

927

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

928

929

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

930

&c, &childPosition, &terminal, &freq);

931

*nextOutputIndex = depth + 1;

932

933

const bool needsToTraverseChildrenNodes = childPosition != 0;

934

935

// If we are only doing traverseAllNodes, no need to look at the typed characters.

936

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

937

mWord[depth] = c;

938

if (traverseAllNodes && terminal) {

939

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

940

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

941

}

942

if (!needsToTraverseChildrenNodes) return false;

943

*newTraverseAllNodes = traverseAllNodes;

944

*newMatchRate = matchWeight;

945

*newDiffs = diffs;

946

*newInputIndex = inputIndex;

947

} else {

948

const int *currentChars = getInputCharsAt(inputIndex);

949

950

if (transposedPos >= 0) {

951

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

952

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

953

}

954

955

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

956

transposedPos);

957

if (UNRELATED_CHAR == matchedProximityCharId) return false;

958

mWord[depth] = c;

959

// If inputIndex is greater than mInputLength, that means there is no

960

// proximity chars. So, we don't need to check proximity.

961

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

962

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

963

}

964

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

965

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

966

if (isSameAsUserTypedLength && terminal) {

967

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

968

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

969

}

970

if (!needsToTraverseChildrenNodes) return false;

971

// Start traversing all nodes after the index exceeds the user typed length

972

*newTraverseAllNodes = isSameAsUserTypedLength;

973

*newMatchRate = matchWeight;

974

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

975

*newInputIndex = inputIndex + 1;

976

}

977

// Optimization: Prune out words that are too long compared to how much was typed.

978

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

983

// TODO: Check if this can be isSameAsUserTypedLength only.

984

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

985

*newTraverseAllNodes = true;

986

}

987

// get the count of nodes and increment childAddress.

988

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

989

*newChildPosition = childPosition;

990

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

991

return needsToTraverseChildrenNodes;

992

}

993

994

#else // NEW_DICTIONARY_FORMAT

995

#endif // NEW_DICTIONARY_FORMAT

996

satok