Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <string.h>

20

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

21

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

22

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

23

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

24

#include "dictionary.h"

25

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

26

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

27

#ifdef NEW_DICTIONARY_FORMAT

28

#include "binary_format.h"

29

#endif // NEW_DICTIONARY_FORMAT

30

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

31

namespace latinime {

32

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

33

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

38

// TODO: check the header

39

UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

40

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

41

const bool isLatestDictVersion)

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

42

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

43

: DICT_ROOT(streamStart),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

44

#else // NEW_DICTIONARY_FORMAT

45

: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),

46

#endif // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

47

MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

48

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

49

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

50

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

51

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

52

#else // NEW_DICTIONARY_FORMAT

53

// TODO : remove this variable.

54

ROOT_POS(0),

55

#endif // NEW_DICTIONARY_FORMAT

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

56

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

57

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

58

if (DEBUG_DICT) {

59

LOGI("UnigramDictionary - constructor");

60

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

61

}

62

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

63

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

64

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

65

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

66

const int MAX_PROXIMITY_CHARS) {

67

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

68

}

69

70

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

71

72

// There can't be a digraph if we don't have at least 2 characters to examine

73

if (i + 2 > codesSize) return false;

74

75

// Search for the first char of some digraph

76

int lastDigraphIndex = -1;

77

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

78

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

79

lastDigraphIndex >= 0; --lastDigraphIndex) {

80

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

81

}

82

// No match: return early

83

if (lastDigraphIndex < 0) return false;

84

85

// It's an interesting digraph if the second char matches too.

86

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

87

}

88

89

// Mostly the same arguments as the non-recursive version, except:

90

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

91

// codesSize is the size of the user input (thus, it is the size of codesSrc).

92

// codesDest is the current point in the work buffer.

93

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

94

// codesRemain is the remaining size in codesSrc.

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

95

void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

96

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

97

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

98

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

99

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

100

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

101

for (int i = 0; i < codesRemain; ++i) {

102

if (isDigraph(codesSrc, i, codesRemain)) {

103

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

104

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

105

// Copy the word up to the first char of the digraph, then continue processing

106

// on the remaining part of the word, skipping the second char of the digraph.

107

// In our example, copy "pru" and continue running on "fen"

108

// Make i the index of the second char of the digraph for simplicity. Forgetting

109

// to do that results in an infinite recursion so take care!

110

++i;

111

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

112

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

113

codesBuffer, codesBufferSize, flags,

114

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

115

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

116

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

117

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

118

// Copy the second char of the digraph in place, then continue processing on

119

// the remaining part of the word.

120

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

121

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

122

BYTES_IN_ONE_CHAR);

123

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

124

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

125

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

126

outWords, frequencies);

127

return;

128

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

133

// In our example, we'll come here once for "prufen" and then once for "pruefen".

134

// If the word contains several digraphs, we'll come it for the product of them.

135

// eg. if the word is "ueberpruefen" we'll test, in order, against

136

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

137

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

138

if (0 != remainingBytes)

139

memcpy(codesDest, codesSrc, remainingBytes);

140

141

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

142

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

143

}

144

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

145

int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

146

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

147

unsigned short *outWords, int *frequencies) {

148

149

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

150

{ // Incrementally tune the word and try all possibilities

151

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

152

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

153

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

154

} else { // Normal processing

155

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

156

outWords, frequencies);

157

}

158

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

159

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

160

// Get the word count

161

int suggestedWordsCount = 0;

162

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

163

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

168

/// Print the returned words

169

for (int j = 0; j < suggestedWordsCount; ++j) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

170

#ifdef FLAG_DBG

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

171

short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;

172

char s[MAX_WORD_LENGTH];

173

for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];

174

LOGI("%s %i", s, mFrequencies[j]);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

175

#endif

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

176

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

177

LOGI("Next letters: ");

178

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

179

if (mNextLettersFrequency[k] > 0) {

180

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

181

}

182

}

183

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

184

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

185

PROF_CLOSE;

186

return suggestedWordsCount;

187

}

188

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

189

void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

190

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

191

unsigned short *outWords, int *frequencies) {

192

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

193

PROF_OPEN;

194

PROF_START(0);

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

195

initSuggestions(

196

proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

197

if (DEBUG_DICT) assert(codesSize == mInputLength);

198

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

199

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

200

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

201

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

202

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

203

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

204

PROF_END(1);

205

206

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

207

// Suggestion with missing character

208

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

209

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

210

if (DEBUG_DICT) {

211

LOGI("--- Suggest missing characters %d", i);

212

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

213

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

214

}

215

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

216

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

217

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

218

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

219

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

220

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

221

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

222

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

223

if (DEBUG_DICT) {

224

LOGI("--- Suggest excessive characters %d", i);

225

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

226

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

227

}

228

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

229

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

230

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

231

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

232

// Suggestion with transposed characters

233

// Only suggest words that length is mInputLength

234

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

235

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

236

if (DEBUG_DICT) {

237

LOGI("--- Suggest transposed characters %d", i);

238

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

239

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

240

}

241

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

242

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

243

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

244

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

245

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

246

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

247

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

248

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

249

if (DEBUG_DICT) {

250

LOGI("--- Suggest missing space characters %d", i);

251

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

252

getMissingSpaceWords(mInputLength, i);

253

}

254

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

255

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

256

257

PROF_START(6);

Jean Chalard

e93b1f22

2011-06-01 17:12:25 +0900

[diff] [blame]

258

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

259

// The first and last "mistyped spaces" are taken care of by excessive character handling

260

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

261

if (DEBUG_DICT) {

262

LOGI("--- Suggest words with proximity space %d", i);

263

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

264

const int x = xcoordinates[i];

265

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

266

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

267

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

268

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

269

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

270

if (proximityInfo->hasSpaceProximity(x, y)) {

271

getMistypedSpaceWords(mInputLength, i);

272

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

273

}

274

}

275

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

276

}

277

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

278

void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,

279

const int *ycoordinates, const int *codes, const int codesSize,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

280

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

281

if (DEBUG_DICT) {

282

LOGI("initSuggest");

283

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

284

mFrequencies = frequencies;

285

mOutputChars = outWords;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

286

mInputLength = codesSize;

287

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

288

proximityInfo->setInputParams(codes, codesSize);

289

mProximityInfo = proximityInfo;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

290

}

291

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

292

static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

293

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

298

// TODO: We need to optimize addWord by using STL or something

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

299

// TODO: This needs to take an const unsigned short* and not tinker with its contents

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

300

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

301

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

302

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

303

#ifdef FLAG_DBG

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

304

char s[length + 1];

305

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

306

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

307

#endif

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

308

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

309

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

310

if (DEBUG_DICT) {

311

LOGI("Exceeded max word length.");

312

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

313

return false;

314

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

315

316

// Find the right insertion point

317

int insertAt = 0;

318

while (insertAt < MAX_WORDS) {

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

319

// TODO: How should we sort words with the same frequency?

320

if (frequency > mFrequencies[insertAt]) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

326

if (DEBUG_DICT) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

327

#ifdef FLAG_DBG

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

328

char s[length + 1];

329

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

330

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

331

#endif

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

332

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

333

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

334

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

335

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

336

mFrequencies[insertAt] = frequency;

337

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

338

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

339

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

340

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

345

if (DEBUG_DICT) {

346

LOGI("Added word at %d", insertAt);

347

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

353

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

354

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

355

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

356

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

357

const int excessivePos, const int transposedPos, int *nextLetters,

358

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

359

if (DEBUG_DICT) {

360

LOGI("getSuggestionCandidates %d", maxDepth);

361

assert(transposedPos + 1 < mInputLength);

362

assert(excessivePos < mInputLength);

363

assert(missingPos < mInputLength);

364

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

365

int rootPosition = ROOT_POS;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

366

// Get the number of children of root, then increment the position

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

367

int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

368

int depth = 0;

369

370

mStackChildCount[0] = childCount;

371

mStackTraverseAll[0] = (mInputLength <= 0);

372

mStackNodeFreq[0] = 1;

373

mStackInputIndex[0] = 0;

374

mStackDiffs[0] = 0;

375

mStackSiblingPos[0] = rootPosition;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

376

mStackOutputIndex[0] = 0;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

377

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

378

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

379

while (depth >= 0) {

380

if (mStackChildCount[depth] > 0) {

381

--mStackChildCount[depth];

382

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

383

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

384

int inputIndex = mStackInputIndex[depth];

385

int diffs = mStackDiffs[depth];

386

int siblingPos = mStackSiblingPos[depth];

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

387

int outputIndex = mStackOutputIndex[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

388

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

389

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

390

// needsToTraverseChildrenNodes should be false

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

391

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

392

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

393

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

394

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

395

&siblingPos, &outputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

396

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

397

mStackSiblingPos[depth] = siblingPos;

398

if (needsToTraverseChildrenNodes) {

399

// Goes to child node

400

++depth;

401

mStackChildCount[depth] = childCount;

402

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

403

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

404

mStackInputIndex[depth] = inputIndex;

405

mStackDiffs[depth] = diffs;

406

mStackSiblingPos[depth] = firstChildPos;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

407

mStackOutputIndex[depth] = outputIndex;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

408

}

409

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

410

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

416

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

417

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

418

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

419

}

420

421

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

422

inline static void multiplyIntCapped(const int multiplier, int *base) {

423

const int temp = *base;

424

if (temp != S_INT_MAX) {

425

// Branch if multiplier == 2 for the optimization

426

if (multiplier == 2) {

427

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

428

} else {

429

const int tempRetval = temp * multiplier;

430

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

436

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

437

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

438

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

439

int ret = base;

440

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

446

if (*freq != S_INT_MAX) {

447

if (*freq > 1000000) {

448

*freq /= 100;

449

multiplyIntCapped(rate, freq);

450

} else {

451

multiplyIntCapped(rate, freq);

452

*freq /= 100;

453

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

457

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

458

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

459

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

460

if (firstWordLength == 0 || secondWordLength == 0) {

461

return 0;

462

}

463

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

464

int tempFirstFreq = firstFreq;

465

multiplyRate(firstDemotionRate, &tempFirstFreq);

466

467

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

468

int tempSecondFreq = secondFreq;

469

multiplyRate(secondDemotionRate, &tempSecondFreq);

470

471

const int totalLength = firstWordLength + secondWordLength;

472

473

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

474

// length.

475

int totalFreq = tempFirstFreq + tempSecondFreq;

476

477

// This is a workaround to try offsetting the not-enough-demotion which will be done in

478

// calcNormalizedScore in Utils.java.

479

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

480

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

481

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

482

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

483

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

484

485

// At this moment, totalFreq is calculated by the following formula:

486

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

487

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

488

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

489

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

490

491

// This is another workaround to offset the demotion which will be done in

492

// calcNormalizedScore in Utils.java.

493

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

494

// the same amount because we already have adjusted the synthetic freq of this "missing or

495

// mistyped space" suggestion candidate above in this method.

496

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

497

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

498

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

499

if (isSpaceProximity) {

500

// A word pair with one space proximity correction

501

if (DEBUG_DICT) {

502

LOGI("Found a word pair with space proximity correction.");

503

}

504

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

505

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

506

}

507

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

508

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

512

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

513

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

514

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

515

}

516

517

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

518

return getSplitTwoWordsSuggestion(

519

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

520

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

521

}

522

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

523

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

524

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

07a8406

2011-03-03 10:22:10 +0900

[diff] [blame]

525

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

526

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

527

int finalFreq = freq * matchWeight;

Jean Chalard

07a8406

2011-03-03 10:22:10 +0900

[diff] [blame]

528

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

529

if (mInputLength >= 2) {

530

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

531

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

532

/ (10 * mInputLength

533

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

534

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

535

LOGI("Demotion rate for missing character is %d.", demotionRate);

536

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

537

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

07a8406

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

542

if (transposedPos >= 0) multiplyRate(

543

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

544

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

545

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

546

if (!mProximityInfo->existsAdjacentProximityChars(inputIndex)) {

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

547

// If an excessive character is not adjacent to the left char or the right char,

548

// we will demote this word.

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

549

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

550

}

551

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

552

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

553

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

554

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

555

// Full exact match

Jean Chalard

8dc754a

2011-01-27 14:20:22 +0900

[diff] [blame]

556

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

557

if (DEBUG_DICT) {

558

LOGI("Found full matched word.");

559

}

Jean Chalard

8dc754a

2011-01-27 14:20:22 +0900

[diff] [blame]

560

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

561

}

562

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

a5d5849

2011-02-18 17:50:58 +0900

[diff] [blame]

563

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

8dc754a

2011-01-27 14:20:22 +0900

[diff] [blame]

564

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

565

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

566

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

567

if (DEBUG_DICT) {

568

LOGI("Found one proximity correction.");

569

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

570

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

571

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

572

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

573

if (DEBUG_DICT) {

574

LOGI("calc: %d, %d", depth, sameLength);

575

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

576

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

577

return finalFreq;

578

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

579

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

580

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

581

const int inputIndex, const int skipPos, const int depth) {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

582

const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex);

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

583

// Skip the ' or other letter and continue deeper

584

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

585

}

586

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

587

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

588

inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

589

const uint8_t* const root, const uint8_t flags, const int pos,

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

590

const int inputIndex, const int matchWeight, const int skipPos,

591

const int excessivePos, const int transposedPos, const int freq, const bool sameLength,

592

int* nextLetters, const int nextLettersSize) {

593

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

594

const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

595

if (isSameAsTyped) return;

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

596

597

if (depth >= MIN_SUGGEST_DEPTH) {

598

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

599

excessivePos, transposedPos, freq, sameLength);

600

if (!isSameAsTyped)

601

addWord(word, depth + 1, finalFreq);

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

602

}

603

604

if (sameLength && depth >= mInputLength && skipPos < 0) {

605

registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);

}

}

Jean Chalard

2011-06-30 19:47:25 +0900

[diff] [blame]

609

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

610

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

611

const int secondWordLength, const bool isSpaceProximity) {

612

if (inputLength >= MAX_WORD_LENGTH) return false;

613

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

614

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

615

return false;

616

const int newWordLength = firstWordLength + secondWordLength + 1;

617

// Allocating variable length array on stack

618

unsigned short word[newWordLength];

619

const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);

620

if (DEBUG_DICT) {

621

LOGI("First freq: %d", firstFreq);

622

}

623

if (firstFreq <= 0) return false;

624

625

for (int i = 0; i < firstWordLength; ++i) {

word[i] = mWord[i];

}

const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);

630

if (DEBUG_DICT) {

631

LOGI("Second freq: %d", secondFreq);

632

}

633

if (secondFreq <= 0) return false;

634

635

word[firstWordLength] = SPACE;

636

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

637

word[i] = mWord[i - firstWordLength - 1];

638

}

639

640

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

641

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

642

if (DEBUG_DICT) {

643

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

644

TYPED_LETTER_MULTIPLIER);

645

}

646

addWord(word, newWordLength, pairFreq);

return true;

}

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

650

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

651

// The following functions will be entirely replaced with new implementations.

652

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

653

const int excessivePos, const int transposedPos,int *nextLetters,

654

const int nextLettersSize) {

655

int initialPosition = initialPos;

656

const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);

657

getWordsRec(count, initialPosition, 0,

658

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

659

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

660

nextLettersSize);

661

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

662

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

663

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

664

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

665

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

666

const int transposedPos, int *nextLetters, const int nextLettersSize) {

667

int siblingPos = pos;

668

for (int i = 0; i < childrenCount; ++i) {

669

int newCount;

670

int newChildPosition;

671

bool newTraverseAllNodes;

int newMatchRate;

int newInputIndex;

int newDiffs;

int newSiblingPos;

int newOutputIndex;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

678

traverseAllNodes, matchWeight, inputIndex, diffs,

679

skipPos, excessivePos, transposedPos,

680

nextLetters, nextLettersSize,

681

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

682

&newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);

683

siblingPos = newSiblingPos;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

684

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

685

if (needsToTraverseChildrenNodes) {

686

getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,

687

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

688

nextLetters, nextLettersSize);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

689

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

690

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

691

}

692

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

693

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

694

const int inputLength, unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

695

int pos = ROOT_POS;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

696

int count = Dictionary::getCount(DICT_ROOT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

697

int maxFreq = 0;

698

int depth = 0;

699

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

700

bool terminal = false;

701

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

702

mStackChildCount[0] = count;

703

mStackSiblingPos[0] = pos;

704

705

while (depth >= 0) {

706

if (mStackChildCount[depth] > 0) {

707

--mStackChildCount[depth];

708

int firstChildPos;

709

int newFreq;

710

int siblingPos = mStackSiblingPos[depth];

711

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

712

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

713

&siblingPos);

714

mStackSiblingPos[depth] = siblingPos;

715

if (depth == (inputLength - 1)) {

716

// Traverse sibling node

717

if (terminal) {

718

if (newFreq > maxFreq) {

719

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

720

if (DEBUG_DICT && DEBUG_NODE) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

721

#ifdef FLAG_DBG

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

722

char s[inputLength + 1];

723

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

724

s[inputLength] = 0;

725

LOGI("New missing space word found: %d > %d (%s), %d, %d",

726

newFreq, maxFreq, s, inputLength, depth);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

727

#endif

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

733

// Traverse children nodes

734

++depth;

735

mStackChildCount[depth] = count;

736

mStackSiblingPos[depth] = firstChildPos;

737

}

738

} else {

739

// Traverse parent node

740

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

741

}

742

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

743

744

word[inputLength] = 0;

745

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

746

}

747

748

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

749

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

750

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

751

const int inputIndex = startInputIndex + depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

752

unsigned short c;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

753

*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,

754

&c, newChildPosition, newTerminal, newFreq);

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

755

const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

756

if (DEBUG_DICT) {

757

assert(inputC <= U_SHORT_MAX);

758

}

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

759

const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

760

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

761

const bool hasChild = *newChildPosition != 0;

762

if (matched) {

763

word[depth] = c;

764

if (DEBUG_DICT && DEBUG_NODE) {

765

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

766

if (*newTerminal) {

767

LOGI("Terminal %d", *newFreq);

768

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

769

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

770

if (hasChild) {

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

771

*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

778

// Thus newTerminal shouldn't be true.

779

*newTerminal = false;

780

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

781

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

782

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

783

784

// TODO: use uint32_t instead of unsigned short

785

bool UnigramDictionary::isValidWord(unsigned short *word, int length) {

786

if (IS_LATEST_DICT_VERSION) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

787

return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

788

} else {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

789

return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

}

}

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

793

794

// Require strict exact match.

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

795

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

796

int length) const {

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

797

// returns address of bigram data of that word

798

// return -99 if not found

799

800

int count = Dictionary::getCount(DICT_ROOT, &pos);

801

unsigned short currentChar = (unsigned short) word[offset];

802

for (int j = 0; j < count; j++) {

803

unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);

804

int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);

805

int childPos = Dictionary::getAddress(DICT_ROOT, &pos);

806

if (c == currentChar) {

807

if (offset == length - 1) {

if (terminal) {

return (pos+1);

}

} else {

if (childPos != 0) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

813

int t = getBigramPosition(childPos, word, offset + 1, length);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

if (t > 0) {

return t;

}

}

}

}

if (terminal) {

Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);

822

}

823

// There could be two instances of each alphabet - upper and lower case. So continue

824

// looking ...

825

}

826

return NOT_VALID_WORD;

827

}

828

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

829

// The following functions will be modified.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

830

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

831

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

832

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

833

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

834

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

835

int *nextSiblingPosition, int *nextOutputIndex) {

836

if (DEBUG_DICT) {

837

int inputCount = 0;

838

if (skipPos >= 0) ++inputCount;

839

if (excessivePos >= 0) ++inputCount;

840

if (transposedPos >= 0) ++inputCount;

841

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

848

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

849

const int pos = initialPos;

850

const int depth = initialDepth;

851

const int traverseAllNodes = initialTraverseAllNodes;

852

const int diffs = initialDiffs;

853

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

854

const uint8_t flags = 0; // No flags for now

855

856

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

857

858

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

859

&c, &childPosition, &terminal, &freq);

860

*nextOutputIndex = depth + 1;

861

862

const bool needsToTraverseChildrenNodes = childPosition != 0;

863

864

// If we are only doing traverseAllNodes, no need to look at the typed characters.

865

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

866

mWord[depth] = c;

867

if (traverseAllNodes && terminal) {

868

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

869

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

870

}

871

if (!needsToTraverseChildrenNodes) return false;

872

*newTraverseAllNodes = traverseAllNodes;

873

*newMatchRate = matchWeight;

874

*newDiffs = diffs;

875

*newInputIndex = inputIndex;

876

} else {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

877

int inputIndexForProximity = inputIndex;

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

878

879

if (transposedPos >= 0) {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

880

if (inputIndex == transposedPos) ++inputIndexForProximity;

881

if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

882

}

883

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

884

ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(

885

inputIndexForProximity, c, skipPos, excessivePos, transposedPos);

886

if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

887

mWord[depth] = c;

888

// If inputIndex is greater than mInputLength, that means there is no

889

// proximity chars. So, we don't need to check proximity.

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

890

if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

891

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

892

}

893

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

894

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

895

if (isSameAsUserTypedLength && terminal) {

896

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

897

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

898

}

899

if (!needsToTraverseChildrenNodes) return false;

900

// Start traversing all nodes after the index exceeds the user typed length

901

*newTraverseAllNodes = isSameAsUserTypedLength;

902

*newMatchRate = matchWeight;

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

903

*newDiffs = diffs

904

+ ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

905

*newInputIndex = inputIndex + 1;

906

}

907

// Optimization: Prune out words that are too long compared to how much was typed.

908

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

913

// TODO: Check if this can be isSameAsUserTypedLength only.

914

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

915

*newTraverseAllNodes = true;

916

}

917

// get the count of nodes and increment childAddress.

918

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

919

*newChildPosition = childPosition;

920

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

921

return needsToTraverseChildrenNodes;

922

}

923

924

#else // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

925

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

926

// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous

927

// interface.

928

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

929

const int inputLength, unsigned short *word) {

930

uint16_t inWord[inputLength];

931

932

for (int i = 0; i < inputLength; ++i) {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

933

inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

934

}

935

return getMostFrequentWordLikeInner(inWord, inputLength, word);

936

}

937

938

// This function will take the position of a character array within a CharGroup,

939

// and check it actually like-matches the word in inWord starting at startInputIndex,

940

// that is, it matches it with case and accents squashed.

941

// The function returns true if there was a full match, false otherwise.

942

// The function will copy on-the-fly the characters in the CharGroup to outNewWord.

943

// It will also place the end position of the array in outPos; in outInputIndex,

944

// it will place the index of the first char AFTER the match if there was a match,

945

// and the initial position if there was not. It makes sense because if there was

946

// a match we want to continue searching, but if there was not, we want to go to

947

// the next CharGroup.

948

// In and out parameters may point to the same location. This function takes care

949

// not to use any input parameters after it wrote into its outputs.

950

static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,

951

const uint8_t* const root, const int startPos,

952

const uint16_t* const inWord, const int startInputIndex,

953

int32_t* outNewWord, int* outInputIndex, int* outPos) {

954

const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));

955

int pos = startPos;

956

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

957

int32_t baseChar = Dictionary::toBaseLowerCase(character);

958

const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

959

960

if (baseChar != wChar) {

961

*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;

962

*outInputIndex = startInputIndex;

963

return false;

964

}

965

int inputIndex = startInputIndex;

966

outNewWord[inputIndex] = character;

967

if (hasMultipleChars) {

968

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

969

while (NOT_A_CHARACTER != character) {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

970

baseChar = Dictionary::toBaseLowerCase(character);

971

if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

972

*outPos = BinaryFormat::skipOtherCharacters(root, pos);

973

*outInputIndex = startInputIndex;

974

return false;

975

}

976

outNewWord[inputIndex] = character;

977

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

978

}

979

}

980

*outInputIndex = inputIndex + 1;

*outPos = pos;

return true;

}

// This function is invoked when a word like the word searched for is found.

986

// It will compare the frequency to the max frequency, and if greater, will

987

// copy the word into the output buffer. In output value maxFreq, it will

988

// write the new maximum frequency if it changed.

989

static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,

990

short unsigned int* outWord, int* maxFreq) {

991

if (freq > *maxFreq) {

992

for (int q = 0; q < length; ++q)

993

outWord[q] = newWord[q];

outWord[length] = 0;

*maxFreq = freq;

}

}

// Will find the highest frequency of the words like the one passed as an argument,

1000

// that is, everything that only differs by case/accents.

1001

int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,

1002

const int length, short unsigned int* outWord) {

1003

int32_t newWord[MAX_WORD_LENGTH_INTERNAL];

1004

int depth = 0;

1005

int maxFreq = -1;

1006

const uint8_t* const root = DICT_ROOT;

1007

1008

mStackChildCount[0] = root[0];

1009

mStackInputIndex[0] = 0;

1010

mStackSiblingPos[0] = 1;

1011

while (depth >= 0) {

1012

const int charGroupCount = mStackChildCount[depth];

1013

int pos = mStackSiblingPos[depth];

1014

for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {

1015

int inputIndex = mStackInputIndex[depth];

1016

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1017

// Test whether all chars in this group match with the word we are searching for. If so,

1018

// we want to traverse its children (or if the length match, evaluate its frequency).

1019

// Note that this function will output the position regardless, but will only write

1020

// into inputIndex if there is a match.

1021

const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,

1022

inputIndex, newWord, &inputIndex, &pos);

1023

if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {

1024

const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1025

onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);

1026

}

1027

pos = BinaryFormat::skipFrequency(flags, pos);

1028

const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

1029

const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);

1030

// If we had a match and the word has children, we want to traverse them. We don't have

1031

// to traverse words longer than the one we are searching for, since they will not match

1032

// anyway, so don't traverse unless inputIndex < length.

1033

if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {

1034

// Save position for this depth, to get back to this once children are done

1035

mStackChildCount[depth] = charGroupIndex;

1036

mStackSiblingPos[depth] = siblingPos;

1037

// Prepare stack values for next depth

1038

++depth;

1039

int childrenPos = childrenNodePos;

1040

mStackChildCount[depth] =

1041

BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);

1042

mStackSiblingPos[depth] = childrenPos;

1043

mStackInputIndex[depth] = inputIndex;

1044

pos = childrenPos;

1045

// Go to the next depth level.

++depth;

break;

} else {

// No match, or no children, or word too long to ever match: go the next sibling.

pos = siblingPos;

}

}

--depth;

}

return maxFreq;

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1058

bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {

Jean Chalard

6a0e964

2011-07-25 18:17:11 +0900

[diff] [blame^]

1059

return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1060

}

1061

1062

// TODO: remove this function.

1063

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

int length) const {

return -1;

}

// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.

1069

// If the return value is false, then the caller should read in the output "nextSiblingPosition"

1070

// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.

1071

// It is worthy to note that when false is returned, the output values other than

1072

// nextSiblingPosition are undefined.

1073

// If the return value is true, then the caller must proceed to traverse the children of this

1074

// node. processCurrentNode will output the information about the children: their count in

1075

// newCount, their position in newChildrenPosition, the traverseAllNodes flag in

1076

// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the

1077

// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into

1078

// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when

1079

// there aren't any more nodes at this level, it merely returns the address of the first byte after

1080

// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any

1081

// given level, as output into newCount when traversing this level's parent.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1082

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

1083

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

1084

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1085

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1086

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

Jean Chalard

432789a

2011-06-30 17:50:48 +0900

[diff] [blame]

1087

int *nextSiblingPosition, int *newOutputIndex) {

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1088

if (DEBUG_DICT) {

1089

int inputCount = 0;

1090

if (skipPos >= 0) ++inputCount;

1091

if (excessivePos >= 0) ++inputCount;

1092

if (transposedPos >= 0) ++inputCount;

1093

assert(inputCount <= 1);

1094

}

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1095

int pos = initialPos;

1096

int depth = initialDepth;

1097

int traverseAllNodes = initialTraverseAllNodes;

1098

int diffs = initialDiffs;

1099

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1100

// Flags contain the following information:

1101

// - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:

1102

// - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address

1103

// is on the specified number of bytes.

1104

// - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.

1105

// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.

1106

// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)

1107

// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not

1108

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);

1109

const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1110

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1111

// This gets only ONE character from the stream. Next there will be:

1112

// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node

1113

// else if FLAG_IS_TERMINAL: the frequency

1114

// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address

1115

// Note that you can't have a node that both is not a terminal and has no children.

1116

int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);

1117

assert(NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1118

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1119

// We are going to loop through each character and make it look like it's a different

1120

// node each time. To do that, we will process characters in this node in order until

1121

// we find the character terminator. This is signalled by getCharCode* returning

1122

// NOT_A_CHARACTER.

1123

// As a special case, if there is only one character in this node, we must not read the

1124

// next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.

1125

// This way, each loop run will look like a "virtual node".

1126

do {

1127

// We prefetch the next char. If 'c' is the last char of this node, we will have

1128

// NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node

1129

// should behave as a terminal or not and whether we have children.

1130

const int32_t nextc = hasMultipleChars

1131

? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;

1132

const bool isLastChar = (NOT_A_CHARACTER == nextc);

1133

// If there are more chars in this nodes, then this virtual node is not a terminal.

1134

// If we are on the last char, this virtual node is a terminal if this node is.

1135

const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));

1136

// If there are more chars in this node, then this virtual node has children.

1137

// If we are on the last char, this virtual node has children if this node has.

1138

const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1139

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1140

// This has to be done for each virtual char (this forwards the "inputIndex" which

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

1141

// is the index in the user-inputted chars, as read by proximity chars.

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1142

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

1143

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

1144

mWord[depth] = c;

1145

if (traverseAllNodes && isTerminal) {

1146

// The frequency should be here, because we come here only if this is actually

1147

// a terminal node, and we are on its last char.

1148

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1149

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1150

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

1151

}

1152

if (!hasChildren) {

1153

// If we don't have children here, that means we finished processing all

1154

// characters of this node (we are on the last virtual node), AND we are in

1155

// traverseAllNodes mode, which means we are searching for *completions*. We

1156

// should skip the frequency if we have a terminal, and report the position

1157

// of the next sibling. We don't have to return other values because we are

1158

// returning false, as in "don't traverse children".

1159

if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);

1160

*nextSiblingPosition =

1161

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1162

return false;

1163

}

1164

} else {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

1165

int inputIndexForProximity = inputIndex;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1166

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1167

if (transposedPos >= 0) {

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

1168

if (inputIndex == transposedPos) ++inputIndexForProximity;

1169

if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1170

}

1171

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

1172

int matchedProximityCharId = mProximityInfo->getMatchedProximityId(

1173

inputIndexForProximity, c, skipPos, excessivePos, transposedPos);

1174

if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1175

// We found that this is an unrelated character, so we should give up traversing

1176

// this node and its children entirely.

1177

// However we may not be on the last virtual node yet so we skip the remaining

1178

// characters in this node, the frequency if it's there, read the next sibling

1179

// position to output it, then return false.

1180

// We don't have to output other values because we return false, as in

1181

// "don't traverse children".

1182

if (!isLastChar) {

1183

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1184

}

1185

pos = BinaryFormat::skipFrequency(flags, pos);

1186

*nextSiblingPosition =

1187

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

mWord[depth] = c;

// If inputIndex is greater than mInputLength, that means there is no

1192

// proximity chars. So, we don't need to check proximity.

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

1193

if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1194

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

1195

}

1196

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

1197

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

1198

if (isSameAsUserTypedLength && isTerminal) {

1199

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1200

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1201

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

1202

}

1203

// This character matched the typed character (enough to traverse the node at least)

1204

// so we just evaluated it. Now we should evaluate this virtual node's children - that

1205

// is, if it has any. If it has no children, we're done here - so we skip the end of

1206

// the node, output the siblings position, and return false "don't traverse children".

1207

// Note that !hasChildren implies isLastChar, so we know we don't have to skip any

1208

// remaining char in this group for there can't be any.

1209

if (!hasChildren) {

1210

pos = BinaryFormat::skipFrequency(flags, pos);

1211

*nextSiblingPosition =

1212

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1213

return false;

1214

}

1215

// Start traversing all nodes after the index exceeds the user typed length

1216

traverseAllNodes = isSameAsUserTypedLength;

satok

2011-07-14 15:43:42 +0900

[diff] [blame]

1217

diffs = diffs

1218

+ ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1219

// Finally, we are ready to go to the next character, the next "virtual node".

1220

// We should advance the input index.

1221

// We do this in this branch of the 'if traverseAllNodes' because we are still matching

1222

// characters to input; the other branch is not matching them but searching for

1223

// completions, this is why it does not have to do it.

1224

++inputIndex;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1225

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1226

// Optimization: Prune out words that are too long compared to how much was typed.

1227

if (depth >= maxDepth || diffs > mMaxEditDistance) {

1228

// We are giving up parsing this node and its children. Skip the rest of the node,

1229

// output the sibling position, and return that we don't want to traverse children.

1230

if (!isLastChar) {

1231

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1232

}

1233

pos = BinaryFormat::skipFrequency(flags, pos);

1234

*nextSiblingPosition =

1235

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1236

return false;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1237

}

1238

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1239

// Prepare for the next character. Promote the prefetched char to current char - the loop

1240

// will take care of prefetching the next. If we finally found our last char, nextc will

1241

// contain NOT_A_CHARACTER.

1242

c = nextc;

1243

// Also, the next char is one "virtual node" depth more than this char.

1244

++depth;

1245

} while (NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1246

1247

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1248

// Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.

1249

if (mInputLength <= *newInputIndex) {

1250

traverseAllNodes = true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1251

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1252

1253

// All the output values that are purely computation by this function are held in local

1254

// variables. Output them to the caller.

1255

*newTraverseAllNodes = traverseAllNodes;

1256

*newMatchRate = matchWeight;

1257

*newDiffs = diffs;

1258

*newInputIndex = inputIndex;

1259

*newOutputIndex = depth;

1260

1261

// Now we finished processing this node, and we want to traverse children. If there are no

1262

// children, we can't come here.

1263

assert(BinaryFormat::hasChildrenInFlags(flags));

1264

1265

// If this node was a terminal it still has the frequency under the pointer (it may have been

1266

// read, but not skipped - see readFrequencyWithoutMovingPointer).

1267

// Next come the children position, then possibly attributes (attributes are bigrams only for

1268

// now, maybe something related to shortcuts in the future).

1269

// Once this is read, we still need to output the number of nodes in the immediate children of

1270

// this node, so we read and output it before returning true, as in "please traverse children".

1271

pos = BinaryFormat::skipFrequency(flags, pos);

1272

int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);

1273

*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1274

*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);

1275

*newChildrenPosition = childrenPos;

1276

return true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1277

}

1278

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

1279

#endif // NEW_DICTIONARY_FORMAT

1280

satok