Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <string.h>

20

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

21

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

22

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

23

#include "basechars.h"

24

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

25

#include "dictionary.h"

26

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

27

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

28

#ifdef NEW_DICTIONARY_FORMAT

29

#include "binary_format.h"

30

#endif // NEW_DICTIONARY_FORMAT

31

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

32

namespace latinime {

33

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

34

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

39

// TODO: check the header

40

UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

const bool isLatestDictVersion)

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

43

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

44

: DICT_ROOT(streamStart),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

45

#else // NEW_DICTIONARY_FORMAT

46

: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),

47

#endif // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

48

MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

49

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

50

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

51

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

52

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

53

#else // NEW_DICTIONARY_FORMAT

54

// TODO : remove this variable.

55

ROOT_POS(0),

56

#endif // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

57

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),

58

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

59

if (DEBUG_DICT) {

60

LOGI("UnigramDictionary - constructor");

61

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

62

}

63

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

64

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

65

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

66

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

67

const int MAX_PROXIMITY_CHARS) {

68

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

69

}

70

71

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

72

73

// There can't be a digraph if we don't have at least 2 characters to examine

74

if (i + 2 > codesSize) return false;

75

76

// Search for the first char of some digraph

77

int lastDigraphIndex = -1;

78

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

79

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

80

lastDigraphIndex >= 0; --lastDigraphIndex) {

81

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

82

}

83

// No match: return early

84

if (lastDigraphIndex < 0) return false;

85

86

// It's an interesting digraph if the second char matches too.

87

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

88

}

89

90

// Mostly the same arguments as the non-recursive version, except:

91

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

92

// codesSize is the size of the user input (thus, it is the size of codesSrc).

93

// codesDest is the current point in the work buffer.

94

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

95

// codesRemain is the remaining size in codesSrc.

96

void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,

97

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

98

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

99

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

100

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

101

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

102

for (int i = 0; i < codesRemain; ++i) {

103

if (isDigraph(codesSrc, i, codesRemain)) {

104

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

105

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

106

// Copy the word up to the first char of the digraph, then continue processing

107

// on the remaining part of the word, skipping the second char of the digraph.

108

// In our example, copy "pru" and continue running on "fen"

109

// Make i the index of the second char of the digraph for simplicity. Forgetting

110

// to do that results in an infinite recursion so take care!

111

++i;

112

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

113

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

114

codesBuffer, codesBufferSize, flags,

115

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

116

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

117

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

118

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

119

// Copy the second char of the digraph in place, then continue processing on

120

// the remaining part of the word.

121

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

122

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

123

BYTES_IN_ONE_CHAR);

124

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

125

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

126

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

127

outWords, frequencies);

128

return;

129

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

134

// In our example, we'll come here once for "prufen" and then once for "pruefen".

135

// If the word contains several digraphs, we'll come it for the product of them.

136

// eg. if the word is "ueberpruefen" we'll test, in order, against

137

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

138

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

139

if (0 != remainingBytes)

140

memcpy(codesDest, codesSrc, remainingBytes);

141

142

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

143

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

144

}

145

146

int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,

147

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

148

unsigned short *outWords, int *frequencies) {

149

150

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

151

{ // Incrementally tune the word and try all possibilities

152

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

153

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

154

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

155

} else { // Normal processing

156

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

157

outWords, frequencies);

158

}

159

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

160

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

161

// Get the word count

162

int suggestedWordsCount = 0;

163

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

164

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

169

/// Print the returned words

170

for (int j = 0; j < suggestedWordsCount; ++j) {

171

short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;

172

char s[MAX_WORD_LENGTH];

173

for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];

174

LOGI("%s %i", s, mFrequencies[j]);

175

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

176

LOGI("Next letters: ");

177

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

178

if (mNextLettersFrequency[k] > 0) {

179

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

180

}

181

}

182

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

183

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

184

PROF_CLOSE;

185

return suggestedWordsCount;

186

}

187

188

void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,

189

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

190

unsigned short *outWords, int *frequencies) {

191

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

192

PROF_OPEN;

193

PROF_START(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

194

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

195

if (DEBUG_DICT) assert(codesSize == mInputLength);

196

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

197

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

198

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

199

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

200

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

201

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

202

PROF_END(1);

203

204

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

205

// Suggestion with missing character

206

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

207

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

208

if (DEBUG_DICT) {

209

LOGI("--- Suggest missing characters %d", i);

210

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

211

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

212

}

213

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

214

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

215

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

216

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

217

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

218

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

219

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

220

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

221

if (DEBUG_DICT) {

222

LOGI("--- Suggest excessive characters %d", i);

223

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

224

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

225

}

226

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

227

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

228

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

229

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

230

// Suggestion with transposed characters

231

// Only suggest words that length is mInputLength

232

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

233

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

234

if (DEBUG_DICT) {

235

LOGI("--- Suggest transposed characters %d", i);

236

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

237

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

238

}

239

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

240

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

241

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

242

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

243

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

244

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

245

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

246

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

247

if (DEBUG_DICT) {

248

LOGI("--- Suggest missing space characters %d", i);

249

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

250

getMissingSpaceWords(mInputLength, i);

251

}

252

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

253

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

254

255

PROF_START(6);

Jean Chalard

e93b1f22

2011-06-01 17:12:25 +0900

[diff] [blame]

256

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

257

// The first and last "mistyped spaces" are taken care of by excessive character handling

258

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

259

if (DEBUG_DICT) {

260

LOGI("--- Suggest words with proximity space %d", i);

261

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

262

const int x = xcoordinates[i];

263

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

264

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

265

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

266

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

267

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

268

if (proximityInfo->hasSpaceProximity(x, y)) {

269

getMistypedSpaceWords(mInputLength, i);

270

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

271

}

272

}

273

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

274

}

275

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

276

void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,

277

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

278

if (DEBUG_DICT) {

279

LOGI("initSuggest");

280

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

281

mFrequencies = frequencies;

282

mOutputChars = outWords;

283

mInputCodes = codes;

284

mInputLength = codesSize;

285

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

286

}

287

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

288

static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

289

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

294

// TODO: We need to optimize addWord by using STL or something

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

295

// TODO: This needs to take an const unsigned short* and not tinker with its contents

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

296

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

297

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

298

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

299

char s[length + 1];

300

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

301

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

302

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

303

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

304

if (DEBUG_DICT) {

305

LOGI("Exceeded max word length.");

306

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

307

return false;

308

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

309

310

// Find the right insertion point

311

int insertAt = 0;

312

while (insertAt < MAX_WORDS) {

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

313

// TODO: How should we sort words with the same frequency?

314

if (frequency > mFrequencies[insertAt]) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

320

if (DEBUG_DICT) {

321

char s[length + 1];

322

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

323

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

324

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

325

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

326

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

327

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

328

mFrequencies[insertAt] = frequency;

329

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

330

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

331

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

332

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

337

if (DEBUG_DICT) {

338

LOGI("Added word at %d", insertAt);

339

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

345

static inline unsigned short toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

346

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

347

c = BASE_CHARS[c];

348

}

349

if (c >='A' && c <= 'Z') {

350

c |= 32;

351

} else if (c > 127) {

352

c = latin_tolower(c);

}

return c;

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

357

bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

358

if (length != mInputLength) {

359

return false;

360

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

361

const int *inputCodes = mInputCodes;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

362

while (length--) {

363

if ((unsigned int) *inputCodes != (unsigned int) *word) {

364

return false;

365

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

366

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

372

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

373

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

374

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

375

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

376

const int excessivePos, const int transposedPos, int *nextLetters,

377

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

378

if (DEBUG_DICT) {

379

LOGI("getSuggestionCandidates %d", maxDepth);

380

assert(transposedPos + 1 < mInputLength);

381

assert(excessivePos < mInputLength);

382

assert(missingPos < mInputLength);

383

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

384

int rootPosition = ROOT_POS;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

385

// Get the number of children of root, then increment the position

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

386

int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

387

int depth = 0;

388

389

mStackChildCount[0] = childCount;

390

mStackTraverseAll[0] = (mInputLength <= 0);

391

mStackNodeFreq[0] = 1;

392

mStackInputIndex[0] = 0;

393

mStackDiffs[0] = 0;

394

mStackSiblingPos[0] = rootPosition;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

395

mStackOutputIndex[0] = 0;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

396

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

397

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

398

while (depth >= 0) {

399

if (mStackChildCount[depth] > 0) {

400

--mStackChildCount[depth];

401

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

402

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

403

int inputIndex = mStackInputIndex[depth];

404

int diffs = mStackDiffs[depth];

405

int siblingPos = mStackSiblingPos[depth];

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

406

int outputIndex = mStackOutputIndex[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

407

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

408

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

409

// needsToTraverseChildrenNodes should be false

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

410

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

411

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

412

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

413

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

414

&siblingPos, &outputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

415

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

416

mStackSiblingPos[depth] = siblingPos;

417

if (needsToTraverseChildrenNodes) {

418

// Goes to child node

419

++depth;

420

mStackChildCount[depth] = childCount;

421

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

422

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

423

mStackInputIndex[depth] = inputIndex;

424

mStackDiffs[depth] = diffs;

425

mStackSiblingPos[depth] = firstChildPos;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

426

mStackOutputIndex[depth] = outputIndex;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

427

}

428

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

429

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

435

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

436

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

437

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

438

}

439

440

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

441

inline static void multiplyIntCapped(const int multiplier, int *base) {

442

const int temp = *base;

443

if (temp != S_INT_MAX) {

444

// Branch if multiplier == 2 for the optimization

445

if (multiplier == 2) {

446

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

447

} else {

448

const int tempRetval = temp * multiplier;

449

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

455

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

456

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

457

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

458

int ret = base;

459

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

465

if (*freq != S_INT_MAX) {

466

if (*freq > 1000000) {

467

*freq /= 100;

468

multiplyIntCapped(rate, freq);

469

} else {

470

multiplyIntCapped(rate, freq);

471

*freq /= 100;

472

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

476

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

477

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

478

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

479

if (firstWordLength == 0 || secondWordLength == 0) {

480

return 0;

481

}

482

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

483

int tempFirstFreq = firstFreq;

484

multiplyRate(firstDemotionRate, &tempFirstFreq);

485

486

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

487

int tempSecondFreq = secondFreq;

488

multiplyRate(secondDemotionRate, &tempSecondFreq);

489

490

const int totalLength = firstWordLength + secondWordLength;

491

492

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

493

// length.

494

int totalFreq = tempFirstFreq + tempSecondFreq;

495

496

// This is a workaround to try offsetting the not-enough-demotion which will be done in

497

// calcNormalizedScore in Utils.java.

498

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

499

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

500

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

501

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

502

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

503

504

// At this moment, totalFreq is calculated by the following formula:

505

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

506

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

507

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

508

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

509

510

// This is another workaround to offset the demotion which will be done in

511

// calcNormalizedScore in Utils.java.

512

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

513

// the same amount because we already have adjusted the synthetic freq of this "missing or

514

// mistyped space" suggestion candidate above in this method.

515

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

516

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

517

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

518

if (isSpaceProximity) {

519

// A word pair with one space proximity correction

520

if (DEBUG_DICT) {

521

LOGI("Found a word pair with space proximity correction.");

522

}

523

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

524

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

525

}

526

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

527

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

531

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

532

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

533

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

534

}

535

536

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

537

return getSplitTwoWordsSuggestion(

538

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

539

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

540

}

541

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

542

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

543

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

544

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

545

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

546

int finalFreq = freq * matchWeight;

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

547

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

548

if (mInputLength >= 2) {

549

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

550

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

551

/ (10 * mInputLength

552

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

553

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

554

LOGI("Demotion rate for missing character is %d.", demotionRate);

555

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

556

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

561

if (transposedPos >= 0) multiplyRate(

562

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

563

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

564

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

565

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

566

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

567

}

568

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

569

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

570

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

571

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

572

// Full exact match

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

573

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

574

if (DEBUG_DICT) {

575

LOGI("Found full matched word.");

576

}

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

577

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

578

}

579

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

580

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

581

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

582

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

583

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

584

if (DEBUG_DICT) {

585

LOGI("Found one proximity correction.");

586

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

587

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

588

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

589

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

590

if (DEBUG_DICT) {

591

LOGI("calc: %d, %d", depth, sameLength);

592

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

593

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

594

return finalFreq;

595

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

596

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

597

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

598

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

599

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

600

// Skip the ' or other letter and continue deeper

601

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

602

}

603

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

604

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

605

const int inputLength) const {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

606

if (inputIndex < 0 || inputIndex >= inputLength) return false;

607

const int currentChar = *getInputCharsAt(inputIndex);

608

const int leftIndex = inputIndex - 1;

609

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

610

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

611

int i = 0;

612

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

613

if (leftChars[i++] == currentChar) return true;

614

}

615

}

616

const int rightIndex = inputIndex + 1;

617

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

618

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

619

int i = 0;

620

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

621

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

627

// In the following function, c is the current character of the dictionary word

628

// currently examined.

629

// currentChars is an array containing the keys close to the character the

630

// user actually typed at the same position. We want to see if c is in it: if so,

631

// then the word contains at that position a character close to what the user

632

// typed.

633

// What the user typed is actually the first character of the array.

634

// Notice : accented characters do not have a proximity list, so they are alone

635

// in their list. The non-accented version of the character should be considered

636

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

637

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

638

const int *currentChars, const unsigned short c, const int skipPos,

639

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

640

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

641

642

// The first char in the array is what user typed. If it matches right away,

643

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

644

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

645

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

646

647

// If one of those is true, we should not check for close characters at all.

648

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

649

return UNRELATED_CHAR;

650

651

// If the non-accented, lowercased version of that first character matches c,

652

// then we have a non-accented version of the accented character the user

653

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

654

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

655

return NEAR_PROXIMITY_CHAR;

656

657

// Not an exact nor an accent-alike match: search the list of close keys

658

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

659

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

660

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

661

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

662

++j;

663

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

664

665

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

666

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

667

}

668

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

669

inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

670

const uint8_t* const root, const uint8_t flags, const int pos,

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

671

const int inputIndex, const int matchWeight, const int skipPos,

672

const int excessivePos, const int transposedPos, const int freq, const bool sameLength,

673

int* nextLetters, const int nextLettersSize) {

674

675

const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

676

if (isSameAsTyped) return;

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

677

678

if (depth >= MIN_SUGGEST_DEPTH) {

679

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

680

excessivePos, transposedPos, freq, sameLength);

681

if (!isSameAsTyped)

682

addWord(word, depth + 1, finalFreq);

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

683

}

684

685

if (sameLength && depth >= mInputLength && skipPos < 0) {

686

registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);

}

}

Jean Chalard

2011-06-30 19:47:25 +0900

[diff] [blame]

690

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

691

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

692

const int secondWordLength, const bool isSpaceProximity) {

693

if (inputLength >= MAX_WORD_LENGTH) return false;

694

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

695

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

696

return false;

697

const int newWordLength = firstWordLength + secondWordLength + 1;

698

// Allocating variable length array on stack

699

unsigned short word[newWordLength];

700

const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);

701

if (DEBUG_DICT) {

702

LOGI("First freq: %d", firstFreq);

703

}

704

if (firstFreq <= 0) return false;

705

706

for (int i = 0; i < firstWordLength; ++i) {

word[i] = mWord[i];

}

const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);

711

if (DEBUG_DICT) {

712

LOGI("Second freq: %d", secondFreq);

713

}

714

if (secondFreq <= 0) return false;

715

716

word[firstWordLength] = SPACE;

717

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

718

word[i] = mWord[i - firstWordLength - 1];

719

}

720

721

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

722

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

723

if (DEBUG_DICT) {

724

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

725

TYPED_LETTER_MULTIPLIER);

726

}

727

addWord(word, newWordLength, pairFreq);

return true;

}

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

731

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

732

// The following functions will be entirely replaced with new implementations.

733

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

734

const int excessivePos, const int transposedPos,int *nextLetters,

735

const int nextLettersSize) {

736

int initialPosition = initialPos;

737

const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);

738

getWordsRec(count, initialPosition, 0,

739

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

740

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

741

nextLettersSize);

742

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

743

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

744

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

745

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

746

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

747

const int transposedPos, int *nextLetters, const int nextLettersSize) {

748

int siblingPos = pos;

749

for (int i = 0; i < childrenCount; ++i) {

750

int newCount;

751

int newChildPosition;

752

bool newTraverseAllNodes;

int newMatchRate;

int newInputIndex;

int newDiffs;

int newSiblingPos;

int newOutputIndex;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

759

traverseAllNodes, matchWeight, inputIndex, diffs,

760

skipPos, excessivePos, transposedPos,

761

nextLetters, nextLettersSize,

762

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

763

&newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);

764

siblingPos = newSiblingPos;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

765

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

766

if (needsToTraverseChildrenNodes) {

767

getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,

768

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

769

nextLetters, nextLettersSize);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

770

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

771

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

772

}

773

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

774

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

775

const int inputLength, unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

776

int pos = ROOT_POS;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

777

int count = Dictionary::getCount(DICT_ROOT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

778

int maxFreq = 0;

779

int depth = 0;

780

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

781

bool terminal = false;

782

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

783

mStackChildCount[0] = count;

784

mStackSiblingPos[0] = pos;

785

786

while (depth >= 0) {

787

if (mStackChildCount[depth] > 0) {

788

--mStackChildCount[depth];

789

int firstChildPos;

790

int newFreq;

791

int siblingPos = mStackSiblingPos[depth];

792

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

793

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

794

&siblingPos);

795

mStackSiblingPos[depth] = siblingPos;

796

if (depth == (inputLength - 1)) {

797

// Traverse sibling node

798

if (terminal) {

799

if (newFreq > maxFreq) {

800

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

801

if (DEBUG_DICT && DEBUG_NODE) {

802

char s[inputLength + 1];

803

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

804

s[inputLength] = 0;

805

LOGI("New missing space word found: %d > %d (%s), %d, %d",

806

newFreq, maxFreq, s, inputLength, depth);

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

812

// Traverse children nodes

813

++depth;

814

mStackChildCount[depth] = count;

815

mStackSiblingPos[depth] = firstChildPos;

816

}

817

} else {

818

// Traverse parent node

819

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

820

}

821

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

822

823

word[inputLength] = 0;

824

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

825

}

826

827

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

828

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

829

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

830

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

831

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

832

unsigned short c;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

833

*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,

834

&c, newChildPosition, newTerminal, newFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

835

const unsigned int inputC = currentChars[0];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

836

if (DEBUG_DICT) {

837

assert(inputC <= U_SHORT_MAX);

838

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

839

const unsigned short baseLowerC = toBaseLowerCase(c);

840

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

841

const bool hasChild = *newChildPosition != 0;

842

if (matched) {

843

word[depth] = c;

844

if (DEBUG_DICT && DEBUG_NODE) {

845

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

846

if (*newTerminal) {

847

LOGI("Terminal %d", *newFreq);

848

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

849

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

850

if (hasChild) {

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

851

*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

858

// Thus newTerminal shouldn't be true.

859

*newTerminal = false;

860

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

861

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

862

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

863

864

// TODO: use uint32_t instead of unsigned short

865

bool UnigramDictionary::isValidWord(unsigned short *word, int length) {

866

if (IS_LATEST_DICT_VERSION) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

867

return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

868

} else {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

869

return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

}

}

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

873

874

// Require strict exact match.

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

875

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

876

int length) const {

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

877

// returns address of bigram data of that word

878

// return -99 if not found

879

880

int count = Dictionary::getCount(DICT_ROOT, &pos);

881

unsigned short currentChar = (unsigned short) word[offset];

882

for (int j = 0; j < count; j++) {

883

unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);

884

int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);

885

int childPos = Dictionary::getAddress(DICT_ROOT, &pos);

886

if (c == currentChar) {

887

if (offset == length - 1) {

if (terminal) {

return (pos+1);

}

} else {

if (childPos != 0) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

893

int t = getBigramPosition(childPos, word, offset + 1, length);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

if (t > 0) {

return t;

}

}

}

}

if (terminal) {

Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);

902

}

903

// There could be two instances of each alphabet - upper and lower case. So continue

904

// looking ...

905

}

906

return NOT_VALID_WORD;

907

}

908

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

909

// The following functions will be modified.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

910

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

911

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

912

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

913

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

914

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

915

int *nextSiblingPosition, int *nextOutputIndex) {

916

if (DEBUG_DICT) {

917

int inputCount = 0;

918

if (skipPos >= 0) ++inputCount;

919

if (excessivePos >= 0) ++inputCount;

920

if (transposedPos >= 0) ++inputCount;

921

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

928

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

929

const int pos = initialPos;

930

const int depth = initialDepth;

931

const int traverseAllNodes = initialTraverseAllNodes;

932

const int diffs = initialDiffs;

933

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

934

const uint8_t flags = 0; // No flags for now

935

936

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

937

938

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

939

&c, &childPosition, &terminal, &freq);

940

*nextOutputIndex = depth + 1;

941

942

const bool needsToTraverseChildrenNodes = childPosition != 0;

943

944

// If we are only doing traverseAllNodes, no need to look at the typed characters.

945

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

946

mWord[depth] = c;

947

if (traverseAllNodes && terminal) {

948

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

949

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

950

}

951

if (!needsToTraverseChildrenNodes) return false;

952

*newTraverseAllNodes = traverseAllNodes;

953

*newMatchRate = matchWeight;

954

*newDiffs = diffs;

955

*newInputIndex = inputIndex;

956

} else {

957

const int *currentChars = getInputCharsAt(inputIndex);

958

959

if (transposedPos >= 0) {

960

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

961

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

962

}

963

964

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

965

transposedPos);

966

if (UNRELATED_CHAR == matchedProximityCharId) return false;

967

mWord[depth] = c;

968

// If inputIndex is greater than mInputLength, that means there is no

969

// proximity chars. So, we don't need to check proximity.

970

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

971

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

972

}

973

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

974

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

975

if (isSameAsUserTypedLength && terminal) {

976

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

977

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

978

}

979

if (!needsToTraverseChildrenNodes) return false;

980

// Start traversing all nodes after the index exceeds the user typed length

981

*newTraverseAllNodes = isSameAsUserTypedLength;

982

*newMatchRate = matchWeight;

983

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

984

*newInputIndex = inputIndex + 1;

985

}

986

// Optimization: Prune out words that are too long compared to how much was typed.

987

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

992

// TODO: Check if this can be isSameAsUserTypedLength only.

993

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

994

*newTraverseAllNodes = true;

995

}

996

// get the count of nodes and increment childAddress.

997

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

998

*newChildPosition = childPosition;

999

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

1000

return needsToTraverseChildrenNodes;

1001

}

1002

1003

#else // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1004

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1005

// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous

1006

// interface.

1007

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

1008

const int inputLength, unsigned short *word) {

1009

uint16_t inWord[inputLength];

1010

1011

for (int i = 0; i < inputLength; ++i) {

1012

inWord[i] = *getInputCharsAt(startInputIndex + i);

1013

}

1014

return getMostFrequentWordLikeInner(inWord, inputLength, word);

1015

}

1016

1017

// This function will take the position of a character array within a CharGroup,

1018

// and check it actually like-matches the word in inWord starting at startInputIndex,

1019

// that is, it matches it with case and accents squashed.

1020

// The function returns true if there was a full match, false otherwise.

1021

// The function will copy on-the-fly the characters in the CharGroup to outNewWord.

1022

// It will also place the end position of the array in outPos; in outInputIndex,

1023

// it will place the index of the first char AFTER the match if there was a match,

1024

// and the initial position if there was not. It makes sense because if there was

1025

// a match we want to continue searching, but if there was not, we want to go to

1026

// the next CharGroup.

1027

// In and out parameters may point to the same location. This function takes care

1028

// not to use any input parameters after it wrote into its outputs.

1029

static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,

1030

const uint8_t* const root, const int startPos,

1031

const uint16_t* const inWord, const int startInputIndex,

1032

int32_t* outNewWord, int* outInputIndex, int* outPos) {

1033

const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));

1034

int pos = startPos;

1035

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1036

int32_t baseChar = toBaseLowerCase(character);

1037

const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);

1038

1039

if (baseChar != wChar) {

1040

*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;

1041

*outInputIndex = startInputIndex;

1042

return false;

1043

}

1044

int inputIndex = startInputIndex;

1045

outNewWord[inputIndex] = character;

1046

if (hasMultipleChars) {

1047

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1048

while (NOT_A_CHARACTER != character) {

1049

baseChar = toBaseLowerCase(character);

1050

if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {

1051

*outPos = BinaryFormat::skipOtherCharacters(root, pos);

1052

*outInputIndex = startInputIndex;

1053

return false;

1054

}

1055

outNewWord[inputIndex] = character;

1056

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1057

}

1058

}

1059

*outInputIndex = inputIndex + 1;

*outPos = pos;

return true;

}

// This function is invoked when a word like the word searched for is found.

1065

// It will compare the frequency to the max frequency, and if greater, will

1066

// copy the word into the output buffer. In output value maxFreq, it will

1067

// write the new maximum frequency if it changed.

1068

static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,

1069

short unsigned int* outWord, int* maxFreq) {

1070

if (freq > *maxFreq) {

1071

for (int q = 0; q < length; ++q)

1072

outWord[q] = newWord[q];

outWord[length] = 0;

*maxFreq = freq;

}

}

// Will find the highest frequency of the words like the one passed as an argument,

1079

// that is, everything that only differs by case/accents.

1080

int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,

1081

const int length, short unsigned int* outWord) {

1082

int32_t newWord[MAX_WORD_LENGTH_INTERNAL];

1083

int depth = 0;

1084

int maxFreq = -1;

1085

const uint8_t* const root = DICT_ROOT;

1086

1087

mStackChildCount[0] = root[0];

1088

mStackInputIndex[0] = 0;

1089

mStackSiblingPos[0] = 1;

1090

while (depth >= 0) {

1091

const int charGroupCount = mStackChildCount[depth];

1092

int pos = mStackSiblingPos[depth];

1093

for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {

1094

int inputIndex = mStackInputIndex[depth];

1095

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1096

// Test whether all chars in this group match with the word we are searching for. If so,

1097

// we want to traverse its children (or if the length match, evaluate its frequency).

1098

// Note that this function will output the position regardless, but will only write

1099

// into inputIndex if there is a match.

1100

const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,

1101

inputIndex, newWord, &inputIndex, &pos);

1102

if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {

1103

const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1104

onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);

1105

}

1106

pos = BinaryFormat::skipFrequency(flags, pos);

1107

const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

1108

const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);

1109

// If we had a match and the word has children, we want to traverse them. We don't have

1110

// to traverse words longer than the one we are searching for, since they will not match

1111

// anyway, so don't traverse unless inputIndex < length.

1112

if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {

1113

// Save position for this depth, to get back to this once children are done

1114

mStackChildCount[depth] = charGroupIndex;

1115

mStackSiblingPos[depth] = siblingPos;

1116

// Prepare stack values for next depth

1117

++depth;

1118

int childrenPos = childrenNodePos;

1119

mStackChildCount[depth] =

1120

BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);

1121

mStackSiblingPos[depth] = childrenPos;

1122

mStackInputIndex[depth] = inputIndex;

1123

pos = childrenPos;

1124

// Go to the next depth level.

++depth;

break;

} else {

// No match, or no children, or word too long to ever match: go the next sibling.

pos = siblingPos;

}

}

--depth;

}

return maxFreq;

}

// This function gets the frequency of the exact matching word in the dictionary.

1138

// If no match is found, it returns -1.

1139

int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const {

1140

int pos = 0;

1141

int wordPos = 0;

1142

const uint8_t* const root = DICT_ROOT;

1143

1144

while (true) {

1145

// If we already traversed the tree further than the word is long, there means

1146

// there was no match (or we would have found it).

1147

if (wordPos > length) return -1;

1148

int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);

1149

const uint16_t wChar = inWord[wordPos];

1150

while (true) {

1151

// If there are no more character groups in this node, it means we could not

1152

// find a matching character for this depth, therefore there is no match.

1153

if (0 >= charGroupCount) return -1;

1154

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1155

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1156

if (character == wChar) {

1157

// This is the correct node. Only one character group may start with the same

1158

// char within a node, so either we found our match in this node, or there is

1159

// no match and we can return -1. So we will check all the characters in this

1160

// character group indeed does match.

1161

if (FLAG_HAS_MULTIPLE_CHARS & flags) {

1162

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1163

while (NOT_A_CHARACTER != character) {

1164

++wordPos;

1165

// If we shoot the length of the word we search for, or if we find a single

1166

// character that does not match, as explained above, it means the word is

1167

// not in the dictionary (by virtue of this chargroup being the only one to

1168

// match the word on the first character, but not matching the whole word).

1169

if (wordPos > length) return -1;

1170

if (inWord[wordPos] != character) return -1;

1171

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1172

}

1173

}

1174

// If we come here we know that so far, we do match. Either we are on a terminal

1175

// and we match the length, in which case we found it, or we traverse children.

1176

// If we don't match the length AND don't have children, then a word in the

1177

// dictionary fully matches a prefix of the searched word but not the full word.

1178

++wordPos;

1179

if (FLAG_IS_TERMINAL & flags) {

1180

if (wordPos == length) {

1181

return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1182

}

1183

pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);

1184

}

1185

if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags))

1186

return -1;

1187

// We have children and we are still shorter than the word we are searching for, so

1188

// we need to traverse children. Put the pointer on the children position, and

1189

// break

1190

pos = BinaryFormat::readChildrenPosition(root, flags, pos);

1191

break;

1192

} else {

1193

// This chargroup does not match, so skip the remaining part and go to the next.

1194

if (FLAG_HAS_MULTIPLE_CHARS & flags) {

1195

pos = BinaryFormat::skipOtherCharacters(root, pos);

1196

}

1197

pos = BinaryFormat::skipFrequency(flags, pos);

1198

pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

}

--charGroupCount;

}

}

}

bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {

1206

return -1 != getFrequency(inWord, length);

1207

}

1208

1209

int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,

1210

unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,

1211

int maxAlternatives) {

1212

// TODO: add implementation.

return 0;

}

// TODO: remove this function.

1217

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

int length) const {

return -1;

}

// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.

1223

// If the return value is false, then the caller should read in the output "nextSiblingPosition"

1224

// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.

1225

// It is worthy to note that when false is returned, the output values other than

1226

// nextSiblingPosition are undefined.

1227

// If the return value is true, then the caller must proceed to traverse the children of this

1228

// node. processCurrentNode will output the information about the children: their count in

1229

// newCount, their position in newChildrenPosition, the traverseAllNodes flag in

1230

// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the

1231

// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into

1232

// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when

1233

// there aren't any more nodes at this level, it merely returns the address of the first byte after

1234

// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any

1235

// given level, as output into newCount when traversing this level's parent.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1236

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

1237

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

1238

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1239

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1240

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

Jean Chalard

432789a

2011-06-30 17:50:48 +0900

[diff] [blame]

1241

int *nextSiblingPosition, int *newOutputIndex) {

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1242

if (DEBUG_DICT) {

1243

int inputCount = 0;

1244

if (skipPos >= 0) ++inputCount;

1245

if (excessivePos >= 0) ++inputCount;

1246

if (transposedPos >= 0) ++inputCount;

1247

assert(inputCount <= 1);

1248

}

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1249

int pos = initialPos;

1250

int depth = initialDepth;

1251

int traverseAllNodes = initialTraverseAllNodes;

1252

int diffs = initialDiffs;

1253

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1254

// Flags contain the following information:

1255

// - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:

1256

// - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address

1257

// is on the specified number of bytes.

1258

// - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.

1259

// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.

1260

// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)

1261

// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not

1262

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);

1263

const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1264

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1265

// This gets only ONE character from the stream. Next there will be:

1266

// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node

1267

// else if FLAG_IS_TERMINAL: the frequency

1268

// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address

1269

// Note that you can't have a node that both is not a terminal and has no children.

1270

int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);

1271

assert(NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1272

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1273

// We are going to loop through each character and make it look like it's a different

1274

// node each time. To do that, we will process characters in this node in order until

1275

// we find the character terminator. This is signalled by getCharCode* returning

1276

// NOT_A_CHARACTER.

1277

// As a special case, if there is only one character in this node, we must not read the

1278

// next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.

1279

// This way, each loop run will look like a "virtual node".

1280

do {

1281

// We prefetch the next char. If 'c' is the last char of this node, we will have

1282

// NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node

1283

// should behave as a terminal or not and whether we have children.

1284

const int32_t nextc = hasMultipleChars

1285

? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;

1286

const bool isLastChar = (NOT_A_CHARACTER == nextc);

1287

// If there are more chars in this nodes, then this virtual node is not a terminal.

1288

// If we are on the last char, this virtual node is a terminal if this node is.

1289

const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));

1290

// If there are more chars in this node, then this virtual node has children.

1291

// If we are on the last char, this virtual node has children if this node has.

1292

const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1293

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1294

// This has to be done for each virtual char (this forwards the "inputIndex" which

1295

// is the index in the user-inputted chars, as read by getInputCharsAt.

1296

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

1297

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

1298

mWord[depth] = c;

1299

if (traverseAllNodes && isTerminal) {

1300

// The frequency should be here, because we come here only if this is actually

1301

// a terminal node, and we are on its last char.

1302

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1303

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1304

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

1305

}

1306

if (!hasChildren) {

1307

// If we don't have children here, that means we finished processing all

1308

// characters of this node (we are on the last virtual node), AND we are in

1309

// traverseAllNodes mode, which means we are searching for *completions*. We

1310

// should skip the frequency if we have a terminal, and report the position

1311

// of the next sibling. We don't have to return other values because we are

1312

// returning false, as in "don't traverse children".

1313

if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);

1314

*nextSiblingPosition =

1315

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

} else {

const int *currentChars = getInputCharsAt(inputIndex);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1320

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1321

if (transposedPos >= 0) {

1322

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

1323

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

1324

}

1325

1326

const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,

1327

excessivePos, transposedPos);

1328

if (UNRELATED_CHAR == matchedProximityCharId) {

1329

// We found that this is an unrelated character, so we should give up traversing

1330

// this node and its children entirely.

1331

// However we may not be on the last virtual node yet so we skip the remaining

1332

// characters in this node, the frequency if it's there, read the next sibling

1333

// position to output it, then return false.

1334

// We don't have to output other values because we return false, as in

1335

// "don't traverse children".

1336

if (!isLastChar) {

1337

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1338

}

1339

pos = BinaryFormat::skipFrequency(flags, pos);

1340

*nextSiblingPosition =

1341

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

mWord[depth] = c;

// If inputIndex is greater than mInputLength, that means there is no

1346

// proximity chars. So, we don't need to check proximity.

1347

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

1348

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

1349

}

1350

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

1351

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

1352

if (isSameAsUserTypedLength && isTerminal) {

1353

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1354

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1355

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

1356

}

1357

// This character matched the typed character (enough to traverse the node at least)

1358

// so we just evaluated it. Now we should evaluate this virtual node's children - that

1359

// is, if it has any. If it has no children, we're done here - so we skip the end of

1360

// the node, output the siblings position, and return false "don't traverse children".

1361

// Note that !hasChildren implies isLastChar, so we know we don't have to skip any

1362

// remaining char in this group for there can't be any.

1363

if (!hasChildren) {

1364

pos = BinaryFormat::skipFrequency(flags, pos);

1365

*nextSiblingPosition =

1366

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1367

return false;

1368

}

1369

// Start traversing all nodes after the index exceeds the user typed length

1370

traverseAllNodes = isSameAsUserTypedLength;

1371

diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

1372

// Finally, we are ready to go to the next character, the next "virtual node".

1373

// We should advance the input index.

1374

// We do this in this branch of the 'if traverseAllNodes' because we are still matching

1375

// characters to input; the other branch is not matching them but searching for

1376

// completions, this is why it does not have to do it.

1377

++inputIndex;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1378

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1379

// Optimization: Prune out words that are too long compared to how much was typed.

1380

if (depth >= maxDepth || diffs > mMaxEditDistance) {

1381

// We are giving up parsing this node and its children. Skip the rest of the node,

1382

// output the sibling position, and return that we don't want to traverse children.

1383

if (!isLastChar) {

1384

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1385

}

1386

pos = BinaryFormat::skipFrequency(flags, pos);

1387

*nextSiblingPosition =

1388

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1389

return false;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1390

}

1391

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1392

// Prepare for the next character. Promote the prefetched char to current char - the loop

1393

// will take care of prefetching the next. If we finally found our last char, nextc will

1394

// contain NOT_A_CHARACTER.

1395

c = nextc;

1396

// Also, the next char is one "virtual node" depth more than this char.

1397

++depth;

1398

} while (NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1399

1400

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1401

// Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.

1402

if (mInputLength <= *newInputIndex) {

1403

traverseAllNodes = true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1404

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame^]

1405

1406

// All the output values that are purely computation by this function are held in local

1407

// variables. Output them to the caller.

1408

*newTraverseAllNodes = traverseAllNodes;

1409

*newMatchRate = matchWeight;

1410

*newDiffs = diffs;

1411

*newInputIndex = inputIndex;

1412

*newOutputIndex = depth;

1413

1414

// Now we finished processing this node, and we want to traverse children. If there are no

1415

// children, we can't come here.

1416

assert(BinaryFormat::hasChildrenInFlags(flags));

1417

1418

// If this node was a terminal it still has the frequency under the pointer (it may have been

1419

// read, but not skipped - see readFrequencyWithoutMovingPointer).

1420

// Next come the children position, then possibly attributes (attributes are bigrams only for

1421

// now, maybe something related to shortcuts in the future).

1422

// Once this is read, we still need to output the number of nodes in the immediate children of

1423

// this node, so we read and output it before returning true, as in "please traverse children".

1424

pos = BinaryFormat::skipFrequency(flags, pos);

1425

int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);

1426

*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1427

*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);

1428

*newChildrenPosition = childrenPos;

1429

return true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1430

}

1431

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

1432

#endif // NEW_DICTIONARY_FORMAT

1433

satok