Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <string.h>

20

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

21

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

22

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

23

#include "basechars.h"

24

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

25

#include "dictionary.h"

26

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

27

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

28

#ifdef NEW_DICTIONARY_FORMAT

29

#include "binary_format.h"

30

#endif // NEW_DICTIONARY_FORMAT

31

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

32

namespace latinime {

33

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

34

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

39

// TODO: check the header

40

UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

const bool isLatestDictVersion)

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

43

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

44

: DICT_ROOT(streamStart),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

45

#else // NEW_DICTIONARY_FORMAT

46

: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),

47

#endif // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

48

MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

49

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

50

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

51

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

52

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

53

#else // NEW_DICTIONARY_FORMAT

54

// TODO : remove this variable.

55

ROOT_POS(0),

56

#endif // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

57

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),

58

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

59

if (DEBUG_DICT) {

60

LOGI("UnigramDictionary - constructor");

61

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

62

}

63

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

64

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

65

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

66

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

67

const int MAX_PROXIMITY_CHARS) {

68

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

69

}

70

71

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

72

73

// There can't be a digraph if we don't have at least 2 characters to examine

74

if (i + 2 > codesSize) return false;

75

76

// Search for the first char of some digraph

77

int lastDigraphIndex = -1;

78

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

79

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

80

lastDigraphIndex >= 0; --lastDigraphIndex) {

81

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

82

}

83

// No match: return early

84

if (lastDigraphIndex < 0) return false;

85

86

// It's an interesting digraph if the second char matches too.

87

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

88

}

89

90

// Mostly the same arguments as the non-recursive version, except:

91

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

92

// codesSize is the size of the user input (thus, it is the size of codesSrc).

93

// codesDest is the current point in the work buffer.

94

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

95

// codesRemain is the remaining size in codesSrc.

96

void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,

97

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

98

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

99

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

100

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

101

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

102

for (int i = 0; i < codesRemain; ++i) {

103

if (isDigraph(codesSrc, i, codesRemain)) {

104

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

105

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

106

// Copy the word up to the first char of the digraph, then continue processing

107

// on the remaining part of the word, skipping the second char of the digraph.

108

// In our example, copy "pru" and continue running on "fen"

109

// Make i the index of the second char of the digraph for simplicity. Forgetting

110

// to do that results in an infinite recursion so take care!

111

++i;

112

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

113

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

114

codesBuffer, codesBufferSize, flags,

115

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

116

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

117

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

118

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

119

// Copy the second char of the digraph in place, then continue processing on

120

// the remaining part of the word.

121

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

122

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

123

BYTES_IN_ONE_CHAR);

124

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

125

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

126

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

127

outWords, frequencies);

128

return;

129

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

134

// In our example, we'll come here once for "prufen" and then once for "pruefen".

135

// If the word contains several digraphs, we'll come it for the product of them.

136

// eg. if the word is "ueberpruefen" we'll test, in order, against

137

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

138

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

139

if (0 != remainingBytes)

140

memcpy(codesDest, codesSrc, remainingBytes);

141

142

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

143

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

144

}

145

146

int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,

147

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

148

unsigned short *outWords, int *frequencies) {

149

150

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

151

{ // Incrementally tune the word and try all possibilities

152

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

153

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

154

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

155

} else { // Normal processing

156

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

157

outWords, frequencies);

158

}

159

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

160

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

161

// Get the word count

162

int suggestedWordsCount = 0;

163

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

164

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

169

/// Print the returned words

170

for (int j = 0; j < suggestedWordsCount; ++j) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

171

#ifdef FLAG_DBG

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

172

short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;

173

char s[MAX_WORD_LENGTH];

174

for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];

175

LOGI("%s %i", s, mFrequencies[j]);

satok

2011-07-14 08:32:57 +0900

[diff] [blame^]

176

#endif

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

177

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

178

LOGI("Next letters: ");

179

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

180

if (mNextLettersFrequency[k] > 0) {

181

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

182

}

183

}

184

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

185

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

186

PROF_CLOSE;

187

return suggestedWordsCount;

188

}

189

190

void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,

191

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

192

unsigned short *outWords, int *frequencies) {

193

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

194

PROF_OPEN;

195

PROF_START(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

196

initSuggestions(codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

197

if (DEBUG_DICT) assert(codesSize == mInputLength);

198

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

199

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

200

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

201

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

202

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

203

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

204

PROF_END(1);

205

206

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

207

// Suggestion with missing character

208

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

209

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

210

if (DEBUG_DICT) {

211

LOGI("--- Suggest missing characters %d", i);

212

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

213

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

214

}

215

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

216

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

217

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

218

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

219

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

220

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

221

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

222

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

223

if (DEBUG_DICT) {

224

LOGI("--- Suggest excessive characters %d", i);

225

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

226

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

227

}

228

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

229

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

230

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

231

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

232

// Suggestion with transposed characters

233

// Only suggest words that length is mInputLength

234

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

235

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

236

if (DEBUG_DICT) {

237

LOGI("--- Suggest transposed characters %d", i);

238

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

239

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

240

}

241

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

242

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

243

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

244

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

245

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

246

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

247

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

248

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

249

if (DEBUG_DICT) {

250

LOGI("--- Suggest missing space characters %d", i);

251

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

252

getMissingSpaceWords(mInputLength, i);

253

}

254

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

255

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

256

257

PROF_START(6);

Jean Chalard

e93b1f22

2011-06-01 17:12:25 +0900

[diff] [blame]

258

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

259

// The first and last "mistyped spaces" are taken care of by excessive character handling

260

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

261

if (DEBUG_DICT) {

262

LOGI("--- Suggest words with proximity space %d", i);

263

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

264

const int x = xcoordinates[i];

265

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

266

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

267

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

268

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

269

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

270

if (proximityInfo->hasSpaceProximity(x, y)) {

271

getMistypedSpaceWords(mInputLength, i);

272

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

273

}

274

}

275

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

276

}

277

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

278

void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,

279

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

280

if (DEBUG_DICT) {

281

LOGI("initSuggest");

282

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

283

mFrequencies = frequencies;

284

mOutputChars = outWords;

285

mInputCodes = codes;

286

mInputLength = codesSize;

287

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

288

}

289

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

290

static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

291

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

296

// TODO: We need to optimize addWord by using STL or something

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

297

// TODO: This needs to take an const unsigned short* and not tinker with its contents

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

298

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

299

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

300

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

301

#ifdef FLAG_DBG

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

302

char s[length + 1];

303

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

304

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2011-07-14 08:32:57 +0900

[diff] [blame^]

305

#endif

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

306

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

307

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

308

if (DEBUG_DICT) {

309

LOGI("Exceeded max word length.");

310

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

311

return false;

312

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

313

314

// Find the right insertion point

315

int insertAt = 0;

316

while (insertAt < MAX_WORDS) {

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

317

// TODO: How should we sort words with the same frequency?

318

if (frequency > mFrequencies[insertAt]) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

324

if (DEBUG_DICT) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

325

#ifdef FLAG_DBG

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

326

char s[length + 1];

327

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

328

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2011-07-14 08:32:57 +0900

[diff] [blame^]

329

#endif

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

330

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

331

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

332

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

333

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

334

mFrequencies[insertAt] = frequency;

335

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

336

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

337

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

338

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

343

if (DEBUG_DICT) {

344

LOGI("Added word at %d", insertAt);

345

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

351

static inline unsigned short toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

352

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

353

c = BASE_CHARS[c];

354

}

355

if (c >='A' && c <= 'Z') {

356

c |= 32;

357

} else if (c > 127) {

358

c = latin_tolower(c);

}

return c;

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

363

bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

364

if (length != mInputLength) {

365

return false;

366

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

367

const int *inputCodes = mInputCodes;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

368

while (length--) {

369

if ((unsigned int) *inputCodes != (unsigned int) *word) {

370

return false;

371

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

372

inputCodes += MAX_PROXIMITY_CHARS;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

word++;

}

return true;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

378

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

379

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

380

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

381

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

382

const int excessivePos, const int transposedPos, int *nextLetters,

383

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

384

if (DEBUG_DICT) {

385

LOGI("getSuggestionCandidates %d", maxDepth);

386

assert(transposedPos + 1 < mInputLength);

387

assert(excessivePos < mInputLength);

388

assert(missingPos < mInputLength);

389

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

390

int rootPosition = ROOT_POS;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

391

// Get the number of children of root, then increment the position

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

392

int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

393

int depth = 0;

394

395

mStackChildCount[0] = childCount;

396

mStackTraverseAll[0] = (mInputLength <= 0);

397

mStackNodeFreq[0] = 1;

398

mStackInputIndex[0] = 0;

399

mStackDiffs[0] = 0;

400

mStackSiblingPos[0] = rootPosition;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

401

mStackOutputIndex[0] = 0;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

402

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

403

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

404

while (depth >= 0) {

405

if (mStackChildCount[depth] > 0) {

406

--mStackChildCount[depth];

407

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

408

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

409

int inputIndex = mStackInputIndex[depth];

410

int diffs = mStackDiffs[depth];

411

int siblingPos = mStackSiblingPos[depth];

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

412

int outputIndex = mStackOutputIndex[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

413

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

414

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

415

// needsToTraverseChildrenNodes should be false

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

416

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

417

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

418

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

419

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

420

&siblingPos, &outputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

421

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

422

mStackSiblingPos[depth] = siblingPos;

423

if (needsToTraverseChildrenNodes) {

424

// Goes to child node

425

++depth;

426

mStackChildCount[depth] = childCount;

427

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

428

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

429

mStackInputIndex[depth] = inputIndex;

430

mStackDiffs[depth] = diffs;

431

mStackSiblingPos[depth] = firstChildPos;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

432

mStackOutputIndex[depth] = outputIndex;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

433

}

434

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

435

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

441

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

442

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

443

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

444

}

445

446

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

447

inline static void multiplyIntCapped(const int multiplier, int *base) {

448

const int temp = *base;

449

if (temp != S_INT_MAX) {

450

// Branch if multiplier == 2 for the optimization

451

if (multiplier == 2) {

452

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

453

} else {

454

const int tempRetval = temp * multiplier;

455

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

461

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

462

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

463

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

464

int ret = base;

465

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

471

if (*freq != S_INT_MAX) {

472

if (*freq > 1000000) {

473

*freq /= 100;

474

multiplyIntCapped(rate, freq);

475

} else {

476

multiplyIntCapped(rate, freq);

477

*freq /= 100;

478

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

482

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

483

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

484

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

485

if (firstWordLength == 0 || secondWordLength == 0) {

486

return 0;

487

}

488

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

489

int tempFirstFreq = firstFreq;

490

multiplyRate(firstDemotionRate, &tempFirstFreq);

491

492

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

493

int tempSecondFreq = secondFreq;

494

multiplyRate(secondDemotionRate, &tempSecondFreq);

495

496

const int totalLength = firstWordLength + secondWordLength;

497

498

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

499

// length.

500

int totalFreq = tempFirstFreq + tempSecondFreq;

501

502

// This is a workaround to try offsetting the not-enough-demotion which will be done in

503

// calcNormalizedScore in Utils.java.

504

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

505

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

506

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

507

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

508

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

509

510

// At this moment, totalFreq is calculated by the following formula:

511

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

512

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

513

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

514

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

515

516

// This is another workaround to offset the demotion which will be done in

517

// calcNormalizedScore in Utils.java.

518

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

519

// the same amount because we already have adjusted the synthetic freq of this "missing or

520

// mistyped space" suggestion candidate above in this method.

521

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

522

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

523

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

524

if (isSpaceProximity) {

525

// A word pair with one space proximity correction

526

if (DEBUG_DICT) {

527

LOGI("Found a word pair with space proximity correction.");

528

}

529

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

530

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

531

}

532

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

533

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

537

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

538

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

539

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

540

}

541

542

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

543

return getSplitTwoWordsSuggestion(

544

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

545

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

546

}

547

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

548

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

549

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

550

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

551

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

552

int finalFreq = freq * matchWeight;

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

553

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

554

if (mInputLength >= 2) {

555

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

556

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

557

/ (10 * mInputLength

558

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

559

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

560

LOGI("Demotion rate for missing character is %d.", demotionRate);

561

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

562

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

567

if (transposedPos >= 0) multiplyRate(

568

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

569

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

570

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

571

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

572

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

573

}

574

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

575

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

576

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

577

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

578

// Full exact match

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

579

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

580

if (DEBUG_DICT) {

581

LOGI("Found full matched word.");

582

}

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

583

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

584

}

585

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

586

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

587

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

588

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

589

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

590

if (DEBUG_DICT) {

591

LOGI("Found one proximity correction.");

592

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

593

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

594

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

595

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

596

if (DEBUG_DICT) {

597

LOGI("calc: %d, %d", depth, sameLength);

598

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

599

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

600

return finalFreq;

601

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

602

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

603

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

604

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

605

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

606

// Skip the ' or other letter and continue deeper

607

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

608

}

609

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

610

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

611

const int inputLength) const {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

612

if (inputIndex < 0 || inputIndex >= inputLength) return false;

613

const int currentChar = *getInputCharsAt(inputIndex);

614

const int leftIndex = inputIndex - 1;

615

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

616

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

617

int i = 0;

618

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

619

if (leftChars[i++] == currentChar) return true;

620

}

621

}

622

const int rightIndex = inputIndex + 1;

623

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

624

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

625

int i = 0;

626

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

627

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

633

// In the following function, c is the current character of the dictionary word

634

// currently examined.

635

// currentChars is an array containing the keys close to the character the

636

// user actually typed at the same position. We want to see if c is in it: if so,

637

// then the word contains at that position a character close to what the user

638

// typed.

639

// What the user typed is actually the first character of the array.

640

// Notice : accented characters do not have a proximity list, so they are alone

641

// in their list. The non-accented version of the character should be considered

642

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

643

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

644

const int *currentChars, const unsigned short c, const int skipPos,

645

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

646

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

647

648

// The first char in the array is what user typed. If it matches right away,

649

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

650

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

651

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

652

653

// If one of those is true, we should not check for close characters at all.

654

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

655

return UNRELATED_CHAR;

656

657

// If the non-accented, lowercased version of that first character matches c,

658

// then we have a non-accented version of the accented character the user

659

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

660

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

661

return NEAR_PROXIMITY_CHAR;

662

663

// Not an exact nor an accent-alike match: search the list of close keys

664

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

665

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

666

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

667

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

668

++j;

669

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

670

671

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

672

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

673

}

674

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

675

inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

676

const uint8_t* const root, const uint8_t flags, const int pos,

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

677

const int inputIndex, const int matchWeight, const int skipPos,

678

const int excessivePos, const int transposedPos, const int freq, const bool sameLength,

679

int* nextLetters, const int nextLettersSize) {

680

681

const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

682

if (isSameAsTyped) return;

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

683

684

if (depth >= MIN_SUGGEST_DEPTH) {

685

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

686

excessivePos, transposedPos, freq, sameLength);

687

if (!isSameAsTyped)

688

addWord(word, depth + 1, finalFreq);

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

689

}

690

691

if (sameLength && depth >= mInputLength && skipPos < 0) {

692

registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);

}

}

Jean Chalard

2011-06-30 19:47:25 +0900

[diff] [blame]

696

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

697

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

698

const int secondWordLength, const bool isSpaceProximity) {

699

if (inputLength >= MAX_WORD_LENGTH) return false;

700

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

701

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

702

return false;

703

const int newWordLength = firstWordLength + secondWordLength + 1;

704

// Allocating variable length array on stack

705

unsigned short word[newWordLength];

706

const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);

707

if (DEBUG_DICT) {

708

LOGI("First freq: %d", firstFreq);

709

}

710

if (firstFreq <= 0) return false;

711

712

for (int i = 0; i < firstWordLength; ++i) {

word[i] = mWord[i];

}

const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);

717

if (DEBUG_DICT) {

718

LOGI("Second freq: %d", secondFreq);

719

}

720

if (secondFreq <= 0) return false;

721

722

word[firstWordLength] = SPACE;

723

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

724

word[i] = mWord[i - firstWordLength - 1];

725

}

726

727

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

728

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

729

if (DEBUG_DICT) {

730

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

731

TYPED_LETTER_MULTIPLIER);

732

}

733

addWord(word, newWordLength, pairFreq);

return true;

}

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

737

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

738

// The following functions will be entirely replaced with new implementations.

739

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

740

const int excessivePos, const int transposedPos,int *nextLetters,

741

const int nextLettersSize) {

742

int initialPosition = initialPos;

743

const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);

744

getWordsRec(count, initialPosition, 0,

745

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

746

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

747

nextLettersSize);

748

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

749

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

750

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

751

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

752

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

753

const int transposedPos, int *nextLetters, const int nextLettersSize) {

754

int siblingPos = pos;

755

for (int i = 0; i < childrenCount; ++i) {

756

int newCount;

757

int newChildPosition;

758

bool newTraverseAllNodes;

int newMatchRate;

int newInputIndex;

int newDiffs;

int newSiblingPos;

int newOutputIndex;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

765

traverseAllNodes, matchWeight, inputIndex, diffs,

766

skipPos, excessivePos, transposedPos,

767

nextLetters, nextLettersSize,

768

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

769

&newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);

770

siblingPos = newSiblingPos;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

771

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

772

if (needsToTraverseChildrenNodes) {

773

getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,

774

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

775

nextLetters, nextLettersSize);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

776

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

777

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

778

}

779

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

780

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

781

const int inputLength, unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

782

int pos = ROOT_POS;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

783

int count = Dictionary::getCount(DICT_ROOT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

784

int maxFreq = 0;

785

int depth = 0;

786

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

787

bool terminal = false;

788

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

789

mStackChildCount[0] = count;

790

mStackSiblingPos[0] = pos;

791

792

while (depth >= 0) {

793

if (mStackChildCount[depth] > 0) {

794

--mStackChildCount[depth];

795

int firstChildPos;

796

int newFreq;

797

int siblingPos = mStackSiblingPos[depth];

798

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

799

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

800

&siblingPos);

801

mStackSiblingPos[depth] = siblingPos;

802

if (depth == (inputLength - 1)) {

803

// Traverse sibling node

804

if (terminal) {

805

if (newFreq > maxFreq) {

806

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

807

if (DEBUG_DICT && DEBUG_NODE) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

808

#ifdef FLAG_DBG

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

809

char s[inputLength + 1];

810

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

811

s[inputLength] = 0;

812

LOGI("New missing space word found: %d > %d (%s), %d, %d",

813

newFreq, maxFreq, s, inputLength, depth);

satok

2011-07-14 08:32:57 +0900

[diff] [blame^]

814

#endif

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

820

// Traverse children nodes

821

++depth;

822

mStackChildCount[depth] = count;

823

mStackSiblingPos[depth] = firstChildPos;

824

}

825

} else {

826

// Traverse parent node

827

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

828

}

829

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

830

831

word[inputLength] = 0;

832

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

833

}

834

835

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

836

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

837

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

838

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

839

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

840

unsigned short c;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

841

*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,

842

&c, newChildPosition, newTerminal, newFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

843

const unsigned int inputC = currentChars[0];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

844

if (DEBUG_DICT) {

845

assert(inputC <= U_SHORT_MAX);

846

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

847

const unsigned short baseLowerC = toBaseLowerCase(c);

848

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

849

const bool hasChild = *newChildPosition != 0;

850

if (matched) {

851

word[depth] = c;

852

if (DEBUG_DICT && DEBUG_NODE) {

853

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

854

if (*newTerminal) {

855

LOGI("Terminal %d", *newFreq);

856

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

857

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

858

if (hasChild) {

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

859

*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

866

// Thus newTerminal shouldn't be true.

867

*newTerminal = false;

868

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

869

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

870

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

871

872

// TODO: use uint32_t instead of unsigned short

873

bool UnigramDictionary::isValidWord(unsigned short *word, int length) {

874

if (IS_LATEST_DICT_VERSION) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

875

return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

876

} else {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

877

return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

}

}

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

881

882

// Require strict exact match.

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

883

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

884

int length) const {

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

885

// returns address of bigram data of that word

886

// return -99 if not found

887

888

int count = Dictionary::getCount(DICT_ROOT, &pos);

889

unsigned short currentChar = (unsigned short) word[offset];

890

for (int j = 0; j < count; j++) {

891

unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);

892

int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);

893

int childPos = Dictionary::getAddress(DICT_ROOT, &pos);

894

if (c == currentChar) {

895

if (offset == length - 1) {

if (terminal) {

return (pos+1);

}

} else {

if (childPos != 0) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

901

int t = getBigramPosition(childPos, word, offset + 1, length);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

if (t > 0) {

return t;

}

}

}

}

if (terminal) {

Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);

910

}

911

// There could be two instances of each alphabet - upper and lower case. So continue

912

// looking ...

913

}

914

return NOT_VALID_WORD;

915

}

916

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

917

// The following functions will be modified.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

918

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

919

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

920

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

921

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

922

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

923

int *nextSiblingPosition, int *nextOutputIndex) {

924

if (DEBUG_DICT) {

925

int inputCount = 0;

926

if (skipPos >= 0) ++inputCount;

927

if (excessivePos >= 0) ++inputCount;

928

if (transposedPos >= 0) ++inputCount;

929

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

936

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

937

const int pos = initialPos;

938

const int depth = initialDepth;

939

const int traverseAllNodes = initialTraverseAllNodes;

940

const int diffs = initialDiffs;

941

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

942

const uint8_t flags = 0; // No flags for now

943

944

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

945

946

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

947

&c, &childPosition, &terminal, &freq);

948

*nextOutputIndex = depth + 1;

949

950

const bool needsToTraverseChildrenNodes = childPosition != 0;

951

952

// If we are only doing traverseAllNodes, no need to look at the typed characters.

953

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

954

mWord[depth] = c;

955

if (traverseAllNodes && terminal) {

956

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

957

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

958

}

959

if (!needsToTraverseChildrenNodes) return false;

960

*newTraverseAllNodes = traverseAllNodes;

961

*newMatchRate = matchWeight;

962

*newDiffs = diffs;

963

*newInputIndex = inputIndex;

964

} else {

965

const int *currentChars = getInputCharsAt(inputIndex);

966

967

if (transposedPos >= 0) {

968

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

969

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

970

}

971

972

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

973

transposedPos);

974

if (UNRELATED_CHAR == matchedProximityCharId) return false;

975

mWord[depth] = c;

976

// If inputIndex is greater than mInputLength, that means there is no

977

// proximity chars. So, we don't need to check proximity.

978

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

979

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

980

}

981

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

982

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

983

if (isSameAsUserTypedLength && terminal) {

984

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

985

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

986

}

987

if (!needsToTraverseChildrenNodes) return false;

988

// Start traversing all nodes after the index exceeds the user typed length

989

*newTraverseAllNodes = isSameAsUserTypedLength;

990

*newMatchRate = matchWeight;

991

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

992

*newInputIndex = inputIndex + 1;

993

}

994

// Optimization: Prune out words that are too long compared to how much was typed.

995

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

1000

// TODO: Check if this can be isSameAsUserTypedLength only.

1001

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

1002

*newTraverseAllNodes = true;

1003

}

1004

// get the count of nodes and increment childAddress.

1005

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

1006

*newChildPosition = childPosition;

1007

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

1008

return needsToTraverseChildrenNodes;

1009

}

1010

1011

#else // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1012

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1013

// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous

1014

// interface.

1015

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

1016

const int inputLength, unsigned short *word) {

1017

uint16_t inWord[inputLength];

1018

1019

for (int i = 0; i < inputLength; ++i) {

1020

inWord[i] = *getInputCharsAt(startInputIndex + i);

1021

}

1022

return getMostFrequentWordLikeInner(inWord, inputLength, word);

1023

}

1024

1025

// This function will take the position of a character array within a CharGroup,

1026

// and check it actually like-matches the word in inWord starting at startInputIndex,

1027

// that is, it matches it with case and accents squashed.

1028

// The function returns true if there was a full match, false otherwise.

1029

// The function will copy on-the-fly the characters in the CharGroup to outNewWord.

1030

// It will also place the end position of the array in outPos; in outInputIndex,

1031

// it will place the index of the first char AFTER the match if there was a match,

1032

// and the initial position if there was not. It makes sense because if there was

1033

// a match we want to continue searching, but if there was not, we want to go to

1034

// the next CharGroup.

1035

// In and out parameters may point to the same location. This function takes care

1036

// not to use any input parameters after it wrote into its outputs.

1037

static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,

1038

const uint8_t* const root, const int startPos,

1039

const uint16_t* const inWord, const int startInputIndex,

1040

int32_t* outNewWord, int* outInputIndex, int* outPos) {

1041

const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));

1042

int pos = startPos;

1043

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1044

int32_t baseChar = toBaseLowerCase(character);

1045

const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);

1046

1047

if (baseChar != wChar) {

1048

*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;

1049

*outInputIndex = startInputIndex;

1050

return false;

1051

}

1052

int inputIndex = startInputIndex;

1053

outNewWord[inputIndex] = character;

1054

if (hasMultipleChars) {

1055

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1056

while (NOT_A_CHARACTER != character) {

1057

baseChar = toBaseLowerCase(character);

1058

if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {

1059

*outPos = BinaryFormat::skipOtherCharacters(root, pos);

1060

*outInputIndex = startInputIndex;

1061

return false;

1062

}

1063

outNewWord[inputIndex] = character;

1064

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1065

}

1066

}

1067

*outInputIndex = inputIndex + 1;

*outPos = pos;

return true;

}

// This function is invoked when a word like the word searched for is found.

1073

// It will compare the frequency to the max frequency, and if greater, will

1074

// copy the word into the output buffer. In output value maxFreq, it will

1075

// write the new maximum frequency if it changed.

1076

static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,

1077

short unsigned int* outWord, int* maxFreq) {

1078

if (freq > *maxFreq) {

1079

for (int q = 0; q < length; ++q)

1080

outWord[q] = newWord[q];

outWord[length] = 0;

*maxFreq = freq;

}

}

// Will find the highest frequency of the words like the one passed as an argument,

1087

// that is, everything that only differs by case/accents.

1088

int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,

1089

const int length, short unsigned int* outWord) {

1090

int32_t newWord[MAX_WORD_LENGTH_INTERNAL];

1091

int depth = 0;

1092

int maxFreq = -1;

1093

const uint8_t* const root = DICT_ROOT;

1094

1095

mStackChildCount[0] = root[0];

1096

mStackInputIndex[0] = 0;

1097

mStackSiblingPos[0] = 1;

1098

while (depth >= 0) {

1099

const int charGroupCount = mStackChildCount[depth];

1100

int pos = mStackSiblingPos[depth];

1101

for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {

1102

int inputIndex = mStackInputIndex[depth];

1103

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1104

// Test whether all chars in this group match with the word we are searching for. If so,

1105

// we want to traverse its children (or if the length match, evaluate its frequency).

1106

// Note that this function will output the position regardless, but will only write

1107

// into inputIndex if there is a match.

1108

const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,

1109

inputIndex, newWord, &inputIndex, &pos);

1110

if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {

1111

const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1112

onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);

1113

}

1114

pos = BinaryFormat::skipFrequency(flags, pos);

1115

const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

1116

const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);

1117

// If we had a match and the word has children, we want to traverse them. We don't have

1118

// to traverse words longer than the one we are searching for, since they will not match

1119

// anyway, so don't traverse unless inputIndex < length.

1120

if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {

1121

// Save position for this depth, to get back to this once children are done

1122

mStackChildCount[depth] = charGroupIndex;

1123

mStackSiblingPos[depth] = siblingPos;

1124

// Prepare stack values for next depth

1125

++depth;

1126

int childrenPos = childrenNodePos;

1127

mStackChildCount[depth] =

1128

BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);

1129

mStackSiblingPos[depth] = childrenPos;

1130

mStackInputIndex[depth] = inputIndex;

1131

pos = childrenPos;

1132

// Go to the next depth level.

++depth;

break;

} else {

// No match, or no children, or word too long to ever match: go the next sibling.

pos = siblingPos;

}

}

--depth;

}

return maxFreq;

}

// This function gets the frequency of the exact matching word in the dictionary.

1146

// If no match is found, it returns -1.

1147

int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const {

1148

int pos = 0;

1149

int wordPos = 0;

1150

const uint8_t* const root = DICT_ROOT;

1151

1152

while (true) {

1153

// If we already traversed the tree further than the word is long, there means

1154

// there was no match (or we would have found it).

1155

if (wordPos > length) return -1;

1156

int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);

1157

const uint16_t wChar = inWord[wordPos];

1158

while (true) {

1159

// If there are no more character groups in this node, it means we could not

1160

// find a matching character for this depth, therefore there is no match.

1161

if (0 >= charGroupCount) return -1;

1162

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1163

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1164

if (character == wChar) {

1165

// This is the correct node. Only one character group may start with the same

1166

// char within a node, so either we found our match in this node, or there is

1167

// no match and we can return -1. So we will check all the characters in this

1168

// character group indeed does match.

1169

if (FLAG_HAS_MULTIPLE_CHARS & flags) {

1170

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1171

while (NOT_A_CHARACTER != character) {

1172

++wordPos;

1173

// If we shoot the length of the word we search for, or if we find a single

1174

// character that does not match, as explained above, it means the word is

1175

// not in the dictionary (by virtue of this chargroup being the only one to

1176

// match the word on the first character, but not matching the whole word).

1177

if (wordPos > length) return -1;

1178

if (inWord[wordPos] != character) return -1;

1179

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1180

}

1181

}

1182

// If we come here we know that so far, we do match. Either we are on a terminal

1183

// and we match the length, in which case we found it, or we traverse children.

1184

// If we don't match the length AND don't have children, then a word in the

1185

// dictionary fully matches a prefix of the searched word but not the full word.

1186

++wordPos;

1187

if (FLAG_IS_TERMINAL & flags) {

1188

if (wordPos == length) {

1189

return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1190

}

1191

pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);

1192

}

1193

if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags))

1194

return -1;

1195

// We have children and we are still shorter than the word we are searching for, so

1196

// we need to traverse children. Put the pointer on the children position, and

1197

// break

1198

pos = BinaryFormat::readChildrenPosition(root, flags, pos);

1199

break;

1200

} else {

1201

// This chargroup does not match, so skip the remaining part and go to the next.

1202

if (FLAG_HAS_MULTIPLE_CHARS & flags) {

1203

pos = BinaryFormat::skipOtherCharacters(root, pos);

1204

}

1205

pos = BinaryFormat::skipFrequency(flags, pos);

1206

pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

}

--charGroupCount;

}

}

}

bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {

1214

return -1 != getFrequency(inWord, length);

1215

}

1216

1217

int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,

1218

unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,

1219

int maxAlternatives) {

1220

// TODO: add implementation.

return 0;

}

// TODO: remove this function.

1225

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

int length) const {

return -1;

}

// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.

1231

// If the return value is false, then the caller should read in the output "nextSiblingPosition"

1232

// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.

1233

// It is worthy to note that when false is returned, the output values other than

1234

// nextSiblingPosition are undefined.

1235

// If the return value is true, then the caller must proceed to traverse the children of this

1236

// node. processCurrentNode will output the information about the children: their count in

1237

// newCount, their position in newChildrenPosition, the traverseAllNodes flag in

1238

// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the

1239

// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into

1240

// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when

1241

// there aren't any more nodes at this level, it merely returns the address of the first byte after

1242

// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any

1243

// given level, as output into newCount when traversing this level's parent.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1244

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

1245

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

1246

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1247

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1248

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

Jean Chalard

432789a

2011-06-30 17:50:48 +0900

[diff] [blame]

1249

int *nextSiblingPosition, int *newOutputIndex) {

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1250

if (DEBUG_DICT) {

1251

int inputCount = 0;

1252

if (skipPos >= 0) ++inputCount;

1253

if (excessivePos >= 0) ++inputCount;

1254

if (transposedPos >= 0) ++inputCount;

1255

assert(inputCount <= 1);

1256

}

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1257

int pos = initialPos;

1258

int depth = initialDepth;

1259

int traverseAllNodes = initialTraverseAllNodes;

1260

int diffs = initialDiffs;

1261

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1262

// Flags contain the following information:

1263

// - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:

1264

// - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address

1265

// is on the specified number of bytes.

1266

// - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.

1267

// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.

1268

// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)

1269

// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not

1270

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);

1271

const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1272

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1273

// This gets only ONE character from the stream. Next there will be:

1274

// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node

1275

// else if FLAG_IS_TERMINAL: the frequency

1276

// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address

1277

// Note that you can't have a node that both is not a terminal and has no children.

1278

int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);

1279

assert(NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1280

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1281

// We are going to loop through each character and make it look like it's a different

1282

// node each time. To do that, we will process characters in this node in order until

1283

// we find the character terminator. This is signalled by getCharCode* returning

1284

// NOT_A_CHARACTER.

1285

// As a special case, if there is only one character in this node, we must not read the

1286

// next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.

1287

// This way, each loop run will look like a "virtual node".

1288

do {

1289

// We prefetch the next char. If 'c' is the last char of this node, we will have

1290

// NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node

1291

// should behave as a terminal or not and whether we have children.

1292

const int32_t nextc = hasMultipleChars

1293

? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;

1294

const bool isLastChar = (NOT_A_CHARACTER == nextc);

1295

// If there are more chars in this nodes, then this virtual node is not a terminal.

1296

// If we are on the last char, this virtual node is a terminal if this node is.

1297

const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));

1298

// If there are more chars in this node, then this virtual node has children.

1299

// If we are on the last char, this virtual node has children if this node has.

1300

const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1301

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1302

// This has to be done for each virtual char (this forwards the "inputIndex" which

1303

// is the index in the user-inputted chars, as read by getInputCharsAt.

1304

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

1305

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

1306

mWord[depth] = c;

1307

if (traverseAllNodes && isTerminal) {

1308

// The frequency should be here, because we come here only if this is actually

1309

// a terminal node, and we are on its last char.

1310

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1311

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1312

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

1313

}

1314

if (!hasChildren) {

1315

// If we don't have children here, that means we finished processing all

1316

// characters of this node (we are on the last virtual node), AND we are in

1317

// traverseAllNodes mode, which means we are searching for *completions*. We

1318

// should skip the frequency if we have a terminal, and report the position

1319

// of the next sibling. We don't have to return other values because we are

1320

// returning false, as in "don't traverse children".

1321

if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);

1322

*nextSiblingPosition =

1323

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

} else {

const int *currentChars = getInputCharsAt(inputIndex);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1328

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1329

if (transposedPos >= 0) {

1330

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

1331

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

1332

}

1333

1334

const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,

1335

excessivePos, transposedPos);

1336

if (UNRELATED_CHAR == matchedProximityCharId) {

1337

// We found that this is an unrelated character, so we should give up traversing

1338

// this node and its children entirely.

1339

// However we may not be on the last virtual node yet so we skip the remaining

1340

// characters in this node, the frequency if it's there, read the next sibling

1341

// position to output it, then return false.

1342

// We don't have to output other values because we return false, as in

1343

// "don't traverse children".

1344

if (!isLastChar) {

1345

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1346

}

1347

pos = BinaryFormat::skipFrequency(flags, pos);

1348

*nextSiblingPosition =

1349

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

mWord[depth] = c;

// If inputIndex is greater than mInputLength, that means there is no

1354

// proximity chars. So, we don't need to check proximity.

1355

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

1356

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

1357

}

1358

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

1359

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

1360

if (isSameAsUserTypedLength && isTerminal) {

1361

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1362

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1363

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

1364

}

1365

// This character matched the typed character (enough to traverse the node at least)

1366

// so we just evaluated it. Now we should evaluate this virtual node's children - that

1367

// is, if it has any. If it has no children, we're done here - so we skip the end of

1368

// the node, output the siblings position, and return false "don't traverse children".

1369

// Note that !hasChildren implies isLastChar, so we know we don't have to skip any

1370

// remaining char in this group for there can't be any.

1371

if (!hasChildren) {

1372

pos = BinaryFormat::skipFrequency(flags, pos);

1373

*nextSiblingPosition =

1374

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1375

return false;

1376

}

1377

// Start traversing all nodes after the index exceeds the user typed length

1378

traverseAllNodes = isSameAsUserTypedLength;

1379

diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

1380

// Finally, we are ready to go to the next character, the next "virtual node".

1381

// We should advance the input index.

1382

// We do this in this branch of the 'if traverseAllNodes' because we are still matching

1383

// characters to input; the other branch is not matching them but searching for

1384

// completions, this is why it does not have to do it.

1385

++inputIndex;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1386

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1387

// Optimization: Prune out words that are too long compared to how much was typed.

1388

if (depth >= maxDepth || diffs > mMaxEditDistance) {

1389

// We are giving up parsing this node and its children. Skip the rest of the node,

1390

// output the sibling position, and return that we don't want to traverse children.

1391

if (!isLastChar) {

1392

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1393

}

1394

pos = BinaryFormat::skipFrequency(flags, pos);

1395

*nextSiblingPosition =

1396

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1397

return false;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1398

}

1399

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1400

// Prepare for the next character. Promote the prefetched char to current char - the loop

1401

// will take care of prefetching the next. If we finally found our last char, nextc will

1402

// contain NOT_A_CHARACTER.

1403

c = nextc;

1404

// Also, the next char is one "virtual node" depth more than this char.

1405

++depth;

1406

} while (NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1407

1408

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1409

// Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.

1410

if (mInputLength <= *newInputIndex) {

1411

traverseAllNodes = true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1412

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1413

1414

// All the output values that are purely computation by this function are held in local

1415

// variables. Output them to the caller.

1416

*newTraverseAllNodes = traverseAllNodes;

1417

*newMatchRate = matchWeight;

1418

*newDiffs = diffs;

1419

*newInputIndex = inputIndex;

1420

*newOutputIndex = depth;

1421

1422

// Now we finished processing this node, and we want to traverse children. If there are no

1423

// children, we can't come here.

1424

assert(BinaryFormat::hasChildrenInFlags(flags));

1425

1426

// If this node was a terminal it still has the frequency under the pointer (it may have been

1427

// read, but not skipped - see readFrequencyWithoutMovingPointer).

1428

// Next come the children position, then possibly attributes (attributes are bigrams only for

1429

// now, maybe something related to shortcuts in the future).

1430

// Once this is read, we still need to output the number of nodes in the immediate children of

1431

// this node, so we read and output it before returning true, as in "please traverse children".

1432

pos = BinaryFormat::skipFrequency(flags, pos);

1433

int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);

1434

*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1435

*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);

1436

*newChildrenPosition = childrenPos;

1437

return true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1438

}

1439

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

1440

#endif // NEW_DICTIONARY_FORMAT

1441

satok