Blame - native/src/unigram_dictionary.cpp - android_packages_inputmethods_LatinIME

2010-12-01 21:22:15 +0900

[diff] [blame]

/*

**

**

** Licensed under the Apache License, Version 2.0 (the "License");

6

** you may not use this file except in compliance with the License.

7

** You may obtain a copy of the License at

8

**

9

** http://www.apache.org/licenses/LICENSE-2.0

10

**

11

** Unless required by applicable law or agreed to in writing, software

12

** distributed under the License is distributed on an "AS IS" BASIS,

13

** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

** See the License for the specific language governing permissions and

15

** limitations under the License.

16

*/

17

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

18

#include <assert.h>

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

19

#include <string.h>

20

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

21

#define LOG_TAG "LatinIME: unigram_dictionary.cpp"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

22

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

23

#include "basechars.h"

24

#include "char_utils.h"

satok

e808e43

2010-12-02 14:53:24 +0900

[diff] [blame]

25

#include "dictionary.h"

26

#include "unigram_dictionary.h"

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

27

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

28

#ifdef NEW_DICTIONARY_FORMAT

29

#include "binary_format.h"

30

#endif // NEW_DICTIONARY_FORMAT

31

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

32

namespace latinime {

33

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

34

const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =

{ { 'a', 'e' },

{ 'o', 'e' },

{ 'u', 'e' } };

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

39

// TODO: check the header

40

UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

41

int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

42

const bool isLatestDictVersion)

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

43

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

44

: DICT_ROOT(streamStart),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

45

#else // NEW_DICTIONARY_FORMAT

46

: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),

47

#endif // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

48

MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

49

MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),

50

TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

51

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

52

ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

53

#else // NEW_DICTIONARY_FORMAT

54

// TODO : remove this variable.

55

ROOT_POS(0),

56

#endif // NEW_DICTIONARY_FORMAT

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

57

BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

58

MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

59

if (DEBUG_DICT) {

60

LOGI("UnigramDictionary - constructor");

61

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

62

}

63

satok

18c28f4

2010-12-02 18:11:54 +0900

[diff] [blame]

64

UnigramDictionary::~UnigramDictionary() {}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

65

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

66

static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,

67

const int MAX_PROXIMITY_CHARS) {

68

return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;

69

}

70

71

bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {

72

73

// There can't be a digraph if we don't have at least 2 characters to examine

74

if (i + 2 > codesSize) return false;

75

76

// Search for the first char of some digraph

77

int lastDigraphIndex = -1;

78

const int thisChar = codes[i * MAX_PROXIMITY_CHARS];

79

for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;

80

lastDigraphIndex >= 0; --lastDigraphIndex) {

81

if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;

82

}

83

// No match: return early

84

if (lastDigraphIndex < 0) return false;

85

86

// It's an interesting digraph if the second char matches too.

87

return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];

88

}

89

90

// Mostly the same arguments as the non-recursive version, except:

91

// codes is the original value. It points to the start of the work buffer, and gets passed as is.

92

// codesSize is the size of the user input (thus, it is the size of codesSrc).

93

// codesDest is the current point in the work buffer.

94

// codesSrc is the current point in the user-input, original, content-unmodified buffer.

95

// codesRemain is the remaining size in codesSrc.

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

96

void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

97

const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,

98

const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,

satok

3c4bb77

2011-03-04 22:50:19 -0800

[diff] [blame]

99

const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

100

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

101

if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {

102

for (int i = 0; i < codesRemain; ++i) {

103

if (isDigraph(codesSrc, i, codesRemain)) {

104

// Found a digraph. We will try both spellings. eg. the word is "pruefen"

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

105

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

106

// Copy the word up to the first char of the digraph, then continue processing

107

// on the remaining part of the word, skipping the second char of the digraph.

108

// In our example, copy "pru" and continue running on "fen"

109

// Make i the index of the second char of the digraph for simplicity. Forgetting

110

// to do that results in an infinite recursion so take care!

111

++i;

112

memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);

113

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

114

codesBuffer, codesBufferSize, flags,

115

codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,

116

currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,

117

frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

118

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

119

// Copy the second char of the digraph in place, then continue processing on

120

// the remaining part of the word.

121

// In our example, after "pru" in the buffer copy the "e", and continue on "fen"

122

memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,

123

BYTES_IN_ONE_CHAR);

124

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,

125

codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,

126

codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,

127

outWords, frequencies);

128

return;

129

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

}

}

// If we come here, we hit the end of the word: let's check it against the dictionary.

134

// In our example, we'll come here once for "prufen" and then once for "pruefen".

135

// If the word contains several digraphs, we'll come it for the product of them.

136

// eg. if the word is "ueberpruefen" we'll test, in order, against

137

// "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".

138

const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;

139

if (0 != remainingBytes)

140

memcpy(codesDest, codesSrc, remainingBytes);

141

142

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

143

(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);

144

}

145

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

146

int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

147

const int *ycoordinates, const int *codes, const int codesSize, const int flags,

148

unsigned short *outWords, int *frequencies) {

149

150

if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)

151

{ // Incrementally tune the word and try all possibilities

152

int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];

153

getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,

Jean Chalard

2011-03-04 12:17:48 +0900

[diff] [blame]

154

codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

155

} else { // Normal processing

156

getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,

157

outWords, frequencies);

158

}

159

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

160

PROF_START(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

161

// Get the word count

162

int suggestedWordsCount = 0;

163

while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {

164

suggestedWordsCount++;

}

if (DEBUG_DICT) {

LOGI("Returning %d words", suggestedWordsCount);

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

169

/// Print the returned words

170

for (int j = 0; j < suggestedWordsCount; ++j) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

171

#ifdef FLAG_DBG

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

172

short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;

173

char s[MAX_WORD_LENGTH];

174

for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];

175

LOGI("%s %i", s, mFrequencies[j]);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

176

#endif

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

177

}

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

178

LOGI("Next letters: ");

179

for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {

180

if (mNextLettersFrequency[k] > 0) {

181

LOGI("%c = %d,", k, mNextLettersFrequency[k]);

182

}

183

}

184

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

185

PROF_END(20);

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

186

PROF_CLOSE;

187

return suggestedWordsCount;

188

}

189

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

190

void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

191

const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,

192

unsigned short *outWords, int *frequencies) {

193

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

194

PROF_OPEN;

195

PROF_START(0);

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

196

initSuggestions(

197

proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

198

if (DEBUG_DICT) assert(codesSize == mInputLength);

199

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

200

const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

201

PROF_END(0);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

202

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

203

PROF_START(1);

Tadashi G. Takaoka

887f11e

2011-02-10 20:53:58 +0900

[diff] [blame]

204

getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

205

PROF_END(1);

206

207

PROF_START(2);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

208

// Suggestion with missing character

209

if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

210

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

211

if (DEBUG_DICT) {

212

LOGI("--- Suggest missing characters %d", i);

213

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

214

getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

215

}

216

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

217

PROF_END(2);

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

218

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

219

PROF_START(3);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

220

// Suggestion with excessive character

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

221

if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER

222

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

223

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

224

if (DEBUG_DICT) {

225

LOGI("--- Suggest excessive characters %d", i);

226

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

227

getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

228

}

229

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

230

PROF_END(3);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

231

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

232

PROF_START(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

233

// Suggestion with transposed characters

234

// Only suggest words that length is mInputLength

235

if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {

236

for (int i = 0; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

237

if (DEBUG_DICT) {

238

LOGI("--- Suggest transposed characters %d", i);

239

}

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

240

getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

241

}

242

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

243

PROF_END(4);

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

244

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

245

PROF_START(5);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

246

// Suggestions with missing space

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

247

if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER

248

&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

249

for (int i = 1; i < codesSize; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

250

if (DEBUG_DICT) {

251

LOGI("--- Suggest missing space characters %d", i);

252

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

253

getMissingSpaceWords(mInputLength, i);

254

}

255

}

satok

2011-01-05 14:13:07 +0900

[diff] [blame]

256

PROF_END(5);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

257

258

PROF_START(6);

Jean Chalard

e93b1f22

2011-06-01 17:12:25 +0900

[diff] [blame]

259

if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

260

// The first and last "mistyped spaces" are taken care of by excessive character handling

261

for (int i = 1; i < codesSize - 1; ++i) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

262

if (DEBUG_DICT) {

263

LOGI("--- Suggest words with proximity space %d", i);

264

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

265

const int x = xcoordinates[i];

266

const int y = ycoordinates[i];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

267

if (DEBUG_PROXIMITY_INFO) {

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

268

LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",

269

i, x, y, proximityInfo->hasSpaceProximity(x, y));

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

270

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

271

if (proximityInfo->hasSpaceProximity(x, y)) {

272

getMistypedSpaceWords(mInputLength, i);

273

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

274

}

275

}

276

PROF_END(6);

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

277

}

278

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

279

void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,

280

const int *ycoordinates, const int *codes, const int codesSize,

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

281

unsigned short *outWords, int *frequencies) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

282

if (DEBUG_DICT) {

283

LOGI("initSuggest");

284

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

285

mFrequencies = frequencies;

286

mOutputChars = outWords;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

287

mInputLength = codesSize;

288

mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

289

proximityInfo->setInputParams(codes, codesSize);

290

mProximityInfo = proximityInfo;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

291

}

292

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

293

static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

294

if (c < nextLettersSize) {

nextLetters[c]++;

}

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

299

// TODO: We need to optimize addWord by using STL or something

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

300

// TODO: This needs to take an const unsigned short* and not tinker with its contents

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

301

bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

302

word[length] = 0;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

303

if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

304

#ifdef FLAG_DBG

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

305

char s[length + 1];

306

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

307

LOGI("Found word = %s, freq = %d", s, frequency);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

308

#endif

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

309

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

310

if (length > MAX_WORD_LENGTH) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

311

if (DEBUG_DICT) {

312

LOGI("Exceeded max word length.");

313

}

satok

f5cded1

2010-12-06 21:28:24 +0900

[diff] [blame]

314

return false;

315

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

316

317

// Find the right insertion point

318

int insertAt = 0;

319

while (insertAt < MAX_WORDS) {

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

320

// TODO: How should we sort words with the same frequency?

321

if (frequency > mFrequencies[insertAt]) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

break;

}

insertAt++;

}

if (insertAt < MAX_WORDS) {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

327

if (DEBUG_DICT) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

328

#ifdef FLAG_DBG

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

329

char s[length + 1];

330

for (int i = 0; i <= length; i++) s[i] = word[i];

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

331

LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

332

#endif

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

333

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

334

memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),

335

(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),

336

(MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));

337

mFrequencies[insertAt] = frequency;

338

memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

339

(char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

340

(MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);

satok

715514d

2010-12-02 20:19:59 +0900

[diff] [blame]

341

unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

while (length--) {

*dest++ = *word++;

}

*dest = 0; // NULL terminate

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

346

if (DEBUG_DICT) {

347

LOGI("Added word at %d", insertAt);

348

}

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

return true;

}

return false;

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

354

static inline unsigned short toBaseLowerCase(unsigned short c) {

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

355

if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {

356

c = BASE_CHARS[c];

357

}

358

if (c >='A' && c <= 'Z') {

359

c |= 32;

360

} else if (c > 127) {

361

c = latin_tolower(c);

}

return c;

}

satok

2010-12-02 20:19:59 +0900

[diff] [blame]

366

static const char QUOTE = '\'';

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

367

static const char SPACE = ' ';

satok

2010-12-01 21:22:15 +0900

[diff] [blame]

368

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

369

void UnigramDictionary::getSuggestionCandidates(const int skipPos,

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

370

const int excessivePos, const int transposedPos, int *nextLetters,

371

const int nextLettersSize, const int maxDepth) {

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

372

if (DEBUG_DICT) {

373

LOGI("getSuggestionCandidates %d", maxDepth);

374

assert(transposedPos + 1 < mInputLength);

375

assert(excessivePos < mInputLength);

376

assert(missingPos < mInputLength);

377

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

378

int rootPosition = ROOT_POS;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

379

// Get the number of children of root, then increment the position

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

380

int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

381

int depth = 0;

382

383

mStackChildCount[0] = childCount;

384

mStackTraverseAll[0] = (mInputLength <= 0);

385

mStackNodeFreq[0] = 1;

386

mStackInputIndex[0] = 0;

387

mStackDiffs[0] = 0;

388

mStackSiblingPos[0] = rootPosition;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

389

mStackOutputIndex[0] = 0;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

390

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

391

// Depth first search

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

392

while (depth >= 0) {

393

if (mStackChildCount[depth] > 0) {

394

--mStackChildCount[depth];

395

bool traverseAllNodes = mStackTraverseAll[depth];

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

396

int matchWeight = mStackNodeFreq[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

397

int inputIndex = mStackInputIndex[depth];

398

int diffs = mStackDiffs[depth];

399

int siblingPos = mStackSiblingPos[depth];

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

400

int outputIndex = mStackOutputIndex[depth];

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

401

int firstChildPos;

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

402

// depth will never be greater than maxDepth because in that case,

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

403

// needsToTraverseChildrenNodes should be false

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

404

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

405

maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,

406

excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,

407

&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

408

&siblingPos, &outputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

409

// Update next sibling pos

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

410

mStackSiblingPos[depth] = siblingPos;

411

if (needsToTraverseChildrenNodes) {

412

// Goes to child node

413

++depth;

414

mStackChildCount[depth] = childCount;

415

mStackTraverseAll[depth] = traverseAllNodes;

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

416

mStackNodeFreq[depth] = matchWeight;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

417

mStackInputIndex[depth] = inputIndex;

418

mStackDiffs[depth] = diffs;

419

mStackSiblingPos[depth] = firstChildPos;

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

420

mStackOutputIndex[depth] = outputIndex;

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

421

}

422

} else {

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

423

// Goes to parent sibling node

satok

2010-12-07 13:08:39 +0900

[diff] [blame]

--depth;

}

}

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

429

static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;

430

static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {

431

return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);

432

}

433

434

static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;

435

inline static void multiplyIntCapped(const int multiplier, int *base) {

436

const int temp = *base;

437

if (temp != S_INT_MAX) {

438

// Branch if multiplier == 2 for the optimization

439

if (multiplier == 2) {

440

*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;

441

} else {

442

const int tempRetval = temp * multiplier;

443

*base = tempRetval >= temp ? tempRetval : S_INT_MAX;

}

}

}

inline static int powerIntCapped(const int base, const int n) {

satok

0b6b0a5

2011-04-27 16:29:27 +0900

[diff] [blame]

449

if (base == 2) {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

450

return n < 31 ? 1 << n : S_INT_MAX;

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

451

} else {

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

452

int ret = base;

453

for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);

return ret;

}

}

inline static void multiplyRate(const int rate, int *freq) {

459

if (*freq != S_INT_MAX) {

460

if (*freq > 1000000) {

461

*freq /= 100;

462

multiplyIntCapped(rate, freq);

463

} else {

464

multiplyIntCapped(rate, freq);

465

*freq /= 100;

466

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

}

}

satok

2011-04-19 13:58:42 +0900

[diff] [blame]

470

inline static int calcFreqForSplitTwoWords(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

471

const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength,

472

const int firstFreq, const int secondFreq, const bool isSpaceProximity) {

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

473

if (firstWordLength == 0 || secondWordLength == 0) {

474

return 0;

475

}

476

const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);

477

int tempFirstFreq = firstFreq;

478

multiplyRate(firstDemotionRate, &tempFirstFreq);

479

480

const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);

481

int tempSecondFreq = secondFreq;

482

multiplyRate(secondDemotionRate, &tempSecondFreq);

483

484

const int totalLength = firstWordLength + secondWordLength;

485

486

// Promote pairFreq with multiplying by 2, because the word length is the same as the typed

487

// length.

488

int totalFreq = tempFirstFreq + tempSecondFreq;

489

490

// This is a workaround to try offsetting the not-enough-demotion which will be done in

491

// calcNormalizedScore in Utils.java.

492

// In calcNormalizedScore the score will be demoted by (1 - 1 / length)

493

// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by

494

// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))

495

const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);

496

multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

497

498

// At this moment, totalFreq is calculated by the following formula:

499

// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))

500

// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

501

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

502

multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

503

504

// This is another workaround to offset the demotion which will be done in

505

// calcNormalizedScore in Utils.java.

506

// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote

507

// the same amount because we already have adjusted the synthetic freq of this "missing or

508

// mistyped space" suggestion candidate above in this method.

509

const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);

510

multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

511

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

512

if (isSpaceProximity) {

513

// A word pair with one space proximity correction

514

if (DEBUG_DICT) {

515

LOGI("Found a word pair with space proximity correction.");

516

}

517

multiplyIntCapped(typedLetterMultiplier, &totalFreq);

518

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);

519

}

520

satok

4c981d3

2011-04-19 13:58:42 +0900

[diff] [blame]

521

multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

return totalFreq;

}

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

525

bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {

526

return getSplitTwoWordsSuggestion(

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

527

inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

528

}

529

530

bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {

531

return getSplitTwoWordsSuggestion(

532

inputLength, 0, spaceProximityPos, spaceProximityPos + 1,

satok

2011-05-18 15:31:04 +0900

[diff] [blame]

533

inputLength - spaceProximityPos - 1, true);

satok

2011-03-04 06:06:45 -0800

[diff] [blame]

534

}

535

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

536

inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

537

const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

538

const int freq, const bool sameLength) const {

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

539

// TODO: Demote by edit distance

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

540

int finalFreq = freq * matchWeight;

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

541

if (skipPos >= 0) {

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

542

if (mInputLength >= 2) {

543

const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE

544

* (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)

545

/ (10 * mInputLength

546

- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

547

if (DEBUG_DICT_FULL) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

548

LOGI("Demotion rate for missing character is %d.", demotionRate);

549

}

satok

dc5301e

2011-04-11 16:14:45 +0900

[diff] [blame]

550

multiplyRate(demotionRate, &finalFreq);

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

} else {

finalFreq = 0;

}

}

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

555

if (transposedPos >= 0) multiplyRate(

556

WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

557

if (excessivePos >= 0) {

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

558

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

559

if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

560

// If an excessive character is not adjacent to the left char or the right char,

561

// we will demote this word.

satok

2011-01-05 16:37:53 +0900

[diff] [blame]

562

multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

563

}

564

}

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

565

int lengthFreq = TYPED_LETTER_MULTIPLIER;

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

566

multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq);

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

567

if (lengthFreq == matchWeight) {

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

568

// Full exact match

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

569

if (depth > 1) {

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

570

if (DEBUG_DICT) {

571

LOGI("Found full matched word.");

572

}

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

573

multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);

574

}

575

if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

576

finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

577

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

578

} else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) {

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

579

// A word with proximity corrections

satok

72bc17e

2011-04-13 17:23:27 +0900

[diff] [blame]

580

if (DEBUG_DICT) {

581

LOGI("Found one proximity correction.");

582

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

583

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq);

satok

9d2a302

2011-04-14 19:13:34 +0900

[diff] [blame]

584

multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);

satok

58c49b9

2011-01-27 03:23:39 +0900

[diff] [blame]

585

}

satok

9674f65

2011-04-20 17:15:27 +0900

[diff] [blame]

586

if (DEBUG_DICT) {

587

LOGI("calc: %d, %d", depth, sameLength);

588

}

satok

2011-04-26 14:50:54 +0900

[diff] [blame]

589

if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq);

satok

2010-12-13 14:42:35 +0900

[diff] [blame]

590

return finalFreq;

591

}

satok

2010-12-09 22:08:33 +0900

[diff] [blame]

592

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

593

inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,

satok

6831926

2010-12-03 19:38:08 +0900

[diff] [blame]

594

const int inputIndex, const int skipPos, const int depth) {

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

595

const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

596

// Skip the ' or other letter and continue deeper

597

return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;

598

}

599

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

600

inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,

Jean Chalard

2011-03-03 10:22:10 +0900

[diff] [blame]

601

const int inputLength) const {

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

602

if (inputIndex < 0 || inputIndex >= inputLength) return false;

603

const int currentChar = *getInputCharsAt(inputIndex);

604

const int leftIndex = inputIndex - 1;

605

if (leftIndex >= 0) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

606

const int *leftChars = getInputCharsAt(leftIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

607

int i = 0;

608

while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

609

if (leftChars[i++] == currentChar) return true;

610

}

611

}

612

const int rightIndex = inputIndex + 1;

613

if (rightIndex < inputLength) {

Jean Chalard

2011-02-25 17:56:53 +0900

[diff] [blame]

614

const int *rightChars = getInputCharsAt(rightIndex);

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

615

int i = 0;

616

while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {

617

if (rightChars[i++] == currentChar) return true;

}

}

return false;

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

623

// In the following function, c is the current character of the dictionary word

624

// currently examined.

625

// currentChars is an array containing the keys close to the character the

626

// user actually typed at the same position. We want to see if c is in it: if so,

627

// then the word contains at that position a character close to what the user

628

// typed.

629

// What the user typed is actually the first character of the array.

630

// Notice : accented characters do not have a proximity list, so they are alone

631

// in their list. The non-accented version of the character should be considered

632

// "close", but not the other keys close to the non-accented version.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

633

inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(

634

const int *currentChars, const unsigned short c, const int skipPos,

635

const int excessivePos, const int transposedPos) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

636

const unsigned short baseLowerC = toBaseLowerCase(c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

637

638

// The first char in the array is what user typed. If it matches right away,

639

// that means the user typed that same char for this pos.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

640

if (currentChars[0] == baseLowerC || currentChars[0] == c)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

641

return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;

642

643

// If one of those is true, we should not check for close characters at all.

644

if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)

645

return UNRELATED_CHAR;

646

647

// If the non-accented, lowercased version of that first character matches c,

648

// then we have a non-accented version of the accented character the user

649

// typed. Treat it as a close char.

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

650

if (toBaseLowerCase(currentChars[0]) == baseLowerC)

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

651

return NEAR_PROXIMITY_CHAR;

652

653

// Not an exact nor an accent-alike match: search the list of close keys

654

int j = 1;

satok

2010-12-09 21:55:40 +0900

[diff] [blame]

655

while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

656

const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

657

if (matched) return NEAR_PROXIMITY_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

658

++j;

659

}

Jean Chalard

2011-02-18 17:50:58 +0900

[diff] [blame]

660

661

// Was not included, signal this as an unrelated character.

Jean Chalard

2011-01-27 14:20:22 +0900

[diff] [blame]

662

return UNRELATED_CHAR;

satok

2010-12-03 16:39:16 +0900

[diff] [blame]

663

}

664

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

665

inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

666

const uint8_t* const root, const uint8_t flags, const int pos,

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

667

const int inputIndex, const int matchWeight, const int skipPos,

668

const int excessivePos, const int transposedPos, const int freq, const bool sameLength,

669

int* nextLetters, const int nextLettersSize) {

670

satok

2011-07-13 10:32:02 +0900

[diff] [blame]

671

const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false;

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

672

if (isSameAsTyped) return;

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

673

674

if (depth >= MIN_SUGGEST_DEPTH) {

675

const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,

676

excessivePos, transposedPos, freq, sameLength);

677

if (!isSameAsTyped)

678

addWord(word, depth + 1, finalFreq);

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

679

}

680

681

if (sameLength && depth >= mInputLength && skipPos < 0) {

682

registerNextLetter(word[mInputLength], nextLetters, nextLettersSize);

}

}

Jean Chalard

2011-06-30 19:47:25 +0900

[diff] [blame]

686

bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,

687

const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,

688

const int secondWordLength, const bool isSpaceProximity) {

689

if (inputLength >= MAX_WORD_LENGTH) return false;

690

if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos

691

|| firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)

692

return false;

693

const int newWordLength = firstWordLength + secondWordLength + 1;

694

// Allocating variable length array on stack

695

unsigned short word[newWordLength];

696

const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);

697

if (DEBUG_DICT) {

698

LOGI("First freq: %d", firstFreq);

699

}

700

if (firstFreq <= 0) return false;

701

702

for (int i = 0; i < firstWordLength; ++i) {

word[i] = mWord[i];

}

const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);

707

if (DEBUG_DICT) {

708

LOGI("Second freq: %d", secondFreq);

709

}

710

if (secondFreq <= 0) return false;

711

712

word[firstWordLength] = SPACE;

713

for (int i = (firstWordLength + 1); i < newWordLength; ++i) {

714

word[i] = mWord[i - firstWordLength - 1];

715

}

716

717

int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,

718

secondWordLength, firstFreq, secondFreq, isSpaceProximity);

719

if (DEBUG_DICT) {

720

LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,

721

TYPED_LETTER_MULTIPLIER);

722

}

723

addWord(word, newWordLength, pairFreq);

return true;

}

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

727

#ifndef NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

728

// The following functions will be entirely replaced with new implementations.

729

void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,

730

const int excessivePos, const int transposedPos,int *nextLetters,

731

const int nextLettersSize) {

732

int initialPosition = initialPos;

733

const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);

734

getWordsRec(count, initialPosition, 0,

735

min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),

736

mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,

737

nextLettersSize);

738

}

Jean Chalard

2011-06-17 15:36:26 +0900

[diff] [blame]

739

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

740

void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,

741

const int maxDepth, const bool traverseAllNodes, const int matchWeight,

742

const int inputIndex, const int diffs, const int skipPos, const int excessivePos,

743

const int transposedPos, int *nextLetters, const int nextLettersSize) {

744

int siblingPos = pos;

745

for (int i = 0; i < childrenCount; ++i) {

746

int newCount;

747

int newChildPosition;

748

bool newTraverseAllNodes;

int newMatchRate;

int newInputIndex;

int newDiffs;

int newSiblingPos;

int newOutputIndex;

const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,

755

traverseAllNodes, matchWeight, inputIndex, diffs,

756

skipPos, excessivePos, transposedPos,

757

nextLetters, nextLettersSize,

758

&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,

759

&newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);

760

siblingPos = newSiblingPos;

satok

2010-12-08 16:04:16 +0900

[diff] [blame]

761

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

762

if (needsToTraverseChildrenNodes) {

763

getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,

764

newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,

765

nextLetters, nextLettersSize);

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

766

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

767

}

satok

2010-12-06 17:38:58 +0900

[diff] [blame]

768

}

769

Jean Chalard

2011-06-30 17:02:23 +0900

[diff] [blame]

770

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

771

const int inputLength, unsigned short *word) {

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

772

int pos = ROOT_POS;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

773

int count = Dictionary::getCount(DICT_ROOT, &pos);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

774

int maxFreq = 0;

775

int depth = 0;

776

unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

777

bool terminal = false;

778

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

779

mStackChildCount[0] = count;

780

mStackSiblingPos[0] = pos;

781

782

while (depth >= 0) {

783

if (mStackChildCount[depth] > 0) {

784

--mStackChildCount[depth];

785

int firstChildPos;

786

int newFreq;

787

int siblingPos = mStackSiblingPos[depth];

788

const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,

789

startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,

790

&siblingPos);

791

mStackSiblingPos[depth] = siblingPos;

792

if (depth == (inputLength - 1)) {

793

// Traverse sibling node

794

if (terminal) {

795

if (newFreq > maxFreq) {

796

for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];

797

if (DEBUG_DICT && DEBUG_NODE) {

Doug Kwan

2011-07-07 22:53:50 -0700

[diff] [blame]

798

#ifdef FLAG_DBG

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

799

char s[inputLength + 1];

800

for (int i = 0; i < inputLength; ++i) s[i] = word[i];

801

s[inputLength] = 0;

802

LOGI("New missing space word found: %d > %d (%s), %d, %d",

803

newFreq, maxFreq, s, inputLength, depth);

satok

2011-07-14 08:32:57 +0900

[diff] [blame]

804

#endif

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

}

maxFreq = newFreq;

}

}

} else if (needsToTraverseChildrenNodes) {

810

// Traverse children nodes

811

++depth;

812

mStackChildCount[depth] = count;

813

mStackSiblingPos[depth] = firstChildPos;

814

}

815

} else {

816

// Traverse parent node

817

--depth;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

818

}

819

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

820

821

word[inputLength] = 0;

822

return maxFreq;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

823

}

824

825

inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

826

const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,

827

int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {

828

const int inputIndex = startInputIndex + depth;

satok

8fbd552

2011-02-22 17:28:55 +0900

[diff] [blame]

829

const int *currentChars = getInputCharsAt(inputIndex);

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

830

unsigned short c;

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

831

*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,

832

&c, newChildPosition, newTerminal, newFreq);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

833

const unsigned int inputC = currentChars[0];

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

834

if (DEBUG_DICT) {

835

assert(inputC <= U_SHORT_MAX);

836

}

Jean Chalard

2011-02-22 15:12:46 +0900

[diff] [blame]

837

const unsigned short baseLowerC = toBaseLowerCase(c);

838

const bool matched = (inputC == baseLowerC || inputC == c);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

839

const bool hasChild = *newChildPosition != 0;

840

if (matched) {

841

word[depth] = c;

842

if (DEBUG_DICT && DEBUG_NODE) {

843

LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);

Ken Wakasa

2011-03-19 09:16:42 +0900

[diff] [blame]

844

if (*newTerminal) {

845

LOGI("Terminal %d", *newFreq);

846

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

847

}

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

848

if (hasChild) {

Jean Chalard

2011-06-16 20:55:16 +0900

[diff] [blame]

849

*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);

satok

2010-12-09 19:21:51 +0900

[diff] [blame]

return true;

} else {

return false;

}

} else {

// If this node is not user typed character, this method treats this word as unmatched.

856

// Thus newTerminal shouldn't be true.

857

*newTerminal = false;

858

return false;

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

859

}

satok

2010-12-08 17:05:39 +0900

[diff] [blame]

860

}

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

861

862

// TODO: use uint32_t instead of unsigned short

863

bool UnigramDictionary::isValidWord(unsigned short *word, int length) {

864

if (IS_LATEST_DICT_VERSION) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

865

return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

866

} else {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

867

return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

}

}

Jean Chalard

2011-06-16 22:51:11 +0900

[diff] [blame]

871

872

// Require strict exact match.

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

873

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

874

int length) const {

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

875

// returns address of bigram data of that word

876

// return -99 if not found

877

878

int count = Dictionary::getCount(DICT_ROOT, &pos);

879

unsigned short currentChar = (unsigned short) word[offset];

880

for (int j = 0; j < count; j++) {

881

unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);

882

int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);

883

int childPos = Dictionary::getAddress(DICT_ROOT, &pos);

884

if (c == currentChar) {

885

if (offset == length - 1) {

if (terminal) {

return (pos+1);

}

} else {

if (childPos != 0) {

Jean Chalard

2011-06-17 12:45:17 +0900

[diff] [blame]

891

int t = getBigramPosition(childPos, word, offset + 1, length);

Jean Chalard

2011-06-16 22:33:41 +0900

[diff] [blame]

if (t > 0) {

return t;

}

}

}

}

if (terminal) {

Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);

900

}

901

// There could be two instances of each alphabet - upper and lower case. So continue

902

// looking ...

903

}

904

return NOT_VALID_WORD;

905

}

906

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

907

// The following functions will be modified.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

908

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

909

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

910

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

911

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,

912

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

913

int *nextSiblingPosition, int *nextOutputIndex) {

914

if (DEBUG_DICT) {

915

int inputCount = 0;

916

if (skipPos >= 0) ++inputCount;

917

if (excessivePos >= 0) ++inputCount;

918

if (transposedPos >= 0) ++inputCount;

919

assert(inputCount <= 1);

}

unsigned short c;

int childPosition;

bool terminal;

int freq;

bool isSameAsUserTypedLength = false;

926

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

927

const int pos = initialPos;

928

const int depth = initialDepth;

929

const int traverseAllNodes = initialTraverseAllNodes;

930

const int diffs = initialDiffs;

931

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

932

const uint8_t flags = 0; // No flags for now

933

934

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

935

936

*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,

937

&c, &childPosition, &terminal, &freq);

938

*nextOutputIndex = depth + 1;

939

940

const bool needsToTraverseChildrenNodes = childPosition != 0;

941

942

// If we are only doing traverseAllNodes, no need to look at the typed characters.

943

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

944

mWord[depth] = c;

945

if (traverseAllNodes && terminal) {

946

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

947

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

948

}

949

if (!needsToTraverseChildrenNodes) return false;

950

*newTraverseAllNodes = traverseAllNodes;

951

*newMatchRate = matchWeight;

952

*newDiffs = diffs;

953

*newInputIndex = inputIndex;

954

} else {

955

const int *currentChars = getInputCharsAt(inputIndex);

956

957

if (transposedPos >= 0) {

958

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

959

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

960

}

961

962

int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,

963

transposedPos);

964

if (UNRELATED_CHAR == matchedProximityCharId) return false;

965

mWord[depth] = c;

966

// If inputIndex is greater than mInputLength, that means there is no

967

// proximity chars. So, we don't need to check proximity.

968

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

969

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

970

}

971

bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

972

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

973

if (isSameAsUserTypedLength && terminal) {

974

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

975

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

976

}

977

if (!needsToTraverseChildrenNodes) return false;

978

// Start traversing all nodes after the index exceeds the user typed length

979

*newTraverseAllNodes = isSameAsUserTypedLength;

980

*newMatchRate = matchWeight;

981

*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

982

*newInputIndex = inputIndex + 1;

983

}

984

// Optimization: Prune out words that are too long compared to how much was typed.

985

if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {

return false;

}

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

990

// TODO: Check if this can be isSameAsUserTypedLength only.

991

if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {

992

*newTraverseAllNodes = true;

993

}

994

// get the count of nodes and increment childAddress.

995

*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);

996

*newChildPosition = childPosition;

997

if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);

998

return needsToTraverseChildrenNodes;

999

}

1000

1001

#else // NEW_DICTIONARY_FORMAT

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1002

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1003

// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous

1004

// interface.

1005

inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

1006

const int inputLength, unsigned short *word) {

1007

uint16_t inWord[inputLength];

1008

1009

for (int i = 0; i < inputLength; ++i) {

1010

inWord[i] = *getInputCharsAt(startInputIndex + i);

1011

}

1012

return getMostFrequentWordLikeInner(inWord, inputLength, word);

1013

}

1014

1015

// This function will take the position of a character array within a CharGroup,

1016

// and check it actually like-matches the word in inWord starting at startInputIndex,

1017

// that is, it matches it with case and accents squashed.

1018

// The function returns true if there was a full match, false otherwise.

1019

// The function will copy on-the-fly the characters in the CharGroup to outNewWord.

1020

// It will also place the end position of the array in outPos; in outInputIndex,

1021

// it will place the index of the first char AFTER the match if there was a match,

1022

// and the initial position if there was not. It makes sense because if there was

1023

// a match we want to continue searching, but if there was not, we want to go to

1024

// the next CharGroup.

1025

// In and out parameters may point to the same location. This function takes care

1026

// not to use any input parameters after it wrote into its outputs.

1027

static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,

1028

const uint8_t* const root, const int startPos,

1029

const uint16_t* const inWord, const int startInputIndex,

1030

int32_t* outNewWord, int* outInputIndex, int* outPos) {

1031

const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));

1032

int pos = startPos;

1033

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1034

int32_t baseChar = toBaseLowerCase(character);

1035

const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);

1036

1037

if (baseChar != wChar) {

1038

*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;

1039

*outInputIndex = startInputIndex;

1040

return false;

1041

}

1042

int inputIndex = startInputIndex;

1043

outNewWord[inputIndex] = character;

1044

if (hasMultipleChars) {

1045

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1046

while (NOT_A_CHARACTER != character) {

1047

baseChar = toBaseLowerCase(character);

1048

if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {

1049

*outPos = BinaryFormat::skipOtherCharacters(root, pos);

1050

*outInputIndex = startInputIndex;

1051

return false;

1052

}

1053

outNewWord[inputIndex] = character;

1054

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1055

}

1056

}

1057

*outInputIndex = inputIndex + 1;

*outPos = pos;

return true;

}

// This function is invoked when a word like the word searched for is found.

1063

// It will compare the frequency to the max frequency, and if greater, will

1064

// copy the word into the output buffer. In output value maxFreq, it will

1065

// write the new maximum frequency if it changed.

1066

static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,

1067

short unsigned int* outWord, int* maxFreq) {

1068

if (freq > *maxFreq) {

1069

for (int q = 0; q < length; ++q)

1070

outWord[q] = newWord[q];

outWord[length] = 0;

*maxFreq = freq;

}

}

// Will find the highest frequency of the words like the one passed as an argument,

1077

// that is, everything that only differs by case/accents.

1078

int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,

1079

const int length, short unsigned int* outWord) {

1080

int32_t newWord[MAX_WORD_LENGTH_INTERNAL];

1081

int depth = 0;

1082

int maxFreq = -1;

1083

const uint8_t* const root = DICT_ROOT;

1084

1085

mStackChildCount[0] = root[0];

1086

mStackInputIndex[0] = 0;

1087

mStackSiblingPos[0] = 1;

1088

while (depth >= 0) {

1089

const int charGroupCount = mStackChildCount[depth];

1090

int pos = mStackSiblingPos[depth];

1091

for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {

1092

int inputIndex = mStackInputIndex[depth];

1093

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1094

// Test whether all chars in this group match with the word we are searching for. If so,

1095

// we want to traverse its children (or if the length match, evaluate its frequency).

1096

// Note that this function will output the position regardless, but will only write

1097

// into inputIndex if there is a match.

1098

const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,

1099

inputIndex, newWord, &inputIndex, &pos);

1100

if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {

1101

const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1102

onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);

1103

}

1104

pos = BinaryFormat::skipFrequency(flags, pos);

1105

const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

1106

const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);

1107

// If we had a match and the word has children, we want to traverse them. We don't have

1108

// to traverse words longer than the one we are searching for, since they will not match

1109

// anyway, so don't traverse unless inputIndex < length.

1110

if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {

1111

// Save position for this depth, to get back to this once children are done

1112

mStackChildCount[depth] = charGroupIndex;

1113

mStackSiblingPos[depth] = siblingPos;

1114

// Prepare stack values for next depth

1115

++depth;

1116

int childrenPos = childrenNodePos;

1117

mStackChildCount[depth] =

1118

BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);

1119

mStackSiblingPos[depth] = childrenPos;

1120

mStackInputIndex[depth] = inputIndex;

1121

pos = childrenPos;

1122

// Go to the next depth level.

++depth;

break;

} else {

// No match, or no children, or word too long to ever match: go the next sibling.

pos = siblingPos;

}

}

--depth;

}

return maxFreq;

}

// This function gets the frequency of the exact matching word in the dictionary.

1136

// If no match is found, it returns -1.

1137

int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const {

1138

int pos = 0;

1139

int wordPos = 0;

1140

const uint8_t* const root = DICT_ROOT;

1141

1142

while (true) {

1143

// If we already traversed the tree further than the word is long, there means

1144

// there was no match (or we would have found it).

1145

if (wordPos > length) return -1;

1146

int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);

1147

const uint16_t wChar = inWord[wordPos];

1148

while (true) {

1149

// If there are no more character groups in this node, it means we could not

1150

// find a matching character for this depth, therefore there is no match.

1151

if (0 >= charGroupCount) return -1;

1152

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);

1153

int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1154

if (character == wChar) {

1155

// This is the correct node. Only one character group may start with the same

1156

// char within a node, so either we found our match in this node, or there is

1157

// no match and we can return -1. So we will check all the characters in this

1158

// character group indeed does match.

1159

if (FLAG_HAS_MULTIPLE_CHARS & flags) {

1160

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1161

while (NOT_A_CHARACTER != character) {

1162

++wordPos;

1163

// If we shoot the length of the word we search for, or if we find a single

1164

// character that does not match, as explained above, it means the word is

1165

// not in the dictionary (by virtue of this chargroup being the only one to

1166

// match the word on the first character, but not matching the whole word).

1167

if (wordPos > length) return -1;

1168

if (inWord[wordPos] != character) return -1;

1169

character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);

1170

}

1171

}

1172

// If we come here we know that so far, we do match. Either we are on a terminal

1173

// and we match the length, in which case we found it, or we traverse children.

1174

// If we don't match the length AND don't have children, then a word in the

1175

// dictionary fully matches a prefix of the searched word but not the full word.

1176

++wordPos;

1177

if (FLAG_IS_TERMINAL & flags) {

1178

if (wordPos == length) {

1179

return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);

1180

}

1181

pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);

1182

}

1183

if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags))

1184

return -1;

1185

// We have children and we are still shorter than the word we are searching for, so

1186

// we need to traverse children. Put the pointer on the children position, and

1187

// break

1188

pos = BinaryFormat::readChildrenPosition(root, flags, pos);

1189

break;

1190

} else {

1191

// This chargroup does not match, so skip the remaining part and go to the next.

1192

if (FLAG_HAS_MULTIPLE_CHARS & flags) {

1193

pos = BinaryFormat::skipOtherCharacters(root, pos);

1194

}

1195

pos = BinaryFormat::skipFrequency(flags, pos);

1196

pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);

}

--charGroupCount;

}

}

}

bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {

1204

return -1 != getFrequency(inWord, length);

1205

}

1206

1207

int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,

1208

unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,

1209

int maxAlternatives) {

1210

// TODO: add implementation.

return 0;

}

// TODO: remove this function.

1215

int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,

int length) const {

return -1;

}

// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.

1221

// If the return value is false, then the caller should read in the output "nextSiblingPosition"

1222

// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.

1223

// It is worthy to note that when false is returned, the output values other than

1224

// nextSiblingPosition are undefined.

1225

// If the return value is true, then the caller must proceed to traverse the children of this

1226

// node. processCurrentNode will output the information about the children: their count in

1227

// newCount, their position in newChildrenPosition, the traverseAllNodes flag in

1228

// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the

1229

// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into

1230

// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when

1231

// there aren't any more nodes at this level, it merely returns the address of the first byte after

1232

// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any

1233

// given level, as output into newCount when traversing this level's parent.

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1234

inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,

1235

const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,

1236

const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1237

int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1238

bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,

Jean Chalard

432789a

2011-06-30 17:50:48 +0900

[diff] [blame]

1239

int *nextSiblingPosition, int *newOutputIndex) {

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1240

if (DEBUG_DICT) {

1241

int inputCount = 0;

1242

if (skipPos >= 0) ++inputCount;

1243

if (excessivePos >= 0) ++inputCount;

1244

if (transposedPos >= 0) ++inputCount;

1245

assert(inputCount <= 1);

1246

}

Jean Chalard

2011-06-30 19:23:16 +0900

[diff] [blame]

1247

int pos = initialPos;

1248

int depth = initialDepth;

1249

int traverseAllNodes = initialTraverseAllNodes;

1250

int diffs = initialDiffs;

1251

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1252

// Flags contain the following information:

1253

// - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:

1254

// - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address

1255

// is on the specified number of bytes.

1256

// - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.

1257

// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.

1258

// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)

1259

// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not

1260

const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);

1261

const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1262

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1263

// This gets only ONE character from the stream. Next there will be:

1264

// if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node

1265

// else if FLAG_IS_TERMINAL: the frequency

1266

// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address

1267

// Note that you can't have a node that both is not a terminal and has no children.

1268

int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);

1269

assert(NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1270

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1271

// We are going to loop through each character and make it look like it's a different

1272

// node each time. To do that, we will process characters in this node in order until

1273

// we find the character terminator. This is signalled by getCharCode* returning

1274

// NOT_A_CHARACTER.

1275

// As a special case, if there is only one character in this node, we must not read the

1276

// next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.

1277

// This way, each loop run will look like a "virtual node".

1278

do {

1279

// We prefetch the next char. If 'c' is the last char of this node, we will have

1280

// NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node

1281

// should behave as a terminal or not and whether we have children.

1282

const int32_t nextc = hasMultipleChars

1283

? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;

1284

const bool isLastChar = (NOT_A_CHARACTER == nextc);

1285

// If there are more chars in this nodes, then this virtual node is not a terminal.

1286

// If we are on the last char, this virtual node is a terminal if this node is.

1287

const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags));

1288

// If there are more chars in this node, then this virtual node has children.

1289

// If we are on the last char, this virtual node has children if this node has.

1290

const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1291

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1292

// This has to be done for each virtual char (this forwards the "inputIndex" which

1293

// is the index in the user-inputted chars, as read by getInputCharsAt.

1294

if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;

1295

if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {

1296

mWord[depth] = c;

1297

if (traverseAllNodes && isTerminal) {

1298

// The frequency should be here, because we come here only if this is actually

1299

// a terminal node, and we are on its last char.

1300

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1301

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1302

excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);

1303

}

1304

if (!hasChildren) {

1305

// If we don't have children here, that means we finished processing all

1306

// characters of this node (we are on the last virtual node), AND we are in

1307

// traverseAllNodes mode, which means we are searching for *completions*. We

1308

// should skip the frequency if we have a terminal, and report the position

1309

// of the next sibling. We don't have to return other values because we are

1310

// returning false, as in "don't traverse children".

1311

if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos);

1312

*nextSiblingPosition =

1313

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

} else {

const int *currentChars = getInputCharsAt(inputIndex);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1318

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1319

if (transposedPos >= 0) {

1320

if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;

1321

if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;

1322

}

1323

1324

const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,

1325

excessivePos, transposedPos);

1326

if (UNRELATED_CHAR == matchedProximityCharId) {

1327

// We found that this is an unrelated character, so we should give up traversing

1328

// this node and its children entirely.

1329

// However we may not be on the last virtual node yet so we skip the remaining

1330

// characters in this node, the frequency if it's there, read the next sibling

1331

// position to output it, then return false.

1332

// We don't have to output other values because we return false, as in

1333

// "don't traverse children".

1334

if (!isLastChar) {

1335

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1336

}

1337

pos = BinaryFormat::skipFrequency(flags, pos);

1338

*nextSiblingPosition =

1339

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

return false;

}

mWord[depth] = c;

// If inputIndex is greater than mInputLength, that means there is no

1344

// proximity chars. So, we don't need to check proximity.

1345

if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {

1346

multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);

1347

}

1348

const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1

1349

|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);

1350

if (isSameAsUserTypedLength && isTerminal) {

1351

const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);

1352

onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,

1353

excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);

1354

}

1355

// This character matched the typed character (enough to traverse the node at least)

1356

// so we just evaluated it. Now we should evaluate this virtual node's children - that

1357

// is, if it has any. If it has no children, we're done here - so we skip the end of

1358

// the node, output the siblings position, and return false "don't traverse children".

1359

// Note that !hasChildren implies isLastChar, so we know we don't have to skip any

1360

// remaining char in this group for there can't be any.

1361

if (!hasChildren) {

1362

pos = BinaryFormat::skipFrequency(flags, pos);

1363

*nextSiblingPosition =

1364

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1365

return false;

1366

}

1367

// Start traversing all nodes after the index exceeds the user typed length

1368

traverseAllNodes = isSameAsUserTypedLength;

1369

diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);

1370

// Finally, we are ready to go to the next character, the next "virtual node".

1371

// We should advance the input index.

1372

// We do this in this branch of the 'if traverseAllNodes' because we are still matching

1373

// characters to input; the other branch is not matching them but searching for

1374

// completions, this is why it does not have to do it.

1375

++inputIndex;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1376

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1377

// Optimization: Prune out words that are too long compared to how much was typed.

1378

if (depth >= maxDepth || diffs > mMaxEditDistance) {

1379

// We are giving up parsing this node and its children. Skip the rest of the node,

1380

// output the sibling position, and return that we don't want to traverse children.

1381

if (!isLastChar) {

1382

pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);

1383

}

1384

pos = BinaryFormat::skipFrequency(flags, pos);

1385

*nextSiblingPosition =

1386

BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1387

return false;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1388

}

1389

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1390

// Prepare for the next character. Promote the prefetched char to current char - the loop

1391

// will take care of prefetching the next. If we finally found our last char, nextc will

1392

// contain NOT_A_CHARACTER.

1393

c = nextc;

1394

// Also, the next char is one "virtual node" depth more than this char.

1395

++depth;

1396

} while (NOT_A_CHARACTER != c);

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1397

1398

// If inputIndex is greater than mInputLength, that means there are no proximity chars.

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1399

// Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength.

1400

if (mInputLength <= *newInputIndex) {

1401

traverseAllNodes = true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1402

}

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

1403

1404

// All the output values that are purely computation by this function are held in local

1405

// variables. Output them to the caller.

1406

*newTraverseAllNodes = traverseAllNodes;

1407

*newMatchRate = matchWeight;

1408

*newDiffs = diffs;

1409

*newInputIndex = inputIndex;

1410

*newOutputIndex = depth;

1411

1412

// Now we finished processing this node, and we want to traverse children. If there are no

1413

// children, we can't come here.

1414

assert(BinaryFormat::hasChildrenInFlags(flags));

1415

1416

// If this node was a terminal it still has the frequency under the pointer (it may have been

1417

// read, but not skipped - see readFrequencyWithoutMovingPointer).

1418

// Next come the children position, then possibly attributes (attributes are bigrams only for

1419

// now, maybe something related to shortcuts in the future).

1420

// Once this is read, we still need to output the number of nodes in the immediate children of

1421

// this node, so we read and output it before returning true, as in "please traverse children".

1422

pos = BinaryFormat::skipFrequency(flags, pos);

1423

int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);

1424

*nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);

1425

*newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);

1426

*newChildrenPosition = childrenPos;

1427

return true;

Jean Chalard

2011-06-21 22:23:21 +0900

[diff] [blame]

1428

}

1429

Jean Chalard

2011-06-20 21:09:04 +0900

[diff] [blame]

1430

#endif // NEW_DICTIONARY_FORMAT

1431

satok