blob: 70702a95fe87ca8681fcf9826616a015d7f315a1 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_UNIGRAM_DICTIONARY_H
18#define LATINIME_UNIGRAM_DICTIONARY_H
19
Jean Chalard293ece02011-06-16 20:55:16 +090020#include <stdint.h>
satoke808e432010-12-02 14:53:24 +090021#include "defines.h"
satok8fbd5522011-02-22 17:28:55 +090022#include "proximity_info.h"
satoke808e432010-12-02 14:53:24 +090023
Jean Chalard293ece02011-06-16 20:55:16 +090024#ifndef NULL
25#define NULL 0
26#endif
27
satok30088252010-12-01 21:22:15 +090028namespace latinime {
29
satok30088252010-12-01 21:22:15 +090030class UnigramDictionary {
Jean Chalard8dc754a2011-01-27 14:20:22 +090031
32 typedef enum { // Used as a return value for character comparison
33 SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR, // Same char, possibly with different case or accent
34 NEAR_PROXIMITY_CHAR, // It is a char located nearby on the keyboard
35 UNRELATED_CHAR // It is an unrelated char
36 } ProximityType;
37
satok30088252010-12-01 21:22:15 +090038public:
Jean Chalard1059f272011-06-28 20:45:05 +090039#ifdef NEW_DICTIONARY_FORMAT
40
41 // Mask and flags for children address type selection.
42 static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
43 static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
44 static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
45 static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
46 static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
47
48 // Flag for single/multiple char group
49 static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
50
51 // Flag for terminal groups
52 static const int FLAG_IS_TERMINAL = 0x10;
53
54 // Flag for bigram presence
55 static const int FLAG_HAS_BIGRAMS = 0x04;
56
57 // Attribute (bigram/shortcut) related flags:
58 // Flag for presence of more attributes
59 static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
60 // Flag for sign of offset. If this flag is set, the offset value must be negated.
61 static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
62
63 // Mask for attribute frequency, stored on 4 bits inside the flags byte.
64 static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
65
66 // Mask and flags for attribute address type selection.
67 static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
68 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
69 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
70 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
71#endif // NEW_DICTIONARY_FORMAT
72
Jean Chalard293ece02011-06-16 20:55:16 +090073 UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
74 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
75 const bool isLatestDictVersion);
Jean Chalard1059f272011-06-28 20:45:05 +090076#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard8124e642011-06-16 22:33:41 +090077 bool isValidWord(unsigned short *word, int length);
Jean Chalard1059f272011-06-28 20:45:05 +090078#else // NEW_DICTIONARY_FORMAT
79 bool isValidWord(const uint16_t* const inWord, const int length) const;
80 int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
81 unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
82 int maxAlternatives);
83#endif // NEW_DICTIONARY_FORMAT
Jean Chalard581335c2011-06-17 12:45:17 +090084 int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
satok1d7eaf82011-07-13 10:32:02 +090085 int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090086 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
87 unsigned short *outWords, int *frequencies);
satok30088252010-12-01 21:22:15 +090088 ~UnigramDictionary();
89
90private:
satok1d7eaf82011-07-13 10:32:02 +090091 void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090092 const int *ycoordinates, const int *codes, const int codesSize,
93 unsigned short *outWords, int *frequencies);
94 bool isDigraph(const int* codes, const int i, const int codesSize) const;
satok1d7eaf82011-07-13 10:32:02 +090095 void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090096 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
97 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080098 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
satok1d7eaf82011-07-13 10:32:02 +090099 void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
100 const int *ycoordinates, const int *codes, const int codesSize,
101 unsigned short *outWords, int *frequencies);
satok54fe9e02010-12-13 14:42:35 +0900102 void getSuggestionCandidates(const int skipPos, const int excessivePos,
satoka3d78f62010-12-09 22:08:33 +0900103 const int transposedPos, int *nextLetters, const int nextLettersSize,
104 const int maxDepth);
satok30088252010-12-01 21:22:15 +0900105 bool addWord(unsigned short *word, int length, int frequency);
satok817e5172011-03-04 06:06:45 -0800106 bool getSplitTwoWordsSuggestion(const int inputLength,
107 const int firstWordStartPos, const int firstWordLength,
satokd8db9f82011-05-18 15:31:04 +0900108 const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
satok662fe692010-12-08 17:05:39 +0900109 bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
satok817e5172011-03-04 06:06:45 -0800110 bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
satok58c49b92011-01-27 03:23:39 +0900111 int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900112 const int excessivePos, const int transposedPos, const int freq,
113 const bool sameLength) const;
Jean Chalardca5ef282011-06-17 15:36:26 +0900114 void onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900115 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900116 const int inputIndex, const int matchWeight, const int skipPos,
117 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
118 int *nextLetters, const int nextLettersSize);
satok28bd03b2010-12-03 16:39:16 +0900119 bool needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900120 const int inputIndex, const int skipPos, const int depth);
Jean Chalard8dc754a2011-01-27 14:20:22 +0900121 ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c,
122 const int skipPos, const int excessivePos, const int transposedPos);
satok662fe692010-12-08 17:05:39 +0900123 // Process a node by considering proximity, missing and excessive character
Jean Chalard0584f022011-06-30 19:23:16 +0900124 bool processCurrentNode(const int initialPos, const int initialDepth,
125 const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex,
126 const int initialDiffs, const int skipPos, const int excessivePos,
127 const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount,
128 int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex,
129 int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex);
Jean Chalardffefdb62011-06-30 17:15:32 +0900130 bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const;
Jean Chalard432789a2011-06-30 17:50:48 +0900131 inline const int* getInputCharsAt(const int index) const {
satok1d7eaf82011-07-13 10:32:02 +0900132 return mProximityInfo->getProximityCharsAt(index);
Jean Chalard432789a2011-06-30 17:50:48 +0900133 }
Jean Chalardbb15e772011-06-30 20:14:38 +0900134 int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
135 unsigned short *word);
Jean Chalard1059f272011-06-28 20:45:05 +0900136#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardffefdb62011-06-30 17:15:32 +0900137 void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
138 const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
139 const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
140 const int nextLettersSize);
141 // Keep getWordsOld for comparing performance between getWords and getWordsOld
142 void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
143 const int excessivePos, const int transposedPos, int *nextLetters,
144 const int nextLettersSize);
satok662fe692010-12-08 17:05:39 +0900145 // Process a node by considering missing space
satokaee09dc2010-12-09 19:21:51 +0900146 bool processCurrentNodeForExactMatch(const int firstChildPos,
147 const int startInputIndex, const int depth, unsigned short *word,
148 int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
Jean Chalard1059f272011-06-28 20:45:05 +0900149#else // NEW_DICTIONARY_FORMAT
150 int getFrequency(const uint16_t* const inWord, const int length) const;
151 int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
152 short unsigned int* outWord);
153#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +0900154
155 const uint8_t* const DICT_ROOT;
satok30088252010-12-01 21:22:15 +0900156 const int MAX_WORD_LENGTH;
Ken Wakasae90b3332011-01-07 15:01:51 +0900157 const int MAX_WORDS;
satok662fe692010-12-08 17:05:39 +0900158 const int MAX_PROXIMITY_CHARS;
satoke808e432010-12-02 14:53:24 +0900159 const bool IS_LATEST_DICT_VERSION;
satok18c28f42010-12-02 18:11:54 +0900160 const int TYPED_LETTER_MULTIPLIER;
161 const int FULL_WORD_MULTIPLIER;
Ken Wakasae90b3332011-01-07 15:01:51 +0900162 const int ROOT_POS;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900163 const unsigned int BYTES_IN_ONE_CHAR;
satok3c4bb772011-03-04 22:50:19 -0800164 const int MAX_UMLAUT_SEARCH_DEPTH;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900165
166 // Flags for special processing
167 // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
168 // or something very bad (like, the apocalypse) will happen.
169 // Please update both at the same time.
170 enum {
171 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
172 };
173 static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
satok30088252010-12-01 21:22:15 +0900174
satok30088252010-12-01 21:22:15 +0900175 int *mFrequencies;
satok30088252010-12-01 21:22:15 +0900176 unsigned short *mOutputChars;
satok1d7eaf82011-07-13 10:32:02 +0900177 const ProximityInfo *mProximityInfo;
satok30088252010-12-01 21:22:15 +0900178 int mInputLength;
satok715514d2010-12-02 20:19:59 +0900179 // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
180 unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
satok30088252010-12-01 21:22:15 +0900181 int mMaxEditDistance;
satokd2997922010-12-07 13:08:39 +0900182
183 int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
184 bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
185 int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
186 int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
187 int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
188 int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
Jean Chalard17e44a72011-06-16 22:51:11 +0900189 int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900190 int mNextLettersFrequency[NEXT_LETTERS_SIZE];
satok30088252010-12-01 21:22:15 +0900191};
192
Ken Wakasace9e52a2011-06-18 13:09:55 +0900193} // namespace latinime
satok30088252010-12-01 21:22:15 +0900194
195#endif // LATINIME_UNIGRAM_DICTIONARY_H