blob: 55771eeb848328681d7cf1b4ea2ce93d6243d23b [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_UNIGRAM_DICTIONARY_H
18#define LATINIME_UNIGRAM_DICTIONARY_H
19
Jean Chalard293ece02011-06-16 20:55:16 +090020#include <stdint.h>
satoke808e432010-12-02 14:53:24 +090021#include "defines.h"
satok8fbd5522011-02-22 17:28:55 +090022#include "proximity_info.h"
satoke808e432010-12-02 14:53:24 +090023
Jean Chalard293ece02011-06-16 20:55:16 +090024#ifndef NULL
25#define NULL 0
26#endif
27
satok30088252010-12-01 21:22:15 +090028namespace latinime {
29
satok30088252010-12-01 21:22:15 +090030class UnigramDictionary {
Jean Chalard8dc754a2011-01-27 14:20:22 +090031
satok30088252010-12-01 21:22:15 +090032public:
Jean Chalard1059f272011-06-28 20:45:05 +090033#ifdef NEW_DICTIONARY_FORMAT
34
35 // Mask and flags for children address type selection.
36 static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
37 static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
38 static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
39 static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
40 static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
41
42 // Flag for single/multiple char group
43 static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
44
45 // Flag for terminal groups
46 static const int FLAG_IS_TERMINAL = 0x10;
47
48 // Flag for bigram presence
49 static const int FLAG_HAS_BIGRAMS = 0x04;
50
51 // Attribute (bigram/shortcut) related flags:
52 // Flag for presence of more attributes
53 static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
54 // Flag for sign of offset. If this flag is set, the offset value must be negated.
55 static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
56
57 // Mask for attribute frequency, stored on 4 bits inside the flags byte.
58 static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
59
60 // Mask and flags for attribute address type selection.
61 static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
62 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
63 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
64 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
65#endif // NEW_DICTIONARY_FORMAT
66
Jean Chalard293ece02011-06-16 20:55:16 +090067 UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
68 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
69 const bool isLatestDictVersion);
Jean Chalard1059f272011-06-28 20:45:05 +090070#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard8124e642011-06-16 22:33:41 +090071 bool isValidWord(unsigned short *word, int length);
Jean Chalard1059f272011-06-28 20:45:05 +090072#else // NEW_DICTIONARY_FORMAT
73 bool isValidWord(const uint16_t* const inWord, const int length) const;
74 int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
75 unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
76 int maxAlternatives);
77#endif // NEW_DICTIONARY_FORMAT
Jean Chalard581335c2011-06-17 12:45:17 +090078 int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
satok1d7eaf82011-07-13 10:32:02 +090079 int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090080 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
81 unsigned short *outWords, int *frequencies);
satok30088252010-12-01 21:22:15 +090082 ~UnigramDictionary();
83
84private:
satok1d7eaf82011-07-13 10:32:02 +090085 void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090086 const int *ycoordinates, const int *codes, const int codesSize,
87 unsigned short *outWords, int *frequencies);
88 bool isDigraph(const int* codes, const int i, const int codesSize) const;
satok1d7eaf82011-07-13 10:32:02 +090089 void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090090 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
91 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080092 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
satok1d7eaf82011-07-13 10:32:02 +090093 void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
94 const int *ycoordinates, const int *codes, const int codesSize,
95 unsigned short *outWords, int *frequencies);
satok54fe9e02010-12-13 14:42:35 +090096 void getSuggestionCandidates(const int skipPos, const int excessivePos,
satoka3d78f62010-12-09 22:08:33 +090097 const int transposedPos, int *nextLetters, const int nextLettersSize,
98 const int maxDepth);
satok30088252010-12-01 21:22:15 +090099 bool addWord(unsigned short *word, int length, int frequency);
satok817e5172011-03-04 06:06:45 -0800100 bool getSplitTwoWordsSuggestion(const int inputLength,
101 const int firstWordStartPos, const int firstWordLength,
satokd8db9f82011-05-18 15:31:04 +0900102 const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
satok662fe692010-12-08 17:05:39 +0900103 bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
satok817e5172011-03-04 06:06:45 -0800104 bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
satok58c49b92011-01-27 03:23:39 +0900105 int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900106 const int excessivePos, const int transposedPos, const int freq,
107 const bool sameLength) const;
Jean Chalardca5ef282011-06-17 15:36:26 +0900108 void onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900109 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900110 const int inputIndex, const int matchWeight, const int skipPos,
111 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
112 int *nextLetters, const int nextLettersSize);
satok28bd03b2010-12-03 16:39:16 +0900113 bool needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900114 const int inputIndex, const int skipPos, const int depth);
satok662fe692010-12-08 17:05:39 +0900115 // Process a node by considering proximity, missing and excessive character
Jean Chalard0584f022011-06-30 19:23:16 +0900116 bool processCurrentNode(const int initialPos, const int initialDepth,
117 const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex,
118 const int initialDiffs, const int skipPos, const int excessivePos,
119 const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount,
120 int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex,
121 int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex);
Jean Chalardbb15e772011-06-30 20:14:38 +0900122 int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
123 unsigned short *word);
Jean Chalard1059f272011-06-28 20:45:05 +0900124#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardffefdb62011-06-30 17:15:32 +0900125 void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
126 const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
127 const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
128 const int nextLettersSize);
129 // Keep getWordsOld for comparing performance between getWords and getWordsOld
130 void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
131 const int excessivePos, const int transposedPos, int *nextLetters,
132 const int nextLettersSize);
satok662fe692010-12-08 17:05:39 +0900133 // Process a node by considering missing space
satokaee09dc2010-12-09 19:21:51 +0900134 bool processCurrentNodeForExactMatch(const int firstChildPos,
135 const int startInputIndex, const int depth, unsigned short *word,
136 int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
Jean Chalard1059f272011-06-28 20:45:05 +0900137#else // NEW_DICTIONARY_FORMAT
Jean Chalard1059f272011-06-28 20:45:05 +0900138 int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
139 short unsigned int* outWord);
140#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +0900141
142 const uint8_t* const DICT_ROOT;
satok30088252010-12-01 21:22:15 +0900143 const int MAX_WORD_LENGTH;
Ken Wakasae90b3332011-01-07 15:01:51 +0900144 const int MAX_WORDS;
satok662fe692010-12-08 17:05:39 +0900145 const int MAX_PROXIMITY_CHARS;
satoke808e432010-12-02 14:53:24 +0900146 const bool IS_LATEST_DICT_VERSION;
satok18c28f42010-12-02 18:11:54 +0900147 const int TYPED_LETTER_MULTIPLIER;
148 const int FULL_WORD_MULTIPLIER;
Ken Wakasae90b3332011-01-07 15:01:51 +0900149 const int ROOT_POS;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900150 const unsigned int BYTES_IN_ONE_CHAR;
satok3c4bb772011-03-04 22:50:19 -0800151 const int MAX_UMLAUT_SEARCH_DEPTH;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900152
153 // Flags for special processing
154 // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
155 // or something very bad (like, the apocalypse) will happen.
156 // Please update both at the same time.
157 enum {
158 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
159 };
160 static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
satok30088252010-12-01 21:22:15 +0900161
satok30088252010-12-01 21:22:15 +0900162 int *mFrequencies;
satok30088252010-12-01 21:22:15 +0900163 unsigned short *mOutputChars;
satok1d7eaf82011-07-13 10:32:02 +0900164 const ProximityInfo *mProximityInfo;
satok30088252010-12-01 21:22:15 +0900165 int mInputLength;
satok715514d2010-12-02 20:19:59 +0900166 // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
167 unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
satok30088252010-12-01 21:22:15 +0900168 int mMaxEditDistance;
satokd2997922010-12-07 13:08:39 +0900169
170 int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
171 bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
172 int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
173 int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
174 int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
175 int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
Jean Chalard17e44a72011-06-16 22:51:11 +0900176 int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900177 int mNextLettersFrequency[NEXT_LETTERS_SIZE];
satok30088252010-12-01 21:22:15 +0900178};
Ken Wakasace9e52a2011-06-18 13:09:55 +0900179} // namespace latinime
satok30088252010-12-01 21:22:15 +0900180
181#endif // LATINIME_UNIGRAM_DICTIONARY_H