blob: 97198ef1328ecb1243149af42e78457c7629f701 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_UNIGRAM_DICTIONARY_H
18#define LATINIME_UNIGRAM_DICTIONARY_H
19
Jean Chalard293ece02011-06-16 20:55:16 +090020#include <stdint.h>
satoke808e432010-12-02 14:53:24 +090021#include "defines.h"
satok8fbd5522011-02-22 17:28:55 +090022#include "proximity_info.h"
satoke808e432010-12-02 14:53:24 +090023
Jean Chalard293ece02011-06-16 20:55:16 +090024#ifndef NULL
25#define NULL 0
26#endif
27
satok30088252010-12-01 21:22:15 +090028namespace latinime {
29
satok30088252010-12-01 21:22:15 +090030class UnigramDictionary {
Jean Chalard8dc754a2011-01-27 14:20:22 +090031
satok30088252010-12-01 21:22:15 +090032public:
Jean Chalard1059f272011-06-28 20:45:05 +090033#ifdef NEW_DICTIONARY_FORMAT
34
35 // Mask and flags for children address type selection.
36 static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
37 static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
38 static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
39 static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
40 static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
41
42 // Flag for single/multiple char group
43 static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
44
45 // Flag for terminal groups
46 static const int FLAG_IS_TERMINAL = 0x10;
47
48 // Flag for bigram presence
49 static const int FLAG_HAS_BIGRAMS = 0x04;
50
51 // Attribute (bigram/shortcut) related flags:
52 // Flag for presence of more attributes
53 static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
54 // Flag for sign of offset. If this flag is set, the offset value must be negated.
55 static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
56
57 // Mask for attribute frequency, stored on 4 bits inside the flags byte.
58 static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
59
60 // Mask and flags for attribute address type selection.
61 static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
62 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
63 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
64 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
65#endif // NEW_DICTIONARY_FORMAT
66
Jean Chalard293ece02011-06-16 20:55:16 +090067 UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
68 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
69 const bool isLatestDictVersion);
Jean Chalard1059f272011-06-28 20:45:05 +090070#ifndef NEW_DICTIONARY_FORMAT
Jean Chalard8124e642011-06-16 22:33:41 +090071 bool isValidWord(unsigned short *word, int length);
Jean Chalard1059f272011-06-28 20:45:05 +090072#else // NEW_DICTIONARY_FORMAT
73 bool isValidWord(const uint16_t* const inWord, const int length) const;
Jean Chalard1059f272011-06-28 20:45:05 +090074#endif // NEW_DICTIONARY_FORMAT
Jean Chalard581335c2011-06-17 12:45:17 +090075 int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
satok1d7eaf82011-07-13 10:32:02 +090076 int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090077 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
78 unsigned short *outWords, int *frequencies);
satok30088252010-12-01 21:22:15 +090079 ~UnigramDictionary();
80
81private:
satok1d7eaf82011-07-13 10:32:02 +090082 void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090083 const int *ycoordinates, const int *codes, const int codesSize,
84 unsigned short *outWords, int *frequencies);
85 bool isDigraph(const int* codes, const int i, const int codesSize) const;
satok1d7eaf82011-07-13 10:32:02 +090086 void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090087 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
88 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080089 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
satok1d7eaf82011-07-13 10:32:02 +090090 void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
91 const int *ycoordinates, const int *codes, const int codesSize,
92 unsigned short *outWords, int *frequencies);
satok54fe9e02010-12-13 14:42:35 +090093 void getSuggestionCandidates(const int skipPos, const int excessivePos,
satoka3d78f62010-12-09 22:08:33 +090094 const int transposedPos, int *nextLetters, const int nextLettersSize,
95 const int maxDepth);
satok30088252010-12-01 21:22:15 +090096 bool addWord(unsigned short *word, int length, int frequency);
satok817e5172011-03-04 06:06:45 -080097 bool getSplitTwoWordsSuggestion(const int inputLength,
98 const int firstWordStartPos, const int firstWordLength,
satokd8db9f82011-05-18 15:31:04 +090099 const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
satok662fe692010-12-08 17:05:39 +0900100 bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
satok817e5172011-03-04 06:06:45 -0800101 bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
satok58c49b92011-01-27 03:23:39 +0900102 int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
Jean Chalard07a84062011-03-03 10:22:10 +0900103 const int excessivePos, const int transposedPos, const int freq,
104 const bool sameLength) const;
Jean Chalardca5ef282011-06-17 15:36:26 +0900105 void onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +0900106 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +0900107 const int inputIndex, const int matchWeight, const int skipPos,
108 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
109 int *nextLetters, const int nextLettersSize);
satok28bd03b2010-12-03 16:39:16 +0900110 bool needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +0900111 const int inputIndex, const int skipPos, const int depth);
satok662fe692010-12-08 17:05:39 +0900112 // Process a node by considering proximity, missing and excessive character
Jean Chalard0584f022011-06-30 19:23:16 +0900113 bool processCurrentNode(const int initialPos, const int initialDepth,
114 const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex,
115 const int initialDiffs, const int skipPos, const int excessivePos,
116 const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount,
117 int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex,
118 int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex);
Jean Chalardbb15e772011-06-30 20:14:38 +0900119 int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
120 unsigned short *word);
Jean Chalard1059f272011-06-28 20:45:05 +0900121#ifndef NEW_DICTIONARY_FORMAT
Jean Chalardffefdb62011-06-30 17:15:32 +0900122 void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
123 const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
124 const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
125 const int nextLettersSize);
126 // Keep getWordsOld for comparing performance between getWords and getWordsOld
127 void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
128 const int excessivePos, const int transposedPos, int *nextLetters,
129 const int nextLettersSize);
satok662fe692010-12-08 17:05:39 +0900130 // Process a node by considering missing space
satokaee09dc2010-12-09 19:21:51 +0900131 bool processCurrentNodeForExactMatch(const int firstChildPos,
132 const int startInputIndex, const int depth, unsigned short *word,
133 int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
Jean Chalard1059f272011-06-28 20:45:05 +0900134#else // NEW_DICTIONARY_FORMAT
Jean Chalard1059f272011-06-28 20:45:05 +0900135 int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
136 short unsigned int* outWord);
137#endif // NEW_DICTIONARY_FORMAT
Jean Chalard293ece02011-06-16 20:55:16 +0900138
139 const uint8_t* const DICT_ROOT;
satok30088252010-12-01 21:22:15 +0900140 const int MAX_WORD_LENGTH;
Ken Wakasae90b3332011-01-07 15:01:51 +0900141 const int MAX_WORDS;
satok662fe692010-12-08 17:05:39 +0900142 const int MAX_PROXIMITY_CHARS;
satoke808e432010-12-02 14:53:24 +0900143 const bool IS_LATEST_DICT_VERSION;
satok18c28f42010-12-02 18:11:54 +0900144 const int TYPED_LETTER_MULTIPLIER;
145 const int FULL_WORD_MULTIPLIER;
Ken Wakasae90b3332011-01-07 15:01:51 +0900146 const int ROOT_POS;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900147 const unsigned int BYTES_IN_ONE_CHAR;
satok3c4bb772011-03-04 22:50:19 -0800148 const int MAX_UMLAUT_SEARCH_DEPTH;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900149
150 // Flags for special processing
151 // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
152 // or something very bad (like, the apocalypse) will happen.
153 // Please update both at the same time.
154 enum {
155 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
156 };
157 static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
satok30088252010-12-01 21:22:15 +0900158
satok30088252010-12-01 21:22:15 +0900159 int *mFrequencies;
satok30088252010-12-01 21:22:15 +0900160 unsigned short *mOutputChars;
satok1d7eaf82011-07-13 10:32:02 +0900161 const ProximityInfo *mProximityInfo;
satok30088252010-12-01 21:22:15 +0900162 int mInputLength;
satok715514d2010-12-02 20:19:59 +0900163 // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
164 unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
satok30088252010-12-01 21:22:15 +0900165 int mMaxEditDistance;
satokd2997922010-12-07 13:08:39 +0900166
167 int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
168 bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
169 int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
170 int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
171 int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
172 int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
Jean Chalard17e44a72011-06-16 22:51:11 +0900173 int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900174 int mNextLettersFrequency[NEXT_LETTERS_SIZE];
satok30088252010-12-01 21:22:15 +0900175};
Ken Wakasace9e52a2011-06-18 13:09:55 +0900176} // namespace latinime
satok30088252010-12-01 21:22:15 +0900177
178#endif // LATINIME_UNIGRAM_DICTIONARY_H