blob: 1aeace63caf3c84d0887ae774869e42f560cb0a4 [file] [log] [blame]
satok30088252010-12-01 21:22:15 +09001/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_UNIGRAM_DICTIONARY_H
18#define LATINIME_UNIGRAM_DICTIONARY_H
19
Jean Chalard293ece02011-06-16 20:55:16 +090020#include <stdint.h>
satoke808e432010-12-02 14:53:24 +090021#include "defines.h"
satok8fbd5522011-02-22 17:28:55 +090022#include "proximity_info.h"
satoke808e432010-12-02 14:53:24 +090023
Jean Chalard293ece02011-06-16 20:55:16 +090024#ifndef NULL
25#define NULL 0
26#endif
27
satok30088252010-12-01 21:22:15 +090028namespace latinime {
29
satok30088252010-12-01 21:22:15 +090030class UnigramDictionary {
Jean Chalard8dc754a2011-01-27 14:20:22 +090031
32 typedef enum { // Used as a return value for character comparison
33 SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR, // Same char, possibly with different case or accent
34 NEAR_PROXIMITY_CHAR, // It is a char located nearby on the keyboard
35 UNRELATED_CHAR // It is an unrelated char
36 } ProximityType;
37
satok30088252010-12-01 21:22:15 +090038public:
Jean Chalard293ece02011-06-16 20:55:16 +090039 UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
40 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
41 const bool isLatestDictVersion);
Jean Chalard8124e642011-06-16 22:33:41 +090042 bool isValidWord(unsigned short *word, int length);
Jean Chalard581335c2011-06-17 12:45:17 +090043 int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090044 int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
45 const int *ycoordinates, const int *codes, const int codesSize, const int flags,
46 unsigned short *outWords, int *frequencies);
satok30088252010-12-01 21:22:15 +090047 ~UnigramDictionary();
48
49private:
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090050 void getWordSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
51 const int *ycoordinates, const int *codes, const int codesSize,
52 unsigned short *outWords, int *frequencies);
53 bool isDigraph(const int* codes, const int i, const int codesSize) const;
54 void getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
55 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
56 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
satok3c4bb772011-03-04 22:50:19 -080057 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
Jean Chalardc2bbc6a2011-02-25 17:56:53 +090058 void initSuggestions(const int *codes, const int codesSize, unsigned short *outWords,
59 int *frequencies);
satok54fe9e02010-12-13 14:42:35 +090060 void getSuggestionCandidates(const int skipPos, const int excessivePos,
satoka3d78f62010-12-09 22:08:33 +090061 const int transposedPos, int *nextLetters, const int nextLettersSize,
62 const int maxDepth);
Jean Chalardca5ef282011-06-17 15:36:26 +090063 bool sameAsTyped(const unsigned short *word, int length) const;
satok30088252010-12-01 21:22:15 +090064 bool addWord(unsigned short *word, int length, int frequency);
satok817e5172011-03-04 06:06:45 -080065 bool getSplitTwoWordsSuggestion(const int inputLength,
66 const int firstWordStartPos, const int firstWordLength,
satokd8db9f82011-05-18 15:31:04 +090067 const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity);
satok662fe692010-12-08 17:05:39 +090068 bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
satok817e5172011-03-04 06:06:45 -080069 bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
satok58c49b92011-01-27 03:23:39 +090070 int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
Jean Chalard07a84062011-03-03 10:22:10 +090071 const int excessivePos, const int transposedPos, const int freq,
72 const bool sameLength) const;
Jean Chalardca5ef282011-06-17 15:36:26 +090073 void onTerminal(unsigned short int* word, const int depth,
Jean Chalard980d6b62011-06-30 17:02:23 +090074 const uint8_t* const root, const uint8_t flags, const int pos,
Jean Chalardca5ef282011-06-17 15:36:26 +090075 const int inputIndex, const int matchWeight, const int skipPos,
76 const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
77 int *nextLetters, const int nextLettersSize);
satok28bd03b2010-12-03 16:39:16 +090078 bool needsToSkipCurrentNode(const unsigned short c,
satok68319262010-12-03 19:38:08 +090079 const int inputIndex, const int skipPos, const int depth);
Jean Chalard8dc754a2011-01-27 14:20:22 +090080 ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c,
81 const int skipPos, const int excessivePos, const int transposedPos);
satok662fe692010-12-08 17:05:39 +090082 // Process a node by considering proximity, missing and excessive character
satok48e432c2010-12-06 17:38:58 +090083 bool processCurrentNode(const int pos, const int depth,
satokcdbbea72010-12-08 16:04:16 +090084 const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
satoka3d78f62010-12-09 22:08:33 +090085 const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
86 int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
satokcdbbea72010-12-08 16:04:16 +090087 bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
Jean Chalard17e44a72011-06-16 22:51:11 +090088 int *nextSiblingPosition, int *nextOutputIndex);
Jean Chalardffefdb62011-06-30 17:15:32 +090089 bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const;
90 void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
91 const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
92 const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
93 const int nextLettersSize);
94 // Keep getWordsOld for comparing performance between getWords and getWordsOld
95 void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
96 const int excessivePos, const int transposedPos, int *nextLetters,
97 const int nextLettersSize);
Jean Chalard980d6b62011-06-30 17:02:23 +090098 int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
99 unsigned short *word);
satok662fe692010-12-08 17:05:39 +0900100 // Process a node by considering missing space
satokaee09dc2010-12-09 19:21:51 +0900101 bool processCurrentNodeForExactMatch(const int firstChildPos,
102 const int startInputIndex, const int depth, unsigned short *word,
103 int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
Jean Chalard07a84062011-03-03 10:22:10 +0900104 inline const int* getInputCharsAt(const int index) const {
satok8fbd5522011-02-22 17:28:55 +0900105 return mInputCodes + (index * MAX_PROXIMITY_CHARS);
106 }
Jean Chalard293ece02011-06-16 20:55:16 +0900107
108 const uint8_t* const DICT_ROOT;
satok30088252010-12-01 21:22:15 +0900109 const int MAX_WORD_LENGTH;
Ken Wakasae90b3332011-01-07 15:01:51 +0900110 const int MAX_WORDS;
satok662fe692010-12-08 17:05:39 +0900111 const int MAX_PROXIMITY_CHARS;
satoke808e432010-12-02 14:53:24 +0900112 const bool IS_LATEST_DICT_VERSION;
satok18c28f42010-12-02 18:11:54 +0900113 const int TYPED_LETTER_MULTIPLIER;
114 const int FULL_WORD_MULTIPLIER;
Ken Wakasae90b3332011-01-07 15:01:51 +0900115 const int ROOT_POS;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900116 const unsigned int BYTES_IN_ONE_CHAR;
satok3c4bb772011-03-04 22:50:19 -0800117 const int MAX_UMLAUT_SEARCH_DEPTH;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900118
119 // Flags for special processing
120 // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
121 // or something very bad (like, the apocalypse) will happen.
122 // Please update both at the same time.
123 enum {
124 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
125 };
126 static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
satok30088252010-12-01 21:22:15 +0900127
satok30088252010-12-01 21:22:15 +0900128 int *mFrequencies;
satok30088252010-12-01 21:22:15 +0900129 unsigned short *mOutputChars;
Jean Chalardc2bbc6a2011-02-25 17:56:53 +0900130 const int *mInputCodes;
satok30088252010-12-01 21:22:15 +0900131 int mInputLength;
satok715514d2010-12-02 20:19:59 +0900132 // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
133 unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
satok30088252010-12-01 21:22:15 +0900134 int mMaxEditDistance;
satokd2997922010-12-07 13:08:39 +0900135
136 int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
137 bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
138 int mStackNodeFreq[MAX_WORD_LENGTH_INTERNAL];
139 int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
140 int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
141 int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
Jean Chalard17e44a72011-06-16 22:51:11 +0900142 int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
Tadashi G. Takaoka887f11e2011-02-10 20:53:58 +0900143 int mNextLettersFrequency[NEXT_LETTERS_SIZE];
satok30088252010-12-01 21:22:15 +0900144};
145
Ken Wakasace9e52a2011-06-18 13:09:55 +0900146} // namespace latinime
satok30088252010-12-01 21:22:15 +0900147
148#endif // LATINIME_UNIGRAM_DICTIONARY_H