native/src/unigram_dictionary.h - android_packages_inputmethods_LatinIME - Gitiles

 /*
  * Copyright (C) 2010 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LATINIME_UNIGRAM_DICTIONARY_H
 #define LATINIME_UNIGRAM_DICTIONARY_H

 #include <stdint.h>
 #include "correction.h"
 #include "correction_state.h"
 #include "defines.h"
 #include "proximity_info.h"

 #ifndef NULL
 #define NULL 0
 #endif

 namespace latinime {

 class UnigramDictionary {

 public:

     // Mask and flags for children address type selection.
     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
     static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
     static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
     static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
     static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;

     // Flag for single/multiple char group
     static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;

     // Flag for terminal groups
     static const int FLAG_IS_TERMINAL = 0x10;

     // Flag for bigram presence
     static const int FLAG_HAS_BIGRAMS = 0x04;

     // Attribute (bigram/shortcut) related flags:
     // Flag for presence of more attributes
     static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
     // Flag for sign of offset. If this flag is set, the offset value must be negated.
     static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;

     // Mask for attribute frequency, stored on 4 bits inside the flags byte.
     static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;

     // Mask and flags for attribute address type selection.
     static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;

     UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
             int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
             const bool isLatestDictVersion);
     bool isValidWord(const uint16_t* const inWord, const int length) const;
     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const int codesSize, const int flags,
             unsigned short *outWords, int *frequencies);
     virtual ~UnigramDictionary();

 private:

     void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const int codesSize,
             unsigned short *outWords, int *frequencies);
     bool isDigraph(const int* codes, const int i, const int codesSize) const;
     void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
         const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
         const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
         const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
     void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const int codesSize,
             unsigned short *outWords, int *frequencies);
     void getSuggestionCandidates();
     bool addWord(unsigned short *word, int length, int frequency);
     void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction);
     void getMissingSpaceWords(
             const int inputLength, const int missingSpacePos, Correction *correction);
     void getMistypedSpaceWords(
             const int inputLength, const int spaceProximityPos, Correction *correction);
     void onTerminal(const int freq, Correction *correction);
     bool needsToSkipCurrentNode(const unsigned short c,
             const int inputIndex, const int skipPos, const int depth);
     // Process a node by considering proximity, missing and excessive character
     bool processCurrentNode(const int initialPos,
             Correction *correction, int *newCount,
             int *newChildPosition, int *nextSiblingPosition);
     int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
             unsigned short *word);
     int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
             short unsigned int* outWord);

     const uint8_t* const DICT_ROOT;
     const int MAX_WORD_LENGTH;
     const int MAX_WORDS;
     const int MAX_PROXIMITY_CHARS;
     const bool IS_LATEST_DICT_VERSION;
     const int TYPED_LETTER_MULTIPLIER;
     const int FULL_WORD_MULTIPLIER;
     const int ROOT_POS;
     const unsigned int BYTES_IN_ONE_CHAR;
     const int MAX_UMLAUT_SEARCH_DEPTH;

     // Flags for special processing
     // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
     // or something very bad (like, the apocalypse) will happen.
     // Please update both at the same time.
     enum {
         REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
     };
     static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];

     int *mFrequencies;
     unsigned short *mOutputChars;
     ProximityInfo *mProximityInfo;
     Correction *mCorrection;
     int mInputLength;
     // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
     unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];

     int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
     int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
     int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
 };
 } // namespace latinime

 #endif // LATINIME_UNIGRAM_DICTIONARY_H
	/*
	* Copyright (C) 2010 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LATINIME_UNIGRAM_DICTIONARY_H
	#define LATINIME_UNIGRAM_DICTIONARY_H

	#include <stdint.h>
	#include "correction.h"
	#include "correction_state.h"
	#include "defines.h"
	#include "proximity_info.h"

	#ifndef NULL
	#define NULL 0
	#endif

	namespace latinime {

	class UnigramDictionary {

	public:

	// Mask and flags for children address type selection.
	static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
	static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
	static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
	static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
	static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;

	// Flag for single/multiple char group
	static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;

	// Flag for terminal groups
	static const int FLAG_IS_TERMINAL = 0x10;

	// Flag for bigram presence
	static const int FLAG_HAS_BIGRAMS = 0x04;

	// Attribute (bigram/shortcut) related flags:
	// Flag for presence of more attributes
	static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
	// Flag for sign of offset. If this flag is set, the offset value must be negated.
	static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;

	// Mask for attribute frequency, stored on 4 bits inside the flags byte.
	static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;

	// Mask and flags for attribute address type selection.
	static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
	static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
	static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
	static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;

	UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
	int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
	const bool isLatestDictVersion);
	bool isValidWord(const uint16_t* const inWord, const int length) const;
	int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
	int getSuggestions(ProximityInfo proximityInfo, const int xcoordinates,
	const int ycoordinates, const int codes, const int codesSize, const int flags,
	unsigned short outWords, int frequencies);
	virtual ~UnigramDictionary();

	private:

	void getWordSuggestions(ProximityInfo proximityInfo, const int xcoordinates,
	const int ycoordinates, const int codes, const int codesSize,
	unsigned short outWords, int frequencies);
	bool isDigraph(const int* codes, const int i, const int codesSize) const;
	void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
	const int xcoordinates, const int ycoordinates, const int *codesBuffer,
	const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
	const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
	void initSuggestions(ProximityInfo proximityInfo, const int xcoordinates,
	const int ycoordinates, const int codes, const int codesSize,
	unsigned short outWords, int frequencies);
	void getSuggestionCandidates();
	bool addWord(unsigned short *word, int length, int frequency);
	void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction);
	void getMissingSpaceWords(
	const int inputLength, const int missingSpacePos, Correction *correction);
	void getMistypedSpaceWords(
	const int inputLength, const int spaceProximityPos, Correction *correction);
	void onTerminal(const int freq, Correction *correction);
	bool needsToSkipCurrentNode(const unsigned short c,
	const int inputIndex, const int skipPos, const int depth);
	// Process a node by considering proximity, missing and excessive character
	bool processCurrentNode(const int initialPos,
	Correction correction, int newCount,
	int newChildPosition, int nextSiblingPosition);
	int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
	unsigned short *word);
	int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
	short unsigned int* outWord);

	const uint8_t* const DICT_ROOT;
	const int MAX_WORD_LENGTH;
	const int MAX_WORDS;
	const int MAX_PROXIMITY_CHARS;
	const bool IS_LATEST_DICT_VERSION;
	const int TYPED_LETTER_MULTIPLIER;
	const int FULL_WORD_MULTIPLIER;
	const int ROOT_POS;
	const unsigned int BYTES_IN_ONE_CHAR;
	const int MAX_UMLAUT_SEARCH_DEPTH;

	// Flags for special processing
	// Those must match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
	// or something very bad (like, the apocalypse) will happen.
	// Please update both at the same time.
	enum {
	REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
	};
	static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];

	int *mFrequencies;
	unsigned short *mOutputChars;
	ProximityInfo *mProximityInfo;
	Correction *mCorrection;
	int mInputLength;
	// MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
	unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];

	int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
	int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
	int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
	};
	} // namespace latinime

	#endif // LATINIME_UNIGRAM_DICTIONARY_H