Move flags belonging to BinaryFormat to the right place.
These masks and flags are constants that are an integral part
of the format. They belong in BinaryFormat and have nothing to
do in UnigramDictionary.
This needs I6751dda4 to not break the build
Bug: 6429243
Change-Id: Ic1c842b3245f7fdc25aa8d1459c5bb07b262e265
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index bc34e4e..2201711 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -126,7 +126,7 @@
// codesSize == 0 means we are trying to find bigram predictions.
if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
- const int bigramFreqTemp = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+ const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
// Due to space constraints, the frequency for bigrams is approximate - the lower the
// unigram frequency, the worse the precision. The theoritical maximum error in
// resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
@@ -139,7 +139,7 @@
++bigramCount;
}
}
- } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
+ } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
return bigramCount;
}
@@ -154,8 +154,8 @@
if (NOT_VALID_WORD == pos) return 0;
const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
- if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0;
- if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) {
+ if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
+ if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
} else {
pos = BinaryFormat::skipOtherCharacters(root, pos);
@@ -182,12 +182,12 @@
int bigramFlags;
do {
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
- const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+ const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
&pos);
(*map)[bigramPos] = frequency;
setInFilter(filter, bigramPos);
- } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
+ } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
}
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
@@ -223,7 +223,7 @@
if (bigramPos == nextWordPos) {
return true;
}
- } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
+ } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
return false;
}
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 4155ef4..fdc13bf 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -18,13 +18,47 @@
#define LATINIME_BINARY_FORMAT_H
#include <limits>
+#include <map>
#include "bloom_filter.h"
#include "char_utils.h"
-#include "unigram_dictionary.h"
namespace latinime {
class BinaryFormat {
+ public:
+ // Mask and flags for children address type selection.
+ static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
+ static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
+ static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
+ static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
+ static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
+
+ // Flag for single/multiple char group
+ static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
+
+ // Flag for terminal groups
+ static const int FLAG_IS_TERMINAL = 0x10;
+
+ // Flag for shortcut targets presence
+ static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
+ // Flag for bigram presence
+ static const int FLAG_HAS_BIGRAMS = 0x04;
+
+ // Attribute (bigram/shortcut) related flags:
+ // Flag for presence of more attributes
+ static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
+ // Flag for sign of offset. If this flag is set, the offset value must be negated.
+ static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
+
+ // Mask for attribute frequency, stored on 4 bits inside the flags byte.
+ static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
+
+ // Mask and flags for attribute address type selection.
+ static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
@@ -174,13 +208,13 @@
static inline int attributeAddressSize(const uint8_t flags) {
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
- return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
+ return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
/* Note: this is a value-dependant optimization of what may probably be
more readably written this way:
- switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
+ switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
default: return 0;
}
*/
@@ -189,7 +223,7 @@
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
int currentPos = pos;
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
- while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
+ while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags);
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
}
@@ -199,7 +233,7 @@
static inline int childrenAddressSize(const uint8_t flags) {
static const int CHILDREN_ADDRESS_SHIFT = 6;
- return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
+ return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
/* See the note in attributeAddressSize. The same applies here */
}
@@ -212,12 +246,12 @@
}
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
- return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
+ return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
const int pos) {
- if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+ if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
return pos + shortcutByteSize(dict, pos);
} else {
return pos;
@@ -226,7 +260,7 @@
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
const int pos) {
- if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
+ if (FLAG_HAS_BIGRAMS & flags) {
return skipExistingBigrams(dict, pos);
} else {
return pos;
@@ -253,15 +287,15 @@
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
const int pos) {
int offset = 0;
- switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
+ switch (MASK_GROUP_ADDRESS_TYPE & flags) {
+ case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
offset = dict[pos];
break;
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
+ case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
offset = dict[pos] << 8;
offset += dict[pos + 1];
break;
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
+ case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
offset = dict[pos] << 16;
offset += dict[pos + 1] << 8;
offset += dict[pos + 2];
@@ -275,32 +309,31 @@
}
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
- return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+ return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
}
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
const uint8_t flags, int *pos) {
int offset = 0;
const int origin = *pos;
- switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+ switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
offset = dict[origin];
*pos = origin + 1;
break;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
offset = dict[origin] << 8;
offset += dict[origin + 1];
*pos = origin + 2;
break;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
offset = dict[origin] << 16;
offset += dict[origin + 1] << 8;
offset += dict[origin + 2];
*pos = origin + 3;
break;
}
- if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
+ if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
return origin - offset;
} else {
return origin + offset;
@@ -332,7 +365,7 @@
// char within a node, so either we found our match in this node, or there is
// no match and we can return NOT_VALID_WORD. So we will check all the characters
// in this character group indeed does match.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
while (NOT_A_CHARACTER != character) {
++wordPos;
@@ -350,14 +383,13 @@
// If we don't match the length AND don't have children, then a word in the
// dictionary fully matches a prefix of the searched word but not the full word.
++wordPos;
- if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
+ if (FLAG_IS_TERMINAL & flags) {
if (wordPos == length) {
return charGroupPos;
}
- pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
+ pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
}
- if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
+ if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
return NOT_VALID_WORD;
}
// We have children and we are still shorter than the word we are searching for, so
@@ -367,7 +399,7 @@
break;
} else {
// This chargroup does not match, so skip the remaining part and go to the next.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = BinaryFormat::skipOtherCharacters(root, pos);
}
pos = BinaryFormat::skipFrequency(flags, pos);
@@ -420,7 +452,7 @@
// We found the address. Copy the rest of the word in the buffer and return
// the length.
outWord[wordPos] = character;
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
// We count chars in order to avoid infinite loops if the file is broken or
// if there is some other bug
@@ -435,7 +467,7 @@
}
// We need to skip past this char group, so skip any remaining chars after the
// first and possibly the frequency.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = skipOtherCharacters(root, pos);
}
pos = skipFrequency(flags, pos);
@@ -443,8 +475,8 @@
// The fact that this group has children is very important. Since we already know
// that this group does not match, if it has no children we know it is irrelevant
// to what we are searching for.
- const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
- (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+ const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
+ (MASK_GROUP_ADDRESS_TYPE & flags));
// We will write in `found' whether we have passed the children address we are
// searching for. For example if we search for "beer", the children of b are less
// than the address we are searching for and the children of c are greater. When we
@@ -484,7 +516,7 @@
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
// We copy all the characters in this group to the buffer
outWord[wordPos] = lastChar;
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
int32_t nextChar =
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
int charCount = maxDepth;
diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h
index 755635f..d633645 100644
--- a/native/jni/src/terminal_attributes.h
+++ b/native/jni/src/terminal_attributes.h
@@ -17,7 +17,7 @@
#ifndef LATINIME_TERMINAL_ATTRIBUTES_H
#define LATINIME_TERMINAL_ATTRIBUTES_H
-#include "unigram_dictionary.h"
+#include "binary_format.h"
namespace latinime {
@@ -36,7 +36,7 @@
public:
ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) : mDict(dict),
mPos(pos) {
- mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS));
+ mHasNextShortcutTarget = (0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS));
}
inline bool hasNextShortcutTarget() const {
@@ -49,7 +49,7 @@
inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord) {
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
mHasNextShortcutTarget =
- 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
+ 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
unsigned int i;
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index b2fc870..efcf512 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -707,7 +707,7 @@
const uint8_t *const root, const int startPos,
const uint16_t *const inWord, const int startInputIndex,
int32_t *outNewWord, int *outInputIndex, int *outPos) {
- const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
+ const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
int pos = startPos;
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
int32_t baseChar = toBaseLowerCase(character);
@@ -780,7 +780,7 @@
// into inputIndex if there is a match.
const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
inputIndex, newWord, &inputIndex, &pos);
- if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
+ if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
}
@@ -823,7 +823,7 @@
return NOT_A_PROBABILITY;
}
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
- const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
+ const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
if (hasMultipleChars) {
pos = BinaryFormat::skipOtherCharacters(root, pos);
} else {
@@ -871,8 +871,8 @@
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
- const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
- const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
+ const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
+ const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
bool needsToInvokeOnTerminal = false;
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index ac17f50..6083f01 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -32,39 +32,6 @@
typedef struct { int first; int second; int replacement; } digraph_t;
public:
- // Mask and flags for children address type selection.
- static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
- static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
- static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
- static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
- static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
-
- // Flag for single/multiple char group
- static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
-
- // Flag for terminal groups
- static const int FLAG_IS_TERMINAL = 0x10;
-
- // Flag for shortcut targets presence
- static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
- // Flag for bigram presence
- static const int FLAG_HAS_BIGRAMS = 0x04;
-
- // Attribute (bigram/shortcut) related flags:
- // Flag for presence of more attributes
- static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
- // Flag for sign of offset. If this flag is set, the offset value must be negated.
- static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
-
- // Mask for attribute frequency, stored on 4 bits inside the flags byte.
- static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
-
- // Mask and flags for attribute address type selection.
- static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
- static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
- static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
- static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
-
// Error tolerances
static const int DEFAULT_MAX_ERRORS = 2;
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;