Blame - native/src/binary_format.h - android_packages_inputmethods_LatinIME

2011-06-28 20:45:05 +0900

[diff] [blame]

/*

*

* Licensed under the Apache License, Version 2.0 (the "License");

5

* you may not use this file except in compliance with the License.

6

* You may obtain a copy of the License at

7

*

8

* http://www.apache.org/licenses/LICENSE-2.0

9

*

10

* Unless required by applicable law or agreed to in writing, software

11

* distributed under the License is distributed on an "AS IS" BASIS,

12

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13

* See the License for the specific language governing permissions and

14

* limitations under the License.

15

*/

16

17

#ifndef LATINIME_BINARY_FORMAT_H

18

#define LATINIME_BINARY_FORMAT_H

19

Jean Chalard

2012-02-27 19:48:47 +0900

[diff] [blame^]

20

#include <limits>

Jean Chalard

2011-07-20 18:42:32 +0900

[diff] [blame]

21

#include "unigram_dictionary.h"

22

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

23

namespace latinime {

24

25

class BinaryFormat {

Ken Wakasa

e12e9b5

2012-01-06 12:24:38 +0900

[diff] [blame]

26

private:

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

27

const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;

28

const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;

29

const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;

30

Ken Wakasa

e12e9b5

2012-01-06 12:24:38 +0900

[diff] [blame]

31

public:

Jean Chalard

2011-07-20 18:42:32 +0900

[diff] [blame]

32

const static int UNKNOWN_FORMAT = -1;

Jean Chalard

2012-02-27 19:48:47 +0900

[diff] [blame^]

33

// Originally, format version 1 had a 16-bit magic number, then the version number `01'

34

// then options that must be 0. Hence the first 32-bits of the format are always as follow

35

// and it's okay to consider them a magic number as a whole.

36

const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;

37

const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;

38

// The versions of Latin IME that only handle format version 1 only test for the magic

39

// number, so we had to change it so that version 2 files would be rejected by older

40

// implementations. On this occasion, we made the magic number 32 bits long.

41

const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;

Jean Chalard

2011-07-20 18:42:32 +0900

[diff] [blame]

42

43

static int detectFormat(const uint8_t* const dict);

Jean Chalard

2012-02-27 19:48:47 +0900

[diff] [blame^]

44

static unsigned int getHeaderSize(const uint8_t* const dict);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

45

static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);

46

static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);

47

static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);

48

static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);

49

static int skipOtherCharacters(const uint8_t* const dict, const int pos);

50

static int skipAttributes(const uint8_t* const dict, const int pos);

51

static int skipChildrenPosition(const uint8_t flags, const int pos);

52

static int skipFrequency(const uint8_t flags, const int pos);

53

static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);

54

static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,

55

const int pos);

56

static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);

57

static bool hasChildrenInFlags(const uint8_t flags);

58

static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,

59

int *pos);

Jean Chalard

6a0e964

2011-07-25 18:17:11 +0900

[diff] [blame]

60

static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,

61

const int length);

Jean Chalard

588e2f2

2011-07-25 14:03:19 +0900

[diff] [blame]

62

static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,

63

uint16_t* outWord);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

64

};

65

Jean Chalard

2011-07-20 18:42:32 +0900

[diff] [blame]

66

inline int BinaryFormat::detectFormat(const uint8_t* const dict) {

Jean Chalard

2012-02-27 19:48:47 +0900

[diff] [blame^]

67

// The magic number is stored big-endian.

68

const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];

69

switch (magicNumber) {

70

case FORMAT_VERSION_1_MAGIC_NUMBER:

71

// Format 1 header is exactly 5 bytes long and looks like:

72

// Magic number (2 bytes) 0x78 0xB1

73

// Version number (1 byte) 0x01

74

// Options (2 bytes) must be 0x00 0x00

75

return 1;

76

case FORMAT_VERSION_2_MAGIC_NUMBER:

77

// Format 2 header is as follows:

78

// Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE

79

// Version number (2 bytes) 0x00 0x02

80

// Options (2 bytes) must be 0x00 0x00

81

// Header size (4 bytes) : integer, big endian

82

return (dict[4] << 8) + dict[5];

83

default:

84

return UNKNOWN_FORMAT;

}

}

inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {

89

switch (detectFormat(dict)) {

90

case 1:

91

return FORMAT_VERSION_1_HEADER_SIZE;

92

case 2:

93

// See the format of the header in the comment in detectFormat() above

94

return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];

95

default:

96

return std::numeric_limits<unsigned int>::max();

97

}

Jean Chalard

2011-07-20 18:42:32 +0900

[diff] [blame]

98

}

99

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

100

inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {

Jean Chalard

4c0eca6

2012-01-16 15:15:53 +0900

[diff] [blame]

101

const int msb = dict[(*pos)++];

102

if (msb < 0x80) return msb;

103

return ((msb & 0x7F) << 8) | dict[(*pos)++];

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

104

}

105

106

inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {

107

return dict[(*pos)++];

108

}

109

110

inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {

111

const int origin = *pos;

112

const int32_t character = dict[origin];

113

if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {

114

if (character == CHARACTER_ARRAY_TERMINATOR) {

115

*pos = origin + 1;

116

return NOT_A_CHARACTER;

117

} else {

118

*pos = origin + 3;

119

const int32_t char_1 = character << 16;

120

const int32_t char_2 = char_1 + (dict[origin + 1] << 8);

121

return char_2 + dict[origin + 2];

}

} else {

*pos = origin + 1;

return character;

}

}

inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,

const int pos) {

return dict[pos];

}

inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {

135

int currentPos = pos;

136

int32_t character = dict[currentPos++];

137

while (CHARACTER_ARRAY_TERMINATOR != character) {

138

if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {

139

currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;

140

}

141

character = dict[currentPos++];

}

return currentPos;

}

static inline int attributeAddressSize(const uint8_t flags) {

147

static const int ATTRIBUTE_ADDRESS_SHIFT = 4;

148

return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;

149

/* Note: this is a value-dependant optimization of what may probably be

150

more readably written this way:

151

switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {

152

case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;

153

case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;

154

case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;

default: return 0;

}

*/

}

inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {

161

int currentPos = pos;

162

uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);

163

while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {

164

currentPos += attributeAddressSize(flags);

165

flags = getFlagsAndForwardPointer(dict, &currentPos);

166

}

167

currentPos += attributeAddressSize(flags);

return currentPos;

}

static inline int childrenAddressSize(const uint8_t flags) {

172

static const int CHILDREN_ADDRESS_SHIFT = 6;

173

return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;

174

/* See the note in attributeAddressSize. The same applies here */

175

}

176

177

inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {

178

return pos + childrenAddressSize(flags);

179

}

180

181

inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {

182

return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;

183

}

184

185

inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,

186

const int pos) {

Jean Chalard

e0e3396

2011-12-26 20:23:32 +0900

[diff] [blame]

187

// This function skips all attributes: shortcuts and bigrams.

188

int newPos = pos;

189

if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {

190

newPos = skipAttributes(dict, newPos);

Jean Chalard

2011-06-28 20:45:05 +0900

[diff] [blame]

191

}

Jean Chalard

e0e3396

2011-12-26 20:23:32 +0900

[diff] [blame]

192

if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {

193

newPos = skipAttributes(dict, newPos);

194

}

195

return newPos;

Jean Chalard