blob: fc2ed98949dd1042647a0cf5cec2a4f07b95f038 [file] [log] [blame]
Bjorn Bringertfb903a42013-03-18 21:17:26 +00001#include "pseudolocalize.h"
2
3using namespace std;
4
Anton Krumina2ef5c02014-03-12 14:46:44 -07005// String basis to generate expansion
6static const String16 k_expansion_string = String16("one two three "
7 "four five six seven eight nine ten eleven twelve thirteen "
8 "fourteen fiveteen sixteen seventeen nineteen twenty");
9
10// Special unicode characters to override directionality of the words
11static const String16 k_rlm = String16("\xe2\x80\x8f");
12static const String16 k_rlo = String16("\xE2\x80\xae");
13static const String16 k_pdf = String16("\xE2\x80\xac");
14
15// Placeholder marks
16static const String16 k_placeholder_open = String16("\xc2\xbb");
17static const String16 k_placeholder_close = String16("\xc2\xab");
18
Igor Viarheichykcbb1e672015-05-14 18:47:00 -070019static const char16_t k_arg_start = '{';
20static const char16_t k_arg_end = '}';
21
22Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m)
23 : mImpl(nullptr), mLastDepth(0) {
24 setMethod(m);
25}
26
27void Pseudolocalizer::setMethod(PseudolocalizationMethod m) {
28 if (mImpl) {
29 delete mImpl;
30 }
31 if (m == PSEUDO_ACCENTED) {
32 mImpl = new PseudoMethodAccent();
33 } else if (m == PSEUDO_BIDI) {
34 mImpl = new PseudoMethodBidi();
35 } else {
36 mImpl = new PseudoMethodNone();
37 }
38}
39
40String16 Pseudolocalizer::text(const String16& text) {
41 String16 out;
42 size_t depth = mLastDepth;
43 size_t lastpos, pos;
44 const size_t length= text.size();
Tomasz Wasilczykade06312023-08-10 23:54:44 +000045 const char16_t* str = text.c_str();
Igor Viarheichykcbb1e672015-05-14 18:47:00 -070046 bool escaped = false;
47 for (lastpos = pos = 0; pos < length; pos++) {
48 char16_t c = str[pos];
49 if (escaped) {
50 escaped = false;
51 continue;
52 }
53 if (c == '\'') {
54 escaped = true;
55 continue;
56 }
57
58 if (c == k_arg_start) {
59 depth++;
60 } else if (c == k_arg_end && depth) {
61 depth--;
62 }
63
64 if (mLastDepth != depth || pos == length - 1) {
65 bool pseudo = ((mLastDepth % 2) == 0);
66 size_t nextpos = pos;
67 if (!pseudo || depth == mLastDepth) {
68 nextpos++;
69 }
70 size_t size = nextpos - lastpos;
71 if (size) {
72 String16 chunk = String16(text, size, lastpos);
73 if (pseudo) {
74 chunk = mImpl->text(chunk);
75 } else if (str[lastpos] == k_arg_start &&
76 str[nextpos - 1] == k_arg_end) {
77 chunk = mImpl->placeholder(chunk);
78 }
79 out.append(chunk);
80 }
81 if (pseudo && depth < mLastDepth) { // End of message
82 out.append(mImpl->end());
83 } else if (!pseudo && depth > mLastDepth) { // Start of message
84 out.append(mImpl->start());
85 }
86 lastpos = nextpos;
87 mLastDepth = depth;
88 }
89 }
90 return out;
91}
92
Bjorn Bringertfb903a42013-03-18 21:17:26 +000093static const char*
Anton Krumina2ef5c02014-03-12 14:46:44 -070094pseudolocalize_char(const char16_t c)
Bjorn Bringertfb903a42013-03-18 21:17:26 +000095{
96 switch (c) {
Anton Krumina2ef5c02014-03-12 14:46:44 -070097 case 'a': return "\xc3\xa5";
98 case 'b': return "\xc9\x93";
99 case 'c': return "\xc3\xa7";
100 case 'd': return "\xc3\xb0";
101 case 'e': return "\xc3\xa9";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000102 case 'f': return "\xc6\x92";
103 case 'g': return "\xc4\x9d";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700104 case 'h': return "\xc4\xa5";
105 case 'i': return "\xc3\xae";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000106 case 'j': return "\xc4\xb5";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700107 case 'k': return "\xc4\xb7";
108 case 'l': return "\xc4\xbc";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000109 case 'm': return "\xe1\xb8\xbf";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700110 case 'n': return "\xc3\xb1";
111 case 'o': return "\xc3\xb6";
112 case 'p': return "\xc3\xbe";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000113 case 'q': return "\x51";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700114 case 'r': return "\xc5\x95";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000115 case 's': return "\xc5\xa1";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700116 case 't': return "\xc5\xa3";
117 case 'u': return "\xc3\xbb";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000118 case 'v': return "\x56";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700119 case 'w': return "\xc5\xb5";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000120 case 'x': return "\xd1\x85";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700121 case 'y': return "\xc3\xbd";
122 case 'z': return "\xc5\xbe";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000123 case 'A': return "\xc3\x85";
124 case 'B': return "\xce\xb2";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700125 case 'C': return "\xc3\x87";
126 case 'D': return "\xc3\x90";
127 case 'E': return "\xc3\x89";
128 case 'G': return "\xc4\x9c";
129 case 'H': return "\xc4\xa4";
130 case 'I': return "\xc3\x8e";
131 case 'J': return "\xc4\xb4";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000132 case 'K': return "\xc4\xb6";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700133 case 'L': return "\xc4\xbb";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000134 case 'M': return "\xe1\xb8\xbe";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700135 case 'N': return "\xc3\x91";
136 case 'O': return "\xc3\x96";
137 case 'P': return "\xc3\x9e";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000138 case 'Q': return "\x71";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700139 case 'R': return "\xc5\x94";
140 case 'S': return "\xc5\xa0";
141 case 'T': return "\xc5\xa2";
142 case 'U': return "\xc3\x9b";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000143 case 'V': return "\xce\xbd";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700144 case 'W': return "\xc5\xb4";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000145 case 'X': return "\xc3\x97";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700146 case 'Y': return "\xc3\x9d";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000147 case 'Z': return "\xc5\xbd";
Anton Krumina2ef5c02014-03-12 14:46:44 -0700148 case '!': return "\xc2\xa1";
149 case '?': return "\xc2\xbf";
150 case '$': return "\xe2\x82\xac";
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000151 default: return NULL;
152 }
153}
154
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700155static bool is_possible_normal_placeholder_end(const char16_t c) {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700156 switch (c) {
157 case 's': return true;
158 case 'S': return true;
159 case 'c': return true;
160 case 'C': return true;
161 case 'd': return true;
162 case 'o': return true;
163 case 'x': return true;
164 case 'X': return true;
165 case 'f': return true;
166 case 'e': return true;
167 case 'E': return true;
168 case 'g': return true;
169 case 'G': return true;
170 case 'a': return true;
171 case 'A': return true;
172 case 'b': return true;
173 case 'B': return true;
174 case 'h': return true;
175 case 'H': return true;
176 case '%': return true;
177 case 'n': return true;
178 default: return false;
179 }
180}
181
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700182static String16 pseudo_generate_expansion(const unsigned int length) {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700183 String16 result = k_expansion_string;
Tomasz Wasilczykade06312023-08-10 23:54:44 +0000184 const char16_t* s = result.c_str();
Anton Krumina2ef5c02014-03-12 14:46:44 -0700185 if (result.size() < length) {
186 result += String16(" ");
187 result += pseudo_generate_expansion(length - result.size());
188 } else {
189 int ext = 0;
190 // Should contain only whole words, so looking for a space
191 for (unsigned int i = length + 1; i < result.size(); ++i) {
192 ++ext;
193 if (s[i] == ' ') {
194 break;
195 }
196 }
Elliott Hughes05233282021-05-11 16:24:35 -0700197 // Just keep the first length + ext characters
198 result = String16(result, length + ext);
Anton Krumina2ef5c02014-03-12 14:46:44 -0700199 }
200 return result;
201}
202
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700203static bool is_space(const char16_t c) {
204 return (c == ' ' || c == '\t' || c == '\n');
205}
206
207String16 PseudoMethodAccent::start() {
208 String16 result;
209 if (mDepth == 0) {
210 result = String16(String8("["));
211 }
212 mWordCount = mLength = 0;
213 mDepth++;
214 return result;
215}
216
217String16 PseudoMethodAccent::end() {
218 String16 result;
219 if (mLength) {
220 result.append(String16(String8(" ")));
221 result.append(pseudo_generate_expansion(
222 mWordCount > 3 ? mLength : mLength / 2));
223 }
224 mWordCount = mLength = 0;
225 mDepth--;
226 if (mDepth == 0) {
227 result.append(String16(String8("]")));
228 }
229 return result;
230}
231
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000232/**
233 * Converts characters so they look like they've been localized.
234 *
235 * Note: This leaves escape sequences untouched so they can later be
236 * processed by ResTable::collectString in the normal way.
237 */
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700238String16 PseudoMethodAccent::text(const String16& source)
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000239{
Tomasz Wasilczykade06312023-08-10 23:54:44 +0000240 const char16_t* s = source.c_str();
Anton Krumina2ef5c02014-03-12 14:46:44 -0700241 String16 result;
242 const size_t I = source.size();
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700243 bool lastspace = true;
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000244 for (size_t i=0; i<I; i++) {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700245 char16_t c = s[i];
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000246 if (c == '\\') {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700247 // Escape syntax, no need to pseudolocalize
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000248 if (i<I-1) {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700249 result += String16("\\");
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000250 i++;
251 c = s[i];
252 switch (c) {
253 case 'u':
254 // this one takes up 5 chars
Anton Krumina2ef5c02014-03-12 14:46:44 -0700255 result += String16(s+i, 5);
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000256 i += 4;
257 break;
258 case 't':
259 case 'n':
260 case '#':
261 case '@':
262 case '?':
263 case '"':
264 case '\'':
265 case '\\':
266 default:
Anton Krumina2ef5c02014-03-12 14:46:44 -0700267 result.append(&c, 1);
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000268 break;
269 }
270 } else {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700271 result.append(&c, 1);
272 }
273 } else if (c == '%') {
274 // Placeholder syntax, no need to pseudolocalize
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700275 String16 chunk;
Anton Krumina2ef5c02014-03-12 14:46:44 -0700276 bool end = false;
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700277 chunk.append(&c, 1);
Anton Krumina2ef5c02014-03-12 14:46:44 -0700278 while (!end && i < I) {
279 ++i;
280 c = s[i];
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700281 chunk.append(&c, 1);
Anton Krumina2ef5c02014-03-12 14:46:44 -0700282 if (is_possible_normal_placeholder_end(c)) {
283 end = true;
284 } else if (c == 't') {
285 ++i;
286 c = s[i];
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700287 chunk.append(&c, 1);
Anton Krumina2ef5c02014-03-12 14:46:44 -0700288 end = true;
289 }
290 }
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700291 // Treat chunk as a placeholder unless it ends with %.
292 result += ((c == '%') ? chunk : placeholder(chunk));
Anton Krumina2ef5c02014-03-12 14:46:44 -0700293 } else if (c == '<' || c == '&') {
294 // html syntax, no need to pseudolocalize
295 bool tag_closed = false;
296 while (!tag_closed && i < I) {
297 if (c == '&') {
298 String16 escape_text;
299 escape_text.append(&c, 1);
300 bool end = false;
301 size_t htmlCodePos = i;
302 while (!end && htmlCodePos < I) {
303 ++htmlCodePos;
304 c = s[htmlCodePos];
305 escape_text.append(&c, 1);
306 // Valid html code
307 if (c == ';') {
308 end = true;
309 i = htmlCodePos;
310 }
311 // Wrong html code
312 else if (!((c == '#' ||
313 (c >= 'a' && c <= 'z') ||
314 (c >= 'A' && c <= 'Z') ||
315 (c >= '0' && c <= '9')))) {
316 end = true;
317 }
318 }
319 result += escape_text;
320 if (escape_text != String16("&lt;")) {
321 tag_closed = true;
322 }
323 continue;
324 }
325 if (c == '>') {
326 tag_closed = true;
327 result.append(&c, 1);
328 continue;
329 }
330 result.append(&c, 1);
331 i++;
332 c = s[i];
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000333 }
334 } else {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700335 // This is a pure text that should be pseudolocalized
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000336 const char* p = pseudolocalize_char(c);
337 if (p != NULL) {
Anton Krumina2ef5c02014-03-12 14:46:44 -0700338 result += String16(p);
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000339 } else {
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700340 bool space = is_space(c);
341 if (lastspace && !space) {
342 mWordCount++;
343 }
344 lastspace = space;
Anton Krumina2ef5c02014-03-12 14:46:44 -0700345 result.append(&c, 1);
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000346 }
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700347 // Count only pseudolocalizable chars and delimiters
348 mLength++;
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000349 }
350 }
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000351 return result;
352}
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700353String16 PseudoMethodAccent::placeholder(const String16& source) {
354 // Surround a placeholder with brackets
355 return k_placeholder_open + source + k_placeholder_close;
356}
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000357
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700358String16 PseudoMethodBidi::text(const String16& source)
Anton Krumina2ef5c02014-03-12 14:46:44 -0700359{
Tomasz Wasilczykade06312023-08-10 23:54:44 +0000360 const char16_t* s = source.c_str();
Anton Krumina2ef5c02014-03-12 14:46:44 -0700361 String16 result;
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700362 bool lastspace = true;
363 bool space = true;
Igor Viarheichyk4fb65162017-07-06 15:23:51 -0700364 bool escape = false;
365 const char16_t ESCAPE_CHAR = '\\';
Anton Krumina2ef5c02014-03-12 14:46:44 -0700366 for (size_t i=0; i<source.size(); i++) {
367 char16_t c = s[i];
Igor Viarheichyk4fb65162017-07-06 15:23:51 -0700368 if (!escape && c == ESCAPE_CHAR) {
369 escape = true;
370 continue;
371 }
372 space = (!escape && is_space(c)) || (escape && (c == 'n' || c == 't'));
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700373 if (lastspace && !space) {
374 // Word start
375 result += k_rlm + k_rlo;
376 } else if (!lastspace && space) {
377 // Word end
378 result += k_pdf + k_rlm;
Anton Krumina2ef5c02014-03-12 14:46:44 -0700379 }
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700380 lastspace = space;
Igor Viarheichyk4fb65162017-07-06 15:23:51 -0700381 if (escape) {
382 result.append(&ESCAPE_CHAR, 1);
383 escape=false;
384 }
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700385 result.append(&c, 1);
Anton Krumina2ef5c02014-03-12 14:46:44 -0700386 }
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700387 if (!lastspace) {
388 // End of last word
389 result += k_pdf + k_rlm;
390 }
Anton Krumina2ef5c02014-03-12 14:46:44 -0700391 return result;
392}
Bjorn Bringertfb903a42013-03-18 21:17:26 +0000393
Igor Viarheichykcbb1e672015-05-14 18:47:00 -0700394String16 PseudoMethodBidi::placeholder(const String16& source) {
395 // Surround a placeholder with directionality change sequence
396 return k_rlm + k_rlo + source + k_pdf + k_rlm;
397}
398