blob: f64dab24999aa294f15ff2a76181c130f0524d80 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9
10/*
11 * arabic.c: functions for Arabic language
12 *
Bram Moolenaar071d4272004-06-13 20:20:40 +000013 * Author: Nadim Shaikli & Isam Bayazidi
Bram Moolenaardc4fa192019-03-22 16:33:15 +010014 * Farsi support and restructuring to make adding new letters easier by Ali
15 * Gholami Rudi. Further work by Ameretat Reith.
16 */
17
18/*
19 * Sorted list of unicode Arabic characters. Each entry holds the
20 * presentation forms of a letter.
21 *
22 * Arabic characters are categorized into following types:
23 *
24 * Isolated - iso-8859-6 form
25 * Initial - unicode form-B start
26 * Medial - unicode form-B middle
27 * Final - unicode form-B final
28 * Stand-Alone - unicode form-B isolated
Bram Moolenaar071d4272004-06-13 20:20:40 +000029 */
30
Bram Moolenaar75464dc2016-07-02 20:27:50 +020031#include "vim.h"
32
33#if defined(FEAT_ARABIC) || defined(PROTO)
34
Bram Moolenaardc4fa192019-03-22 16:33:15 +010035// Unicode values for Arabic characters.
36#define a_HAMZA 0x0621
37#define a_ALEF_MADDA 0x0622
38#define a_ALEF_HAMZA_ABOVE 0x0623
39#define a_WAW_HAMZA 0x0624
40#define a_ALEF_HAMZA_BELOW 0x0625
41#define a_YEH_HAMZA 0x0626
42#define a_ALEF 0x0627
43#define a_BEH 0x0628
44#define a_TEH_MARBUTA 0x0629
45#define a_TEH 0x062a
46#define a_THEH 0x062b
47#define a_JEEM 0x062c
48#define a_HAH 0x062d
49#define a_KHAH 0x062e
50#define a_DAL 0x062f
51#define a_THAL 0x0630
52#define a_REH 0x0631
53#define a_ZAIN 0x0632
54#define a_SEEN 0x0633
55#define a_SHEEN 0x0634
56#define a_SAD 0x0635
57#define a_DAD 0x0636
58#define a_TAH 0x0637
59#define a_ZAH 0x0638
60#define a_AIN 0x0639
61#define a_GHAIN 0x063a
62#define a_TATWEEL 0x0640
63#define a_FEH 0x0641
64#define a_QAF 0x0642
65#define a_KAF 0x0643
66#define a_LAM 0x0644
67#define a_MEEM 0x0645
68#define a_NOON 0x0646
69#define a_HEH 0x0647
70#define a_WAW 0x0648
71#define a_ALEF_MAKSURA 0x0649
72#define a_YEH 0x064a
73#define a_FATHATAN 0x064b
74#define a_DAMMATAN 0x064c
75#define a_KASRATAN 0x064d
76#define a_FATHA 0x064e
77#define a_DAMMA 0x064f
78#define a_KASRA 0x0650
79#define a_SHADDA 0x0651
80#define a_SUKUN 0x0652
81#define a_MADDA_ABOVE 0x0653
82#define a_HAMZA_ABOVE 0x0654
83#define a_HAMZA_BELOW 0x0655
Bram Moolenaar071d4272004-06-13 20:20:40 +000084
Bram Moolenaardc4fa192019-03-22 16:33:15 +010085#define a_PEH 0x067e
86#define a_TCHEH 0x0686
87#define a_JEH 0x0698
88#define a_FKAF 0x06a9
89#define a_GAF 0x06af
90#define a_FYEH 0x06cc
91
92#define a_s_LAM_ALEF_MADDA_ABOVE 0xfef5
93#define a_f_LAM_ALEF_MADDA_ABOVE 0xfef6
94#define a_s_LAM_ALEF_HAMZA_ABOVE 0xfef7
95#define a_f_LAM_ALEF_HAMZA_ABOVE 0xfef8
96#define a_s_LAM_ALEF_HAMZA_BELOW 0xfef9
97#define a_f_LAM_ALEF_HAMZA_BELOW 0xfefa
98#define a_s_LAM_ALEF 0xfefb
99#define a_f_LAM_ALEF 0xfefc
100
101static struct achar {
102 unsigned c;
103 unsigned isolated;
104 unsigned initial;
105 unsigned medial;
106 unsigned final;
107} achars[] = {
108 {a_HAMZA, 0xfe80, 0, 0, 0},
109 {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82},
110 {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84},
111 {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86},
112 {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88},
113 {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a},
114 {a_ALEF, 0xfe8d, 0, 0, 0xfe8e},
115 {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90},
116 {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94},
117 {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96},
118 {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a},
119 {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e},
120 {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2},
121 {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6},
122 {a_DAL, 0xfea9, 0, 0, 0xfeaa},
123 {a_THAL, 0xfeab, 0, 0, 0xfeac},
124 {a_REH, 0xfead, 0, 0, 0xfeae},
125 {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0},
126 {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2},
127 {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6},
128 {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba},
129 {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe},
130 {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2},
131 {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6},
132 {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca},
133 {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece},
134 {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640},
135 {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2},
136 {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6},
137 {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda},
138 {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede},
139 {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2},
140 {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6},
141 {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea},
142 {a_WAW, 0xfeed, 0, 0, 0xfeee},
143 {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0},
144 {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2},
145 {a_FATHATAN, 0xfe70, 0, 0, 0},
146 {a_DAMMATAN, 0xfe72, 0, 0, 0},
147 {a_KASRATAN, 0xfe74, 0, 0, 0},
148 {a_FATHA, 0xfe76, 0, 0xfe77, 0},
149 {a_DAMMA, 0xfe78, 0, 0xfe79, 0},
150 {a_KASRA, 0xfe7a, 0, 0xfe7b, 0},
151 {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0},
152 {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0},
153 {a_MADDA_ABOVE, 0, 0, 0, 0},
154 {a_HAMZA_ABOVE, 0, 0, 0, 0},
155 {a_HAMZA_BELOW, 0, 0, 0, 0},
156 {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57},
157 {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b},
158 {a_JEH, 0xfb8a, 0, 0, 0xfb8b},
159 {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f},
160 {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93},
161 {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd},
162};
163
164#define a_BYTE_ORDER_MARK 0xfeff
165
Bram Moolenaar071d4272004-06-13 20:20:40 +0000166/*
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100167 * Find the struct achar pointer to the given Arabic char.
168 * Returns NULL if not found.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000169 */
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100170 static struct achar *
171find_achar(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000172{
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100173 int h, m, l;
174
175 // using binary search to find c
K.Takataeeec2542021-06-02 13:28:16 +0200176 h = ARRAY_LENGTH(achars);
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100177 l = 0;
178 while (l < h)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000179 {
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100180 m = (h + l) / 2;
181 if (achars[m].c == (unsigned)c)
182 return &achars[m];
183 if ((unsigned)c < achars[m].c)
184 h = m;
185 else
186 l = m + 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000187 }
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100188 return NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000189}
190
Bram Moolenaar071d4272004-06-13 20:20:40 +0000191/*
192 * Change shape - from Combination (2 char) to an Isolated
193 */
194 static int
Bram Moolenaar7454a062016-01-30 15:14:10 +0100195chg_c_laa2i(int hid_c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000196{
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100197 int tempc;
198
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 switch (hid_c)
200 {
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100201 case a_ALEF_MADDA:
202 tempc = a_s_LAM_ALEF_MADDA_ABOVE;
203 break;
204 case a_ALEF_HAMZA_ABOVE:
205 tempc = a_s_LAM_ALEF_HAMZA_ABOVE;
206 break;
207 case a_ALEF_HAMZA_BELOW:
208 tempc = a_s_LAM_ALEF_HAMZA_BELOW;
209 break;
210 case a_ALEF:
211 tempc = a_s_LAM_ALEF;
212 break;
213 default:
214 tempc = 0;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000215 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100217 return tempc;
218}
Bram Moolenaar071d4272004-06-13 20:20:40 +0000219
220/*
221 * Change shape - from Combination-Isolated to Final
222 */
223 static int
Bram Moolenaar7454a062016-01-30 15:14:10 +0100224chg_c_laa2f(int hid_c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000225{
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100226 int tempc;
227
Bram Moolenaar071d4272004-06-13 20:20:40 +0000228 switch (hid_c)
229 {
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100230 case a_ALEF_MADDA:
231 tempc = a_f_LAM_ALEF_MADDA_ABOVE;
232 break;
233 case a_ALEF_HAMZA_ABOVE:
234 tempc = a_f_LAM_ALEF_HAMZA_ABOVE;
235 break;
236 case a_ALEF_HAMZA_BELOW:
237 tempc = a_f_LAM_ALEF_HAMZA_BELOW;
238 break;
239 case a_ALEF:
240 tempc = a_f_LAM_ALEF;
241 break;
242 default:
243 tempc = 0;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000244 }
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100245
246 return tempc;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000247}
248
249/*
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100250 * Returns whether it is possible to join the given letters
Bram Moolenaar071d4272004-06-13 20:20:40 +0000251 */
252 static int
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100253can_join(int c1, int c2)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000254{
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100255 struct achar *a1 = find_achar(c1);
256 struct achar *a2 = find_achar(c2);
257
258 return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial);
259}
260
261/*
262 * Check whether we are dealing with a character that could be regarded as an
263 * Arabic combining character, need to check the character before this.
264 */
265 int
266arabic_maycombine(int two)
267{
268 if (p_arshape && !p_tbidi)
269 return (two == a_ALEF_MADDA
270 || two == a_ALEF_HAMZA_ABOVE
271 || two == a_ALEF_HAMZA_BELOW
272 || two == a_ALEF);
273 return FALSE;
274}
275
276/*
277 * Check whether we are dealing with Arabic combining characters.
278 * Note: these are NOT really composing characters!
279 */
280 int
281arabic_combine(
282 int one, // first character
283 int two) // character just after "one"
284{
285 if (one == a_LAM)
286 return arabic_maycombine(two);
287 return FALSE;
288}
289
290/*
291 * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character
292 * (alphabet/number/punctuation)
293 */
294 static int
295A_is_iso(int c)
296{
297 return find_achar(c) != NULL;
298}
299
300/*
301 * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
302 */
303 static int
304A_is_ok(int c)
305{
306 return (A_is_iso(c) || c == a_BYTE_ORDER_MARK);
307}
308
309/*
310 * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B)
311 * with some exceptions/exclusions
312 */
313 static int
314A_is_valid(int c)
315{
316 return (A_is_ok(c) && c != a_HAMZA);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000317}
318
319/*
320 * Do Arabic shaping on character "c". Returns the shaped character.
321 * out: "ccp" points to the first byte of the character to be shaped.
322 * in/out: "c1p" points to the first composing char for "c".
323 * in: "prev_c" is the previous character (not shaped)
324 * in: "prev_c1" is the first composing char for the previous char
325 * (not shaped)
326 * in: "next_c" is the next character (not shaped).
327 */
328 int
Bram Moolenaar7454a062016-01-30 15:14:10 +0100329arabic_shape(
330 int c,
331 int *ccp,
332 int *c1p,
333 int prev_c,
334 int prev_c1,
335 int next_c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000336{
337 int curr_c;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000338 int curr_laa;
339 int prev_laa;
340
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100341 // Deal only with Arabic characters, pass back all others
Bram Moolenaar071d4272004-06-13 20:20:40 +0000342 if (!A_is_ok(c))
343 return c;
344
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100345 curr_laa = arabic_combine(c, *c1p);
346 prev_laa = arabic_combine(prev_c, prev_c1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000347
348 if (curr_laa)
349 {
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100350 if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa)
351 curr_c = chg_c_laa2f(*c1p);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000352 else
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100353 curr_c = chg_c_laa2i(*c1p);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000354
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100355 // Remove the composing character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000356 *c1p = 0;
357 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000358 else
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100359 {
360 struct achar *curr_a = find_achar(c);
361 int backward_combine = !prev_laa && can_join(prev_c, c);
362 int forward_combine = can_join(c, next_c);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000363
Bram Moolenaarbd9bf262019-03-30 18:25:39 +0100364 if (backward_combine)
365 {
366 if (forward_combine)
367 curr_c = curr_a->medial;
368 else
369 curr_c = curr_a->final;
370 }
371 else
372 {
373 if (forward_combine)
374 curr_c = curr_a->initial;
375 else
376 curr_c = curr_a->isolated;
377 }
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100378 }
379
Bram Moolenaarbd9bf262019-03-30 18:25:39 +0100380 // Character missing from the table means using original character.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000381 if (curr_c == NUL)
382 curr_c = c;
383
384 if (curr_c != c && ccp != NULL)
385 {
Bram Moolenaar9a920d82012-06-01 15:21:02 +0200386 char_u buf[MB_MAXBYTES + 1];
Bram Moolenaar071d4272004-06-13 20:20:40 +0000387
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100388 // Update the first byte of the character.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000389 (*mb_char2bytes)(curr_c, buf);
390 *ccp = buf[0];
391 }
392
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100393 // Return the shaped character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000394 return curr_c;
395}
Bram Moolenaardc4fa192019-03-22 16:33:15 +0100396#endif // FEAT_ARABIC