blob: 6da013ef89ba0c3b05b62234f317eb378e48c93d [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
Bram Moolenaar383aa842017-06-22 15:27:37 +02006" Might need to update the URL to the emoji-data.txt
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01007" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
Bram Moolenaar207f0092020-08-30 17:20:20 +020010" Last Update: 2020 Aug 24
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010011
12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14 let s:dataprops = []
15 let lnum = 1
16 while lnum <= line('$')
17 let l = split(getline(lnum), '\s*;\s*', 1)
18 if len(l) != 15
19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20 return
21 endif
22 call add(s:dataprops, l)
23 let lnum += 1
24 endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29 let s:foldprops = []
30 let lnum = 1
31 while lnum <= line('$')
32 let line = getline(lnum)
33 if line !~ '^#' && line !~ '^\s*$'
34 let l = split(line, '\s*;\s*', 1)
35 if len(l) != 4
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010036 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010038 endif
39 call add(s:foldprops, l)
40 endif
41 let lnum += 1
42 endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47 let s:widthprops = []
48 let lnum = 1
49 while lnum <= line('$')
50 let line = getline(lnum)
51 if line !~ '^#' && line !~ '^\s*$'
52 let l = split(line, '\s*;\s*', 1)
53 if len(l) != 2
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010054 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010056 endif
57 call add(s:widthprops, l)
58 endif
59 let lnum += 1
60 endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66 let start = -1
67 let end = -1
68 let step = 0
69 let add = -1
70 let ranges = []
71 for p in s:dataprops
72 if p[a:index] != ''
73 let n = ('0x' . p[0]) + 0
74 let nl = ('0x' . p[a:index]) + 0
75 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010076 " continue with same range.
77 let step = n - end
78 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010079 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010080 if start >= 0
81 " produce previous range
82 call Range(ranges, start, end, step, add)
83 endif
84 let start = n
85 let end = n
86 let step = 0
87 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010088 endif
89 endif
90 endfor
91 if start >= 0
92 call Range(ranges, start, end, step, add)
93 endif
94
95 " New buffer to put the result in.
96 new
97 exe "file to" . a:name
98 call setline(1, "static convertStruct to" . a:name . "[] =")
99 call setline(2, "{")
100 call append('$', ranges)
101 call setline('$', getline('$')[:-2]) " remove last comma
102 call setline(line('$') + 1, "};")
103 wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109 let start = -1
110 let end = -1
111 let step = 0
112 let add = -1
113 let ranges = []
114 for p in s:foldprops
115 if p[1] == 'C' || p[1] == 'S'
116 let n = ('0x' . p[0]) + 0
117 let nl = ('0x' . p[2]) + 0
118 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100119 " continue with same range.
120 let step = n - end
121 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100122 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100123 if start >= 0
124 " produce previous range
125 call Range(ranges, start, end, step, add)
126 endif
127 let start = n
128 let end = n
129 let step = 0
130 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100131 endif
132 endif
133 endfor
134 if start >= 0
135 call Range(ranges, start, end, step, add)
136 endif
137
138 " New buffer to put the result in.
139 new
140 file foldCase
141 call setline(1, "static convertStruct foldCase[] =")
142 call setline(2, "{")
143 call append('$', ranges)
144 call setline('$', getline('$')[:-2]) " remove last comma
145 call setline(line('$') + 1, "};")
146 wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151 call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157 let start = -1
158 let end = -1
159 let ranges = []
160 for p in s:dataprops
161 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
162 let n = ('0x' . p[0]) + 0
163 if start >= 0 && end + 1 == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100164 " continue with same range.
165 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100166 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100167 if start >= 0
168 " produce previous range
169 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
170 endif
171 let start = n
172 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100173 endif
174 endif
175 endfor
176 if start >= 0
177 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
178 endif
179
180 " New buffer to put the result in.
181 new
182 file combining
183 call setline(1, " static struct interval combining[] =")
184 call setline(2, " {")
185 call append('$', ranges)
186 call setline('$', getline('$')[:-2]) " remove last comma
187 call setline(line('$') + 1, " };")
188 wincmd p
189endfunc
190
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100191" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100192" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100193func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100194 let start = -1
195 let end = -1
196 let ranges = []
197 let dataidx = 0
Christian Brabandtd8872972021-06-27 21:30:14 +0200198 " Account for indentation differences between ambiguous and doublewidth
199 " table in mbyte.c
200 if a:pattern == 'A'
201 let spc = ' '
202 else
203 let spc = "\t"
204 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100205 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100206 if p[1][0] =~ a:pattern
207 if p[0] =~ '\.\.'
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100208 " It is a range. we don't check for composing char then.
209 let rng = split(p[0], '\.\.')
210 if len(rng) != 2
211 echoerr "Cannot parse range: '" . p[0] . "' in width table"
212 endif
213 let n = ('0x' . rng[0]) + 0
214 let n_last = ('0x' . rng[1]) + 0
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100215 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100216 let n = ('0x' . p[0]) + 0
217 let n_last = n
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100218 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100219 " Find this char in the data table.
220 while 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100221 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
222 if dn >= n
223 break
224 endif
225 let dataidx += 1
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100226 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100227 if dn != n && n_last == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100228 echoerr "Cannot find character " . n . " in data table"
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100229 endif
230 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100231 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100232 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100233 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100234 if start >= 0 && end + 1 == n
235 " continue with same range.
236 else
237 if start >= 0
238 " produce previous range
Christian Brabandtd8872972021-06-27 21:30:14 +0200239 call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100240 if a:pattern == 'A'
241 call add(s:ambitable, [start, end])
242 else
243 call add(s:doubletable, [start, end])
244 endif
245 endif
246 let start = n
247 endif
248 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100249 endif
250 endif
251 endfor
252 if start >= 0
Christian Brabandtd8872972021-06-27 21:30:14 +0200253 call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100254 if a:pattern == 'A'
255 call add(s:ambitable, [start, end])
256 else
257 call add(s:doubletable, [start, end])
258 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100259 endif
260
261 " New buffer to put the result in.
262 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100263 exe "file " . a:tableName
Christian Brabandtd8872972021-06-27 21:30:14 +0200264 if a:pattern == 'A'
265 call setline(1, "static struct interval " . a:tableName . "[] =")
266 call setline(2, "{")
267 else
268 call setline(1, " static struct interval " . a:tableName . "[] =")
269 call setline(2, " {")
270 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100271 call append('$', ranges)
272 call setline('$', getline('$')[:-2]) " remove last comma
Christian Brabandtd8872972021-06-27 21:30:14 +0200273 if a:pattern == 'A'
274 call setline(line('$') + 1, "};")
275 else
276 call setline(line('$') + 1, " };")
277 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100278 wincmd p
279endfunc
280
Bram Moolenaar207f0092020-08-30 17:20:20 +0200281
282" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
283" and put them in dictionary "chardict"
284func AddLinesToCharDict(lines, chardict)
285 for line in a:lines
286 let tokens = split(line, '\.\.')
287 let first = str2nr(tokens[0], 16)
288 if len(tokens) == 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100289 let last = first
290 else
Bram Moolenaar207f0092020-08-30 17:20:20 +0200291 let last = str2nr(tokens[1], 16)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100292 endif
Bram Moolenaar207f0092020-08-30 17:20:20 +0200293 for nr in range(first, last)
294 let a:chardict[nr] = 1
295 endfor
296 endfor
297endfunc
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100298
Bram Moolenaar207f0092020-08-30 17:20:20 +0200299func Test_AddLinesToCharDict()
300 let dict = {}
301 call AddLinesToCharDict([
302 \ '1234 blah blah',
303 \ '1235 blah blah',
304 \ '12a0..12a2 blah blah',
305 \ '12a1 blah blah',
306 \ ], dict)
307 call assert_equal({0x1234: 1, 0x1235: 1,
308 \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
309 \ }, dict)
310 if v:errors != []
311 echoerr 'AddLinesToCharDict' v:errors
312 return 1
313 endif
314 return 0
315endfunc
316
317
318func CharDictToPairList(chardict)
319 let result = []
320 let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
321 let low = keys[0]
322 let high = keys[0]
323 for key in keys
324 if key > high + 1
325 call add(result, [low, high])
326 let low = key
327 let high = key
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100328 else
Bram Moolenaar207f0092020-08-30 17:20:20 +0200329 let high = key
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100330 endif
Bram Moolenaar3848e002016-03-19 18:42:29 +0100331 endfor
Bram Moolenaar207f0092020-08-30 17:20:20 +0200332 call add(result, [low, high])
333 return result
334endfunc
335
336func Test_CharDictToPairList()
337 let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
338 \ 0x1024: 1,
339 \ 0x2022: 1,
340 \ 0x2024: 1, 0x2025: 1}
341 call assert_equal([
342 \ [0x1020, 0x1022],
343 \ [0x1024, 0x1024],
344 \ [0x2022, 0x2022],
345 \ [0x2024, 0x2025],
346 \ ], CharDictToPairList(dict))
347 if v:errors != []
348 echoerr 'CharDictToPairList' v:errors
349 return 1
350 endif
351 return 0
352endfunc
353
354
355" Build the amoji width table in a new buffer.
356func BuildEmojiTable()
357 " First make the table for all emojis.
358 let pattern = '; Emoji\s\+#\s'
359 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
360
361 " Make a dictionary with an entry for each character.
362 let chardict = {}
363 call AddLinesToCharDict(lines, chardict)
364 let pairlist = CharDictToPairList(chardict)
365 let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100366
367 " New buffer to put the result in.
368 new
Bram Moolenaar207f0092020-08-30 17:20:20 +0200369 exe 'file emoji_all'
370 call setline(1, "static struct interval emoji_all[] =")
371 call setline(2, "{")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100372 call append('$', allranges)
373 call setline('$', getline('$')[:-2]) " remove last comma
Bram Moolenaar207f0092020-08-30 17:20:20 +0200374 call setline(line('$') + 1, "};")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100375 wincmd p
376
Bram Moolenaar207f0092020-08-30 17:20:20 +0200377 " Make the table for wide emojis.
378 let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
379 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
380
381 " Make a dictionary with an entry for each character.
382 let chardict = {}
383 call AddLinesToCharDict(lines, chardict)
384
385 " exclude characters that are in the "ambiguous" or "doublewidth" table
386 for ambi in s:ambitable
387 for nr in range(ambi[0], ambi[1])
388 if has_key(chardict, nr)
389 call remove(chardict, nr)
390 endif
391 endfor
392 endfor
393
394 for wide in s:doubletable
395 for nr in range(wide[0], wide[1])
396 if has_key(chardict, nr)
397 call remove(chardict, nr)
398 endif
399 endfor
400 endfor
401
402 let pairlist = CharDictToPairList(chardict)
403 let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
404
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100405 " New buffer to put the result in.
406 new
Bram Moolenaar207f0092020-08-30 17:20:20 +0200407 exe 'file emoji_wide'
408 call setline(1, " static struct interval emoji_wide[] =")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100409 call setline(2, " {")
Bram Moolenaar207f0092020-08-30 17:20:20 +0200410 call append('$', wide_ranges)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100411 call setline('$', getline('$')[:-2]) " remove last comma
412 call setline(line('$') + 1, " };")
413 wincmd p
414endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100415
Bram Moolenaar207f0092020-08-30 17:20:20 +0200416" First test a few things
417let v:errors = []
418if Test_AddLinesToCharDict() || Test_CharDictToPairList()
419 finish
420endif
421
422
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200423" Try to avoid hitting E36
424set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100425
426" Edit the Unicode text file. Requires the netrw plugin.
427edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
428
429" Parse each line, create a list of lists.
430call ParseDataToProps()
431
432" Build the toLower table.
433call BuildCaseTable("Lower", 13)
434
435" Build the toUpper table.
436call BuildCaseTable("Upper", 12)
437
438" Build the ranges of composing chars.
439call BuildCombiningTable()
440
441" Edit the case folding text file. Requires the netrw plugin.
442edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
443
444" Parse each line, create a list of lists.
445call ParseFoldProps()
446
447" Build the foldCase table.
448call BuildFoldTable()
449
450" Edit the width text file. Requires the netrw plugin.
451edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
452
453" Parse each line, create a list of lists.
454call ParseWidthProps()
455
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100456" Build the double width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100457let s:doubletable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100458call BuildWidthTable('[WF]', 'doublewidth')
459
460" Build the ambiguous width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100461let s:ambitable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100462call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100463
464" Edit the emoji text file. Requires the netrw plugin.
Bram Moolenaar207f0092020-08-30 17:20:20 +0200465edit https://unicode.org/Public/emoji/12.1/emoji-data.txt
Bram Moolenaar3848e002016-03-19 18:42:29 +0100466
467" Build the emoji table. Ver. 1.0 - 6.0
Bram Moolenaar207f0092020-08-30 17:20:20 +0200468" Must come after the "ambiguous" and "doublewidth" tables
469call BuildEmojiTable()