blob: 5859f34738c6ef81050e882323046607d74cd03c [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
Bram Moolenaar383aa842017-06-22 15:27:37 +02006" Might need to update the URL to the emoji-data.txt
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01007" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
Bram Moolenaar207f0092020-08-30 17:20:20 +020010" Last Update: 2020 Aug 24
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010011
12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14 let s:dataprops = []
15 let lnum = 1
16 while lnum <= line('$')
17 let l = split(getline(lnum), '\s*;\s*', 1)
18 if len(l) != 15
19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20 return
21 endif
22 call add(s:dataprops, l)
23 let lnum += 1
24 endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29 let s:foldprops = []
30 let lnum = 1
31 while lnum <= line('$')
32 let line = getline(lnum)
33 if line !~ '^#' && line !~ '^\s*$'
34 let l = split(line, '\s*;\s*', 1)
35 if len(l) != 4
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010036 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010038 endif
39 call add(s:foldprops, l)
40 endif
41 let lnum += 1
42 endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47 let s:widthprops = []
48 let lnum = 1
49 while lnum <= line('$')
50 let line = getline(lnum)
51 if line !~ '^#' && line !~ '^\s*$'
52 let l = split(line, '\s*;\s*', 1)
53 if len(l) != 2
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010054 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010056 endif
57 call add(s:widthprops, l)
58 endif
59 let lnum += 1
60 endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66 let start = -1
67 let end = -1
68 let step = 0
69 let add = -1
70 let ranges = []
71 for p in s:dataprops
72 if p[a:index] != ''
73 let n = ('0x' . p[0]) + 0
74 let nl = ('0x' . p[a:index]) + 0
75 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010076 " continue with same range.
77 let step = n - end
78 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010079 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010080 if start >= 0
81 " produce previous range
82 call Range(ranges, start, end, step, add)
83 endif
84 let start = n
85 let end = n
86 let step = 0
87 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010088 endif
89 endif
90 endfor
91 if start >= 0
92 call Range(ranges, start, end, step, add)
93 endif
94
95 " New buffer to put the result in.
96 new
97 exe "file to" . a:name
98 call setline(1, "static convertStruct to" . a:name . "[] =")
99 call setline(2, "{")
100 call append('$', ranges)
101 call setline('$', getline('$')[:-2]) " remove last comma
102 call setline(line('$') + 1, "};")
103 wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109 let start = -1
110 let end = -1
111 let step = 0
112 let add = -1
113 let ranges = []
114 for p in s:foldprops
115 if p[1] == 'C' || p[1] == 'S'
116 let n = ('0x' . p[0]) + 0
117 let nl = ('0x' . p[2]) + 0
118 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100119 " continue with same range.
120 let step = n - end
121 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100122 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100123 if start >= 0
124 " produce previous range
125 call Range(ranges, start, end, step, add)
126 endif
127 let start = n
128 let end = n
129 let step = 0
130 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100131 endif
132 endif
133 endfor
134 if start >= 0
135 call Range(ranges, start, end, step, add)
136 endif
137
138 " New buffer to put the result in.
139 new
140 file foldCase
141 call setline(1, "static convertStruct foldCase[] =")
142 call setline(2, "{")
143 call append('$', ranges)
144 call setline('$', getline('$')[:-2]) " remove last comma
145 call setline(line('$') + 1, "};")
146 wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151 call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157 let start = -1
158 let end = -1
159 let ranges = []
160 for p in s:dataprops
161 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
162 let n = ('0x' . p[0]) + 0
163 if start >= 0 && end + 1 == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100164 " continue with same range.
165 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100166 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100167 if start >= 0
168 " produce previous range
169 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
170 endif
171 let start = n
172 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100173 endif
174 endif
175 endfor
176 if start >= 0
177 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
178 endif
179
180 " New buffer to put the result in.
181 new
182 file combining
183 call setline(1, " static struct interval combining[] =")
184 call setline(2, " {")
185 call append('$', ranges)
186 call setline('$', getline('$')[:-2]) " remove last comma
187 call setline(line('$') + 1, " };")
188 wincmd p
189endfunc
190
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100191" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100192" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100193func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100194 let start = -1
195 let end = -1
196 let ranges = []
197 let dataidx = 0
198 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100199 if p[1][0] =~ a:pattern
200 if p[0] =~ '\.\.'
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100201 " It is a range. we don't check for composing char then.
202 let rng = split(p[0], '\.\.')
203 if len(rng) != 2
204 echoerr "Cannot parse range: '" . p[0] . "' in width table"
205 endif
206 let n = ('0x' . rng[0]) + 0
207 let n_last = ('0x' . rng[1]) + 0
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100208 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100209 let n = ('0x' . p[0]) + 0
210 let n_last = n
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100211 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100212 " Find this char in the data table.
213 while 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100214 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
215 if dn >= n
216 break
217 endif
218 let dataidx += 1
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100219 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100220 if dn != n && n_last == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100221 echoerr "Cannot find character " . n . " in data table"
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100222 endif
223 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100224 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100225 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100226 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100227 if start >= 0 && end + 1 == n
228 " continue with same range.
229 else
230 if start >= 0
231 " produce previous range
232 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
233 if a:pattern == 'A'
234 call add(s:ambitable, [start, end])
235 else
236 call add(s:doubletable, [start, end])
237 endif
238 endif
239 let start = n
240 endif
241 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100242 endif
243 endif
244 endfor
245 if start >= 0
246 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100247 if a:pattern == 'A'
248 call add(s:ambitable, [start, end])
249 else
250 call add(s:doubletable, [start, end])
251 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100252 endif
253
254 " New buffer to put the result in.
255 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100256 exe "file " . a:tableName
257 call setline(1, " static struct interval " . a:tableName . "[] =")
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100258 call setline(2, " {")
259 call append('$', ranges)
260 call setline('$', getline('$')[:-2]) " remove last comma
261 call setline(line('$') + 1, " };")
262 wincmd p
263endfunc
264
Bram Moolenaar207f0092020-08-30 17:20:20 +0200265
266" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
267" and put them in dictionary "chardict"
268func AddLinesToCharDict(lines, chardict)
269 for line in a:lines
270 let tokens = split(line, '\.\.')
271 let first = str2nr(tokens[0], 16)
272 if len(tokens) == 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100273 let last = first
274 else
Bram Moolenaar207f0092020-08-30 17:20:20 +0200275 let last = str2nr(tokens[1], 16)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100276 endif
Bram Moolenaar207f0092020-08-30 17:20:20 +0200277 for nr in range(first, last)
278 let a:chardict[nr] = 1
279 endfor
280 endfor
281endfunc
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100282
Bram Moolenaar207f0092020-08-30 17:20:20 +0200283func Test_AddLinesToCharDict()
284 let dict = {}
285 call AddLinesToCharDict([
286 \ '1234 blah blah',
287 \ '1235 blah blah',
288 \ '12a0..12a2 blah blah',
289 \ '12a1 blah blah',
290 \ ], dict)
291 call assert_equal({0x1234: 1, 0x1235: 1,
292 \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
293 \ }, dict)
294 if v:errors != []
295 echoerr 'AddLinesToCharDict' v:errors
296 return 1
297 endif
298 return 0
299endfunc
300
301
302func CharDictToPairList(chardict)
303 let result = []
304 let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
305 let low = keys[0]
306 let high = keys[0]
307 for key in keys
308 if key > high + 1
309 call add(result, [low, high])
310 let low = key
311 let high = key
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100312 else
Bram Moolenaar207f0092020-08-30 17:20:20 +0200313 let high = key
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100314 endif
Bram Moolenaar3848e002016-03-19 18:42:29 +0100315 endfor
Bram Moolenaar207f0092020-08-30 17:20:20 +0200316 call add(result, [low, high])
317 return result
318endfunc
319
320func Test_CharDictToPairList()
321 let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
322 \ 0x1024: 1,
323 \ 0x2022: 1,
324 \ 0x2024: 1, 0x2025: 1}
325 call assert_equal([
326 \ [0x1020, 0x1022],
327 \ [0x1024, 0x1024],
328 \ [0x2022, 0x2022],
329 \ [0x2024, 0x2025],
330 \ ], CharDictToPairList(dict))
331 if v:errors != []
332 echoerr 'CharDictToPairList' v:errors
333 return 1
334 endif
335 return 0
336endfunc
337
338
339" Build the amoji width table in a new buffer.
340func BuildEmojiTable()
341 " First make the table for all emojis.
342 let pattern = '; Emoji\s\+#\s'
343 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
344
345 " Make a dictionary with an entry for each character.
346 let chardict = {}
347 call AddLinesToCharDict(lines, chardict)
348 let pairlist = CharDictToPairList(chardict)
349 let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100350
351 " New buffer to put the result in.
352 new
Bram Moolenaar207f0092020-08-30 17:20:20 +0200353 exe 'file emoji_all'
354 call setline(1, "static struct interval emoji_all[] =")
355 call setline(2, "{")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100356 call append('$', allranges)
357 call setline('$', getline('$')[:-2]) " remove last comma
Bram Moolenaar207f0092020-08-30 17:20:20 +0200358 call setline(line('$') + 1, "};")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100359 wincmd p
360
Bram Moolenaar207f0092020-08-30 17:20:20 +0200361 " Make the table for wide emojis.
362 let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
363 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
364
365 " Make a dictionary with an entry for each character.
366 let chardict = {}
367 call AddLinesToCharDict(lines, chardict)
368
369 " exclude characters that are in the "ambiguous" or "doublewidth" table
370 for ambi in s:ambitable
371 for nr in range(ambi[0], ambi[1])
372 if has_key(chardict, nr)
373 call remove(chardict, nr)
374 endif
375 endfor
376 endfor
377
378 for wide in s:doubletable
379 for nr in range(wide[0], wide[1])
380 if has_key(chardict, nr)
381 call remove(chardict, nr)
382 endif
383 endfor
384 endfor
385
386 let pairlist = CharDictToPairList(chardict)
387 let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
388
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100389 " New buffer to put the result in.
390 new
Bram Moolenaar207f0092020-08-30 17:20:20 +0200391 exe 'file emoji_wide'
392 call setline(1, " static struct interval emoji_wide[] =")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100393 call setline(2, " {")
Bram Moolenaar207f0092020-08-30 17:20:20 +0200394 call append('$', wide_ranges)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100395 call setline('$', getline('$')[:-2]) " remove last comma
396 call setline(line('$') + 1, " };")
397 wincmd p
398endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100399
Bram Moolenaar207f0092020-08-30 17:20:20 +0200400" First test a few things
401let v:errors = []
402if Test_AddLinesToCharDict() || Test_CharDictToPairList()
403 finish
404endif
405
406
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200407" Try to avoid hitting E36
408set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100409
410" Edit the Unicode text file. Requires the netrw plugin.
411edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
412
413" Parse each line, create a list of lists.
414call ParseDataToProps()
415
416" Build the toLower table.
417call BuildCaseTable("Lower", 13)
418
419" Build the toUpper table.
420call BuildCaseTable("Upper", 12)
421
422" Build the ranges of composing chars.
423call BuildCombiningTable()
424
425" Edit the case folding text file. Requires the netrw plugin.
426edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
427
428" Parse each line, create a list of lists.
429call ParseFoldProps()
430
431" Build the foldCase table.
432call BuildFoldTable()
433
434" Edit the width text file. Requires the netrw plugin.
435edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
436
437" Parse each line, create a list of lists.
438call ParseWidthProps()
439
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100440" Build the double width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100441let s:doubletable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100442call BuildWidthTable('[WF]', 'doublewidth')
443
444" Build the ambiguous width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100445let s:ambitable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100446call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100447
448" Edit the emoji text file. Requires the netrw plugin.
Bram Moolenaar207f0092020-08-30 17:20:20 +0200449edit https://unicode.org/Public/emoji/12.1/emoji-data.txt
Bram Moolenaar3848e002016-03-19 18:42:29 +0100450
451" Build the emoji table. Ver. 1.0 - 6.0
Bram Moolenaar207f0092020-08-30 17:20:20 +0200452" Must come after the "ambiguous" and "doublewidth" tables
453call BuildEmojiTable()