blob: c289868c5a43c6aa42db4307031deadcbced2abe [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
Bram Moolenaar383aa842017-06-22 15:27:37 +02006" Might need to update the URL to the emoji-data.txt
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01007" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
Bram Moolenaar207f0092020-08-30 17:20:20 +020010" Last Update: 2020 Aug 24
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010011
12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14 let s:dataprops = []
15 let lnum = 1
16 while lnum <= line('$')
17 let l = split(getline(lnum), '\s*;\s*', 1)
18 if len(l) != 15
19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20 return
21 endif
22 call add(s:dataprops, l)
23 let lnum += 1
24 endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29 let s:foldprops = []
30 let lnum = 1
31 while lnum <= line('$')
32 let line = getline(lnum)
33 if line !~ '^#' && line !~ '^\s*$'
34 let l = split(line, '\s*;\s*', 1)
35 if len(l) != 4
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010036 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010038 endif
39 call add(s:foldprops, l)
40 endif
41 let lnum += 1
42 endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47 let s:widthprops = []
48 let lnum = 1
49 while lnum <= line('$')
50 let line = getline(lnum)
51 if line !~ '^#' && line !~ '^\s*$'
52 let l = split(line, '\s*;\s*', 1)
53 if len(l) != 2
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010054 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010056 endif
57 call add(s:widthprops, l)
58 endif
59 let lnum += 1
60 endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66 let start = -1
67 let end = -1
68 let step = 0
69 let add = -1
70 let ranges = []
71 for p in s:dataprops
72 if p[a:index] != ''
73 let n = ('0x' . p[0]) + 0
74 let nl = ('0x' . p[a:index]) + 0
75 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010076 " continue with same range.
77 let step = n - end
78 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010079 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010080 if start >= 0
81 " produce previous range
82 call Range(ranges, start, end, step, add)
83 endif
84 let start = n
85 let end = n
86 let step = 0
87 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010088 endif
89 endif
90 endfor
91 if start >= 0
92 call Range(ranges, start, end, step, add)
93 endif
94
95 " New buffer to put the result in.
96 new
97 exe "file to" . a:name
98 call setline(1, "static convertStruct to" . a:name . "[] =")
99 call setline(2, "{")
100 call append('$', ranges)
101 call setline('$', getline('$')[:-2]) " remove last comma
102 call setline(line('$') + 1, "};")
103 wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109 let start = -1
110 let end = -1
111 let step = 0
112 let add = -1
113 let ranges = []
114 for p in s:foldprops
115 if p[1] == 'C' || p[1] == 'S'
116 let n = ('0x' . p[0]) + 0
117 let nl = ('0x' . p[2]) + 0
118 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100119 " continue with same range.
120 let step = n - end
121 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100122 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100123 if start >= 0
124 " produce previous range
125 call Range(ranges, start, end, step, add)
126 endif
127 let start = n
128 let end = n
129 let step = 0
130 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100131 endif
132 endif
133 endfor
134 if start >= 0
135 call Range(ranges, start, end, step, add)
136 endif
137
138 " New buffer to put the result in.
139 new
140 file foldCase
141 call setline(1, "static convertStruct foldCase[] =")
142 call setline(2, "{")
143 call append('$', ranges)
144 call setline('$', getline('$')[:-2]) " remove last comma
145 call setline(line('$') + 1, "};")
146 wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151 call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157 let start = -1
158 let end = -1
159 let ranges = []
160 for p in s:dataprops
Bram Moolenaar7beaf6a2022-10-05 18:03:00 +0100161 " The 'Mc' property was removed, it does take up space.
162 if p[2] == 'Mn' || p[2] == 'Me'
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100163 let n = ('0x' . p[0]) + 0
164 if start >= 0 && end + 1 == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100165 " continue with same range.
166 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100167 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100168 if start >= 0
169 " produce previous range
170 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
171 endif
172 let start = n
173 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100174 endif
175 endif
176 endfor
177 if start >= 0
178 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
179 endif
180
181 " New buffer to put the result in.
182 new
183 file combining
184 call setline(1, " static struct interval combining[] =")
185 call setline(2, " {")
186 call append('$', ranges)
187 call setline('$', getline('$')[:-2]) " remove last comma
188 call setline(line('$') + 1, " };")
189 wincmd p
190endfunc
191
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100192" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100193" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100194func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100195 let start = -1
196 let end = -1
197 let ranges = []
198 let dataidx = 0
Christian Brabandtd8872972021-06-27 21:30:14 +0200199 " Account for indentation differences between ambiguous and doublewidth
200 " table in mbyte.c
201 if a:pattern == 'A'
202 let spc = ' '
203 else
204 let spc = "\t"
205 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100206 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100207 if p[1][0] =~ a:pattern
208 if p[0] =~ '\.\.'
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100209 " It is a range. we don't check for composing char then.
210 let rng = split(p[0], '\.\.')
211 if len(rng) != 2
212 echoerr "Cannot parse range: '" . p[0] . "' in width table"
213 endif
214 let n = ('0x' . rng[0]) + 0
215 let n_last = ('0x' . rng[1]) + 0
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100216 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100217 let n = ('0x' . p[0]) + 0
218 let n_last = n
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100219 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100220 " Find this char in the data table.
221 while 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100222 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
223 if dn >= n
224 break
225 endif
226 let dataidx += 1
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100227 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100228 if dn != n && n_last == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100229 echoerr "Cannot find character " . n . " in data table"
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100230 endif
231 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100232 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100233 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100234 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100235 if start >= 0 && end + 1 == n
236 " continue with same range.
237 else
238 if start >= 0
239 " produce previous range
Christian Brabandtd8872972021-06-27 21:30:14 +0200240 call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100241 if a:pattern == 'A'
242 call add(s:ambitable, [start, end])
243 else
244 call add(s:doubletable, [start, end])
245 endif
246 endif
247 let start = n
248 endif
249 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100250 endif
251 endif
252 endfor
253 if start >= 0
Christian Brabandtd8872972021-06-27 21:30:14 +0200254 call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100255 if a:pattern == 'A'
256 call add(s:ambitable, [start, end])
257 else
258 call add(s:doubletable, [start, end])
259 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100260 endif
261
262 " New buffer to put the result in.
263 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100264 exe "file " . a:tableName
Christian Brabandtd8872972021-06-27 21:30:14 +0200265 if a:pattern == 'A'
266 call setline(1, "static struct interval " . a:tableName . "[] =")
267 call setline(2, "{")
268 else
269 call setline(1, " static struct interval " . a:tableName . "[] =")
270 call setline(2, " {")
271 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100272 call append('$', ranges)
273 call setline('$', getline('$')[:-2]) " remove last comma
Christian Brabandtd8872972021-06-27 21:30:14 +0200274 if a:pattern == 'A'
275 call setline(line('$') + 1, "};")
276 else
277 call setline(line('$') + 1, " };")
278 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100279 wincmd p
280endfunc
281
Bram Moolenaar207f0092020-08-30 17:20:20 +0200282
283" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
284" and put them in dictionary "chardict"
285func AddLinesToCharDict(lines, chardict)
286 for line in a:lines
287 let tokens = split(line, '\.\.')
288 let first = str2nr(tokens[0], 16)
289 if len(tokens) == 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100290 let last = first
291 else
Bram Moolenaar207f0092020-08-30 17:20:20 +0200292 let last = str2nr(tokens[1], 16)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100293 endif
Bram Moolenaar207f0092020-08-30 17:20:20 +0200294 for nr in range(first, last)
295 let a:chardict[nr] = 1
296 endfor
297 endfor
298endfunc
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100299
Bram Moolenaar207f0092020-08-30 17:20:20 +0200300func Test_AddLinesToCharDict()
301 let dict = {}
302 call AddLinesToCharDict([
303 \ '1234 blah blah',
304 \ '1235 blah blah',
305 \ '12a0..12a2 blah blah',
306 \ '12a1 blah blah',
307 \ ], dict)
308 call assert_equal({0x1234: 1, 0x1235: 1,
309 \ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
310 \ }, dict)
311 if v:errors != []
312 echoerr 'AddLinesToCharDict' v:errors
313 return 1
314 endif
315 return 0
316endfunc
317
318
319func CharDictToPairList(chardict)
320 let result = []
321 let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
322 let low = keys[0]
323 let high = keys[0]
324 for key in keys
325 if key > high + 1
326 call add(result, [low, high])
327 let low = key
328 let high = key
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100329 else
Bram Moolenaar207f0092020-08-30 17:20:20 +0200330 let high = key
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100331 endif
Bram Moolenaar3848e002016-03-19 18:42:29 +0100332 endfor
Bram Moolenaar207f0092020-08-30 17:20:20 +0200333 call add(result, [low, high])
334 return result
335endfunc
336
337func Test_CharDictToPairList()
338 let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
339 \ 0x1024: 1,
340 \ 0x2022: 1,
341 \ 0x2024: 1, 0x2025: 1}
342 call assert_equal([
343 \ [0x1020, 0x1022],
344 \ [0x1024, 0x1024],
345 \ [0x2022, 0x2022],
346 \ [0x2024, 0x2025],
347 \ ], CharDictToPairList(dict))
348 if v:errors != []
349 echoerr 'CharDictToPairList' v:errors
350 return 1
351 endif
352 return 0
353endfunc
354
355
356" Build the amoji width table in a new buffer.
357func BuildEmojiTable()
358 " First make the table for all emojis.
359 let pattern = '; Emoji\s\+#\s'
360 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
361
362 " Make a dictionary with an entry for each character.
363 let chardict = {}
364 call AddLinesToCharDict(lines, chardict)
365 let pairlist = CharDictToPairList(chardict)
366 let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100367
368 " New buffer to put the result in.
369 new
Bram Moolenaar207f0092020-08-30 17:20:20 +0200370 exe 'file emoji_all'
371 call setline(1, "static struct interval emoji_all[] =")
372 call setline(2, "{")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100373 call append('$', allranges)
374 call setline('$', getline('$')[:-2]) " remove last comma
Bram Moolenaar207f0092020-08-30 17:20:20 +0200375 call setline(line('$') + 1, "};")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100376 wincmd p
377
Bram Moolenaar207f0092020-08-30 17:20:20 +0200378 " Make the table for wide emojis.
379 let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
380 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
381
382 " Make a dictionary with an entry for each character.
383 let chardict = {}
384 call AddLinesToCharDict(lines, chardict)
385
386 " exclude characters that are in the "ambiguous" or "doublewidth" table
387 for ambi in s:ambitable
388 for nr in range(ambi[0], ambi[1])
389 if has_key(chardict, nr)
390 call remove(chardict, nr)
391 endif
392 endfor
393 endfor
394
395 for wide in s:doubletable
396 for nr in range(wide[0], wide[1])
397 if has_key(chardict, nr)
398 call remove(chardict, nr)
399 endif
400 endfor
401 endfor
402
403 let pairlist = CharDictToPairList(chardict)
404 let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
405
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100406 " New buffer to put the result in.
407 new
Bram Moolenaar207f0092020-08-30 17:20:20 +0200408 exe 'file emoji_wide'
409 call setline(1, " static struct interval emoji_wide[] =")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100410 call setline(2, " {")
Bram Moolenaar207f0092020-08-30 17:20:20 +0200411 call append('$', wide_ranges)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100412 call setline('$', getline('$')[:-2]) " remove last comma
413 call setline(line('$') + 1, " };")
414 wincmd p
415endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100416
Bram Moolenaar207f0092020-08-30 17:20:20 +0200417" First test a few things
418let v:errors = []
419if Test_AddLinesToCharDict() || Test_CharDictToPairList()
420 finish
421endif
422
Christian Brabandt4298c5f2024-09-17 20:24:56 +0200423if !exists("g:loaded_netrw")
424 echomsg "Netrw not available, cannot download"
425 finish
426endif
Bram Moolenaar207f0092020-08-30 17:20:20 +0200427
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200428" Try to avoid hitting E36
429set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100430
431" Edit the Unicode text file. Requires the netrw plugin.
432edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
433
434" Parse each line, create a list of lists.
435call ParseDataToProps()
436
437" Build the toLower table.
438call BuildCaseTable("Lower", 13)
439
440" Build the toUpper table.
441call BuildCaseTable("Upper", 12)
442
443" Build the ranges of composing chars.
444call BuildCombiningTable()
445
446" Edit the case folding text file. Requires the netrw plugin.
447edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
448
449" Parse each line, create a list of lists.
450call ParseFoldProps()
451
452" Build the foldCase table.
453call BuildFoldTable()
454
455" Edit the width text file. Requires the netrw plugin.
456edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
457
458" Parse each line, create a list of lists.
459call ParseWidthProps()
460
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100461" Build the double width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100462let s:doubletable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100463call BuildWidthTable('[WF]', 'doublewidth')
464
465" Build the ambiguous width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100466let s:ambitable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100467call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100468
469" Edit the emoji text file. Requires the netrw plugin.
Christian Brabandt9882e9d2022-09-25 19:25:51 +0100470" commented out, because it drops too many characters
471"edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
472"
473"" Build the emoji table. Ver. 1.0 - 6.0
474"" Must come after the "ambiguous" and "doublewidth" tables
475"call BuildEmojiTable()