blob: b518d0541f459f7ee8c85947eb1cb0fb6b144c93 [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
Bram Moolenaar383aa842017-06-22 15:27:37 +02006" Might need to update the URL to the emoji-data.txt
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01007" Usage: Vim -S <this-file>
8"
9" Author: Bram Moolenaar
10" Last Update: 2010 Jan 12
11
12" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
13func! ParseDataToProps()
14 let s:dataprops = []
15 let lnum = 1
16 while lnum <= line('$')
17 let l = split(getline(lnum), '\s*;\s*', 1)
18 if len(l) != 15
19 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
20 return
21 endif
22 call add(s:dataprops, l)
23 let lnum += 1
24 endwhile
25endfunc
26
27" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
28func! ParseFoldProps()
29 let s:foldprops = []
30 let lnum = 1
31 while lnum <= line('$')
32 let line = getline(lnum)
33 if line !~ '^#' && line !~ '^\s*$'
34 let l = split(line, '\s*;\s*', 1)
35 if len(l) != 4
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010036 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
37 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010038 endif
39 call add(s:foldprops, l)
40 endif
41 let lnum += 1
42 endwhile
43endfunc
44
45" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
46func! ParseWidthProps()
47 let s:widthprops = []
48 let lnum = 1
49 while lnum <= line('$')
50 let line = getline(lnum)
51 if line !~ '^#' && line !~ '^\s*$'
52 let l = split(line, '\s*;\s*', 1)
53 if len(l) != 2
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010054 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
55 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010056 endif
57 call add(s:widthprops, l)
58 endif
59 let lnum += 1
60 endwhile
61endfunc
62
63" Build the toLower or toUpper table in a new buffer.
64" Uses s:dataprops.
65func! BuildCaseTable(name, index)
66 let start = -1
67 let end = -1
68 let step = 0
69 let add = -1
70 let ranges = []
71 for p in s:dataprops
72 if p[a:index] != ''
73 let n = ('0x' . p[0]) + 0
74 let nl = ('0x' . p[a:index]) + 0
75 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010076 " continue with same range.
77 let step = n - end
78 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010079 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010080 if start >= 0
81 " produce previous range
82 call Range(ranges, start, end, step, add)
83 endif
84 let start = n
85 let end = n
86 let step = 0
87 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010088 endif
89 endif
90 endfor
91 if start >= 0
92 call Range(ranges, start, end, step, add)
93 endif
94
95 " New buffer to put the result in.
96 new
97 exe "file to" . a:name
98 call setline(1, "static convertStruct to" . a:name . "[] =")
99 call setline(2, "{")
100 call append('$', ranges)
101 call setline('$', getline('$')[:-2]) " remove last comma
102 call setline(line('$') + 1, "};")
103 wincmd p
104endfunc
105
106" Build the foldCase table in a new buffer.
107" Uses s:foldprops.
108func! BuildFoldTable()
109 let start = -1
110 let end = -1
111 let step = 0
112 let add = -1
113 let ranges = []
114 for p in s:foldprops
115 if p[1] == 'C' || p[1] == 'S'
116 let n = ('0x' . p[0]) + 0
117 let nl = ('0x' . p[2]) + 0
118 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100119 " continue with same range.
120 let step = n - end
121 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100122 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100123 if start >= 0
124 " produce previous range
125 call Range(ranges, start, end, step, add)
126 endif
127 let start = n
128 let end = n
129 let step = 0
130 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100131 endif
132 endif
133 endfor
134 if start >= 0
135 call Range(ranges, start, end, step, add)
136 endif
137
138 " New buffer to put the result in.
139 new
140 file foldCase
141 call setline(1, "static convertStruct foldCase[] =")
142 call setline(2, "{")
143 call append('$', ranges)
144 call setline('$', getline('$')[:-2]) " remove last comma
145 call setline(line('$') + 1, "};")
146 wincmd p
147endfunc
148
149func! Range(ranges, start, end, step, add)
150 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
151 call add(a:ranges, s)
152endfunc
153
154" Build the combining table.
155" Uses s:dataprops.
156func! BuildCombiningTable()
157 let start = -1
158 let end = -1
159 let ranges = []
160 for p in s:dataprops
161 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
162 let n = ('0x' . p[0]) + 0
163 if start >= 0 && end + 1 == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100164 " continue with same range.
165 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100166 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100167 if start >= 0
168 " produce previous range
169 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
170 endif
171 let start = n
172 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100173 endif
174 endif
175 endfor
176 if start >= 0
177 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
178 endif
179
180 " New buffer to put the result in.
181 new
182 file combining
183 call setline(1, " static struct interval combining[] =")
184 call setline(2, " {")
185 call append('$', ranges)
186 call setline('$', getline('$')[:-2]) " remove last comma
187 call setline(line('$') + 1, " };")
188 wincmd p
189endfunc
190
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100191" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100192" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100193func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100194 let start = -1
195 let end = -1
196 let ranges = []
197 let dataidx = 0
198 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100199 if p[1][0] =~ a:pattern
200 if p[0] =~ '\.\.'
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100201 " It is a range. we don't check for composing char then.
202 let rng = split(p[0], '\.\.')
203 if len(rng) != 2
204 echoerr "Cannot parse range: '" . p[0] . "' in width table"
205 endif
206 let n = ('0x' . rng[0]) + 0
207 let n_last = ('0x' . rng[1]) + 0
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100208 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100209 let n = ('0x' . p[0]) + 0
210 let n_last = n
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100211 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100212 " Find this char in the data table.
213 while 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100214 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
215 if dn >= n
216 break
217 endif
218 let dataidx += 1
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100219 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100220 if dn != n && n_last == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100221 echoerr "Cannot find character " . n . " in data table"
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100222 endif
223 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100224 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100225 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100226 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100227 if start >= 0 && end + 1 == n
228 " continue with same range.
229 else
230 if start >= 0
231 " produce previous range
232 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
233 if a:pattern == 'A'
234 call add(s:ambitable, [start, end])
235 else
236 call add(s:doubletable, [start, end])
237 endif
238 endif
239 let start = n
240 endif
241 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100242 endif
243 endif
244 endfor
245 if start >= 0
246 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100247 if a:pattern == 'A'
248 call add(s:ambitable, [start, end])
249 else
250 call add(s:doubletable, [start, end])
251 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100252 endif
253
254 " New buffer to put the result in.
255 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100256 exe "file " . a:tableName
257 call setline(1, " static struct interval " . a:tableName . "[] =")
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100258 call setline(2, " {")
259 call append('$', ranges)
260 call setline('$', getline('$')[:-2]) " remove last comma
261 call setline(line('$') + 1, " };")
262 wincmd p
263endfunc
264
Bram Moolenaar3848e002016-03-19 18:42:29 +0100265" Build the amoji width table in a new buffer.
266func! BuildEmojiTable(pattern, tableName)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100267 let alltokens = []
268 let widthtokens = []
269 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
270 for n in range(len(lines))
271 let line = lines[n]
Bram Moolenaar3848e002016-03-19 18:42:29 +0100272 let token = split(line, '\.\.')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100273 let first = ('0x' . token[0]) + 0
Bram Moolenaar3848e002016-03-19 18:42:29 +0100274 if len(token) == 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100275 let last = first
276 else
277 let last = ('0x' . token[1]) + 0
Bram Moolenaar3848e002016-03-19 18:42:29 +0100278 endif
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100279
280 let token = [first, last]
281 if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1])
282 let alltokens[-1][1] = token[1]
283 else
284 call add(alltokens, token)
285 endif
286
Bram Moolenaar6a084542016-03-24 18:24:58 +0100287 " Characters below 1F000 may be considered single width traditionally,
288 " making them double width causes problems.
289 if first < 0x1f000
290 continue
291 endif
292
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100293 " exclude characters that are in the "ambiguous" or "doublewidth" table
294 for ambi in s:ambitable
295 if first >= ambi[0] && first <= ambi[1]
296 let first = ambi[1] + 1
297 endif
298 if last >= ambi[0] && last <= ambi[1]
299 let last = ambi[0] - 1
300 endif
301 endfor
302 for double in s:doubletable
303 if first >= double[0] && first <= double[1]
304 let first = double[1] + 1
305 endif
306 if last >= double[0] && last <= double[1]
307 let last = double[0] - 1
308 endif
309 endfor
310
311 if first <= last
312 let token = [first, last]
313 if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1])
314 let widthtokens[-1][1] = token[1]
315 else
316 call add(widthtokens, token)
317 endif
318 endif
Bram Moolenaar3848e002016-03-19 18:42:29 +0100319 endfor
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100320 let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
321 let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100322
323 " New buffer to put the result in.
324 new
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100325 exe "file " . a:tableName . '_all'
326 call setline(1, " static struct interval " . a:tableName . "_all[] =")
Bram Moolenaar3848e002016-03-19 18:42:29 +0100327 call setline(2, " {")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100328 call append('$', allranges)
329 call setline('$', getline('$')[:-2]) " remove last comma
330 call setline(line('$') + 1, " };")
331 wincmd p
332
333 " New buffer to put the result in.
334 new
335 exe "file " . a:tableName . '_width'
336 call setline(1, " static struct interval " . a:tableName . "_width[] =")
337 call setline(2, " {")
338 call append('$', widthranges)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100339 call setline('$', getline('$')[:-2]) " remove last comma
340 call setline(line('$') + 1, " };")
341 wincmd p
342endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100343
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200344" Try to avoid hitting E36
345set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100346
347" Edit the Unicode text file. Requires the netrw plugin.
348edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
349
350" Parse each line, create a list of lists.
351call ParseDataToProps()
352
353" Build the toLower table.
354call BuildCaseTable("Lower", 13)
355
356" Build the toUpper table.
357call BuildCaseTable("Upper", 12)
358
359" Build the ranges of composing chars.
360call BuildCombiningTable()
361
362" Edit the case folding text file. Requires the netrw plugin.
363edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
364
365" Parse each line, create a list of lists.
366call ParseFoldProps()
367
368" Build the foldCase table.
369call BuildFoldTable()
370
371" Edit the width text file. Requires the netrw plugin.
372edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
373
374" Parse each line, create a list of lists.
375call ParseWidthProps()
376
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100377" Build the double width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100378let s:doubletable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100379call BuildWidthTable('[WF]', 'doublewidth')
380
381" Build the ambiguous width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100382let s:ambitable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100383call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100384
385" Edit the emoji text file. Requires the netrw plugin.
Bram Moolenaarb477af22018-07-15 20:20:18 +0200386edit https://www.unicode.org/Public/emoji/11.0/emoji-data.txt
Bram Moolenaar383aa842017-06-22 15:27:37 +0200387"edit http://www.unicode.org/Public/emoji/latest/emoji-data.txt
Bram Moolenaar3848e002016-03-19 18:42:29 +0100388
389" Build the emoji table. Ver. 1.0 - 6.0
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100390" Must come after the "ambiguous" table
Bram Moolenaar383aa842017-06-22 15:27:37 +0200391call BuildEmojiTable('; Emoji\s\+#\s\+\d\+\.\d', 'emoji')