blob: 88c4c796098dd24356acd034665bebe947f2a1a0 [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Usage: Vim -S <this-file>
7"
8" Author: Bram Moolenaar
9" Last Update: 2010 Jan 12
10
11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
12func! ParseDataToProps()
13 let s:dataprops = []
14 let lnum = 1
15 while lnum <= line('$')
16 let l = split(getline(lnum), '\s*;\s*', 1)
17 if len(l) != 15
18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19 return
20 endif
21 call add(s:dataprops, l)
22 let lnum += 1
23 endwhile
24endfunc
25
26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
27func! ParseFoldProps()
28 let s:foldprops = []
29 let lnum = 1
30 while lnum <= line('$')
31 let line = getline(lnum)
32 if line !~ '^#' && line !~ '^\s*$'
33 let l = split(line, '\s*;\s*', 1)
34 if len(l) != 4
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010035 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010037 endif
38 call add(s:foldprops, l)
39 endif
40 let lnum += 1
41 endwhile
42endfunc
43
44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
45func! ParseWidthProps()
46 let s:widthprops = []
47 let lnum = 1
48 while lnum <= line('$')
49 let line = getline(lnum)
50 if line !~ '^#' && line !~ '^\s*$'
51 let l = split(line, '\s*;\s*', 1)
52 if len(l) != 2
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010053 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010055 endif
56 call add(s:widthprops, l)
57 endif
58 let lnum += 1
59 endwhile
60endfunc
61
62" Build the toLower or toUpper table in a new buffer.
63" Uses s:dataprops.
64func! BuildCaseTable(name, index)
65 let start = -1
66 let end = -1
67 let step = 0
68 let add = -1
69 let ranges = []
70 for p in s:dataprops
71 if p[a:index] != ''
72 let n = ('0x' . p[0]) + 0
73 let nl = ('0x' . p[a:index]) + 0
74 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010075 " continue with same range.
76 let step = n - end
77 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010078 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010079 if start >= 0
80 " produce previous range
81 call Range(ranges, start, end, step, add)
82 endif
83 let start = n
84 let end = n
85 let step = 0
86 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010087 endif
88 endif
89 endfor
90 if start >= 0
91 call Range(ranges, start, end, step, add)
92 endif
93
94 " New buffer to put the result in.
95 new
96 exe "file to" . a:name
97 call setline(1, "static convertStruct to" . a:name . "[] =")
98 call setline(2, "{")
99 call append('$', ranges)
100 call setline('$', getline('$')[:-2]) " remove last comma
101 call setline(line('$') + 1, "};")
102 wincmd p
103endfunc
104
105" Build the foldCase table in a new buffer.
106" Uses s:foldprops.
107func! BuildFoldTable()
108 let start = -1
109 let end = -1
110 let step = 0
111 let add = -1
112 let ranges = []
113 for p in s:foldprops
114 if p[1] == 'C' || p[1] == 'S'
115 let n = ('0x' . p[0]) + 0
116 let nl = ('0x' . p[2]) + 0
117 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100118 " continue with same range.
119 let step = n - end
120 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100121 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100122 if start >= 0
123 " produce previous range
124 call Range(ranges, start, end, step, add)
125 endif
126 let start = n
127 let end = n
128 let step = 0
129 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100130 endif
131 endif
132 endfor
133 if start >= 0
134 call Range(ranges, start, end, step, add)
135 endif
136
137 " New buffer to put the result in.
138 new
139 file foldCase
140 call setline(1, "static convertStruct foldCase[] =")
141 call setline(2, "{")
142 call append('$', ranges)
143 call setline('$', getline('$')[:-2]) " remove last comma
144 call setline(line('$') + 1, "};")
145 wincmd p
146endfunc
147
148func! Range(ranges, start, end, step, add)
149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150 call add(a:ranges, s)
151endfunc
152
153" Build the combining table.
154" Uses s:dataprops.
155func! BuildCombiningTable()
156 let start = -1
157 let end = -1
158 let ranges = []
159 for p in s:dataprops
160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161 let n = ('0x' . p[0]) + 0
162 if start >= 0 && end + 1 == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100163 " continue with same range.
164 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100165 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100166 if start >= 0
167 " produce previous range
168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169 endif
170 let start = n
171 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100172 endif
173 endif
174 endfor
175 if start >= 0
176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177 endif
178
179 " New buffer to put the result in.
180 new
181 file combining
182 call setline(1, " static struct interval combining[] =")
183 call setline(2, " {")
184 call append('$', ranges)
185 call setline('$', getline('$')[:-2]) " remove last comma
186 call setline(line('$') + 1, " };")
187 wincmd p
188endfunc
189
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100190" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100191" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100192func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100193 let start = -1
194 let end = -1
195 let ranges = []
196 let dataidx = 0
197 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100198 if p[1][0] =~ a:pattern
199 if p[0] =~ '\.\.'
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100200 " It is a range. we don't check for composing char then.
201 let rng = split(p[0], '\.\.')
202 if len(rng) != 2
203 echoerr "Cannot parse range: '" . p[0] . "' in width table"
204 endif
205 let n = ('0x' . rng[0]) + 0
206 let n_last = ('0x' . rng[1]) + 0
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100207 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100208 let n = ('0x' . p[0]) + 0
209 let n_last = n
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100210 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100211 " Find this char in the data table.
212 while 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
214 if dn >= n
215 break
216 endif
217 let dataidx += 1
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100218 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100219 if dn != n && n_last == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100220 echoerr "Cannot find character " . n . " in data table"
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100221 endif
222 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100223 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100224 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100226 if start >= 0 && end + 1 == n
227 " continue with same range.
228 else
229 if start >= 0
230 " produce previous range
231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
232 if a:pattern == 'A'
233 call add(s:ambitable, [start, end])
234 else
235 call add(s:doubletable, [start, end])
236 endif
237 endif
238 let start = n
239 endif
240 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100241 endif
242 endif
243 endfor
244 if start >= 0
245 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100246 if a:pattern == 'A'
247 call add(s:ambitable, [start, end])
248 else
249 call add(s:doubletable, [start, end])
250 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100251 endif
252
253 " New buffer to put the result in.
254 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100255 exe "file " . a:tableName
256 call setline(1, " static struct interval " . a:tableName . "[] =")
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100257 call setline(2, " {")
258 call append('$', ranges)
259 call setline('$', getline('$')[:-2]) " remove last comma
260 call setline(line('$') + 1, " };")
261 wincmd p
262endfunc
263
Bram Moolenaar3848e002016-03-19 18:42:29 +0100264" Build the amoji width table in a new buffer.
265func! BuildEmojiTable(pattern, tableName)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100266 let alltokens = []
267 let widthtokens = []
268 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
269 for n in range(len(lines))
270 let line = lines[n]
Bram Moolenaar3848e002016-03-19 18:42:29 +0100271 let token = split(line, '\.\.')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100272 let first = ('0x' . token[0]) + 0
Bram Moolenaar3848e002016-03-19 18:42:29 +0100273 if len(token) == 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100274 let last = first
275 else
276 let last = ('0x' . token[1]) + 0
Bram Moolenaar3848e002016-03-19 18:42:29 +0100277 endif
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100278
279 let token = [first, last]
280 if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1])
281 let alltokens[-1][1] = token[1]
282 else
283 call add(alltokens, token)
284 endif
285
Bram Moolenaar6a084542016-03-24 18:24:58 +0100286 " Characters below 1F000 may be considered single width traditionally,
287 " making them double width causes problems.
288 if first < 0x1f000
289 continue
290 endif
291
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100292 " exclude characters that are in the "ambiguous" or "doublewidth" table
293 for ambi in s:ambitable
294 if first >= ambi[0] && first <= ambi[1]
295 let first = ambi[1] + 1
296 endif
297 if last >= ambi[0] && last <= ambi[1]
298 let last = ambi[0] - 1
299 endif
300 endfor
301 for double in s:doubletable
302 if first >= double[0] && first <= double[1]
303 let first = double[1] + 1
304 endif
305 if last >= double[0] && last <= double[1]
306 let last = double[0] - 1
307 endif
308 endfor
309
310 if first <= last
311 let token = [first, last]
312 if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1])
313 let widthtokens[-1][1] = token[1]
314 else
315 call add(widthtokens, token)
316 endif
317 endif
Bram Moolenaar3848e002016-03-19 18:42:29 +0100318 endfor
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100319 let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
320 let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100321
322 " New buffer to put the result in.
323 new
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100324 exe "file " . a:tableName . '_all'
325 call setline(1, " static struct interval " . a:tableName . "_all[] =")
Bram Moolenaar3848e002016-03-19 18:42:29 +0100326 call setline(2, " {")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100327 call append('$', allranges)
328 call setline('$', getline('$')[:-2]) " remove last comma
329 call setline(line('$') + 1, " };")
330 wincmd p
331
332 " New buffer to put the result in.
333 new
334 exe "file " . a:tableName . '_width'
335 call setline(1, " static struct interval " . a:tableName . "_width[] =")
336 call setline(2, " {")
337 call append('$', widthranges)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100338 call setline('$', getline('$')[:-2]) " remove last comma
339 call setline(line('$') + 1, " };")
340 wincmd p
341endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100342
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200343" Try to avoid hitting E36
344set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100345
346" Edit the Unicode text file. Requires the netrw plugin.
347edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
348
349" Parse each line, create a list of lists.
350call ParseDataToProps()
351
352" Build the toLower table.
353call BuildCaseTable("Lower", 13)
354
355" Build the toUpper table.
356call BuildCaseTable("Upper", 12)
357
358" Build the ranges of composing chars.
359call BuildCombiningTable()
360
361" Edit the case folding text file. Requires the netrw plugin.
362edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
363
364" Parse each line, create a list of lists.
365call ParseFoldProps()
366
367" Build the foldCase table.
368call BuildFoldTable()
369
370" Edit the width text file. Requires the netrw plugin.
371edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
372
373" Parse each line, create a list of lists.
374call ParseWidthProps()
375
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100376" Build the double width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100377let s:doubletable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100378call BuildWidthTable('[WF]', 'doublewidth')
379
380" Build the ambiguous width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100381let s:ambitable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100382call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100383
384" Edit the emoji text file. Requires the netrw plugin.
385edit http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
386
387" Build the emoji table. Ver. 1.0 - 6.0
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100388" Must come after the "ambiguous" table
Bram Moolenaar3848e002016-03-19 18:42:29 +0100389call BuildEmojiTable('; Emoji\s\+# [1-6]\.[0-9]', 'emoji')