blob: e0627b644d1a81bf18ed90bde70d4fdc9fea784d [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Usage: Vim -S <this-file>
7"
8" Author: Bram Moolenaar
9" Last Update: 2010 Jan 12
10
11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
12func! ParseDataToProps()
13 let s:dataprops = []
14 let lnum = 1
15 while lnum <= line('$')
16 let l = split(getline(lnum), '\s*;\s*', 1)
17 if len(l) != 15
18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19 return
20 endif
21 call add(s:dataprops, l)
22 let lnum += 1
23 endwhile
24endfunc
25
26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
27func! ParseFoldProps()
28 let s:foldprops = []
29 let lnum = 1
30 while lnum <= line('$')
31 let line = getline(lnum)
32 if line !~ '^#' && line !~ '^\s*$'
33 let l = split(line, '\s*;\s*', 1)
34 if len(l) != 4
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010035 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010037 endif
38 call add(s:foldprops, l)
39 endif
40 let lnum += 1
41 endwhile
42endfunc
43
44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
45func! ParseWidthProps()
46 let s:widthprops = []
47 let lnum = 1
48 while lnum <= line('$')
49 let line = getline(lnum)
50 if line !~ '^#' && line !~ '^\s*$'
51 let l = split(line, '\s*;\s*', 1)
52 if len(l) != 2
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010053 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54 return
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010055 endif
56 call add(s:widthprops, l)
57 endif
58 let lnum += 1
59 endwhile
60endfunc
61
62" Build the toLower or toUpper table in a new buffer.
63" Uses s:dataprops.
64func! BuildCaseTable(name, index)
65 let start = -1
66 let end = -1
67 let step = 0
68 let add = -1
69 let ranges = []
70 for p in s:dataprops
71 if p[a:index] != ''
72 let n = ('0x' . p[0]) + 0
73 let nl = ('0x' . p[a:index]) + 0
74 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010075 " continue with same range.
76 let step = n - end
77 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010078 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +010079 if start >= 0
80 " produce previous range
81 call Range(ranges, start, end, step, add)
82 endif
83 let start = n
84 let end = n
85 let step = 0
86 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +010087 endif
88 endif
89 endfor
90 if start >= 0
91 call Range(ranges, start, end, step, add)
92 endif
93
94 " New buffer to put the result in.
95 new
96 exe "file to" . a:name
97 call setline(1, "static convertStruct to" . a:name . "[] =")
98 call setline(2, "{")
99 call append('$', ranges)
100 call setline('$', getline('$')[:-2]) " remove last comma
101 call setline(line('$') + 1, "};")
102 wincmd p
103endfunc
104
105" Build the foldCase table in a new buffer.
106" Uses s:foldprops.
107func! BuildFoldTable()
108 let start = -1
109 let end = -1
110 let step = 0
111 let add = -1
112 let ranges = []
113 for p in s:foldprops
114 if p[1] == 'C' || p[1] == 'S'
115 let n = ('0x' . p[0]) + 0
116 let nl = ('0x' . p[2]) + 0
117 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100118 " continue with same range.
119 let step = n - end
120 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100121 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100122 if start >= 0
123 " produce previous range
124 call Range(ranges, start, end, step, add)
125 endif
126 let start = n
127 let end = n
128 let step = 0
129 let add = nl - n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100130 endif
131 endif
132 endfor
133 if start >= 0
134 call Range(ranges, start, end, step, add)
135 endif
136
137 " New buffer to put the result in.
138 new
139 file foldCase
140 call setline(1, "static convertStruct foldCase[] =")
141 call setline(2, "{")
142 call append('$', ranges)
143 call setline('$', getline('$')[:-2]) " remove last comma
144 call setline(line('$') + 1, "};")
145 wincmd p
146endfunc
147
148func! Range(ranges, start, end, step, add)
149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150 call add(a:ranges, s)
151endfunc
152
153" Build the combining table.
154" Uses s:dataprops.
155func! BuildCombiningTable()
156 let start = -1
157 let end = -1
158 let ranges = []
159 for p in s:dataprops
160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161 let n = ('0x' . p[0]) + 0
162 if start >= 0 && end + 1 == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100163 " continue with same range.
164 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100165 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100166 if start >= 0
167 " produce previous range
168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169 endif
170 let start = n
171 let end = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100172 endif
173 endif
174 endfor
175 if start >= 0
176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177 endif
178
179 " New buffer to put the result in.
180 new
181 file combining
182 call setline(1, " static struct interval combining[] =")
183 call setline(2, " {")
184 call append('$', ranges)
185 call setline('$', getline('$')[:-2]) " remove last comma
186 call setline(line('$') + 1, " };")
187 wincmd p
188endfunc
189
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100190" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100191" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100192func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100193 let start = -1
194 let end = -1
195 let ranges = []
196 let dataidx = 0
197 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100198 if p[1][0] =~ a:pattern
199 if p[0] =~ '\.\.'
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100200 " It is a range. we don't check for composing char then.
201 let rng = split(p[0], '\.\.')
202 if len(rng) != 2
203 echoerr "Cannot parse range: '" . p[0] . "' in width table"
204 endif
205 let n = ('0x' . rng[0]) + 0
206 let n_last = ('0x' . rng[1]) + 0
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100207 else
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100208 let n = ('0x' . p[0]) + 0
209 let n_last = n
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100210 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100211 " Find this char in the data table.
212 while 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
214 if dn >= n
215 break
216 endif
217 let dataidx += 1
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100218 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100219 if dn != n && n_last == n
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100220 echoerr "Cannot find character " . n . " in data table"
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100221 endif
222 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100223 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100224 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100226 if start >= 0 && end + 1 == n
227 " continue with same range.
228 else
229 if start >= 0
230 " produce previous range
231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
232 if a:pattern == 'A'
233 call add(s:ambitable, [start, end])
234 else
235 call add(s:doubletable, [start, end])
236 endif
237 endif
238 let start = n
239 endif
240 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100241 endif
242 endif
243 endfor
244 if start >= 0
245 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100246 if a:pattern == 'A'
247 call add(s:ambitable, [start, end])
248 else
249 call add(s:doubletable, [start, end])
250 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100251 endif
252
253 " New buffer to put the result in.
254 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100255 exe "file " . a:tableName
256 call setline(1, " static struct interval " . a:tableName . "[] =")
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100257 call setline(2, " {")
258 call append('$', ranges)
259 call setline('$', getline('$')[:-2]) " remove last comma
260 call setline(line('$') + 1, " };")
261 wincmd p
262endfunc
263
Bram Moolenaar3848e002016-03-19 18:42:29 +0100264" Build the amoji width table in a new buffer.
265func! BuildEmojiTable(pattern, tableName)
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100266 let alltokens = []
267 let widthtokens = []
268 let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
269 for n in range(len(lines))
270 let line = lines[n]
Bram Moolenaar3848e002016-03-19 18:42:29 +0100271 let token = split(line, '\.\.')
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100272 let first = ('0x' . token[0]) + 0
Bram Moolenaar3848e002016-03-19 18:42:29 +0100273 if len(token) == 1
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100274 let last = first
275 else
276 let last = ('0x' . token[1]) + 0
Bram Moolenaar3848e002016-03-19 18:42:29 +0100277 endif
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100278
279 let token = [first, last]
280 if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1])
281 let alltokens[-1][1] = token[1]
282 else
283 call add(alltokens, token)
284 endif
285
286 " exclude characters that are in the "ambiguous" or "doublewidth" table
287 for ambi in s:ambitable
288 if first >= ambi[0] && first <= ambi[1]
289 let first = ambi[1] + 1
290 endif
291 if last >= ambi[0] && last <= ambi[1]
292 let last = ambi[0] - 1
293 endif
294 endfor
295 for double in s:doubletable
296 if first >= double[0] && first <= double[1]
297 let first = double[1] + 1
298 endif
299 if last >= double[0] && last <= double[1]
300 let last = double[0] - 1
301 endif
302 endfor
303
304 if first <= last
305 let token = [first, last]
306 if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1])
307 let widthtokens[-1][1] = token[1]
308 else
309 call add(widthtokens, token)
310 endif
311 endif
Bram Moolenaar3848e002016-03-19 18:42:29 +0100312 endfor
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100313 let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
314 let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100315
316 " New buffer to put the result in.
317 new
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100318 exe "file " . a:tableName . '_all'
319 call setline(1, " static struct interval " . a:tableName . "_all[] =")
Bram Moolenaar3848e002016-03-19 18:42:29 +0100320 call setline(2, " {")
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100321 call append('$', allranges)
322 call setline('$', getline('$')[:-2]) " remove last comma
323 call setline(line('$') + 1, " };")
324 wincmd p
325
326 " New buffer to put the result in.
327 new
328 exe "file " . a:tableName . '_width'
329 call setline(1, " static struct interval " . a:tableName . "_width[] =")
330 call setline(2, " {")
331 call append('$', widthranges)
Bram Moolenaar3848e002016-03-19 18:42:29 +0100332 call setline('$', getline('$')[:-2]) " remove last comma
333 call setline(line('$') + 1, " };")
334 wincmd p
335endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100336
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200337" Try to avoid hitting E36
338set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100339
340" Edit the Unicode text file. Requires the netrw plugin.
341edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
342
343" Parse each line, create a list of lists.
344call ParseDataToProps()
345
346" Build the toLower table.
347call BuildCaseTable("Lower", 13)
348
349" Build the toUpper table.
350call BuildCaseTable("Upper", 12)
351
352" Build the ranges of composing chars.
353call BuildCombiningTable()
354
355" Edit the case folding text file. Requires the netrw plugin.
356edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
357
358" Parse each line, create a list of lists.
359call ParseFoldProps()
360
361" Build the foldCase table.
362call BuildFoldTable()
363
364" Edit the width text file. Requires the netrw plugin.
365edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
366
367" Parse each line, create a list of lists.
368call ParseWidthProps()
369
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100370" Build the double width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100371let s:doubletable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100372call BuildWidthTable('[WF]', 'doublewidth')
373
374" Build the ambiguous width table.
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100375let s:ambitable = []
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100376call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100377
378" Edit the emoji text file. Requires the netrw plugin.
379edit http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
380
381" Build the emoji table. Ver. 1.0 - 6.0
Bram Moolenaarb86f10e2016-03-21 22:09:44 +0100382" Must come after the "ambiguous" table
Bram Moolenaar3848e002016-03-19 18:42:29 +0100383call BuildEmojiTable('; Emoji\s\+# [1-6]\.[0-9]', 'emoji')