blob: dfe9cef417e68ed5dc86742e0ea64229fd4a5d9d [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Usage: Vim -S <this-file>
7"
8" Author: Bram Moolenaar
9" Last Update: 2010 Jan 12
10
11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
12func! ParseDataToProps()
13 let s:dataprops = []
14 let lnum = 1
15 while lnum <= line('$')
16 let l = split(getline(lnum), '\s*;\s*', 1)
17 if len(l) != 15
18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19 return
20 endif
21 call add(s:dataprops, l)
22 let lnum += 1
23 endwhile
24endfunc
25
26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
27func! ParseFoldProps()
28 let s:foldprops = []
29 let lnum = 1
30 while lnum <= line('$')
31 let line = getline(lnum)
32 if line !~ '^#' && line !~ '^\s*$'
33 let l = split(line, '\s*;\s*', 1)
34 if len(l) != 4
35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36 return
37 endif
38 call add(s:foldprops, l)
39 endif
40 let lnum += 1
41 endwhile
42endfunc
43
44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
45func! ParseWidthProps()
46 let s:widthprops = []
47 let lnum = 1
48 while lnum <= line('$')
49 let line = getline(lnum)
50 if line !~ '^#' && line !~ '^\s*$'
51 let l = split(line, '\s*;\s*', 1)
52 if len(l) != 2
53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54 return
55 endif
56 call add(s:widthprops, l)
57 endif
58 let lnum += 1
59 endwhile
60endfunc
61
62" Build the toLower or toUpper table in a new buffer.
63" Uses s:dataprops.
64func! BuildCaseTable(name, index)
65 let start = -1
66 let end = -1
67 let step = 0
68 let add = -1
69 let ranges = []
70 for p in s:dataprops
71 if p[a:index] != ''
72 let n = ('0x' . p[0]) + 0
73 let nl = ('0x' . p[a:index]) + 0
74 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
75 " continue with same range.
76 let step = n - end
77 let end = n
78 else
79 if start >= 0
80 " produce previous range
81 call Range(ranges, start, end, step, add)
82 endif
83 let start = n
84 let end = n
85 let step = 0
86 let add = nl - n
87 endif
88 endif
89 endfor
90 if start >= 0
91 call Range(ranges, start, end, step, add)
92 endif
93
94 " New buffer to put the result in.
95 new
96 exe "file to" . a:name
97 call setline(1, "static convertStruct to" . a:name . "[] =")
98 call setline(2, "{")
99 call append('$', ranges)
100 call setline('$', getline('$')[:-2]) " remove last comma
101 call setline(line('$') + 1, "};")
102 wincmd p
103endfunc
104
105" Build the foldCase table in a new buffer.
106" Uses s:foldprops.
107func! BuildFoldTable()
108 let start = -1
109 let end = -1
110 let step = 0
111 let add = -1
112 let ranges = []
113 for p in s:foldprops
114 if p[1] == 'C' || p[1] == 'S'
115 let n = ('0x' . p[0]) + 0
116 let nl = ('0x' . p[2]) + 0
117 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
118 " continue with same range.
119 let step = n - end
120 let end = n
121 else
122 if start >= 0
123 " produce previous range
124 call Range(ranges, start, end, step, add)
125 endif
126 let start = n
127 let end = n
128 let step = 0
129 let add = nl - n
130 endif
131 endif
132 endfor
133 if start >= 0
134 call Range(ranges, start, end, step, add)
135 endif
136
137 " New buffer to put the result in.
138 new
139 file foldCase
140 call setline(1, "static convertStruct foldCase[] =")
141 call setline(2, "{")
142 call append('$', ranges)
143 call setline('$', getline('$')[:-2]) " remove last comma
144 call setline(line('$') + 1, "};")
145 wincmd p
146endfunc
147
148func! Range(ranges, start, end, step, add)
149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150 call add(a:ranges, s)
151endfunc
152
153" Build the combining table.
154" Uses s:dataprops.
155func! BuildCombiningTable()
156 let start = -1
157 let end = -1
158 let ranges = []
159 for p in s:dataprops
160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161 let n = ('0x' . p[0]) + 0
162 if start >= 0 && end + 1 == n
163 " continue with same range.
164 let end = n
165 else
166 if start >= 0
167 " produce previous range
168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169 endif
170 let start = n
171 let end = n
172 endif
173 endif
174 endfor
175 if start >= 0
176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177 endif
178
179 " New buffer to put the result in.
180 new
181 file combining
182 call setline(1, " static struct interval combining[] =")
183 call setline(2, " {")
184 call append('$', ranges)
185 call setline('$', getline('$')[:-2]) " remove last comma
186 call setline(line('$') + 1, " };")
187 wincmd p
188endfunc
189
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100190" Build the double width or ambiguous width table in a new buffer.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100191" Uses s:widthprops and s:dataprops.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100192func! BuildWidthTable(pattern, tableName)
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100193 let start = -1
194 let end = -1
195 let ranges = []
196 let dataidx = 0
197 for p in s:widthprops
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100198 if p[1][0] =~ a:pattern
199 if p[0] =~ '\.\.'
200 " It is a range. we don't check for composing char then.
201 let rng = split(p[0], '\.\.')
202 if len(rng) != 2
203 echoerr "Cannot parse range: '" . p[0] . "' in width table"
204 endif
205 let n = ('0x' . rng[0]) + 0
206 let n_last = ('0x' . rng[1]) + 0
207 else
208 let n = ('0x' . p[0]) + 0
209 let n_last = n
210 endif
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100211 " Find this char in the data table.
212 while 1
213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
214 if dn >= n
215 break
216 endif
217 let dataidx += 1
218 endwhile
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100219 if dn != n && n_last == n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100220 echoerr "Cannot find character " . n . " in data table"
221 endif
222 " Only use the char when it's not a composing char.
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100223 " But use all chars from a range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100224 let dp = s:dataprops[dataidx]
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100226 if start >= 0 && end + 1 == n
227 " continue with same range.
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100228 else
229 if start >= 0
230 " produce previous range
231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
232 endif
233 let start = n
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100234 endif
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100235 let end = n_last
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100236 endif
237 endif
238 endfor
239 if start >= 0
240 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
241 endif
242
243 " New buffer to put the result in.
244 new
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100245 exe "file " . a:tableName
246 call setline(1, " static struct interval " . a:tableName . "[] =")
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100247 call setline(2, " {")
248 call append('$', ranges)
249 call setline('$', getline('$')[:-2]) " remove last comma
250 call setline(line('$') + 1, " };")
251 wincmd p
252endfunc
253
Bram Moolenaar3848e002016-03-19 18:42:29 +0100254" Build the amoji width table in a new buffer.
255func! BuildEmojiTable(pattern, tableName)
256 let ranges = []
257 for line in map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
258 let token = split(line, '\.\.')
259 if len(token) == 1
260 call add(token, token[0])
261 endif
262 call add(ranges, printf("\t{0x%04x, 0x%04x},", "0x".token[0], "0x".token[1]))
263 endfor
264
265 " New buffer to put the result in.
266 new
267 exe "file " . a:tableName
268 call setline(1, " static struct interval " . a:tableName . "[] =")
269 call setline(2, " {")
270 call append('$', ranges)
271 call setline('$', getline('$')[:-2]) " remove last comma
272 call setline(line('$') + 1, " };")
273 wincmd p
274endfunc
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100275
Bram Moolenaar66312ac2015-06-21 14:22:00 +0200276" Try to avoid hitting E36
277set equalalways
Bram Moolenaar3e8cb582010-01-12 19:52:03 +0100278
279" Edit the Unicode text file. Requires the netrw plugin.
280edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
281
282" Parse each line, create a list of lists.
283call ParseDataToProps()
284
285" Build the toLower table.
286call BuildCaseTable("Lower", 13)
287
288" Build the toUpper table.
289call BuildCaseTable("Upper", 12)
290
291" Build the ranges of composing chars.
292call BuildCombiningTable()
293
294" Edit the case folding text file. Requires the netrw plugin.
295edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
296
297" Parse each line, create a list of lists.
298call ParseFoldProps()
299
300" Build the foldCase table.
301call BuildFoldTable()
302
303" Edit the width text file. Requires the netrw plugin.
304edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
305
306" Parse each line, create a list of lists.
307call ParseWidthProps()
308
Bram Moolenaarda4d7a92010-01-27 18:29:26 +0100309" Build the double width table.
310call BuildWidthTable('[WF]', 'doublewidth')
311
312" Build the ambiguous width table.
313call BuildWidthTable('A', 'ambiguous')
Bram Moolenaar3848e002016-03-19 18:42:29 +0100314
315" Edit the emoji text file. Requires the netrw plugin.
316edit http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
317
318" Build the emoji table. Ver. 1.0 - 6.0
319call BuildEmojiTable('; Emoji\s\+# [1-6]\.[0-9]', 'emoji')