blob: f33df4206d8e5a4e515bfce13d93d21058d12270 [file] [log] [blame]
Bram Moolenaar3e8cb582010-01-12 19:52:03 +01001" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Usage: Vim -S <this-file>
7"
8" Author: Bram Moolenaar
9" Last Update: 2010 Jan 12
10
11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
12func! ParseDataToProps()
13 let s:dataprops = []
14 let lnum = 1
15 while lnum <= line('$')
16 let l = split(getline(lnum), '\s*;\s*', 1)
17 if len(l) != 15
18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19 return
20 endif
21 call add(s:dataprops, l)
22 let lnum += 1
23 endwhile
24endfunc
25
26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
27func! ParseFoldProps()
28 let s:foldprops = []
29 let lnum = 1
30 while lnum <= line('$')
31 let line = getline(lnum)
32 if line !~ '^#' && line !~ '^\s*$'
33 let l = split(line, '\s*;\s*', 1)
34 if len(l) != 4
35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36 return
37 endif
38 call add(s:foldprops, l)
39 endif
40 let lnum += 1
41 endwhile
42endfunc
43
44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
45func! ParseWidthProps()
46 let s:widthprops = []
47 let lnum = 1
48 while lnum <= line('$')
49 let line = getline(lnum)
50 if line !~ '^#' && line !~ '^\s*$'
51 let l = split(line, '\s*;\s*', 1)
52 if len(l) != 2
53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54 return
55 endif
56 call add(s:widthprops, l)
57 endif
58 let lnum += 1
59 endwhile
60endfunc
61
62" Build the toLower or toUpper table in a new buffer.
63" Uses s:dataprops.
64func! BuildCaseTable(name, index)
65 let start = -1
66 let end = -1
67 let step = 0
68 let add = -1
69 let ranges = []
70 for p in s:dataprops
71 if p[a:index] != ''
72 let n = ('0x' . p[0]) + 0
73 let nl = ('0x' . p[a:index]) + 0
74 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
75 " continue with same range.
76 let step = n - end
77 let end = n
78 else
79 if start >= 0
80 " produce previous range
81 call Range(ranges, start, end, step, add)
82 endif
83 let start = n
84 let end = n
85 let step = 0
86 let add = nl - n
87 endif
88 endif
89 endfor
90 if start >= 0
91 call Range(ranges, start, end, step, add)
92 endif
93
94 " New buffer to put the result in.
95 new
96 exe "file to" . a:name
97 call setline(1, "static convertStruct to" . a:name . "[] =")
98 call setline(2, "{")
99 call append('$', ranges)
100 call setline('$', getline('$')[:-2]) " remove last comma
101 call setline(line('$') + 1, "};")
102 wincmd p
103endfunc
104
105" Build the foldCase table in a new buffer.
106" Uses s:foldprops.
107func! BuildFoldTable()
108 let start = -1
109 let end = -1
110 let step = 0
111 let add = -1
112 let ranges = []
113 for p in s:foldprops
114 if p[1] == 'C' || p[1] == 'S'
115 let n = ('0x' . p[0]) + 0
116 let nl = ('0x' . p[2]) + 0
117 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
118 " continue with same range.
119 let step = n - end
120 let end = n
121 else
122 if start >= 0
123 " produce previous range
124 call Range(ranges, start, end, step, add)
125 endif
126 let start = n
127 let end = n
128 let step = 0
129 let add = nl - n
130 endif
131 endif
132 endfor
133 if start >= 0
134 call Range(ranges, start, end, step, add)
135 endif
136
137 " New buffer to put the result in.
138 new
139 file foldCase
140 call setline(1, "static convertStruct foldCase[] =")
141 call setline(2, "{")
142 call append('$', ranges)
143 call setline('$', getline('$')[:-2]) " remove last comma
144 call setline(line('$') + 1, "};")
145 wincmd p
146endfunc
147
148func! Range(ranges, start, end, step, add)
149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150 call add(a:ranges, s)
151endfunc
152
153" Build the combining table.
154" Uses s:dataprops.
155func! BuildCombiningTable()
156 let start = -1
157 let end = -1
158 let ranges = []
159 for p in s:dataprops
160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161 let n = ('0x' . p[0]) + 0
162 if start >= 0 && end + 1 == n
163 " continue with same range.
164 let end = n
165 else
166 if start >= 0
167 " produce previous range
168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169 endif
170 let start = n
171 let end = n
172 endif
173 endif
174 endfor
175 if start >= 0
176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177 endif
178
179 " New buffer to put the result in.
180 new
181 file combining
182 call setline(1, " static struct interval combining[] =")
183 call setline(2, " {")
184 call append('$', ranges)
185 call setline('$', getline('$')[:-2]) " remove last comma
186 call setline(line('$') + 1, " };")
187 wincmd p
188endfunc
189
190" Build the ambiguous table in a new buffer.
191" Uses s:widthprops and s:dataprops.
192func! BuildAmbiguousTable()
193 let start = -1
194 let end = -1
195 let ranges = []
196 let dataidx = 0
197 for p in s:widthprops
198 if p[1][0] == 'A'
199 let n = ('0x' . p[0]) + 0
200 " Find this char in the data table.
201 while 1
202 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
203 if dn >= n
204 break
205 endif
206 let dataidx += 1
207 endwhile
208 if dn != n
209 echoerr "Cannot find character " . n . " in data table"
210 endif
211 " Only use the char when it's not a composing char.
212 let dp = s:dataprops[dataidx]
213 if dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me'
214 if start >= 0 && end + 1 == n
215 " continue with same range.
216 let end = n
217 else
218 if start >= 0
219 " produce previous range
220 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
221 endif
222 let start = n
223 if p[0] =~ '\.\.'
224 let end = ('0x' . substitute(p[0], '.*\.\.', '', '')) + 0
225 else
226 let end = n
227 endif
228 endif
229 endif
230 endif
231 endfor
232 if start >= 0
233 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
234 endif
235
236 " New buffer to put the result in.
237 new
238 file ambiguous
239 call setline(1, " static struct interval ambiguous[] =")
240 call setline(2, " {")
241 call append('$', ranges)
242 call setline('$', getline('$')[:-2]) " remove last comma
243 call setline(line('$') + 1, " };")
244 wincmd p
245endfunc
246
247
248
249" Edit the Unicode text file. Requires the netrw plugin.
250edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
251
252" Parse each line, create a list of lists.
253call ParseDataToProps()
254
255" Build the toLower table.
256call BuildCaseTable("Lower", 13)
257
258" Build the toUpper table.
259call BuildCaseTable("Upper", 12)
260
261" Build the ranges of composing chars.
262call BuildCombiningTable()
263
264" Edit the case folding text file. Requires the netrw plugin.
265edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
266
267" Parse each line, create a list of lists.
268call ParseFoldProps()
269
270" Build the foldCase table.
271call BuildFoldTable()
272
273" Edit the width text file. Requires the netrw plugin.
274edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
275
276" Parse each line, create a list of lists.
277call ParseWidthProps()
278
279" Build the ambiguous table.
280call BuildAmbiguousTable()