blob: 2c79a49e499ea0caa13f78d3abe476add22c4221 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar0270f382018-07-17 05:43:58 +0200246// Variables only used in nfa_regcomp() and descendants.
247static int nfa_re_flags; // re_flags passed to nfa_regcomp()
248static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200249static int *post_end;
250static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100251
252// Set when the pattern should use the NFA engine.
253// E.g. [[:upper:]] only allows 8bit characters for BT engine,
254// while NFA engine handles multibyte characters correctly.
255static int wants_nfa;
256
Bram Moolenaar0270f382018-07-17 05:43:58 +0200257static int nstate; // Number of states in the NFA.
258static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200261static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200262
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100263// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200264static int nfa_ll_index = 0;
265
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100266static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100267static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200268#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100269static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200270#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int match_follows(nfa_state_T *startstate, int depth);
272static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100274// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200276 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200277 return FAIL; \
278 *post_ptr++ = c; \
279 } while (0)
280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200281/*
282 * Initialize internal variables before NFA compilation.
283 * Return OK on success, FAIL otherwise.
284 */
285 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100286nfa_regcomp_start(
287 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200289{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200290 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200291 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200292
293 nstate = 0;
294 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200296 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298 // Some items blow up in size, such as [A-z]. Add more space for that.
299 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200300 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200301
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200303 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200304
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200305 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200306 if (post_start == NULL)
307 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200308 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100310 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200311 rex.nfa_has_zend = FALSE;
312 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315 regcomp_start(expr, re_flags);
316
317 return OK;
318}
319
320/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200321 * Figure out if the NFA state list starts with an anchor, must match at start
322 * of the line.
323 */
324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100325nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326{
327 nfa_state_T *p = start;
328
329 if (depth > 4)
330 return 0;
331
332 while (p != NULL)
333 {
334 switch (p->c)
335 {
336 case NFA_BOL:
337 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200339
340 case NFA_ZSTART:
341 case NFA_ZEND:
342 case NFA_CURSOR:
343 case NFA_VISUAL:
344
345 case NFA_MOPEN:
346 case NFA_MOPEN1:
347 case NFA_MOPEN2:
348 case NFA_MOPEN3:
349 case NFA_MOPEN4:
350 case NFA_MOPEN5:
351 case NFA_MOPEN6:
352 case NFA_MOPEN7:
353 case NFA_MOPEN8:
354 case NFA_MOPEN9:
355 case NFA_NOPEN:
356#ifdef FEAT_SYN_HL
357 case NFA_ZOPEN:
358 case NFA_ZOPEN1:
359 case NFA_ZOPEN2:
360 case NFA_ZOPEN3:
361 case NFA_ZOPEN4:
362 case NFA_ZOPEN5:
363 case NFA_ZOPEN6:
364 case NFA_ZOPEN7:
365 case NFA_ZOPEN8:
366 case NFA_ZOPEN9:
367#endif
368 p = p->out;
369 break;
370
371 case NFA_SPLIT:
372 return nfa_get_reganch(p->out, depth + 1)
373 && nfa_get_reganch(p->out1, depth + 1);
374
375 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100376 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200377 }
378 }
379 return 0;
380}
381
382/*
383 * Figure out if the NFA state list starts with a character which must match
384 * at start of the match.
385 */
386 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100387nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200388{
389 nfa_state_T *p = start;
390
391 if (depth > 4)
392 return 0;
393
394 while (p != NULL)
395 {
396 switch (p->c)
397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100398 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200399 case NFA_BOL:
400 case NFA_BOF:
401 case NFA_BOW:
402 case NFA_EOW:
403 case NFA_ZSTART:
404 case NFA_ZEND:
405 case NFA_CURSOR:
406 case NFA_VISUAL:
407 case NFA_LNUM:
408 case NFA_LNUM_GT:
409 case NFA_LNUM_LT:
410 case NFA_COL:
411 case NFA_COL_GT:
412 case NFA_COL_LT:
413 case NFA_VCOL:
414 case NFA_VCOL_GT:
415 case NFA_VCOL_LT:
416 case NFA_MARK:
417 case NFA_MARK_GT:
418 case NFA_MARK_LT:
419
420 case NFA_MOPEN:
421 case NFA_MOPEN1:
422 case NFA_MOPEN2:
423 case NFA_MOPEN3:
424 case NFA_MOPEN4:
425 case NFA_MOPEN5:
426 case NFA_MOPEN6:
427 case NFA_MOPEN7:
428 case NFA_MOPEN8:
429 case NFA_MOPEN9:
430 case NFA_NOPEN:
431#ifdef FEAT_SYN_HL
432 case NFA_ZOPEN:
433 case NFA_ZOPEN1:
434 case NFA_ZOPEN2:
435 case NFA_ZOPEN3:
436 case NFA_ZOPEN4:
437 case NFA_ZOPEN5:
438 case NFA_ZOPEN6:
439 case NFA_ZOPEN7:
440 case NFA_ZOPEN8:
441 case NFA_ZOPEN9:
442#endif
443 p = p->out;
444 break;
445
446 case NFA_SPLIT:
447 {
448 int c1 = nfa_get_regstart(p->out, depth + 1);
449 int c2 = nfa_get_regstart(p->out1, depth + 1);
450
451 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100452 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200453 return 0;
454 }
455
456 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200457 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100458 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200459 return 0;
460 }
461 }
462 return 0;
463}
464
465/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200466 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200467 * else. If so return a string in allocated memory with what must match after
468 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200469 */
470 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100471nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200472{
473 nfa_state_T *p = start;
474 int len = 0;
475 char_u *ret;
476 char_u *s;
477
478 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100479 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200480 p = p->out;
481 while (p->c > 0)
482 {
483 len += MB_CHAR2LEN(p->c);
484 p = p->out;
485 }
486 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
487 return NULL;
488
489 ret = alloc(len);
490 if (ret != NULL)
491 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100492 p = start->out->out; // skip first char, it goes into regstart
Bram Moolenaar473de612013-06-08 18:19:48 +0200493 s = ret;
494 while (p->c > 0)
495 {
Bram Moolenaar473de612013-06-08 18:19:48 +0200496 if (has_mbyte)
497 s += (*mb_char2bytes)(p->c, s);
498 else
Bram Moolenaar473de612013-06-08 18:19:48 +0200499 *s++ = p->c;
500 p = p->out;
501 }
502 *s = NUL;
503 }
504 return ret;
505}
506
507/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200508 * Allocate more space for post_start. Called when
509 * running above the estimated number of states.
510 */
511 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100512realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200514 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100515 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200516 int *new_start;
517 int *old_start;
518
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100519 // For weird patterns the number of states can be very high. Increasing by
520 // 50% seems a reasonable compromise between memory use and speed.
521 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200522 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200523 if (new_start == NULL)
524 return FAIL;
525 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200526 old_start = post_start;
527 post_start = new_start;
528 post_ptr = new_start + (post_ptr - old_start);
529 post_end = post_start + new_max;
530 vim_free(old_start);
531 return OK;
532}
533
534/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200535 * Search between "start" and "end" and try to recognize a
536 * character class in expanded form. For example [0-9].
537 * On success, return the id the character class to be emitted.
538 * On failure, return 0 (=FAIL)
539 * Start points to the first char of the range, while end should point
540 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200541 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
542 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200543 */
544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100545nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200546{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200547# define CLASS_not 0x80
548# define CLASS_af 0x40
549# define CLASS_AF 0x20
550# define CLASS_az 0x10
551# define CLASS_AZ 0x08
552# define CLASS_o7 0x04
553# define CLASS_o9 0x02
554# define CLASS_underscore 0x01
555
556 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200557 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200558 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200559
560 if (extra_newl == TRUE)
561 newl = TRUE;
562
563 if (*end != ']')
564 return FAIL;
565 p = start;
566 if (*p == '^')
567 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200568 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200569 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200570 }
571
572 while (p < end)
573 {
574 if (p + 2 < end && *(p + 1) == '-')
575 {
576 switch (*p)
577 {
578 case '0':
579 if (*(p + 2) == '9')
580 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200581 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200582 break;
583 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200584 if (*(p + 2) == '7')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200589 return FAIL;
590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 case 'a':
592 if (*(p + 2) == 'z')
593 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200594 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200595 break;
596 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200597 if (*(p + 2) == 'f')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200602 return FAIL;
603
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200604 case 'A':
605 if (*(p + 2) == 'Z')
606 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200607 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200608 break;
609 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200610 if (*(p + 2) == 'F')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200615 return FAIL;
616
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200617 default:
618 return FAIL;
619 }
620 p += 3;
621 }
622 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
623 {
624 newl = TRUE;
625 p += 2;
626 }
627 else if (*p == '_')
628 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200629 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200630 p ++;
631 }
632 else if (*p == '\n')
633 {
634 newl = TRUE;
635 p ++;
636 }
637 else
638 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100639 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200640
641 if (p != end)
642 return FAIL;
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200645 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200646
647 switch (config)
648 {
649 case CLASS_o9:
650 return extra_newl + NFA_DIGIT;
651 case CLASS_not | CLASS_o9:
652 return extra_newl + NFA_NDIGIT;
653 case CLASS_af | CLASS_AF | CLASS_o9:
654 return extra_newl + NFA_HEX;
655 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
656 return extra_newl + NFA_NHEX;
657 case CLASS_o7:
658 return extra_newl + NFA_OCTAL;
659 case CLASS_not | CLASS_o7:
660 return extra_newl + NFA_NOCTAL;
661 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
662 return extra_newl + NFA_WORD;
663 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
664 return extra_newl + NFA_NWORD;
665 case CLASS_az | CLASS_AZ | CLASS_underscore:
666 return extra_newl + NFA_HEAD;
667 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
668 return extra_newl + NFA_NHEAD;
669 case CLASS_az | CLASS_AZ:
670 return extra_newl + NFA_ALPHA;
671 case CLASS_not | CLASS_az | CLASS_AZ:
672 return extra_newl + NFA_NALPHA;
673 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200674 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200675 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200676 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200677 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200678 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200679 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200680 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200682 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200683}
684
685/*
686 * Produce the bytes for equivalence class "c".
687 * Currently only handles latin1, latin9 and utf-8.
688 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
689 * equivalent to 'a OR b OR c'
690 *
691 * NOTE! When changing this function, also update reg_equi_class()
692 */
693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200696#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
699 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700 {
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000701#define A_grave 0xc0
702#define A_acute 0xc1
703#define A_circumflex 0xc2
704#define A_virguilla 0xc3
705#define A_diaeresis 0xc4
706#define A_ring 0xc5
707#define C_cedilla 0xc7
708#define E_grave 0xc8
709#define E_acute 0xc9
710#define E_circumflex 0xca
711#define E_diaeresis 0xcb
712#define I_grave 0xcc
713#define I_acute 0xcd
714#define I_circumflex 0xce
715#define I_diaeresis 0xcf
716#define N_virguilla 0xd1
717#define O_grave 0xd2
718#define O_acute 0xd3
719#define O_circumflex 0xd4
720#define O_virguilla 0xd5
721#define O_diaeresis 0xd6
722#define O_slash 0xd8
723#define U_grave 0xd9
724#define U_acute 0xda
725#define U_circumflex 0xdb
726#define U_diaeresis 0xdc
727#define Y_acute 0xdd
728#define a_grave 0xe0
729#define a_acute 0xe1
730#define a_circumflex 0xe2
731#define a_virguilla 0xe3
732#define a_diaeresis 0xe4
733#define a_ring 0xe5
734#define c_cedilla 0xe7
735#define e_grave 0xe8
736#define e_acute 0xe9
737#define e_circumflex 0xea
738#define e_diaeresis 0xeb
739#define i_grave 0xec
740#define i_acute 0xed
741#define i_circumflex 0xee
742#define i_diaeresis 0xef
743#define n_virguilla 0xf1
744#define o_grave 0xf2
745#define o_acute 0xf3
746#define o_circumflex 0xf4
747#define o_virguilla 0xf5
748#define o_diaeresis 0xf6
749#define o_slash 0xf8
750#define u_grave 0xf9
751#define u_acute 0xfa
752#define u_circumflex 0xfb
753#define u_diaeresis 0xfc
754#define y_acute 0xfd
755#define y_diaeresis 0xff
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200756 switch (c)
757 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200758 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200759 case A_virguilla: case A_diaeresis: case A_ring:
760 case 0x100: case 0x102: case 0x104: case 0x1cd:
761 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
762 case 0x202: case 0x226: case 0x23a: case 0x1e00:
763 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
764 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
765 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
766 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
767 EMIT2(A_circumflex) EMIT2(A_virguilla)
768 EMIT2(A_diaeresis) EMIT2(A_ring)
769 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
770 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
771 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
772 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
773 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
774 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
775 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
776 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200777 return OK;
778
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200779 case 'B': case 0x181: case 0x243: case 0x1e02:
780 case 0x1e04: case 0x1e06:
781 EMIT2('B')
782 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
783 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200784 return OK;
785
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200786 case 'C': case C_cedilla: case 0x106: case 0x108:
787 case 0x10a: case 0x10c: case 0x187: case 0x23b:
788 case 0x1e08: case 0xa792:
789 EMIT2('C') EMIT2(C_cedilla)
790 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
791 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
792 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200793 return OK;
794
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200795 case 'D': case 0x10e: case 0x110: case 0x18a:
796 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
797 case 0x1e12:
798 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
799 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
800 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200801 return OK;
802
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200803 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200804 case E_diaeresis: case 0x112: case 0x114: case 0x116:
805 case 0x118: case 0x11a: case 0x204: case 0x206:
806 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
807 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
808 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
809 case 0x1ec2: case 0x1ec4: case 0x1ec6:
810 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
811 EMIT2(E_circumflex) EMIT2(E_diaeresis)
812 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
813 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
814 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
815 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
816 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
817 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
818 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
819 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200820 return OK;
821
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case 'F': case 0x191: case 0x1e1e: case 0xa798:
823 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200824 return OK;
825
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200826 case 'G': case 0x11c: case 0x11e: case 0x120:
827 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
828 case 0x1f4: case 0x1e20: case 0xa7a0:
829 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
830 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
831 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
832 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200833 return OK;
834
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200835 case 'H': case 0x124: case 0x126: case 0x21e:
836 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
837 case 0x1e2a: case 0x2c67:
838 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
839 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
840 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200841 return OK;
842
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200843 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200844 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
845 case 0x12e: case 0x130: case 0x197: case 0x1cf:
846 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
847 case 0x1ec8: case 0x1eca:
848 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
849 EMIT2(I_circumflex) EMIT2(I_diaeresis)
850 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
851 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
852 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
853 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
854 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200855 return OK;
856
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200857 case 'J': case 0x134: case 0x248:
858 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200859 return OK;
860
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
862 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
863 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
864 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
865 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200866 return OK;
867
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200868 case 'L': case 0x139: case 0x13b: case 0x13d:
869 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
870 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
871 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
872 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
873 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
874 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200875 return OK;
876
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200877 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
878 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
879 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200880 return OK;
881
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200882 case 'N': case N_virguilla:
883 case 0x143: case 0x145: case 0x147: case 0x1f8:
884 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
885 case 0xa7a4:
886 EMIT2('N') EMIT2(N_virguilla)
887 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
888 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
889 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200890 return OK;
891
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200892 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200893 case O_virguilla: case O_diaeresis: case O_slash:
894 case 0x14c: case 0x14e: case 0x150: case 0x19f:
895 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
896 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
897 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
898 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
899 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
900 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
901 case 0x1ede: case 0x1ee0: case 0x1ee2:
902 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
903 EMIT2(O_circumflex) EMIT2(O_virguilla)
904 EMIT2(O_diaeresis) EMIT2(O_slash)
905 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
906 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
907 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
908 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
909 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
910 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
911 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
912 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
913 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
914 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
915 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200916 return OK;
917
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200918 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
919 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
920 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200921 return OK;
922
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200923 case 'Q': case 0x24a:
924 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200925 return OK;
926
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200927 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
928 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
929 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
930 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
931 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
932 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
933 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200934 return OK;
935
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200936 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
937 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
938 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
939 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
940 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
941 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
942 EMIT2(0xa7a8)
943 return OK;
944
945 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
946 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
947 case 0x1e6e: case 0x1e70:
948 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
949 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
950 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200951 return OK;
952
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200953 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200954 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
955 case 0x16e: case 0x170: case 0x172: case 0x1af:
956 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
957 case 0x1db: case 0x214: case 0x216: case 0x244:
958 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
959 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
960 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
961 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
962 EMIT2(U_diaeresis) EMIT2(U_circumflex)
963 EMIT2(0x168) EMIT2(0x16a)
964 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
965 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
966 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
967 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
968 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
969 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
970 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
971 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
972 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200973 return OK;
974
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200975 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
976 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200977 return OK;
978
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200979 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
980 case 0x1e86: case 0x1e88:
981 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
982 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200983 return OK;
984
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 case 'X': case 0x1e8a: case 0x1e8c:
986 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200987 return OK;
988
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200989 case 'Y': case Y_acute: case 0x176: case 0x178:
990 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
991 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
992 EMIT2('Y') EMIT2(Y_acute)
993 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
994 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
995 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
996 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'Z': case 0x179: case 0x17b: case 0x17d:
1000 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1001 case 0x2c6b:
1002 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1003 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1004 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001005 return OK;
1006
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001007 case 'a': case a_grave: case a_acute: case a_circumflex:
1008 case a_virguilla: case a_diaeresis: case a_ring:
1009 case 0x101: case 0x103: case 0x105: case 0x1ce:
1010 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1011 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1012 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1013 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1014 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1015 case 0x1eb7: case 0x2c65:
1016 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1017 EMIT2(a_circumflex) EMIT2(a_virguilla)
1018 EMIT2(a_diaeresis) EMIT2(a_ring)
1019 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1020 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1021 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1022 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1023 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1024 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1025 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1026 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1027 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001028 return OK;
1029
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001030 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1031 case 0x1e03: case 0x1e05: case 0x1e07:
1032 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1033 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001034 return OK;
1035
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001036 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1037 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1038 case 0xa794:
1039 EMIT2('c') EMIT2(c_cedilla)
1040 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1041 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1042 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001043 return OK;
1044
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001045 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1046 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1047 case 0x1e11: case 0x1e13:
1048 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1049 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1050 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1051 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001052 return OK;
1053
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001054 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001055 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1056 case 0x119: case 0x11b: case 0x205: case 0x207:
1057 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1058 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1059 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1060 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1061 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1062 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1063 EMIT2(0x113) EMIT2(0x115)
1064 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1065 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1066 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1067 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1068 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1069 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1070 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001071 return OK;
1072
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001073 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1074 case 0x1e1f: case 0xa799:
1075 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1076 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001077 return OK;
1078
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001079 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1080 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1081 case 0x1e21: case 0xa7a1:
1082 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1083 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1084 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1085 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001086 return OK;
1087
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001088 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1089 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1090 case 0x1e96: case 0x2c68: case 0xa795:
1091 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1092 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1093 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1094 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001095 return OK;
1096
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001097 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001098 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1099 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1100 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1101 case 0x1ec9: case 0x1ecb:
1102 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1103 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1104 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1105 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1106 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1107 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1108 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001109 return OK;
1110
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001111 case 'j': case 0x135: case 0x1f0: case 0x249:
1112 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001113 return OK;
1114
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001115 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1116 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1117 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1118 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1119 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001120 return OK;
1121
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001122 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1123 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1124 case 0x1e3d: case 0x2c61:
1125 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1126 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1127 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1128 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001129 return OK;
1130
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001131 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1132 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1133 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1137 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1138 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1139 EMIT2('n') EMIT2(n_virguilla)
1140 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1141 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1142 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1143 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001144 return OK;
1145
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001146 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001147 case o_virguilla: case o_diaeresis: case o_slash:
1148 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1149 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1150 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1151 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1152 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1153 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1154 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1155 case 0x1edf: case 0x1ee1: case 0x1ee3:
1156 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1157 EMIT2(o_circumflex) EMIT2(o_virguilla)
1158 EMIT2(o_diaeresis) EMIT2(o_slash)
1159 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1160 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1161 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1162 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1163 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1164 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1165 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1166 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1167 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1168 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1169 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001170 return OK;
1171
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001172 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1173 case 0x1e55: case 0x1e57:
1174 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1175 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'q': case 0x24b: case 0x2a0:
1179 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001180 return OK;
1181
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001182 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1183 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1184 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1185 case 0xa7a7:
1186 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1187 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1188 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1189 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001190 return OK;
1191
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001192 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1193 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1194 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1195 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1196 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1197 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1198 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1199 return OK;
1200
1201 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1202 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1203 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1204 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1205 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1206 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1207 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001208 return OK;
1209
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001210 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001211 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1212 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1213 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1214 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1215 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1216 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1217 case 0x1eed: case 0x1eef: case 0x1ef1:
1218 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1219 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1220 EMIT2(0x169) EMIT2(0x16b)
1221 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1222 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1223 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1224 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1225 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1226 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1227 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1228 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1229 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001230 return OK;
1231
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001232 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1233 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1234 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001235 return OK;
1236
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001237 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1238 case 0x1e87: case 0x1e89: case 0x1e98:
1239 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1240 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001241 return OK;
1242
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001243 case 'x': case 0x1e8b: case 0x1e8d:
1244 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001245 return OK;
1246
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001247 case 'y': case y_acute: case y_diaeresis: case 0x177:
1248 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1249 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1250 case 0x1ef9:
1251 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1252 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1253 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1254 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001255 return OK;
1256
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001257 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1258 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1259 case 0x1e95: case 0x2c6c:
1260 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1261 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1262 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001263 return OK;
1264
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001265 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001266 }
1267 }
1268
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001269 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001270 return OK;
1271#undef EMIT2
1272}
1273
1274/*
1275 * Code to parse regular expression.
1276 *
1277 * We try to reuse parsing functions in regexp.c to
1278 * minimize surprise and keep the syntax consistent.
1279 */
1280
1281/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001282 * Parse the lowest level.
1283 *
1284 * An atom can be one of a long list of items. Many atoms match one character
1285 * in the text. It is often an ordinary character or a character class.
1286 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1287 * is only for syntax highlighting.
1288 *
1289 * atom ::= ordinary-atom
1290 * or \( pattern \)
1291 * or \%( pattern \)
1292 * or \z( pattern \)
1293 */
1294 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001295nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001296{
1297 int c;
1298 int charclass;
1299 int equiclass;
1300 int collclass;
1301 int got_coll_char;
1302 char_u *p;
1303 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001304 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001305 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001306 int emit_range;
1307 int negated;
1308 int result;
1309 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001310 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001311
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001312 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 switch (c)
1314 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001315 case NUL:
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001316 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar47196582013-05-25 22:04:23 +02001317
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001318 case Magic('^'):
1319 EMIT(NFA_BOL);
1320 break;
1321
1322 case Magic('$'):
1323 EMIT(NFA_EOL);
1324#if defined(FEAT_SYN_HL) || defined(PROTO)
1325 had_eol = TRUE;
1326#endif
1327 break;
1328
1329 case Magic('<'):
1330 EMIT(NFA_BOW);
1331 break;
1332
1333 case Magic('>'):
1334 EMIT(NFA_EOW);
1335 break;
1336
1337 case Magic('_'):
1338 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001339 if (c == NUL)
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001340 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar174a8482013-11-28 14:20:17 +01001341
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001342 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001343 {
1344 EMIT(NFA_BOL);
1345 break;
1346 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001347 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001348 {
1349 EMIT(NFA_EOL);
1350#if defined(FEAT_SYN_HL) || defined(PROTO)
1351 had_eol = TRUE;
1352#endif
1353 break;
1354 }
1355
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001356 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001357
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001358 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001359 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001360 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001361
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001362 // "\_x" is character class plus newline
1363 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001364
1365 /*
1366 * Character classes.
1367 */
1368 case Magic('.'):
1369 case Magic('i'):
1370 case Magic('I'):
1371 case Magic('k'):
1372 case Magic('K'):
1373 case Magic('f'):
1374 case Magic('F'):
1375 case Magic('p'):
1376 case Magic('P'):
1377 case Magic('s'):
1378 case Magic('S'):
1379 case Magic('d'):
1380 case Magic('D'):
1381 case Magic('x'):
1382 case Magic('X'):
1383 case Magic('o'):
1384 case Magic('O'):
1385 case Magic('w'):
1386 case Magic('W'):
1387 case Magic('h'):
1388 case Magic('H'):
1389 case Magic('a'):
1390 case Magic('A'):
1391 case Magic('l'):
1392 case Magic('L'):
1393 case Magic('u'):
1394 case Magic('U'):
1395 p = vim_strchr(classchars, no_Magic(c));
1396 if (p == NULL)
1397 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001398 if (extra == NFA_ADD_NL)
1399 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001400 semsg(_(e_nfa_regexp_invalid_character_class_nr), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001401 rc_did_emsg = TRUE;
1402 return FAIL;
1403 }
Bram Moolenaarb5443cc2019-01-15 20:19:40 +01001404 siemsg("INTERNAL: Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001405 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001406 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001408 // When '.' is followed by a composing char ignore the dot, so that
1409 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001410 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1411 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001412 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001413 c = getchr();
1414 goto nfa_do_multibyte;
1415 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001416 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001417 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001418 {
1419 EMIT(NFA_NEWL);
1420 EMIT(NFA_OR);
1421 regflags |= RF_HASNL;
1422 }
1423 break;
1424
1425 case Magic('n'):
1426 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001427 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001428 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001429 else
1430 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001431 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001432 EMIT(NFA_NEWL);
1433 regflags |= RF_HASNL;
1434 }
1435 break;
1436
1437 case Magic('('):
1438 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001439 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001440 break;
1441
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001442 case Magic('|'):
1443 case Magic('&'):
1444 case Magic(')'):
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001445 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001446 return FAIL;
1447
1448 case Magic('='):
1449 case Magic('?'):
1450 case Magic('+'):
1451 case Magic('@'):
1452 case Magic('*'):
1453 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001454 // these should follow an atom, not form an atom
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001455 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001456 return FAIL;
1457
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001458 case Magic('~'):
1459 {
1460 char_u *lp;
1461
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001462 // Previous substitute pattern.
1463 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001464 if (reg_prev_sub == NULL)
1465 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001466 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001467 return FAIL;
1468 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001469 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001470 {
1471 EMIT(PTR2CHAR(lp));
1472 if (lp != reg_prev_sub)
1473 EMIT(NFA_CONCAT);
1474 }
1475 EMIT(NFA_NOPEN);
1476 break;
1477 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001478
Bram Moolenaar428e9872013-05-30 17:05:39 +02001479 case Magic('1'):
1480 case Magic('2'):
1481 case Magic('3'):
1482 case Magic('4'):
1483 case Magic('5'):
1484 case Magic('6'):
1485 case Magic('7'):
1486 case Magic('8'):
1487 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001488 {
1489 int refnum = no_Magic(c) - '1';
1490
1491 if (!seen_endbrace(refnum + 1))
1492 return FAIL;
1493 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001494 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001495 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001496 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001497
1498 case Magic('z'):
1499 c = no_Magic(getchr());
1500 switch (c)
1501 {
1502 case 's':
1503 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001504 if (re_mult_next("\\zs") == FAIL)
1505 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001506 break;
1507 case 'e':
1508 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001509 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001510 if (re_mult_next("\\ze") == FAIL)
1511 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001512 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001513#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001514 case '1':
1515 case '2':
1516 case '3':
1517 case '4':
1518 case '5':
1519 case '6':
1520 case '7':
1521 case '8':
1522 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001523 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001524 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001525 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001526 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001527 // No need to set rex.nfa_has_backref, the sub-matches don't
1528 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001529 re_has_z = REX_USE;
1530 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001531 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001532 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001533 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001534 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001535 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001536 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001537 re_has_z = REX_SET;
1538 break;
1539#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001540 default:
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001541 semsg(_(e_nfa_regexp_unknown_operator_z_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001542 return FAIL;
1543 }
1544 break;
1545
1546 case Magic('%'):
1547 c = no_Magic(getchr());
1548 switch (c)
1549 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001550 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001551 case '(':
1552 if (nfa_reg(REG_NPAREN) == FAIL)
1553 return FAIL;
1554 EMIT(NFA_NOPEN);
1555 break;
1556
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001557 case 'd': // %d123 decimal
1558 case 'o': // %o123 octal
1559 case 'x': // %xab hex 2
1560 case 'u': // %uabcd hex 4
1561 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001562 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001563 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001564
Bram Moolenaar47196582013-05-25 22:04:23 +02001565 switch (c)
1566 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001567 case 'd': nr = getdecchrs(); break;
1568 case 'o': nr = getoctchrs(); break;
1569 case 'x': nr = gethexchrs(2); break;
1570 case 'u': nr = gethexchrs(4); break;
1571 case 'U': nr = gethexchrs(8); break;
1572 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001573 }
1574
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001575 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001576 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1577 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001578 // A NUL is stored in the text as NL
1579 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001580 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001581 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001582 break;
1583
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001584 // Catch \%^ and \%$ regardless of where they appear in the
1585 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001586 case '^':
1587 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001588 break;
1589
1590 case '$':
1591 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001592 break;
1593
1594 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +02001595 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001596 break;
1597
1598 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001599 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001600 break;
1601
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001602 case 'C':
1603 EMIT(NFA_ANY_COMPOSING);
1604 break;
1605
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001606 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001607 {
1608 int n;
1609
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001610 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001611 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001612 {
1613 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001614 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001615 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001616 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001617 if (nfa_regatom() == FAIL)
1618 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001619 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001620 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001621 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001622 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001623 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001624 EMIT(NFA_OPT_CHARS);
1625 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001626
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001627 // Emit as "\%(\%[abc]\)" to be able to handle
1628 // "\%[abc]*" which would cause the empty string to be
1629 // matched an unlimited number of times. NFA_NOPEN is
1630 // added only once at a position, while NFA_SPLIT is
1631 // added multiple times. This is more efficient than
1632 // not allowing NFA_SPLIT multiple times, it is used
1633 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001634 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001635 break;
1636 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001637
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001638 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001639 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001640 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001641 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001642 int cur = FALSE;
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001643 int got_digit = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001644
1645 if (c == '<' || c == '>')
1646 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001647 if (no_Magic(c) == '.')
1648 {
1649 cur = TRUE;
1650 c = getchr();
1651 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001652 while (VIM_ISDIGIT(c))
1653 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001654 long_u tmp;
1655
1656 if (cur)
Bram Moolenaarb10ff5c2022-03-19 11:31:38 +00001657 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001658 semsg(_(e_regexp_number_after_dot_pos_search_chr),
Bram Moolenaarb10ff5c2022-03-19 11:31:38 +00001659 no_Magic(c));
1660 return FAIL;
1661 }
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001662 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001663
1664 if (tmp < n)
1665 {
1666 // overflow.
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001667 emsg(_(e_percent_value_too_large));
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001668 return FAIL;
1669 }
1670 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001671 c = getchr();
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001672 got_digit = TRUE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001673 }
1674 if (c == 'l' || c == 'c' || c == 'v')
1675 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001676 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001677
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001678 if (!cur && !got_digit)
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001679 {
1680 semsg(_(e_nfa_regexp_missing_value_in_chr),
1681 no_Magic(c));
1682 return FAIL;
1683 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001684 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001685 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001686 if (cur)
1687 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001688 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001689 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001690 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001691 if (save_prev_at_start)
1692 at_start = TRUE;
1693 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001694 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001695 {
1696 if (cur)
1697 {
1698 n = curwin->w_cursor.col;
1699 n++;
1700 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001701 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001702 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001703 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001704 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001705 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001706 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001707 if (cur)
1708 {
1709 colnr_T vcol = 0;
1710
1711 getvvcol(curwin, &curwin->w_cursor,
1712 NULL, NULL, &vcol);
1713 n = ++vcol;
1714 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001715 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001716 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001717 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001718 limit = INT_MAX / MB_MAXBYTES;
1719 }
1720 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001721 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001722 emsg(_(e_percent_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001723 return FAIL;
1724 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001725 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001726 break;
1727 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001728 else if (c == '\'' && n == 0)
1729 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001730 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001731 EMIT(cmp == '<' ? NFA_MARK_LT :
1732 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001733 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001734 break;
1735 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001736 }
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001737 semsg(_(e_nfa_regexp_unknown_operator_percent_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001738 return FAIL;
1739 }
1740 break;
1741
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001742 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001743collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001744 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001745 * [abc] uses NFA_START_COLL - NFA_END_COLL
1746 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1747 * Each character is produced as a regular state, using
1748 * NFA_CONCAT to bind them together.
1749 * Besides normal characters there can be:
1750 * - character classes NFA_CLASS_*
1751 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001752 */
1753
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001754 p = regparse;
1755 endp = skip_anyof(p);
1756 if (*endp == ']')
1757 {
1758 /*
1759 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001760 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001761 * and perform the necessary substitutions in the NFA.
1762 */
1763 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001764 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001765 if (result != FAIL)
1766 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001767 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001768 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001769 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001770 EMIT(NFA_NEWL);
1771 EMIT(NFA_OR);
1772 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001773 else
1774 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001775 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001776 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001777 return OK;
1778 }
1779 /*
1780 * Failed to recognize a character class. Use the simple
1781 * version that turns [abc] into 'a' OR 'b' OR 'c'
1782 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001783 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001784 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001785 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001786 {
1787 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001788 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001789 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001790 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001791 else
1792 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001793 if (*regparse == '-')
1794 {
1795 startc = '-';
1796 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001797 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001798 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001799 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001800 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001801 emit_range = FALSE;
1802 while (regparse < endp)
1803 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001804 int oldstartc = startc;
1805
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001806 startc = -1;
1807 got_coll_char = FALSE;
1808 if (*regparse == '[')
1809 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001810 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001811 equiclass = collclass = 0;
1812 charclass = get_char_class(&regparse);
1813 if (charclass == CLASS_NONE)
1814 {
1815 equiclass = get_equi_class(&regparse);
1816 if (equiclass == 0)
1817 collclass = get_coll_element(&regparse);
1818 }
1819
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001820 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001821 if (charclass != CLASS_NONE)
1822 {
1823 switch (charclass)
1824 {
1825 case CLASS_ALNUM:
1826 EMIT(NFA_CLASS_ALNUM);
1827 break;
1828 case CLASS_ALPHA:
1829 EMIT(NFA_CLASS_ALPHA);
1830 break;
1831 case CLASS_BLANK:
1832 EMIT(NFA_CLASS_BLANK);
1833 break;
1834 case CLASS_CNTRL:
1835 EMIT(NFA_CLASS_CNTRL);
1836 break;
1837 case CLASS_DIGIT:
1838 EMIT(NFA_CLASS_DIGIT);
1839 break;
1840 case CLASS_GRAPH:
1841 EMIT(NFA_CLASS_GRAPH);
1842 break;
1843 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001844 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001845 EMIT(NFA_CLASS_LOWER);
1846 break;
1847 case CLASS_PRINT:
1848 EMIT(NFA_CLASS_PRINT);
1849 break;
1850 case CLASS_PUNCT:
1851 EMIT(NFA_CLASS_PUNCT);
1852 break;
1853 case CLASS_SPACE:
1854 EMIT(NFA_CLASS_SPACE);
1855 break;
1856 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001857 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001858 EMIT(NFA_CLASS_UPPER);
1859 break;
1860 case CLASS_XDIGIT:
1861 EMIT(NFA_CLASS_XDIGIT);
1862 break;
1863 case CLASS_TAB:
1864 EMIT(NFA_CLASS_TAB);
1865 break;
1866 case CLASS_RETURN:
1867 EMIT(NFA_CLASS_RETURN);
1868 break;
1869 case CLASS_BACKSPACE:
1870 EMIT(NFA_CLASS_BACKSPACE);
1871 break;
1872 case CLASS_ESCAPE:
1873 EMIT(NFA_CLASS_ESCAPE);
1874 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001875 case CLASS_IDENT:
1876 EMIT(NFA_CLASS_IDENT);
1877 break;
1878 case CLASS_KEYWORD:
1879 EMIT(NFA_CLASS_KEYWORD);
1880 break;
1881 case CLASS_FNAME:
1882 EMIT(NFA_CLASS_FNAME);
1883 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001884 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001885 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001886 continue;
1887 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001888 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001889 if (equiclass != 0)
1890 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001891 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001892 if (result == FAIL)
1893 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001894 // should never happen
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001895 EMSG_RET_FAIL(_(e_error_building_nfa_with_equivalence_class));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001896 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001897 continue;
1898 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001899 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001900 if (collclass != 0)
1901 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001902 startc = collclass; // allow [.a.]-x as a range
1903 // Will emit the proper atom at the end of the
1904 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001905 }
1906 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001907 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1908 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001909 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001910 {
1911 emit_range = TRUE;
1912 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001913 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001914 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001915 }
1916
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001917 // Now handle simple and escaped characters.
1918 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1919 // accepts "\t", "\e", etc., but only when the 'l' flag in
1920 // 'cpoptions' is not included.
1921 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001922 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001923 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001924 && regparse + 1 <= endp
1925 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001926 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001927 && vim_strchr(REGEXP_ABBR, regparse[1])
1928 != NULL)
1929 )
1930 )
1931 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001932 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001933
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001934 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001935 startc = (reg_string || emit_range
1936 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001937 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001938 || *regparse == 'o'
1939 || *regparse == 'x'
1940 || *regparse == 'u'
1941 || *regparse == 'U'
1942 )
1943 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001944 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001945 startc = coll_get_char();
1946 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001947 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001948 }
1949 else
1950 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001951 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001952 startc = backslash_trans(*regparse);
1953 }
1954 }
1955
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001956 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001957 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001958 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001959
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001960 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001961 if (emit_range)
1962 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001963 int endc = startc;
1964
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001965 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001966 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001967 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02001968
1969 if (endc > startc + 2)
1970 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001971 // Emit a range instead of the sequence of
1972 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001973 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001974 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001975 EMIT(1);
1976 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001977 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02001978 EMIT(endc);
1979 EMIT(NFA_RANGE);
1980 EMIT(NFA_CONCAT);
1981 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001982 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001983 || (*mb_char2len)(endc) > 1))
1984 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001985 // Emit the characters in the range.
1986 // "startc" was already emitted, so skip it.
1987 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001988 for (c = startc + 1; c <= endc; c++)
1989 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001990 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001991 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001992 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001993 }
1994 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001995 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001996 // Emit the range. "startc" was already emitted, so
1997 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001998 for (c = startc + 1; c <= endc; c++)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001999 {
2000 EMIT(c);
2001 EMIT(NFA_CONCAT);
2002 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002003 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002004 emit_range = FALSE;
2005 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002006 }
2007 else
2008 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002009 // This char (startc) is not part of a range. Just
2010 // emit it.
2011 // Normally, simply emit startc. But if we get char
2012 // code=0 from a collating char, then replace it with
2013 // 0x0a.
2014 // This is needed to completely mimic the behaviour of
2015 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002016 if (startc == NFA_NEWL)
2017 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002018 // Line break can't be matched as part of the
2019 // collection, add an OR below. But not for negated
2020 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002021 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002022 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002023 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002024 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002025 {
2026 if (got_coll_char == TRUE && startc == 0)
2027 EMIT(0x0a);
2028 else
2029 EMIT(startc);
2030 EMIT(NFA_CONCAT);
2031 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002032 }
2033
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002034 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002035 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002036
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002037 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002038 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002039 {
2040 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002041 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002042 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002043
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002044 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002045 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002046 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002047
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002048 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002049 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002050 EMIT(NFA_END_NEG_COLL);
2051 else
2052 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002053
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002054 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002055 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002056 {
2057 EMIT(reg_string ? NL : NFA_NEWL);
2058 EMIT(NFA_OR);
2059 }
2060
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002061 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002062 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002063
2064 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002065 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002066 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002067
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002068 default:
2069 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002070 int plen;
2071
2072nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002073 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002074 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002075 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002076 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002077 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002078 int i = 0;
2079
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002080 // A base character plus composing characters, or just one
2081 // or more composing characters.
2082 // This requires creating a separate atom as if enclosing
2083 // the characters in (), where NFA_COMPOSING is the ( and
2084 // NFA_END_COMPOSING is the ). Note that right now we are
2085 // building the postfix form, not the NFA itself;
2086 // a composing char could be: a, b, c, NFA_COMPOSING
2087 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002088 for (;;)
2089 {
2090 EMIT(c);
2091 if (i > 0)
2092 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002093 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002094 break;
2095 c = utf_ptr2char(old_regparse + i);
2096 }
2097 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002098 regparse = old_regparse + plen;
2099 }
2100 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002101 {
2102 c = no_Magic(c);
2103 EMIT(c);
2104 }
2105 return OK;
2106 }
2107 }
2108
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002109 return OK;
2110}
2111
2112/*
2113 * Parse something followed by possible [*+=].
2114 *
2115 * A piece is an atom, possibly followed by a multi, an indication of how many
2116 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2117 * characters: "", "a", "aa", etc.
2118 *
2119 * piece ::= atom
2120 * or atom multi
2121 */
2122 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002123nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002124{
2125 int i;
2126 int op;
2127 int ret;
2128 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002129 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002130 parse_state_T old_state;
2131 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002132 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002133 int old_post_pos;
2134 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002135 int quest;
2136
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002137 // Save the current parse state, so that we can use it if <atom>{m,n} is
2138 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002139 save_parse_state(&old_state);
2140
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002141 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002142 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002143
2144 ret = nfa_regatom();
2145 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002146 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002147
2148 op = peekchr();
2149 if (re_multi_type(op) == NOT_MULTI)
2150 return OK;
2151
2152 skipchr();
2153 switch (op)
2154 {
2155 case Magic('*'):
2156 EMIT(NFA_STAR);
2157 break;
2158
2159 case Magic('+'):
2160 /*
2161 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2162 * first and only submatch would be "aaa". But the backtracking
2163 * engine interprets the plus as "try matching one more time", and
2164 * a* matches a second time at the end of the input, the empty
2165 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002166 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002167 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002168 * In order to be consistent with the old engine, we replace
2169 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002170 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002171 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002172 curchr = -1;
2173 if (nfa_regatom() == FAIL)
2174 return FAIL;
2175 EMIT(NFA_STAR);
2176 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002177 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002178 break;
2179
2180 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002181 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002182 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002183 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002184 switch(op)
2185 {
2186 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002187 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002188 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002189 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002190 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002191 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002192 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002193 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002194 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002195 op = no_Magic(getchr());
2196 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002197 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002198 i = NFA_PREV_ATOM_JUST_BEFORE;
2199 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002200 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002201 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2202 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002203 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002204 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002205 i = NFA_PREV_ATOM_LIKE_PATTERN;
2206 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002207 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002208 if (i == 0)
2209 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002210 semsg(_(e_nfa_regexp_unknown_operator_at_chr), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002211 return FAIL;
2212 }
2213 EMIT(i);
2214 if (i == NFA_PREV_ATOM_JUST_BEFORE
2215 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2216 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002217 break;
2218
2219 case Magic('?'):
2220 case Magic('='):
2221 EMIT(NFA_QUEST);
2222 break;
2223
2224 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002225 // a{2,5} will expand to 'aaa?a?a?'
2226 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2227 // version of '?'
2228 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2229 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002230
2231 greedy = TRUE;
2232 c2 = peekchr();
2233 if (c2 == '-' || c2 == Magic('-'))
2234 {
2235 skipchr();
2236 greedy = FALSE;
2237 }
2238 if (!read_limits(&minval, &maxval))
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002239 EMSG_RET_FAIL(_(e_nfa_regexp_error_reading_repetition_limits));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002240
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002241 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2242 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002243 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002244 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002245 if (greedy) // { { (match the braces)
2246 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002247 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002248 else // { { (match the braces)
2249 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002250 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002251 break;
2252 }
2253
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002254 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002255 if (maxval == 0)
2256 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002257 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002258 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002259 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002260 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002261 return OK;
2262 }
2263
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002264 // The engine is very inefficient (uses too many states) when the
2265 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002266 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2267 // will emit NFA_STAR.
2268 // Bail out if we can use the other engine, but only, when the
2269 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002270 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002271 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002272 && (maxval > 500 || maxval > minval + 200)
2273 && (maxval != MAX_LIMIT && minval < 200)
2274 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002275 return FAIL;
2276
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002277 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002278 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002279 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002280 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002281
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002282 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2283 for (i = 0; i < maxval; i++)
2284 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002285 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002286 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002287 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002288 if (nfa_regatom() == FAIL)
2289 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002290 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002291 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002292 {
2293 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002294 {
2295 if (greedy)
2296 EMIT(NFA_STAR);
2297 else
2298 EMIT(NFA_STAR_NONGREEDY);
2299 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002300 else
2301 EMIT(quest);
2302 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002303 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002304 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002305 if (i + 1 > minval && maxval == MAX_LIMIT)
2306 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002307 }
2308
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002309 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002310 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002311 curchr = -1;
2312
2313 break;
2314
2315
2316 default:
2317 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002318 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002319
2320 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002321 // Can't have a multi follow a multi.
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002322 EMSG_RET_FAIL(_(e_nfa_regexp_cant_have_multi_follow_multi));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002323
2324 return OK;
2325}
2326
2327/*
2328 * Parse one or more pieces, concatenated. It matches a match for the
2329 * first piece, followed by a match for the second piece, etc. Example:
2330 * "f[0-9]b", first matches "f", then a digit and then "b".
2331 *
2332 * concat ::= piece
2333 * or piece piece
2334 * or piece piece piece
2335 * etc.
2336 */
2337 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002338nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002339{
2340 int cont = TRUE;
2341 int first = TRUE;
2342
2343 while (cont)
2344 {
2345 switch (peekchr())
2346 {
2347 case NUL:
2348 case Magic('|'):
2349 case Magic('&'):
2350 case Magic(')'):
2351 cont = FALSE;
2352 break;
2353
2354 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002355 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002356 skipchr_keepstart();
2357 break;
2358 case Magic('c'):
2359 regflags |= RF_ICASE;
2360 skipchr_keepstart();
2361 break;
2362 case Magic('C'):
2363 regflags |= RF_NOICASE;
2364 skipchr_keepstart();
2365 break;
2366 case Magic('v'):
2367 reg_magic = MAGIC_ALL;
2368 skipchr_keepstart();
2369 curchr = -1;
2370 break;
2371 case Magic('m'):
2372 reg_magic = MAGIC_ON;
2373 skipchr_keepstart();
2374 curchr = -1;
2375 break;
2376 case Magic('M'):
2377 reg_magic = MAGIC_OFF;
2378 skipchr_keepstart();
2379 curchr = -1;
2380 break;
2381 case Magic('V'):
2382 reg_magic = MAGIC_NONE;
2383 skipchr_keepstart();
2384 curchr = -1;
2385 break;
2386
2387 default:
2388 if (nfa_regpiece() == FAIL)
2389 return FAIL;
2390 if (first == FALSE)
2391 EMIT(NFA_CONCAT);
2392 else
2393 first = FALSE;
2394 break;
2395 }
2396 }
2397
2398 return OK;
2399}
2400
2401/*
2402 * Parse a branch, one or more concats, separated by "\&". It matches the
2403 * last concat, but only if all the preceding concats also match at the same
2404 * position. Examples:
2405 * "foobeep\&..." matches "foo" in "foobeep".
2406 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2407 *
2408 * branch ::= concat
2409 * or concat \& concat
2410 * or concat \& concat \& concat
2411 * etc.
2412 */
2413 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002414nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002415{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002416 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002417
Bram Moolenaar16299b52013-05-30 18:45:23 +02002418 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002419
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002420 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002421 if (nfa_regconcat() == FAIL)
2422 return FAIL;
2423
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002424 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002425 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002426 {
2427 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002428 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002429 if (old_post_pos == (int)(post_ptr - post_start))
2430 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002431 EMIT(NFA_NOPEN);
2432 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002433 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002434 if (nfa_regconcat() == FAIL)
2435 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002436 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002437 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002438 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002439 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002440 }
2441
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002442 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002443 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002444 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002445
2446 return OK;
2447}
2448
2449/*
2450 * Parse a pattern, one or more branches, separated by "\|". It matches
2451 * anything that matches one of the branches. Example: "foo\|beep" matches
2452 * "foo" and matches "beep". If more than one branch matches, the first one
2453 * is used.
2454 *
2455 * pattern ::= branch
2456 * or branch \| branch
2457 * or branch \| branch \| branch
2458 * etc.
2459 */
2460 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002461nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002462 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002463{
2464 int parno = 0;
2465
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002466 if (paren == REG_PAREN)
2467 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002468 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002469 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_parens));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002470 parno = regnpar++;
2471 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002472#ifdef FEAT_SYN_HL
2473 else if (paren == REG_ZPAREN)
2474 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002475 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002476 if (regnzpar >= NSUBEXP)
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002477 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_z));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002478 parno = regnzpar++;
2479 }
2480#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002481
2482 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002483 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002484
2485 while (peekchr() == Magic('|'))
2486 {
2487 skipchr();
2488 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002489 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002490 EMIT(NFA_OR);
2491 }
2492
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002493 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002494 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2495 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002496 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002497 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2498 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002499 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002500 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002501 }
2502 else if (paren == REG_NOPAREN && peekchr() != NUL)
2503 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002504 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002505 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002506 else
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002507 EMSG_RET_FAIL(_(e_nfa_regexp_proper_termination_error));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002508 }
2509 /*
2510 * Here we set the flag allowing back references to this set of
2511 * parentheses.
2512 */
2513 if (paren == REG_PAREN)
2514 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002515 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002516 EMIT(NFA_MOPEN + parno);
2517 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002518#ifdef FEAT_SYN_HL
2519 else if (paren == REG_ZPAREN)
2520 EMIT(NFA_ZOPEN + parno);
2521#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002522
2523 return OK;
2524}
2525
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002526#ifdef DEBUG
2527static char_u code[50];
2528
2529 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002530nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002531{
2532 int addnl = FALSE;
2533
2534 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2535 {
2536 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002537 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002538 }
2539
2540 STRCPY(code, "");
2541 switch (c)
2542 {
2543 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2544 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2545 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2546 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2547 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2548 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2549
Bram Moolenaar5714b802013-05-28 22:03:20 +02002550 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2551 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2552 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2553 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2554 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2555 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2556 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2557 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2558 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002559#ifdef FEAT_SYN_HL
2560 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2561 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2562 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2563 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2564 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2565 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2566 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2567 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2568 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2569#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002570 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2571
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002572 case NFA_PREV_ATOM_NO_WIDTH:
2573 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002574 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2575 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002576 case NFA_PREV_ATOM_JUST_BEFORE:
2577 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2578 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2579 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002580 case NFA_PREV_ATOM_LIKE_PATTERN:
2581 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2582
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002583 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2584 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002585 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002586 case NFA_START_INVISIBLE_FIRST:
2587 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002588 case NFA_START_INVISIBLE_NEG:
2589 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002590 case NFA_START_INVISIBLE_NEG_FIRST:
2591 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002592 case NFA_START_INVISIBLE_BEFORE:
2593 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002594 case NFA_START_INVISIBLE_BEFORE_FIRST:
2595 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002596 case NFA_START_INVISIBLE_BEFORE_NEG:
2597 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002598 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2599 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002600 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002601 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002602 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002603 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002604
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002605 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2606 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002607 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002608
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002609 case NFA_MOPEN:
2610 case NFA_MOPEN1:
2611 case NFA_MOPEN2:
2612 case NFA_MOPEN3:
2613 case NFA_MOPEN4:
2614 case NFA_MOPEN5:
2615 case NFA_MOPEN6:
2616 case NFA_MOPEN7:
2617 case NFA_MOPEN8:
2618 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002619 STRCPY(code, "NFA_MOPEN(x)");
2620 code[10] = c - NFA_MOPEN + '0';
2621 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002622 case NFA_MCLOSE:
2623 case NFA_MCLOSE1:
2624 case NFA_MCLOSE2:
2625 case NFA_MCLOSE3:
2626 case NFA_MCLOSE4:
2627 case NFA_MCLOSE5:
2628 case NFA_MCLOSE6:
2629 case NFA_MCLOSE7:
2630 case NFA_MCLOSE8:
2631 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002632 STRCPY(code, "NFA_MCLOSE(x)");
2633 code[11] = c - NFA_MCLOSE + '0';
2634 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002635#ifdef FEAT_SYN_HL
2636 case NFA_ZOPEN:
2637 case NFA_ZOPEN1:
2638 case NFA_ZOPEN2:
2639 case NFA_ZOPEN3:
2640 case NFA_ZOPEN4:
2641 case NFA_ZOPEN5:
2642 case NFA_ZOPEN6:
2643 case NFA_ZOPEN7:
2644 case NFA_ZOPEN8:
2645 case NFA_ZOPEN9:
2646 STRCPY(code, "NFA_ZOPEN(x)");
2647 code[10] = c - NFA_ZOPEN + '0';
2648 break;
2649 case NFA_ZCLOSE:
2650 case NFA_ZCLOSE1:
2651 case NFA_ZCLOSE2:
2652 case NFA_ZCLOSE3:
2653 case NFA_ZCLOSE4:
2654 case NFA_ZCLOSE5:
2655 case NFA_ZCLOSE6:
2656 case NFA_ZCLOSE7:
2657 case NFA_ZCLOSE8:
2658 case NFA_ZCLOSE9:
2659 STRCPY(code, "NFA_ZCLOSE(x)");
2660 code[11] = c - NFA_ZCLOSE + '0';
2661 break;
2662#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002663 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2664 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2665 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2666 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002667 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2668 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002669 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2670 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2671 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2672 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2673 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2674 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2675 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2676 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2677 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2678 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2679 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2680 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2681 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2682 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002683 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002684
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002685 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002686 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2687 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2688 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002689 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002690 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002691
2692 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2693 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2694 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2695 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2696 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2697 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2698 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2699
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002700 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2701 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2702 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2703 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2704 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2705 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2706 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2707 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2708 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2709 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2710 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2711 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2712 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2713 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2714 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2715 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002716 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2717 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2718 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002719
2720 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2721 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2722 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2723 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2724 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2725 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2726 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2727 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2728 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2729 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2730 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2731 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2732 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2733 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2734 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2735 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2736 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2737 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2738 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2739 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2740 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2741 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2742 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2743 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2744 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2745 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2746 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002747 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2748 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2749 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2750 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002751
2752 default:
2753 STRCPY(code, "CHAR(x)");
2754 code[5] = c;
2755 }
2756
2757 if (addnl == TRUE)
2758 STRCAT(code, " + NEWLINE ");
2759
2760}
2761
2762#ifdef ENABLE_LOG
2763static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002764static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002765
2766/*
2767 * Print the postfix notation of the current regexp.
2768 */
2769 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002770nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002771{
2772 int *p;
2773 FILE *f;
2774
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002775 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002776 if (f != NULL)
2777 {
2778 fprintf(f, "\n-------------------------\n");
2779 if (retval == FAIL)
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002780 fprintf(f, ">>> NFA engine failed... \n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002781 else if (retval == OK)
2782 fprintf(f, ">>> NFA engine succeeded !\n");
2783 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002784 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002785 {
2786 nfa_set_code(*p);
2787 fprintf(f, "%s, ", code);
2788 }
2789 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002790 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002791 fprintf(f, "%d ", *p);
2792 fprintf(f, "\n\n");
2793 fclose(f);
2794 }
2795}
2796
2797/*
2798 * Print the NFA starting with a root node "state".
2799 */
2800 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002801nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002802{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002803 garray_T indent;
2804
2805 ga_init2(&indent, 1, 64);
2806 ga_append(&indent, '\0');
2807 nfa_print_state2(debugf, state, &indent);
2808 ga_clear(&indent);
2809}
2810
2811 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002812nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002813{
2814 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002815
2816 if (state == NULL)
2817 return;
2818
2819 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002820
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002821 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002822 p = (char_u *)indent->ga_data;
2823 if (indent->ga_len >= 3)
2824 {
2825 int last = indent->ga_len - 3;
2826 char_u save[2];
2827
2828 STRNCPY(save, &p[last], 2);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00002829 memcpy(&p[last], "+-", 2);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002830 fprintf(debugf, " %s", p);
2831 STRNCPY(&p[last], save, 2);
2832 }
2833 else
2834 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002835
2836 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002837 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002838 code,
2839 state->c,
2840 abs(state->id),
2841 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002842 if (state->id < 0)
2843 return;
2844
2845 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002846
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002847 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002848 indent->ga_len -= 1;
2849 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002850 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002851 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002852 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002853 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002854
2855 nfa_print_state2(debugf, state->out, indent);
2856
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002857 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002858 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002859 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002860 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002861
2862 nfa_print_state2(debugf, state->out1, indent);
2863
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002864 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002865 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002866 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002867}
2868
2869/*
2870 * Print the NFA state machine.
2871 */
2872 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002873nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002874{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002875 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002876
2877 if (debugf != NULL)
2878 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02002879 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002880
Bram Moolenaar473de612013-06-08 18:19:48 +02002881 if (prog->reganch)
2882 fprintf(debugf, "reganch: %d\n", prog->reganch);
2883 if (prog->regstart != NUL)
2884 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2885 prog->regstart, prog->regstart);
2886 if (prog->match_text != NULL)
2887 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002888
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002889 fclose(debugf);
2890 }
2891}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002892#endif // ENABLE_LOG
2893#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002894
2895/*
2896 * Parse r.e. @expr and convert it into postfix form.
2897 * Return the postfix string on success, NULL otherwise.
2898 */
2899 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002900re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002901{
2902 if (nfa_reg(REG_NOPAREN) == FAIL)
2903 return NULL;
2904 EMIT(NFA_MOPEN);
2905 return post_start;
2906}
2907
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002908// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002909
2910/*
2911 * Represents an NFA state plus zero or one or two arrows exiting.
2912 * if c == MATCH, no arrows out; matching state.
2913 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2914 * If c < 256, labeled arrow with character c to out.
2915 */
2916
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002917static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002918
2919/*
2920 * Allocate and initialize nfa_state_T.
2921 */
2922 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002923alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002924{
2925 nfa_state_T *s;
2926
2927 if (istate >= nstate)
2928 return NULL;
2929
2930 s = &state_ptr[istate++];
2931
2932 s->c = c;
2933 s->out = out;
2934 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002935 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002936
2937 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002938 s->lastlist[0] = 0;
2939 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002940
2941 return s;
2942}
2943
2944/*
2945 * A partially built NFA without the matching state filled in.
2946 * Frag_T.start points at the start state.
2947 * Frag_T.out is a list of places that need to be set to the
2948 * next state for this fragment.
2949 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002950
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002951// Since the out pointers in the list are always
2952// uninitialized, we use the pointers themselves
2953// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002954typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002955union Ptrlist
2956{
2957 Ptrlist *next;
2958 nfa_state_T *s;
2959};
2960
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002961struct Frag
2962{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002963 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002964 Ptrlist *out;
2965};
2966typedef struct Frag Frag_T;
2967
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002968/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02002969 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002970 */
2971 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01002972frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002973{
Bram Moolenaar053bb602013-05-20 13:55:21 +02002974 Frag_T n;
2975
2976 n.start = start;
2977 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002978 return n;
2979}
2980
2981/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002982 * Create singleton list containing just outp.
2983 */
2984 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01002985list1(
2986 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002987{
2988 Ptrlist *l;
2989
2990 l = (Ptrlist *)outp;
2991 l->next = NULL;
2992 return l;
2993}
2994
2995/*
2996 * Patch the list of states at out to point to start.
2997 */
2998 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002999patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003000{
3001 Ptrlist *next;
3002
3003 for (; l; l = next)
3004 {
3005 next = l->next;
3006 l->s = s;
3007 }
3008}
3009
3010
3011/*
3012 * Join the two lists l1 and l2, returning the combination.
3013 */
3014 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003015append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003016{
3017 Ptrlist *oldl1;
3018
3019 oldl1 = l1;
3020 while (l1->next)
3021 l1 = l1->next;
3022 l1->next = l2;
3023 return oldl1;
3024}
3025
3026/*
3027 * Stack used for transforming postfix form into NFA.
3028 */
3029static Frag_T empty;
3030
3031 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003032st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003033{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003034#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003035 FILE *df;
3036 int *p2;
3037
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003038 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003039 if (df)
3040 {
3041 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003042# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003043 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003044# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003045 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003046# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003047 for (p2 = postfix; p2 < end; p2++)
3048 {
3049 nfa_set_code(*p2);
3050 fprintf(df, "%s, ", code);
3051 }
3052 nfa_set_code(*p);
3053 fprintf(df, "\nCurrent position is: ");
3054 for (p2 = postfix; p2 <= p; p2 ++)
3055 {
3056 nfa_set_code(*p2);
3057 fprintf(df, "%s, ", code);
3058 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003059# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003060 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003061 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003062 fprintf(df, "\nCurrent position is: ");
3063 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003064 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003065# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003066 fprintf(df, "\n--------------------------\n");
3067 fclose(df);
3068 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003069#endif
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003070 emsg(_(e_nfa_regexp_could_not_pop_stack));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003071}
3072
3073/*
3074 * Push an item onto the stack.
3075 */
3076 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003077st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003078{
3079 Frag_T *stackp = *p;
3080
3081 if (stackp >= stack_end)
3082 return;
3083 *stackp = s;
3084 *p = *p + 1;
3085}
3086
3087/*
3088 * Pop an item from the stack.
3089 */
3090 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003091st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003092{
3093 Frag_T *stackp;
3094
3095 *p = *p - 1;
3096 stackp = *p;
3097 if (stackp < stack)
3098 return empty;
3099 return **p;
3100}
3101
3102/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003103 * Estimate the maximum byte length of anything matching "state".
3104 * When unknown or unlimited return -1.
3105 */
3106 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003107nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003108{
3109 int l, r;
3110 nfa_state_T *state = startstate;
3111 int len = 0;
3112
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003113 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003114 if (depth > 4)
3115 return -1;
3116
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003117 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003118 {
3119 switch (state->c)
3120 {
3121 case NFA_END_INVISIBLE:
3122 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003123 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003124 return len;
3125
3126 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003127 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003128 l = nfa_max_width(state->out, depth + 1);
3129 r = nfa_max_width(state->out1, depth + 1);
3130 if (l < 0 || r < 0)
3131 return -1;
3132 return len + (l > r ? l : r);
3133
3134 case NFA_ANY:
3135 case NFA_START_COLL:
3136 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003137 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003138 if (enc_utf8)
3139 len += MB_MAXBYTES;
3140 else if (has_mbyte)
3141 len += 2;
3142 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003143 ++len;
3144 if (state->c != NFA_ANY)
3145 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003146 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003147 state = state->out1->out;
3148 continue;
3149 }
3150 break;
3151
3152 case NFA_DIGIT:
3153 case NFA_WHITE:
3154 case NFA_HEX:
3155 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003156 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003157 ++len;
3158 break;
3159
3160 case NFA_IDENT:
3161 case NFA_SIDENT:
3162 case NFA_KWORD:
3163 case NFA_SKWORD:
3164 case NFA_FNAME:
3165 case NFA_SFNAME:
3166 case NFA_PRINT:
3167 case NFA_SPRINT:
3168 case NFA_NWHITE:
3169 case NFA_NDIGIT:
3170 case NFA_NHEX:
3171 case NFA_NOCTAL:
3172 case NFA_WORD:
3173 case NFA_NWORD:
3174 case NFA_HEAD:
3175 case NFA_NHEAD:
3176 case NFA_ALPHA:
3177 case NFA_NALPHA:
3178 case NFA_LOWER:
3179 case NFA_NLOWER:
3180 case NFA_UPPER:
3181 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003182 case NFA_LOWER_IC:
3183 case NFA_NLOWER_IC:
3184 case NFA_UPPER_IC:
3185 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003186 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003187 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003188 if (has_mbyte)
3189 len += 3;
3190 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003191 ++len;
3192 break;
3193
3194 case NFA_START_INVISIBLE:
3195 case NFA_START_INVISIBLE_NEG:
3196 case NFA_START_INVISIBLE_BEFORE:
3197 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003198 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003199 state = state->out1->out;
3200 continue;
3201
3202 case NFA_BACKREF1:
3203 case NFA_BACKREF2:
3204 case NFA_BACKREF3:
3205 case NFA_BACKREF4:
3206 case NFA_BACKREF5:
3207 case NFA_BACKREF6:
3208 case NFA_BACKREF7:
3209 case NFA_BACKREF8:
3210 case NFA_BACKREF9:
3211#ifdef FEAT_SYN_HL
3212 case NFA_ZREF1:
3213 case NFA_ZREF2:
3214 case NFA_ZREF3:
3215 case NFA_ZREF4:
3216 case NFA_ZREF5:
3217 case NFA_ZREF6:
3218 case NFA_ZREF7:
3219 case NFA_ZREF8:
3220 case NFA_ZREF9:
3221#endif
3222 case NFA_NEWL:
3223 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003224 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003225 return -1;
3226
3227 case NFA_BOL:
3228 case NFA_EOL:
3229 case NFA_BOF:
3230 case NFA_EOF:
3231 case NFA_BOW:
3232 case NFA_EOW:
3233 case NFA_MOPEN:
3234 case NFA_MOPEN1:
3235 case NFA_MOPEN2:
3236 case NFA_MOPEN3:
3237 case NFA_MOPEN4:
3238 case NFA_MOPEN5:
3239 case NFA_MOPEN6:
3240 case NFA_MOPEN7:
3241 case NFA_MOPEN8:
3242 case NFA_MOPEN9:
3243#ifdef FEAT_SYN_HL
3244 case NFA_ZOPEN:
3245 case NFA_ZOPEN1:
3246 case NFA_ZOPEN2:
3247 case NFA_ZOPEN3:
3248 case NFA_ZOPEN4:
3249 case NFA_ZOPEN5:
3250 case NFA_ZOPEN6:
3251 case NFA_ZOPEN7:
3252 case NFA_ZOPEN8:
3253 case NFA_ZOPEN9:
3254 case NFA_ZCLOSE:
3255 case NFA_ZCLOSE1:
3256 case NFA_ZCLOSE2:
3257 case NFA_ZCLOSE3:
3258 case NFA_ZCLOSE4:
3259 case NFA_ZCLOSE5:
3260 case NFA_ZCLOSE6:
3261 case NFA_ZCLOSE7:
3262 case NFA_ZCLOSE8:
3263 case NFA_ZCLOSE9:
3264#endif
3265 case NFA_MCLOSE:
3266 case NFA_MCLOSE1:
3267 case NFA_MCLOSE2:
3268 case NFA_MCLOSE3:
3269 case NFA_MCLOSE4:
3270 case NFA_MCLOSE5:
3271 case NFA_MCLOSE6:
3272 case NFA_MCLOSE7:
3273 case NFA_MCLOSE8:
3274 case NFA_MCLOSE9:
3275 case NFA_NOPEN:
3276 case NFA_NCLOSE:
3277
3278 case NFA_LNUM_GT:
3279 case NFA_LNUM_LT:
3280 case NFA_COL_GT:
3281 case NFA_COL_LT:
3282 case NFA_VCOL_GT:
3283 case NFA_VCOL_LT:
3284 case NFA_MARK_GT:
3285 case NFA_MARK_LT:
3286 case NFA_VISUAL:
3287 case NFA_LNUM:
3288 case NFA_CURSOR:
3289 case NFA_COL:
3290 case NFA_VCOL:
3291 case NFA_MARK:
3292
3293 case NFA_ZSTART:
3294 case NFA_ZEND:
3295 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003296 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003297 case NFA_START_PATTERN:
3298 case NFA_END_PATTERN:
3299 case NFA_COMPOSING:
3300 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003301 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003302 break;
3303
3304 default:
3305 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003306 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003307 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003308 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003309 len += MB_CHAR2LEN(state->c);
3310 break;
3311 }
3312
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003313 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003314 state = state->out;
3315 }
3316
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003317 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003318 return -1;
3319}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003320
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003321/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003322 * Convert a postfix form into its equivalent NFA.
3323 * Return the NFA start state on success, NULL otherwise.
3324 */
3325 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003326post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003327{
3328 int *p;
3329 int mopen;
3330 int mclose;
3331 Frag_T *stack = NULL;
3332 Frag_T *stackp = NULL;
3333 Frag_T *stack_end = NULL;
3334 Frag_T e1;
3335 Frag_T e2;
3336 Frag_T e;
3337 nfa_state_T *s;
3338 nfa_state_T *s1;
3339 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003340 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003341
3342 if (postfix == NULL)
3343 return NULL;
3344
Bram Moolenaar053bb602013-05-20 13:55:21 +02003345#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003346#define POP() st_pop(&stackp, stack); \
3347 if (stackp < stack) \
3348 { \
3349 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003350 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003351 return NULL; \
3352 }
3353
3354 if (nfa_calc_size == FALSE)
3355 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003356 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003357 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003358 if (stack == NULL)
3359 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003360 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003361 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003362 }
3363
3364 for (p = postfix; p < end; ++p)
3365 {
3366 switch (*p)
3367 {
3368 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003369 // Concatenation.
3370 // Pay attention: this operator does not exist in the r.e. itself
3371 // (it is implicit, really). It is added when r.e. is translated
3372 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003373 if (nfa_calc_size == TRUE)
3374 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003375 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003376 break;
3377 }
3378 e2 = POP();
3379 e1 = POP();
3380 patch(e1.out, e2.start);
3381 PUSH(frag(e1.start, e2.out));
3382 break;
3383
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003384 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003385 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003386 if (nfa_calc_size == TRUE)
3387 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003388 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003389 break;
3390 }
3391 e2 = POP();
3392 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003393 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003394 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003395 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003396 PUSH(frag(s, append(e1.out, e2.out)));
3397 break;
3398
3399 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003400 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003401 if (nfa_calc_size == TRUE)
3402 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003403 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003404 break;
3405 }
3406 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003407 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003408 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003409 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003410 patch(e.out, s);
3411 PUSH(frag(s, list1(&s->out1)));
3412 break;
3413
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003414 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003415 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003416 if (nfa_calc_size == TRUE)
3417 {
3418 nstate++;
3419 break;
3420 }
3421 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003422 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003423 if (s == NULL)
3424 goto theend;
3425 patch(e.out, s);
3426 PUSH(frag(s, list1(&s->out)));
3427 break;
3428
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003429 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003430 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003431 if (nfa_calc_size == TRUE)
3432 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003433 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003434 break;
3435 }
3436 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003437 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003438 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003439 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003440 PUSH(frag(s, append(e.out, list1(&s->out1))));
3441 break;
3442
3443 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003444 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003445 if (nfa_calc_size == TRUE)
3446 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003447 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003448 break;
3449 }
3450 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003451 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003452 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003453 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003454 PUSH(frag(s, append(e.out, list1(&s->out))));
3455 break;
3456
Bram Moolenaar417bad22013-06-07 14:08:30 +02003457 case NFA_END_COLL:
3458 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003459 // On the stack is the sequence starting with NFA_START_COLL or
3460 // NFA_START_NEG_COLL and all possible characters. Patch it to
3461 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003462 if (nfa_calc_size == TRUE)
3463 {
3464 nstate++;
3465 break;
3466 }
3467 e = POP();
3468 s = alloc_state(NFA_END_COLL, NULL, NULL);
3469 if (s == NULL)
3470 goto theend;
3471 patch(e.out, s);
3472 e.start->out1 = s;
3473 PUSH(frag(e.start, list1(&s->out)));
3474 break;
3475
3476 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003477 // Before this are two characters, the low and high end of a
3478 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003479 if (nfa_calc_size == TRUE)
3480 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003481 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003482 break;
3483 }
3484 e2 = POP();
3485 e1 = POP();
3486 e2.start->val = e2.start->c;
3487 e2.start->c = NFA_RANGE_MAX;
3488 e1.start->val = e1.start->c;
3489 e1.start->c = NFA_RANGE_MIN;
3490 patch(e1.out, e2.start);
3491 PUSH(frag(e1.start, e2.out));
3492 break;
3493
Bram Moolenaar699c1202013-09-25 16:41:54 +02003494 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003495 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003496 if (nfa_calc_size == TRUE)
3497 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003498 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003499 break;
3500 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003501 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003502 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003503 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003504 PUSH(frag(s, list1(&s->out)));
3505 break;
3506
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003507 case NFA_OPT_CHARS:
3508 {
3509 int n;
3510
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003511 // \%[abc] implemented as:
3512 // NFA_SPLIT
3513 // +-CHAR(a)
3514 // | +-NFA_SPLIT
3515 // | +-CHAR(b)
3516 // | | +-NFA_SPLIT
3517 // | | +-CHAR(c)
3518 // | | | +-next
3519 // | | +- next
3520 // | +- next
3521 // +- next
3522 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003523 if (nfa_calc_size == TRUE)
3524 {
3525 nstate += n;
3526 break;
3527 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003528 s = NULL; // avoid compiler warning
3529 e1.out = NULL; // stores list with out1's
3530 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003531 while (n-- > 0)
3532 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003533 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003534 s = alloc_state(NFA_SPLIT, e.start, NULL);
3535 if (s == NULL)
3536 goto theend;
3537 if (e1.out == NULL)
3538 e1 = e;
3539 patch(e.out, s1);
3540 append(e1.out, list1(&s->out1));
3541 s1 = s;
3542 }
3543 PUSH(frag(s, e1.out));
3544 break;
3545 }
3546
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003547 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003548 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003549 case NFA_PREV_ATOM_JUST_BEFORE:
3550 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003551 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003552 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003553 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3554 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003555 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003556 int start_state;
3557 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003558 int n = 0;
3559 nfa_state_T *zend;
3560 nfa_state_T *skip;
3561
Bram Moolenaardecd9542013-06-07 16:31:50 +02003562 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003563 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003564 case NFA_PREV_ATOM_NO_WIDTH:
3565 start_state = NFA_START_INVISIBLE;
3566 end_state = NFA_END_INVISIBLE;
3567 break;
3568 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3569 start_state = NFA_START_INVISIBLE_NEG;
3570 end_state = NFA_END_INVISIBLE_NEG;
3571 break;
3572 case NFA_PREV_ATOM_JUST_BEFORE:
3573 start_state = NFA_START_INVISIBLE_BEFORE;
3574 end_state = NFA_END_INVISIBLE;
3575 break;
3576 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3577 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3578 end_state = NFA_END_INVISIBLE_NEG;
3579 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003580 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003581 start_state = NFA_START_PATTERN;
3582 end_state = NFA_END_PATTERN;
3583 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003584 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003585
3586 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003587 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003588
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003589 // The \@= operator: match the preceding atom with zero width.
3590 // The \@! operator: no match for the preceding atom.
3591 // The \@<= operator: match for the preceding atom.
3592 // The \@<! operator: no match for the preceding atom.
3593 // Surrounds the preceding atom with START_INVISIBLE and
3594 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003595
3596 if (nfa_calc_size == TRUE)
3597 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003598 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003599 break;
3600 }
3601 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003602 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003603 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003604 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003605
Bram Moolenaar87953742013-06-05 18:52:40 +02003606 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003607 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003608 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003609 if (pattern)
3610 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003611 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003612 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003613 if (skip == NULL)
3614 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003615 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003616 if (zend == NULL)
3617 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003618 s1->out= skip;
3619 patch(e.out, zend);
3620 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003621 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003622 else
3623 {
3624 patch(e.out, s1);
3625 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003626 if (before)
3627 {
3628 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003629 // See if we can guess the maximum width, it avoids a
3630 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003631 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003632 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003633 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003634 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003635 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003636 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003637
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003638 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003639#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003640 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003641 if (regflags & RF_ICOMBINE)
3642 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003643 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003644 }
3645#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003646 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003647
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003648 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003649 case NFA_MOPEN1:
3650 case NFA_MOPEN2:
3651 case NFA_MOPEN3:
3652 case NFA_MOPEN4:
3653 case NFA_MOPEN5:
3654 case NFA_MOPEN6:
3655 case NFA_MOPEN7:
3656 case NFA_MOPEN8:
3657 case NFA_MOPEN9:
3658#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003659 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003660 case NFA_ZOPEN1:
3661 case NFA_ZOPEN2:
3662 case NFA_ZOPEN3:
3663 case NFA_ZOPEN4:
3664 case NFA_ZOPEN5:
3665 case NFA_ZOPEN6:
3666 case NFA_ZOPEN7:
3667 case NFA_ZOPEN8:
3668 case NFA_ZOPEN9:
3669#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003670 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003671 if (nfa_calc_size == TRUE)
3672 {
3673 nstate += 2;
3674 break;
3675 }
3676
3677 mopen = *p;
3678 switch (*p)
3679 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003680 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3681#ifdef FEAT_SYN_HL
3682 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3683 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3684 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3685 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3686 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3687 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3688 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3689 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3690 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3691 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3692#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003693 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003694 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003695 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003696 mclose = *p + NSUBEXP;
3697 break;
3698 }
3699
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003700 // Allow "NFA_MOPEN" as a valid postfix representation for
3701 // the empty regexp "". In this case, the NFA will be
3702 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3703 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003704 if (stackp == stack)
3705 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003706 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003707 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003708 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003709 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003710 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003711 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003712 patch(list1(&s->out), s1);
3713 PUSH(frag(s, list1(&s1->out)));
3714 break;
3715 }
3716
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003717 // At least one node was emitted before NFA_MOPEN, so
3718 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003719 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003720 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003721 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003722 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003723
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003724 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003725 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003726 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003727 patch(e.out, s1);
3728
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003729 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003730 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003731 patch(list1(&s->out1), s1);
3732
3733 PUSH(frag(s, list1(&s1->out)));
3734 break;
3735
Bram Moolenaar5714b802013-05-28 22:03:20 +02003736 case NFA_BACKREF1:
3737 case NFA_BACKREF2:
3738 case NFA_BACKREF3:
3739 case NFA_BACKREF4:
3740 case NFA_BACKREF5:
3741 case NFA_BACKREF6:
3742 case NFA_BACKREF7:
3743 case NFA_BACKREF8:
3744 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003745#ifdef FEAT_SYN_HL
3746 case NFA_ZREF1:
3747 case NFA_ZREF2:
3748 case NFA_ZREF3:
3749 case NFA_ZREF4:
3750 case NFA_ZREF5:
3751 case NFA_ZREF6:
3752 case NFA_ZREF7:
3753 case NFA_ZREF8:
3754 case NFA_ZREF9:
3755#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003756 if (nfa_calc_size == TRUE)
3757 {
3758 nstate += 2;
3759 break;
3760 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003761 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003762 if (s == NULL)
3763 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003764 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003765 if (s1 == NULL)
3766 goto theend;
3767 patch(list1(&s->out), s1);
3768 PUSH(frag(s, list1(&s1->out)));
3769 break;
3770
Bram Moolenaar423532e2013-05-29 21:14:42 +02003771 case NFA_LNUM:
3772 case NFA_LNUM_GT:
3773 case NFA_LNUM_LT:
3774 case NFA_VCOL:
3775 case NFA_VCOL_GT:
3776 case NFA_VCOL_LT:
3777 case NFA_COL:
3778 case NFA_COL_GT:
3779 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003780 case NFA_MARK:
3781 case NFA_MARK_GT:
3782 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003783 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003784 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003785
Bram Moolenaar423532e2013-05-29 21:14:42 +02003786 if (nfa_calc_size == TRUE)
3787 {
3788 nstate += 1;
3789 break;
3790 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003791 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003792 if (s == NULL)
3793 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003794 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003795 PUSH(frag(s, list1(&s->out)));
3796 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003797 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003798
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003799 case NFA_ZSTART:
3800 case NFA_ZEND:
3801 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003802 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003803 if (nfa_calc_size == TRUE)
3804 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003805 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003806 break;
3807 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003808 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003809 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003810 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003811 PUSH(frag(s, list1(&s->out)));
3812 break;
3813
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003814 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003815
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003816 } // for(p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003817
3818 if (nfa_calc_size == TRUE)
3819 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003820 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003821 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003822 }
3823
3824 e = POP();
3825 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003826 {
3827 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003828 EMSG_RET_NULL(_(e_nfa_regexp_while_converting_from_postfix_to_nfa_too_many_stats_left_on_stack));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003829 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003830
3831 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003832 {
3833 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003834 EMSG_RET_NULL(_(e_nfa_regexp_not_enough_space_to_store_whole_nfa));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003835 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003836
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003837 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003838 matchstate->c = NFA_MATCH;
3839 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003840 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003841
3842 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003843 ret = e.start;
3844
3845theend:
3846 vim_free(stack);
3847 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003848
3849#undef POP1
3850#undef PUSH1
3851#undef POP2
3852#undef PUSH2
3853#undef POP
3854#undef PUSH
3855}
3856
Bram Moolenaara2947e22013-06-11 22:44:09 +02003857/*
3858 * After building the NFA program, inspect it to add optimization hints.
3859 */
3860 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003861nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003862{
3863 int i;
3864 int c;
3865
3866 for (i = 0; i < prog->nstate; ++i)
3867 {
3868 c = prog->state[i].c;
3869 if (c == NFA_START_INVISIBLE
3870 || c == NFA_START_INVISIBLE_NEG
3871 || c == NFA_START_INVISIBLE_BEFORE
3872 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3873 {
3874 int directly;
3875
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003876 // Do it directly when what follows is possibly the end of the
3877 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003878 if (match_follows(prog->state[i].out1->out, 0))
3879 directly = TRUE;
3880 else
3881 {
3882 int ch_invisible = failure_chance(prog->state[i].out, 0);
3883 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3884
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003885 // Postpone when the invisible match is expensive or has a
3886 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003887 if (c == NFA_START_INVISIBLE_BEFORE
3888 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3889 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003890 // "before" matches are very expensive when
3891 // unbounded, always prefer what follows then,
3892 // unless what follows will always match.
3893 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003894 if (prog->state[i].val <= 0 && ch_follows > 0)
3895 directly = FALSE;
3896 else
3897 directly = ch_follows * 10 < ch_invisible;
3898 }
3899 else
3900 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003901 // normal invisible, first do the one with the
3902 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003903 directly = ch_follows < ch_invisible;
3904 }
3905 }
3906 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003907 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003908 ++prog->state[i].c;
3909 }
3910 }
3911}
3912
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003913/////////////////////////////////////////////////////////////////
3914// NFA execution code.
3915/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003916
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003917typedef struct
3918{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003919 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003920
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003921 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003922 union
3923 {
3924 struct multipos
3925 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003926 linenr_T start_lnum;
3927 linenr_T end_lnum;
3928 colnr_T start_col;
3929 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003930 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003931 struct linepos
3932 {
3933 char_u *start;
3934 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003935 } line[NSUBEXP];
3936 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003937} regsub_T;
3938
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003939typedef struct
3940{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003941 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003942#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003943 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003944#endif
3945} regsubs_T;
3946
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003947// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02003948typedef struct nfa_pim_S nfa_pim_T;
3949struct nfa_pim_S
3950{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003951 int result; // NFA_PIM_*, see below
3952 nfa_state_T *state; // the invisible match start state
3953 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02003954 union
3955 {
3956 lpos_T pos;
3957 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003958 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02003959};
3960
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003961// Values for done in nfa_pim_T.
3962#define NFA_PIM_UNUSED 0 // pim not used
3963#define NFA_PIM_TODO 1 // pim not done yet
3964#define NFA_PIM_MATCH 2 // pim executed, matches
3965#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02003966
3967
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003968// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02003969typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003970{
3971 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02003972 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003973 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
3974 // invisible match
3975 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02003976} nfa_thread_T;
3977
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003978// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003979typedef struct
3980{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003981 nfa_thread_T *t; // allocated array of states
3982 int n; // nr of states currently in "t"
3983 int len; // max nr of states in "t"
3984 int id; // ID of the list
3985 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02003986} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003987
Bram Moolenaar5714b802013-05-28 22:03:20 +02003988#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01003989static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003990
3991 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003992log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003993{
3994 log_subexpr(&subs->norm);
3995# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02003996 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02003997 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003998# endif
3999}
4000
Bram Moolenaar5714b802013-05-28 22:03:20 +02004001 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004002log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02004003{
4004 int j;
4005
4006 for (j = 0; j < sub->in_use; j++)
4007 if (REG_MULTI)
Bram Moolenaar87953742013-06-05 18:52:40 +02004008 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004009 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004010 sub->list.multi[j].start_col,
4011 (int)sub->list.multi[j].start_lnum,
4012 sub->list.multi[j].end_col,
4013 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004014 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004015 {
4016 char *s = (char *)sub->list.line[j].start;
4017 char *e = (char *)sub->list.line[j].end;
4018
Bram Moolenaar87953742013-06-05 18:52:40 +02004019 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004020 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004021 s == NULL ? "NULL" : s,
4022 e == NULL ? "NULL" : e);
4023 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004024}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004025
4026 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004027pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004028{
4029 static char buf[30];
4030
4031 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4032 buf[0] = NUL;
4033 else
4034 {
4035 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004036 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004037 }
4038 return buf;
4039}
4040
Bram Moolenaar5714b802013-05-28 22:03:20 +02004041#endif
4042
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004043// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004044static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004045#ifdef FEAT_RELTIME
4046static proftime_T *nfa_time_limit;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004047static int *nfa_timed_out;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004048static int nfa_time_count;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004049#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004050
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004051static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004052static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004053
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004054/*
4055 * Copy postponed invisible match info from "from" to "to".
4056 */
4057 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004058copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004059{
4060 to->result = from->result;
4061 to->state = from->state;
4062 copy_sub(&to->subs.norm, &from->subs.norm);
4063#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004064 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004065 copy_sub(&to->subs.synt, &from->subs.synt);
4066#endif
4067 to->end = from->end;
4068}
4069
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004070 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004071clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004072{
4073 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004074 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004075 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004076 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004077 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004078 vim_memset(sub->list.line, 0,
4079 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004080 sub->in_use = 0;
4081}
4082
4083/*
4084 * Copy the submatches from "from" to "to".
4085 */
4086 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004087copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004088{
4089 to->in_use = from->in_use;
4090 if (from->in_use > 0)
4091 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004092 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004093 if (REG_MULTI)
4094 mch_memmove(&to->list.multi[0],
4095 &from->list.multi[0],
4096 sizeof(struct multipos) * from->in_use);
4097 else
4098 mch_memmove(&to->list.line[0],
4099 &from->list.line[0],
4100 sizeof(struct linepos) * from->in_use);
4101 }
4102}
4103
4104/*
4105 * Like copy_sub() but exclude the main match.
4106 */
4107 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004108copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004109{
4110 if (to->in_use < from->in_use)
4111 to->in_use = from->in_use;
4112 if (from->in_use > 1)
4113 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004114 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004115 if (REG_MULTI)
4116 mch_memmove(&to->list.multi[1],
4117 &from->list.multi[1],
4118 sizeof(struct multipos) * (from->in_use - 1));
4119 else
4120 mch_memmove(&to->list.line[1],
4121 &from->list.line[1],
4122 sizeof(struct linepos) * (from->in_use - 1));
4123 }
4124}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004125
Bram Moolenaar428e9872013-05-30 17:05:39 +02004126/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004127 * Like copy_sub() but only do the end of the main match if \ze is present.
4128 */
4129 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004130copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004131{
Bram Moolenaar0270f382018-07-17 05:43:58 +02004132 if (rex.nfa_has_zend)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004133 {
4134 if (REG_MULTI)
4135 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004136 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004137 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004138 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4139 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004140 }
Bram Moolenaarf2118842013-09-25 18:16:38 +02004141 }
4142 else
4143 {
4144 if (from->list.line[0].end != NULL)
4145 to->list.line[0].end = from->list.line[0].end;
4146 }
4147 }
4148}
4149
4150/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004151 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004152 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004153 */
4154 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004155sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004156{
4157 int i;
4158 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004159 linenr_T s1;
4160 linenr_T s2;
4161 char_u *sp1;
4162 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004163
4164 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4165 if (REG_MULTI)
4166 {
4167 for (i = 0; i < todo; ++i)
4168 {
4169 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004170 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004171 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004172 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004173 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004174 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004175 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004176 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004177 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004178 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004179 if (s1 != -1 && sub1->list.multi[i].start_col
4180 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004181 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004182
Bram Moolenaar0270f382018-07-17 05:43:58 +02004183 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004184 {
4185 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004186 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004187 else
4188 s1 = -1;
4189 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004190 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004191 else
4192 s2 = -1;
4193 if (s1 != s2)
4194 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004195 if (s1 != -1 && sub1->list.multi[i].end_col
4196 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004197 return FALSE;
4198 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004199 }
4200 }
4201 else
4202 {
4203 for (i = 0; i < todo; ++i)
4204 {
4205 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004206 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004207 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004208 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004209 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004210 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004211 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004212 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004213 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004214 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004215 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004216 {
4217 if (i < sub1->in_use)
4218 sp1 = sub1->list.line[i].end;
4219 else
4220 sp1 = NULL;
4221 if (i < sub2->in_use)
4222 sp2 = sub2->list.line[i].end;
4223 else
4224 sp2 = NULL;
4225 if (sp1 != sp2)
4226 return FALSE;
4227 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004228 }
4229 }
4230
4231 return TRUE;
4232}
4233
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004234#ifdef ENABLE_LOG
4235 static void
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004236open_debug_log(int result)
4237{
4238 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4239 if (log_fd == NULL)
4240 {
4241 emsg(_(e_log_open_failed));
4242 log_fd = stderr;
4243 }
4244
4245 fprintf(log_fd, "****************************\n");
4246 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4247 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : result == MAYBE
4248 ? "MAYBE" : "FALSE");
4249 fprintf(log_fd, "****************************\n");
4250}
4251
4252 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004253report_state(char *action,
4254 regsub_T *sub,
4255 nfa_state_T *state,
4256 int lid,
4257 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004258{
4259 int col;
4260
4261 if (sub->in_use <= 0)
4262 col = -1;
4263 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004264 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004265 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004266 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004267 nfa_set_code(state->c);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004268 if (log_fd == NULL)
4269 open_debug_log(MAYBE);
4270
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004271 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4272 action, abs(state->id), lid, state->c, code, col,
4273 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004274}
4275#endif
4276
Bram Moolenaar43e02982013-06-07 17:31:29 +02004277/*
4278 * Return TRUE if the same state is already in list "l" with the same
4279 * positions as "subs".
4280 */
4281 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004282has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004283 nfa_list_T *l, // runtime state list
4284 nfa_state_T *state, // state to update
4285 regsubs_T *subs, // pointers to subexpressions
4286 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004287{
4288 nfa_thread_T *thread;
4289 int i;
4290
4291 for (i = 0; i < l->n; ++i)
4292 {
4293 thread = &l->t[i];
4294 if (thread->state->id == state->id
4295 && sub_equal(&thread->subs.norm, &subs->norm)
4296#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004297 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004298 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004299#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004300 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004301 return TRUE;
4302 }
4303 return FALSE;
4304}
4305
4306/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004307 * Return TRUE if "one" and "two" are equal. That includes when both are not
4308 * set.
4309 */
4310 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004311pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004312{
4313 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4314 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4315
4316 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004317 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004318 return two_unused;
4319 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004320 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004321 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004322 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004323 if (one->state->id != two->state->id)
4324 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004325 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004326 if (REG_MULTI)
4327 return one->end.pos.lnum == two->end.pos.lnum
4328 && one->end.pos.col == two->end.pos.col;
4329 return one->end.ptr == two->end.ptr;
4330}
4331
4332/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004333 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4334 */
4335 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004336match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004337{
4338 nfa_state_T *state = startstate;
4339
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004340 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004341 if (depth > 10)
4342 return FALSE;
4343
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004344 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004345 {
4346 switch (state->c)
4347 {
4348 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004349 case NFA_MCLOSE:
4350 case NFA_END_INVISIBLE:
4351 case NFA_END_INVISIBLE_NEG:
4352 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004353 return TRUE;
4354
4355 case NFA_SPLIT:
4356 return match_follows(state->out, depth + 1)
4357 || match_follows(state->out1, depth + 1);
4358
4359 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004360 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004361 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004362 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004363 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004364 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004365 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004366 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004367 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004368 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004369 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004370 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004371
4372 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004373 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004374 case NFA_IDENT:
4375 case NFA_SIDENT:
4376 case NFA_KWORD:
4377 case NFA_SKWORD:
4378 case NFA_FNAME:
4379 case NFA_SFNAME:
4380 case NFA_PRINT:
4381 case NFA_SPRINT:
4382 case NFA_WHITE:
4383 case NFA_NWHITE:
4384 case NFA_DIGIT:
4385 case NFA_NDIGIT:
4386 case NFA_HEX:
4387 case NFA_NHEX:
4388 case NFA_OCTAL:
4389 case NFA_NOCTAL:
4390 case NFA_WORD:
4391 case NFA_NWORD:
4392 case NFA_HEAD:
4393 case NFA_NHEAD:
4394 case NFA_ALPHA:
4395 case NFA_NALPHA:
4396 case NFA_LOWER:
4397 case NFA_NLOWER:
4398 case NFA_UPPER:
4399 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004400 case NFA_LOWER_IC:
4401 case NFA_NLOWER_IC:
4402 case NFA_UPPER_IC:
4403 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004404 case NFA_START_COLL:
4405 case NFA_START_NEG_COLL:
4406 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004407 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004408 return FALSE;
4409
4410 default:
4411 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004412 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004413 return FALSE;
4414
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004415 // Others: zero-width or possibly zero-width, might still find
4416 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004417 break;
4418 }
4419 state = state->out;
4420 }
4421 return FALSE;
4422}
4423
4424
4425/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004426 * Return TRUE if "state" is already in list "l".
4427 */
4428 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004429state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004430 nfa_list_T *l, // runtime state list
4431 nfa_state_T *state, // state to update
4432 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004433{
4434 if (state->lastlist[nfa_ll_index] == l->id)
4435 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004436 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004437 return TRUE;
4438 }
4439 return FALSE;
4440}
4441
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004442// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004443#define ADDSTATE_HERE_OFFSET 10
4444
Bram Moolenaard05bf562013-06-30 23:24:08 +02004445/*
4446 * Add "state" and possibly what follows to state list ".".
4447 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004448 * Returns NULL when recursiveness is too deep.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004449 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004450 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004451addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004452 nfa_list_T *l, // runtime state list
4453 nfa_state_T *state, // state to update
4454 regsubs_T *subs_arg, // pointers to subexpressions
4455 nfa_pim_T *pim, // postponed look-behind match
4456 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004457{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004458 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004459 int off = off_arg;
4460 int add_here = FALSE;
4461 int listindex = 0;
4462 int k;
4463 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004464 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004465 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004466 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004467 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004468 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004469 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004470 regsubs_T *subs = subs_arg;
4471 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004472#ifdef ENABLE_LOG
4473 int did_print = FALSE;
4474#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004475 static int depth = 0;
4476
4477 // This function is called recursively. When the depth is too much we run
4478 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004479 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004480 {
4481 --depth;
4482 return NULL;
4483 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004484
Bram Moolenaar16b35782016-09-09 20:29:50 +02004485 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4486 {
4487 add_here = TRUE;
4488 off = 0;
4489 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4490 }
4491
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004492 switch (state->c)
4493 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004494 case NFA_NCLOSE:
4495 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004496 case NFA_MCLOSE1:
4497 case NFA_MCLOSE2:
4498 case NFA_MCLOSE3:
4499 case NFA_MCLOSE4:
4500 case NFA_MCLOSE5:
4501 case NFA_MCLOSE6:
4502 case NFA_MCLOSE7:
4503 case NFA_MCLOSE8:
4504 case NFA_MCLOSE9:
4505#ifdef FEAT_SYN_HL
4506 case NFA_ZCLOSE:
4507 case NFA_ZCLOSE1:
4508 case NFA_ZCLOSE2:
4509 case NFA_ZCLOSE3:
4510 case NFA_ZCLOSE4:
4511 case NFA_ZCLOSE5:
4512 case NFA_ZCLOSE6:
4513 case NFA_ZCLOSE7:
4514 case NFA_ZCLOSE8:
4515 case NFA_ZCLOSE9:
4516#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004517 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004518 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004519 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004520 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004521 // These nodes are not added themselves but their "out" and/or
4522 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004523 break;
4524
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004525 case NFA_BOL:
4526 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004527 // "^" won't match past end-of-line, don't bother trying.
4528 // Except when at the end of the line, or when we are going to the
4529 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004530 if (rex.input > rex.line
4531 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004532 && (nfa_endp == NULL
4533 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004534 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004535 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004536 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004537
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004538 case NFA_MOPEN1:
4539 case NFA_MOPEN2:
4540 case NFA_MOPEN3:
4541 case NFA_MOPEN4:
4542 case NFA_MOPEN5:
4543 case NFA_MOPEN6:
4544 case NFA_MOPEN7:
4545 case NFA_MOPEN8:
4546 case NFA_MOPEN9:
4547#ifdef FEAT_SYN_HL
4548 case NFA_ZOPEN:
4549 case NFA_ZOPEN1:
4550 case NFA_ZOPEN2:
4551 case NFA_ZOPEN3:
4552 case NFA_ZOPEN4:
4553 case NFA_ZOPEN5:
4554 case NFA_ZOPEN6:
4555 case NFA_ZOPEN7:
4556 case NFA_ZOPEN8:
4557 case NFA_ZOPEN9:
4558#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004559 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004560 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004561 // These nodes need to be added so that we can bail out when it
4562 // was added to this list before at the same position to avoid an
4563 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004564
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004565 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004566 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004567 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004568 // This state is already in the list, don't add it again,
4569 // unless it is an MOPEN that is used for a backreference or
4570 // when there is a PIM. For NFA_MATCH check the position,
4571 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004572 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004573 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004574 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004575 // When called from addstate_here() do insert before
4576 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004577 if (add_here)
4578 {
4579 for (k = 0; k < l->n && k < listindex; ++k)
4580 if (l->t[k].state->id == state->id)
4581 {
4582 found = TRUE;
4583 break;
4584 }
4585 }
4586 if (!add_here || found)
4587 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004588skip_add:
4589#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004590 nfa_set_code(state->c);
4591 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4592 abs(state->id), l->id, state->c, code,
4593 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004594#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004595 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004596 return subs;
4597 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004598 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004599
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004600 // Do not add the state again when it exists with the same
4601 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004602 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004603 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004604 }
4605
Bram Moolenaar688b3982019-02-13 21:47:36 +01004606 // When there are backreferences or PIMs the number of states may
4607 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004608 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004609 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004610 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004611 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004612 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004613
Bram Moolenaar688b3982019-02-13 21:47:36 +01004614 if ((long)(newsize >> 10) >= p_mmp)
4615 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004616 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004617 --depth;
4618 return NULL;
4619 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004620 if (subs != &temp_subs)
4621 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004622 // "subs" may point into the current array, need to make a
4623 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004624 copy_sub(&temp_subs.norm, &subs->norm);
4625#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004626 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004627 copy_sub(&temp_subs.synt, &subs->synt);
4628#endif
4629 subs = &temp_subs;
4630 }
4631
Bram Moolenaar688b3982019-02-13 21:47:36 +01004632 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004633 if (newt == NULL)
4634 {
4635 // out of memory
4636 --depth;
4637 return NULL;
4638 }
4639 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004640 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004641 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004642
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004643 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004644 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004645 thread = &l->t[l->n++];
4646 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004647 if (pim == NULL)
4648 thread->pim.result = NFA_PIM_UNUSED;
4649 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004650 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004651 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004652 l->has_pim = TRUE;
4653 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004654 copy_sub(&thread->subs.norm, &subs->norm);
4655#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004656 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004657 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004658#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004659#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004660 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004661 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004662#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004663 }
4664
4665#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004666 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004667 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004668#endif
4669 switch (state->c)
4670 {
4671 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004672 break;
4673
4674 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004675 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004676 subs = addstate(l, state->out, subs, pim, off_arg);
4677 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004678 break;
4679
Bram Moolenaar699c1202013-09-25 16:41:54 +02004680 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004681 case NFA_NOPEN:
4682 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004683 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004684 break;
4685
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004686 case NFA_MOPEN:
4687 case NFA_MOPEN1:
4688 case NFA_MOPEN2:
4689 case NFA_MOPEN3:
4690 case NFA_MOPEN4:
4691 case NFA_MOPEN5:
4692 case NFA_MOPEN6:
4693 case NFA_MOPEN7:
4694 case NFA_MOPEN8:
4695 case NFA_MOPEN9:
4696#ifdef FEAT_SYN_HL
4697 case NFA_ZOPEN:
4698 case NFA_ZOPEN1:
4699 case NFA_ZOPEN2:
4700 case NFA_ZOPEN3:
4701 case NFA_ZOPEN4:
4702 case NFA_ZOPEN5:
4703 case NFA_ZOPEN6:
4704 case NFA_ZOPEN7:
4705 case NFA_ZOPEN8:
4706 case NFA_ZOPEN9:
4707#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004708 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004709 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004710 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004711 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004712 sub = &subs->norm;
4713 }
4714#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004715 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004716 {
4717 subidx = state->c - NFA_ZOPEN;
4718 sub = &subs->synt;
4719 }
4720#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004721 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004722 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004723 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004724 sub = &subs->norm;
4725 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004726
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004727 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004728 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004729 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004730
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004731 // Set the position (with "off" added) in the subexpression. Save
4732 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004733 if (REG_MULTI)
4734 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004735 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004736 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004737 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004738 save_in_use = -1;
4739 }
4740 else
4741 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004742 save_in_use = sub->in_use;
4743 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004744 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004745 sub->list.multi[i].start_lnum = -1;
4746 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004747 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004748 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004749 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004750 if (off == -1)
4751 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004752 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004753 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004754 }
4755 else
4756 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004757 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004758 sub->list.multi[subidx].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004759 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004760 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004761 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004762 }
4763 else
4764 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004765 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004766 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004767 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004768 save_in_use = -1;
4769 }
4770 else
4771 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004772 save_in_use = sub->in_use;
4773 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004774 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004775 sub->list.line[i].start = NULL;
4776 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004777 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004778 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004779 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004780 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004781 }
4782
Bram Moolenaar16b35782016-09-09 20:29:50 +02004783 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004784 if (subs == NULL)
4785 break;
4786 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004787#ifdef FEAT_SYN_HL
4788 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4789 sub = &subs->synt;
4790 else
4791#endif
4792 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004793
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004794 if (save_in_use == -1)
4795 {
4796 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004797 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004798 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004799 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004800 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004801 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004802 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004803 break;
4804
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004805 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004806 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004807 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004808 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004809 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004810 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004811 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004812 break;
4813 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004814 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004815 case NFA_MCLOSE1:
4816 case NFA_MCLOSE2:
4817 case NFA_MCLOSE3:
4818 case NFA_MCLOSE4:
4819 case NFA_MCLOSE5:
4820 case NFA_MCLOSE6:
4821 case NFA_MCLOSE7:
4822 case NFA_MCLOSE8:
4823 case NFA_MCLOSE9:
4824#ifdef FEAT_SYN_HL
4825 case NFA_ZCLOSE:
4826 case NFA_ZCLOSE1:
4827 case NFA_ZCLOSE2:
4828 case NFA_ZCLOSE3:
4829 case NFA_ZCLOSE4:
4830 case NFA_ZCLOSE5:
4831 case NFA_ZCLOSE6:
4832 case NFA_ZCLOSE7:
4833 case NFA_ZCLOSE8:
4834 case NFA_ZCLOSE9:
4835#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004836 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004837 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004838 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004839 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004840 sub = &subs->norm;
4841 }
4842#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004843 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004844 {
4845 subidx = state->c - NFA_ZCLOSE;
4846 sub = &subs->synt;
4847 }
4848#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004849 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004850 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004851 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004852 sub = &subs->norm;
4853 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004854
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004855 // We don't fill in gaps here, there must have been an MOPEN that
4856 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004857 save_in_use = sub->in_use;
4858 if (sub->in_use <= subidx)
4859 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004860 if (REG_MULTI)
4861 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004862 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004863 if (off == -1)
4864 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004865 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004866 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004867 }
4868 else
4869 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004870 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004871 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004872 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004873 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004874 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004875 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004876 }
4877 else
4878 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004879 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004880 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004881 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004882 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004883 }
4884
Bram Moolenaar16b35782016-09-09 20:29:50 +02004885 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004886 if (subs == NULL)
4887 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004888 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004889#ifdef FEAT_SYN_HL
4890 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4891 sub = &subs->synt;
4892 else
4893#endif
4894 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004895
4896 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004897 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004898 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004899 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004900 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004901 break;
4902 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004903 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004904 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004905}
4906
4907/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004908 * Like addstate(), but the new state(s) are put at position "*ip".
4909 * Used for zero-width matches, next state to use is the added one.
4910 * This makes sure the order of states to be tried does not change, which
4911 * matters for alternatives.
4912 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004913 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004914addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004915 nfa_list_T *l, // runtime state list
4916 nfa_state_T *state, // state to update
4917 regsubs_T *subs, // pointers to subexpressions
4918 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004919 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004920{
4921 int tlen = l->n;
4922 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004923 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004924 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004925
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004926 // First add the state(s) at the end, so that we know how many there are.
4927 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00004928 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004929 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4930 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004931 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004932
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004933 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004934 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004935 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004936
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004937 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004938 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004939 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004940 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02004941 if (count == 1)
4942 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004943 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02004944 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02004945 }
4946 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004947 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004948 if (l->n + count - 1 >= l->len)
4949 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004950 // not enough space to move the new states, reallocate the list
4951 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004952 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004953 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004954 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004955
Bram Moolenaar688b3982019-02-13 21:47:36 +01004956 if ((long)(newsize >> 10) >= p_mmp)
4957 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004958 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004959 return NULL;
4960 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02004961 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004962 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004963 return NULL;
4964 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004965 mch_memmove(&(newl[0]),
4966 &(l->t[0]),
4967 sizeof(nfa_thread_T) * listidx);
4968 mch_memmove(&(newl[listidx]),
4969 &(l->t[l->n - count]),
4970 sizeof(nfa_thread_T) * count);
4971 mch_memmove(&(newl[listidx + count]),
4972 &(l->t[listidx + 1]),
4973 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
4974 vim_free(l->t);
4975 l->t = newl;
4976 }
4977 else
4978 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004979 // make space for new states, then move them from the
4980 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004981 mch_memmove(&(l->t[listidx + count]),
4982 &(l->t[listidx + 1]),
4983 sizeof(nfa_thread_T) * (l->n - listidx - 1));
4984 mch_memmove(&(l->t[listidx]),
4985 &(l->t[l->n - 1]),
4986 sizeof(nfa_thread_T) * count);
4987 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02004988 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02004989 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004990 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004991
4992 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004993}
4994
4995/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004996 * Check character class "class" against current character c.
4997 */
4998 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004999check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005000{
5001 switch (class)
5002 {
5003 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005004 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005005 return OK;
5006 break;
5007 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005008 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005009 return OK;
5010 break;
5011 case NFA_CLASS_BLANK:
5012 if (c == ' ' || c == '\t')
5013 return OK;
5014 break;
5015 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005016 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005017 return OK;
5018 break;
5019 case NFA_CLASS_DIGIT:
5020 if (VIM_ISDIGIT(c))
5021 return OK;
5022 break;
5023 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005024 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005025 return OK;
5026 break;
5027 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005028 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005029 return OK;
5030 break;
5031 case NFA_CLASS_PRINT:
5032 if (vim_isprintc(c))
5033 return OK;
5034 break;
5035 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005036 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005037 return OK;
5038 break;
5039 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005040 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005041 return OK;
5042 break;
5043 case NFA_CLASS_UPPER:
5044 if (MB_ISUPPER(c))
5045 return OK;
5046 break;
5047 case NFA_CLASS_XDIGIT:
5048 if (vim_isxdigit(c))
5049 return OK;
5050 break;
5051 case NFA_CLASS_TAB:
5052 if (c == '\t')
5053 return OK;
5054 break;
5055 case NFA_CLASS_RETURN:
5056 if (c == '\r')
5057 return OK;
5058 break;
5059 case NFA_CLASS_BACKSPACE:
5060 if (c == '\b')
5061 return OK;
5062 break;
5063 case NFA_CLASS_ESCAPE:
5064 if (c == '\033')
5065 return OK;
5066 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005067 case NFA_CLASS_IDENT:
5068 if (vim_isIDc(c))
5069 return OK;
5070 break;
5071 case NFA_CLASS_KEYWORD:
5072 if (reg_iswordc(c))
5073 return OK;
5074 break;
5075 case NFA_CLASS_FNAME:
5076 if (vim_isfilec(c))
5077 return OK;
5078 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005079
5080 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005081 // should not be here :P
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00005082 siemsg(_(e_nfa_regexp_invalid_character_class_nr), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005083 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005084 }
5085 return FAIL;
5086}
5087
Bram Moolenaar5714b802013-05-28 22:03:20 +02005088/*
5089 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005090 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005091 */
5092 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005093match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005094 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005095 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005096 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005097{
5098 int len;
5099
5100 if (sub->in_use <= subidx)
5101 {
5102retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005103 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005104 *bytelen = 0;
5105 return TRUE;
5106 }
5107
5108 if (REG_MULTI)
5109 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005110 if (sub->list.multi[subidx].start_lnum < 0
5111 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005112 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005113 if (sub->list.multi[subidx].start_lnum == rex.lnum
5114 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005115 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005116 len = sub->list.multi[subidx].end_col
5117 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005118 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5119 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005120 {
5121 *bytelen = len;
5122 return TRUE;
5123 }
5124 }
5125 else
5126 {
5127 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005128 sub->list.multi[subidx].start_lnum,
5129 sub->list.multi[subidx].start_col,
5130 sub->list.multi[subidx].end_lnum,
5131 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005132 bytelen) == RA_MATCH)
5133 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005134 }
5135 }
5136 else
5137 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005138 if (sub->list.line[subidx].start == NULL
5139 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005140 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005141 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005142 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005143 {
5144 *bytelen = len;
5145 return TRUE;
5146 }
5147 }
5148 return FALSE;
5149}
5150
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005151#ifdef FEAT_SYN_HL
5152
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005153/*
5154 * Check for a match with \z subexpression "subidx".
5155 * Return TRUE if it matches.
5156 */
5157 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005158match_zref(
5159 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005160 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005161{
5162 int len;
5163
5164 cleanup_zsubexpr();
5165 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5166 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005167 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005168 *bytelen = 0;
5169 return TRUE;
5170 }
5171
5172 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005173 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005174 {
5175 *bytelen = len;
5176 return TRUE;
5177 }
5178 return FALSE;
5179}
5180#endif
5181
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005182/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005183 * Save list IDs for all NFA states of "prog" into "list".
5184 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005185 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005186 */
5187 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005188nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005189{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005190 int i;
5191 nfa_state_T *p;
5192
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005193 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005194 p = &prog->state[0];
5195 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005196 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005197 list[i] = p->lastlist[1];
5198 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005199 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005200 }
5201}
5202
5203/*
5204 * Restore list IDs from "list" to all NFA states.
5205 */
5206 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005207nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005208{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005209 int i;
5210 nfa_state_T *p;
5211
5212 p = &prog->state[0];
5213 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005214 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005215 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005216 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005217 }
5218}
5219
Bram Moolenaar423532e2013-05-29 21:14:42 +02005220 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005221nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005222{
5223 if (op == 1) return pos > val;
5224 if (op == 2) return pos < val;
5225 return val == pos;
5226}
5227
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005228static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005229
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005230/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005231 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005232 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5233 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005234 */
5235 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005236recursive_regmatch(
5237 nfa_state_T *state,
5238 nfa_pim_T *pim,
5239 nfa_regprog_T *prog,
5240 regsubs_T *submatch,
5241 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005242 int **listids,
5243 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005244{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005245 int save_reginput_col = (int)(rex.input - rex.line);
5246 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005247 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005248 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005249 save_se_T *save_nfa_endp = nfa_endp;
5250 save_se_T endpos;
5251 save_se_T *endposp = NULL;
5252 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005253 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005254
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005255 if (pim != NULL)
5256 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005257 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005258 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005259 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005260 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005261 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005262 }
5263
Bram Moolenaardecd9542013-06-07 16:31:50 +02005264 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005265 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5266 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5267 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005268 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005269 // The recursive match must end at the current position. When "pim" is
5270 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005271 endposp = &endpos;
5272 if (REG_MULTI)
5273 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005274 if (pim == NULL)
5275 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005276 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5277 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005278 }
5279 else
5280 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005281 }
5282 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005283 {
5284 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005285 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005286 else
5287 endpos.se_u.ptr = pim->end.ptr;
5288 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005289
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005290 // Go back the specified number of bytes, or as far as the
5291 // start of the previous line, to try matching "\@<=" or
5292 // not matching "\@<!". This is very inefficient, limit the number of
5293 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005294 if (state->val <= 0)
5295 {
5296 if (REG_MULTI)
5297 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005298 rex.line = reg_getline(--rex.lnum);
5299 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005300 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005301 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005302 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005303 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005304 }
5305 else
5306 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005307 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005308 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005309 // Not enough bytes in this line, go to end of
5310 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005311 rex.line = reg_getline(--rex.lnum);
5312 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005313 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005314 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005315 rex.line = reg_getline(++rex.lnum);
5316 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005317 }
5318 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005319 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005320 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005321 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005322 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005323 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005324 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005325 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005326 }
5327 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005328 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005329 }
5330 }
5331
Bram Moolenaarf46da702013-06-02 22:37:42 +02005332#ifdef ENABLE_LOG
5333 if (log_fd != stderr)
5334 fclose(log_fd);
5335 log_fd = NULL;
5336#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005337 // Have to clear the lastlist field of the NFA nodes, so that
5338 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005339 if (nfa_ll_index == 1)
5340 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005341 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5342 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005343 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005344 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005345 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005346 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005347 if (*listids == NULL)
5348 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00005349 emsg(_(e_nfa_regexp_could_not_allocate_memory_for_branch_traversal));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005350 return 0;
5351 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005352 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005353 }
5354 nfa_save_listids(prog, *listids);
5355 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005356 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005357 }
5358 else
5359 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005360 // First recursive nfa_regmatch() call, switch to the second lastlist
5361 // entry. Make sure rex.nfa_listid is different from a previous
5362 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005363 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005364 if (rex.nfa_listid <= rex.nfa_alt_listid)
5365 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005366 }
5367
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005368 // Call nfa_regmatch() to check if the current concat matches at this
5369 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005370 nfa_endp = endposp;
5371 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005372
5373 if (need_restore)
5374 nfa_restore_listids(prog, *listids);
5375 else
5376 {
5377 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005378 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005379 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005380
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005381 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005382 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005383 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005384 rex.line = reg_getline(rex.lnum);
5385 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005386 if (result != NFA_TOO_EXPENSIVE)
5387 {
5388 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005389 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005390 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005391 nfa_endp = save_nfa_endp;
5392
5393#ifdef ENABLE_LOG
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005394 open_debug_log(result);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005395#endif
5396
5397 return result;
5398}
5399
Bram Moolenaara2d95102013-06-04 14:23:05 +02005400/*
5401 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005402 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005403 * NFA_ANY: 1
5404 * specific character: 99
5405 */
5406 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005407failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005408{
5409 int c = state->c;
5410 int l, r;
5411
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005412 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005413 if (depth > 4)
5414 return 1;
5415
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005416 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005417 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005418 case NFA_SPLIT:
5419 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005420 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005421 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005422 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005423 l = failure_chance(state->out, depth + 1);
5424 r = failure_chance(state->out1, depth + 1);
5425 return l < r ? l : r;
5426
5427 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005428 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005429 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005430
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005431 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005432 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005433 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005434 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005435 return 0;
5436
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005437 case NFA_START_INVISIBLE:
5438 case NFA_START_INVISIBLE_FIRST:
5439 case NFA_START_INVISIBLE_NEG:
5440 case NFA_START_INVISIBLE_NEG_FIRST:
5441 case NFA_START_INVISIBLE_BEFORE:
5442 case NFA_START_INVISIBLE_BEFORE_FIRST:
5443 case NFA_START_INVISIBLE_BEFORE_NEG:
5444 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5445 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005446 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005447 return 5;
5448
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005449 case NFA_BOL:
5450 case NFA_EOL:
5451 case NFA_BOF:
5452 case NFA_EOF:
5453 case NFA_NEWL:
5454 return 99;
5455
5456 case NFA_BOW:
5457 case NFA_EOW:
5458 return 90;
5459
5460 case NFA_MOPEN:
5461 case NFA_MOPEN1:
5462 case NFA_MOPEN2:
5463 case NFA_MOPEN3:
5464 case NFA_MOPEN4:
5465 case NFA_MOPEN5:
5466 case NFA_MOPEN6:
5467 case NFA_MOPEN7:
5468 case NFA_MOPEN8:
5469 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005470#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005471 case NFA_ZOPEN:
5472 case NFA_ZOPEN1:
5473 case NFA_ZOPEN2:
5474 case NFA_ZOPEN3:
5475 case NFA_ZOPEN4:
5476 case NFA_ZOPEN5:
5477 case NFA_ZOPEN6:
5478 case NFA_ZOPEN7:
5479 case NFA_ZOPEN8:
5480 case NFA_ZOPEN9:
5481 case NFA_ZCLOSE:
5482 case NFA_ZCLOSE1:
5483 case NFA_ZCLOSE2:
5484 case NFA_ZCLOSE3:
5485 case NFA_ZCLOSE4:
5486 case NFA_ZCLOSE5:
5487 case NFA_ZCLOSE6:
5488 case NFA_ZCLOSE7:
5489 case NFA_ZCLOSE8:
5490 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005491#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005492 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005493 case NFA_MCLOSE1:
5494 case NFA_MCLOSE2:
5495 case NFA_MCLOSE3:
5496 case NFA_MCLOSE4:
5497 case NFA_MCLOSE5:
5498 case NFA_MCLOSE6:
5499 case NFA_MCLOSE7:
5500 case NFA_MCLOSE8:
5501 case NFA_MCLOSE9:
5502 case NFA_NCLOSE:
5503 return failure_chance(state->out, depth + 1);
5504
5505 case NFA_BACKREF1:
5506 case NFA_BACKREF2:
5507 case NFA_BACKREF3:
5508 case NFA_BACKREF4:
5509 case NFA_BACKREF5:
5510 case NFA_BACKREF6:
5511 case NFA_BACKREF7:
5512 case NFA_BACKREF8:
5513 case NFA_BACKREF9:
5514#ifdef FEAT_SYN_HL
5515 case NFA_ZREF1:
5516 case NFA_ZREF2:
5517 case NFA_ZREF3:
5518 case NFA_ZREF4:
5519 case NFA_ZREF5:
5520 case NFA_ZREF6:
5521 case NFA_ZREF7:
5522 case NFA_ZREF8:
5523 case NFA_ZREF9:
5524#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005525 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005526 return 94;
5527
5528 case NFA_LNUM_GT:
5529 case NFA_LNUM_LT:
5530 case NFA_COL_GT:
5531 case NFA_COL_LT:
5532 case NFA_VCOL_GT:
5533 case NFA_VCOL_LT:
5534 case NFA_MARK_GT:
5535 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005536 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005537 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005538 return 85;
5539
5540 case NFA_LNUM:
5541 return 90;
5542
5543 case NFA_CURSOR:
5544 case NFA_COL:
5545 case NFA_VCOL:
5546 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005547 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005548 return 98;
5549
5550 case NFA_COMPOSING:
5551 return 95;
5552
5553 default:
5554 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005555 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005556 return 95;
5557 }
5558
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005559 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005560 return 50;
5561}
5562
Bram Moolenaarf46da702013-06-02 22:37:42 +02005563/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005564 * Skip until the char "c" we know a match must start with.
5565 */
5566 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005567skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005568{
5569 char_u *s;
5570
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005571 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005572 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005573 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005574 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005575 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005576 if (s == NULL)
5577 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005578 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005579 return OK;
5580}
5581
5582/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005583 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005584 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005585 * Returns zero for no match, 1 for a match.
5586 */
5587 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01005588find_match_text(colnr_T startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005589{
5590 colnr_T col = startcol;
5591 int c1, c2;
5592 int len1, len2;
5593 int match;
5594
5595 for (;;)
5596 {
5597 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005598 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005599 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5600 {
5601 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005602 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005603 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005604 {
5605 match = FALSE;
5606 break;
5607 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005608 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5609 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005610 }
5611 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005612 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005613 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005614 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005615 {
5616 cleanup_subexpr();
5617 if (REG_MULTI)
5618 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005619 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005620 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005621 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005622 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005623 }
5624 else
5625 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005626 rex.reg_startp[0] = rex.line + col;
5627 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005628 }
5629 return 1L;
5630 }
5631
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005632 // Try finding regstart after the current match.
5633 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005634 if (skip_to_start(regstart, &col) == FAIL)
5635 break;
5636 }
5637 return 0L;
5638}
5639
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005640#ifdef FEAT_RELTIME
5641 static int
5642nfa_did_time_out()
5643{
5644 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5645 {
5646 if (nfa_timed_out != NULL)
5647 *nfa_timed_out = TRUE;
5648 return TRUE;
5649 }
5650 return FALSE;
5651}
5652#endif
5653
Bram Moolenaar473de612013-06-08 18:19:48 +02005654/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005655 * Main matching routine.
5656 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005657 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005658 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005659 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005660 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005661 * Return TRUE if there is a match, FALSE if there is no match,
5662 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005663 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005664 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005665 * Note: Caller must ensure that: start != NULL.
5666 */
5667 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005668nfa_regmatch(
5669 nfa_regprog_T *prog,
5670 nfa_state_T *start,
5671 regsubs_T *submatch,
5672 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005673{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005674 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005675 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005676 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005677 int go_to_nextline = FALSE;
5678 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005679 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005680 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005681 nfa_list_T *thislist;
5682 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005683 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005684 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005685 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005686 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005687 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005688 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005689 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005690 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005691#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005692 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005693#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005694
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005695 // Some patterns may take a long time to match, especially when using
5696 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005697 fast_breakcheck();
5698 if (got_int)
5699 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005700#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005701 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005702 return FALSE;
5703#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005704
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005705#ifdef NFA_REGEXP_DEBUG_LOG
5706 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5707 if (debug == NULL)
5708 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005709 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005710 return FALSE;
5711 }
5712#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005713 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005714
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005715 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005716 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005717
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005718 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005719 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005720 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005721 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005722 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005723 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005724
5725#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005726 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005727 if (log_fd == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005728 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005729 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005730 log_fd = stderr;
5731 }
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005732 fprintf(log_fd, "**********************************\n");
5733 nfa_set_code(start->c);
5734 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5735 abs(start->id), code);
5736 fprintf(log_fd, "**********************************\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005737#endif
5738
5739 thislist = &list[0];
5740 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005741 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005742 nextlist = &list[1];
5743 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005744 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005745#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005746 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005747#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005748 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005749
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005750 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5751 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005752 if (toplevel)
5753 {
5754 if (REG_MULTI)
5755 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005756 m->norm.list.multi[0].start_lnum = rex.lnum;
5757 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005758 }
5759 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005760 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005761 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005762 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005763 }
5764 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005765 r = addstate(thislist, start, m, NULL, 0);
5766 if (r == NULL)
5767 {
5768 nfa_match = NFA_TOO_EXPENSIVE;
5769 goto theend;
5770 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005771
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005772#define ADD_STATE_IF_MATCH(state) \
5773 if (result) { \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005774 add_state = state->out; \
5775 add_off = clen; \
5776 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005777
5778 /*
5779 * Run for each character.
5780 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005781 for (;;)
5782 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005783 int curc;
5784 int clen;
5785
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005786 if (has_mbyte)
5787 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005788 curc = (*mb_ptr2char)(rex.input);
5789 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005790 }
5791 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005792 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005793 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005794 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005795 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005796 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005797 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005798 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005799 go_to_nextline = FALSE;
5800 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005801
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005802 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005803 thislist = &list[flag];
5804 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005805 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005806 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005807 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005808 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005809 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005810# ifdef FEAT_EVAL
5811 || nfa_fail_for_testing
5812# endif
5813 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005814 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005815 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005816 nfa_match = NFA_TOO_EXPENSIVE;
5817 goto theend;
5818 }
5819
Bram Moolenaar0270f382018-07-17 05:43:58 +02005820 thislist->id = rex.nfa_listid;
5821 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005822
5823#ifdef ENABLE_LOG
5824 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005825 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005826 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005827 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005828 {
5829 int i;
5830
5831 for (i = 0; i < thislist->n; i++)
5832 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5833 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005834 fprintf(log_fd, "\n");
5835#endif
5836
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005837#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005838 fprintf(debug, "\n-------------------\n");
5839#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005840 /*
5841 * If the state lists are empty we can stop.
5842 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005843 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005844 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005845
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005846 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005847 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005848 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005849 // If the list gets very long there probably is something wrong.
5850 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005851 fast_breakcheck();
5852 if (got_int)
5853 break;
5854#ifdef FEAT_RELTIME
5855 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5856 {
5857 nfa_time_count = 0;
5858 if (nfa_did_time_out())
5859 break;
5860 }
5861#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005862 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005863
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005864#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005865 nfa_set_code(t->state->c);
5866 fprintf(debug, "%s, ", code);
5867#endif
5868#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005869 {
5870 int col;
5871
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005872 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005873 col = -1;
5874 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005875 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005876 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005877 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005878 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005879 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005880 abs(t->state->id), (int)t->state->c, code, col,
5881 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005882 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005883#endif
5884
5885 /*
5886 * Handle the possible codes of the current state.
5887 * The most important is NFA_MATCH.
5888 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005889 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005890 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005891 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005892 switch (t->state->c)
5893 {
5894 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005895 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005896 // If the match is not at the start of the line, ends before a
5897 // composing characters and rex.reg_icombine is not set, that
5898 // is not really a match.
5899 if (enc_utf8 && !rex.reg_icombine
5900 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005901 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005902
Bram Moolenaar963fee22013-05-26 21:47:28 +02005903 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005904 copy_sub(&submatch->norm, &t->subs.norm);
5905#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005906 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005907 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005908#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005909#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005910 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005911#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005912 // Found the left-most longest match, do not look at any other
5913 // states at this position. When the list of states is going
5914 // to be empty quit without advancing, so that "rex.input" is
5915 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005916 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005917 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005918 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005919 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005920
5921 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005922 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005923 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005924 /*
5925 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005926 * NFA_START_INVISIBLE_BEFORE node.
5927 * They surround a zero-width group, used with "\@=", "\&",
5928 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005929 * If we got here, it means that the current "invisible" group
5930 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005931 * nfa_regmatch(). For a look-behind match only when it ends
5932 * in the position in "nfa_endp".
5933 * Submatches are stored in *m, and used in the parent call.
5934 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005935#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005936 if (nfa_endp != NULL)
5937 {
5938 if (REG_MULTI)
5939 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005940 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02005941 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02005942 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02005943 nfa_endp->se_u.pos.col);
5944 else
5945 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005946 (int)(rex.input - rex.line),
5947 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005948 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005949#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005950 // If "nfa_endp" is set it's only a match if it ends at
5951 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02005952 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02005953 ? (rex.lnum != nfa_endp->se_u.pos.lnum
5954 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005955 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005956 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02005957 break;
5958
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005959 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02005960 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005961 {
5962 copy_sub(&m->norm, &t->subs.norm);
5963#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005964 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005965 copy_sub(&m->synt, &t->subs.synt);
5966#endif
5967 }
Bram Moolenaar87953742013-06-05 18:52:40 +02005968#ifdef ENABLE_LOG
5969 fprintf(log_fd, "Match found:\n");
5970 log_subsexpr(m);
5971#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02005972 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005973 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02005974 if (nextlist->n == 0)
5975 clen = 0;
5976 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005977
5978 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005979 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005980 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005981 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02005982 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005983 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005984 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005985 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005986 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005987#ifdef ENABLE_LOG
5988 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
5989 failure_chance(t->state->out, 0),
5990 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005991#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005992 // Do it directly if there already is a PIM or when
5993 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02005994 if (t->pim.result != NFA_PIM_UNUSED
5995 || t->state->c == NFA_START_INVISIBLE_FIRST
5996 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5997 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5998 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005999 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006000 int in_use = m->norm.in_use;
6001
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006002 // Copy submatch info for the recursive call, opposite
6003 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006004 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02006005#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006006 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006007 copy_sub_off(&m->synt, &t->subs.synt);
6008#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006009
Bram Moolenaara2d95102013-06-04 14:23:05 +02006010 /*
6011 * First try matching the invisible match, then what
6012 * follows.
6013 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006014 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006015 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006016 if (result == NFA_TOO_EXPENSIVE)
6017 {
6018 nfa_match = result;
6019 goto theend;
6020 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006021
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006022 // for \@! and \@<! it is a match when the result is
6023 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006024 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006025 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6026 || t->state->c
6027 == NFA_START_INVISIBLE_BEFORE_NEG
6028 || t->state->c
6029 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006030 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006031 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006032 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006033#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006034 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006035 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006036#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006037 // If the pattern has \ze and it matched in the
6038 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006039 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006040
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006041 // t->state->out1 is the corresponding
6042 // END_INVISIBLE node; Add its out to the current
6043 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006044 add_here = TRUE;
6045 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006046 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006047 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006048 }
6049 else
6050 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006051 nfa_pim_T pim;
6052
Bram Moolenaara2d95102013-06-04 14:23:05 +02006053 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006054 * First try matching what follows. Only if a match
6055 * is found verify the invisible match matches. Add a
6056 * nfa_pim_T to the following states, it contains info
6057 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006058 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006059 pim.state = t->state;
6060 pim.result = NFA_PIM_TODO;
6061 pim.subs.norm.in_use = 0;
6062#ifdef FEAT_SYN_HL
6063 pim.subs.synt.in_use = 0;
6064#endif
6065 if (REG_MULTI)
6066 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006067 pim.end.pos.col = (int)(rex.input - rex.line);
6068 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006069 }
6070 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006071 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006072
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006073 // t->state->out1 is the corresponding END_INVISIBLE
6074 // node; Add its out to the current list (zero-width
6075 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006076 if (addstate_here(thislist, t->state->out1->out,
6077 &t->subs, &pim, &listidx) == NULL)
6078 {
6079 nfa_match = NFA_TOO_EXPENSIVE;
6080 goto theend;
6081 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006082 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006083 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006084 break;
6085
Bram Moolenaar87953742013-06-05 18:52:40 +02006086 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006087 {
6088 nfa_state_T *skip = NULL;
6089#ifdef ENABLE_LOG
6090 int skip_lid = 0;
6091#endif
6092
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006093 // There is no point in trying to match the pattern if the
6094 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006095 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6096 {
6097 skip = t->state->out1->out;
6098#ifdef ENABLE_LOG
6099 skip_lid = nextlist->id;
6100#endif
6101 }
6102 else if (state_in_list(nextlist,
6103 t->state->out1->out->out, &t->subs))
6104 {
6105 skip = t->state->out1->out->out;
6106#ifdef ENABLE_LOG
6107 skip_lid = nextlist->id;
6108#endif
6109 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006110 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006111 t->state->out1->out->out, &t->subs))
6112 {
6113 skip = t->state->out1->out->out;
6114#ifdef ENABLE_LOG
6115 skip_lid = thislist->id;
6116#endif
6117 }
6118 if (skip != NULL)
6119 {
6120#ifdef ENABLE_LOG
6121 nfa_set_code(skip->c);
6122 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6123 abs(skip->id), skip_lid, skip->c, code);
6124#endif
6125 break;
6126 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006127 // Copy submatch info to the recursive call, opposite of what
6128 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006129 copy_sub_off(&m->norm, &t->subs.norm);
6130#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006131 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006132 copy_sub_off(&m->synt, &t->subs.synt);
6133#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006134
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006135 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006136 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006137 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006138 if (result == NFA_TOO_EXPENSIVE)
6139 {
6140 nfa_match = result;
6141 goto theend;
6142 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006143 if (result)
6144 {
6145 int bytelen;
6146
6147#ifdef ENABLE_LOG
6148 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6149 log_subsexpr(m);
6150#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006151 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006152 copy_sub_off(&t->subs.norm, &m->norm);
6153#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006154 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006155 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006156#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006157 // Now we need to skip over the matched text and then
6158 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006159 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006160 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006161 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006162 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006163 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006164 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006165
6166#ifdef ENABLE_LOG
6167 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6168#endif
6169 if (bytelen == 0)
6170 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006171 // empty match, output of corresponding
6172 // NFA_END_PATTERN/NFA_SKIP to be used at current
6173 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006174 add_here = TRUE;
6175 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006176 }
6177 else if (bytelen <= clen)
6178 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006179 // match current character, output of corresponding
6180 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006181 add_state = t->state->out1->out->out;
6182 add_off = clen;
6183 }
6184 else
6185 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006186 // skip over the matched characters, set character
6187 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006188 add_state = t->state->out1->out;
6189 add_off = bytelen;
6190 add_count = bytelen - clen;
6191 }
6192 }
6193 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006194 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006195
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006196 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006197 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006198 {
6199 add_here = TRUE;
6200 add_state = t->state->out;
6201 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006202 break;
6203
6204 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006205 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006206 {
6207 add_here = TRUE;
6208 add_state = t->state->out;
6209 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006210 break;
6211
6212 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006213 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006214
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006215 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006216 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006217 else if (has_mbyte)
6218 {
6219 int this_class;
6220
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006221 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006222 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006223 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006224 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006225 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006226 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006227 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006228 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006229 || (rex.input > rex.line
6230 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006231 result = FALSE;
6232 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006233 {
6234 add_here = TRUE;
6235 add_state = t->state->out;
6236 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006237 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006238
6239 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006240 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006241 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006242 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006243 else if (has_mbyte)
6244 {
6245 int this_class, prev_class;
6246
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006247 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006248 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006249 prev_class = reg_prev_class();
6250 if (this_class == prev_class
6251 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006252 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006253 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006254 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6255 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006256 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006257 result = FALSE;
6258 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006259 {
6260 add_here = TRUE;
6261 add_state = t->state->out;
6262 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006263 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006264
Bram Moolenaar4b780632013-05-31 22:14:52 +02006265 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006266 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006267 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006268 {
6269 add_here = TRUE;
6270 add_state = t->state->out;
6271 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006272 break;
6273
6274 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006275 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006276 {
6277 add_here = TRUE;
6278 add_state = t->state->out;
6279 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006280 break;
6281
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006282 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006283 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006284 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006285 int len = 0;
6286 nfa_state_T *end;
6287 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006288 int cchars[MAX_MCO];
6289 int ccount = 0;
6290 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006291
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006292 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006293 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006294 if (utf_iscomposing(sta->c))
6295 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006296 // Only match composing character(s), ignore base
6297 // character. Used for ".{composing}" and "{composing}"
6298 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006299 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006300 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006301 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006302 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006303 // If \Z was present, then ignore composing characters.
6304 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006305 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006306 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006307 else
6308 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006309 while (sta->c != NFA_END_COMPOSING)
6310 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006311 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006312
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006313 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006314 else if (len > 0 || mc == sta->c)
6315 {
6316 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006317 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006318 len += mb_char2len(mc);
6319 sta = sta->out;
6320 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006321
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006322 // We don't care about the order of composing characters.
6323 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006324 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006325 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006326 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006327 cchars[ccount++] = mc;
6328 len += mb_char2len(mc);
6329 if (ccount == MAX_MCO)
6330 break;
6331 }
6332
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006333 // Check that each composing char in the pattern matches a
6334 // composing char in the text. We do not check if all
6335 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006336 result = OK;
6337 while (sta->c != NFA_END_COMPOSING)
6338 {
6339 for (j = 0; j < ccount; ++j)
6340 if (cchars[j] == sta->c)
6341 break;
6342 if (j == ccount)
6343 {
6344 result = FAIL;
6345 break;
6346 }
6347 sta = sta->out;
6348 }
6349 }
6350 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006351 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006352
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006353 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006354 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006355 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006356 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006357
6358 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006359 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006360 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006361 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006362 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006363 // Pass -1 for the offset, which means taking the position
6364 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006365 add_state = t->state->out;
6366 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006367 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006368 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006369 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006370 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006371 add_state = t->state->out;
6372 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006373 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006374 break;
6375
Bram Moolenaar417bad22013-06-07 14:08:30 +02006376 case NFA_START_COLL:
6377 case NFA_START_NEG_COLL:
6378 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006379 // What follows is a list of characters, until NFA_END_COLL.
6380 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006381 nfa_state_T *state;
6382 int result_if_matched;
6383 int c1, c2;
6384
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006385 // Never match EOL. If it's part of the collection it is added
6386 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006387 if (curc == NUL)
6388 break;
6389
6390 state = t->state->out;
6391 result_if_matched = (t->state->c == NFA_START_COLL);
6392 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006393 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006394 if (state->c == NFA_END_COLL)
6395 {
6396 result = !result_if_matched;
6397 break;
6398 }
6399 if (state->c == NFA_RANGE_MIN)
6400 {
6401 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006402 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006403 c2 = state->val;
6404#ifdef ENABLE_LOG
6405 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6406 curc, c1, c2);
6407#endif
6408 if (curc >= c1 && curc <= c2)
6409 {
6410 result = result_if_matched;
6411 break;
6412 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006413 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006414 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006415 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006416 int done = FALSE;
6417
6418 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006419 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006420 {
6421 result = result_if_matched;
6422 done = TRUE;
6423 break;
6424 }
6425 if (done)
6426 break;
6427 }
6428 }
6429 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006430 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006431 || (rex.reg_ic && MB_CASEFOLD(curc)
6432 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006433 {
6434 result = result_if_matched;
6435 break;
6436 }
6437 state = state->out;
6438 }
6439 if (result)
6440 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006441 // next state is in out of the NFA_END_COLL, out1 of
6442 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006443 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006444 add_off = clen;
6445 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006446 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006447 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006448
6449 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006450 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006451 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006452 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006453 add_state = t->state->out;
6454 add_off = clen;
6455 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006456 break;
6457
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006458 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006459 // On a composing character skip over it. Otherwise do
6460 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006461 if (enc_utf8 && utf_iscomposing(curc))
6462 {
6463 add_off = clen;
6464 }
6465 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006466 {
6467 add_here = TRUE;
6468 add_off = 0;
6469 }
6470 add_state = t->state->out;
6471 break;
6472
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006473 /*
6474 * Character classes like \a for alpha, \d for digit etc.
6475 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006476 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006477 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006478 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006479 break;
6480
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006481 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006482 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006483 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006484 break;
6485
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006486 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006487 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006488 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006489 break;
6490
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006491 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006492 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006493 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006494 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006495 break;
6496
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006497 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006498 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006499 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006500 break;
6501
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006502 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006503 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006504 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006505 break;
6506
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006507 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006508 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006509 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006510 break;
6511
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006512 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006513 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006514 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006515 break;
6516
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006517 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006518 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006519 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006520 break;
6521
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006522 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006523 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006524 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006525 break;
6526
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006527 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006528 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006529 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006530 break;
6531
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006532 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006533 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006534 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006535 break;
6536
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006537 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006538 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006539 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006540 break;
6541
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006542 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006543 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006544 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006545 break;
6546
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006547 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006548 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006549 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006550 break;
6551
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006552 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006553 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006554 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006555 break;
6556
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006557 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006558 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006559 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006560 break;
6561
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006562 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006563 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006564 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006565 break;
6566
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006567 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006568 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006569 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006570 break;
6571
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006572 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006573 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006574 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006575 break;
6576
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006577 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006578 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006579 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006580 break;
6581
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006582 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006583 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006584 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006585 break;
6586
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006587 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006588 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006589 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006590 break;
6591
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006592 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006593 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006594 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006595 break;
6596
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006597 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006598 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006599 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006600 break;
6601
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006602 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006603 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006604 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006605 break;
6606
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006607 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006608 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006609 ADD_STATE_IF_MATCH(t->state);
6610 break;
6611
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006612 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006613 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006614 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006615 ADD_STATE_IF_MATCH(t->state);
6616 break;
6617
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006618 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006619 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006620 ADD_STATE_IF_MATCH(t->state);
6621 break;
6622
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006623 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006624 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006625 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006626 ADD_STATE_IF_MATCH(t->state);
6627 break;
6628
Bram Moolenaar5714b802013-05-28 22:03:20 +02006629 case NFA_BACKREF1:
6630 case NFA_BACKREF2:
6631 case NFA_BACKREF3:
6632 case NFA_BACKREF4:
6633 case NFA_BACKREF5:
6634 case NFA_BACKREF6:
6635 case NFA_BACKREF7:
6636 case NFA_BACKREF8:
6637 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006638#ifdef FEAT_SYN_HL
6639 case NFA_ZREF1:
6640 case NFA_ZREF2:
6641 case NFA_ZREF3:
6642 case NFA_ZREF4:
6643 case NFA_ZREF5:
6644 case NFA_ZREF6:
6645 case NFA_ZREF7:
6646 case NFA_ZREF8:
6647 case NFA_ZREF9:
6648#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006649 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006650 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006651 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006652 int bytelen;
6653
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006654 if (t->state->c <= NFA_BACKREF9)
6655 {
6656 subidx = t->state->c - NFA_BACKREF1 + 1;
6657 result = match_backref(&t->subs.norm, subidx, &bytelen);
6658 }
6659#ifdef FEAT_SYN_HL
6660 else
6661 {
6662 subidx = t->state->c - NFA_ZREF1 + 1;
6663 result = match_zref(subidx, &bytelen);
6664 }
6665#endif
6666
Bram Moolenaar5714b802013-05-28 22:03:20 +02006667 if (result)
6668 {
6669 if (bytelen == 0)
6670 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006671 // empty match always works, output of NFA_SKIP to be
6672 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006673 add_here = TRUE;
6674 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006675 }
6676 else if (bytelen <= clen)
6677 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006678 // match current character, jump ahead to out of
6679 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006680 add_state = t->state->out->out;
6681 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006682 }
6683 else
6684 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006685 // skip over the matched characters, set character
6686 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006687 add_state = t->state->out;
6688 add_off = bytelen;
6689 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006690 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006691 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006692 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006693 }
6694 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006695 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006696 if (t->count - clen <= 0)
6697 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006698 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006699 add_state = t->state->out;
6700 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006701 }
6702 else
6703 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006704 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006705 add_state = t->state;
6706 add_off = 0;
6707 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006708 }
6709 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006710
Bram Moolenaar423532e2013-05-29 21:14:42 +02006711 case NFA_LNUM:
6712 case NFA_LNUM_GT:
6713 case NFA_LNUM_LT:
6714 result = (REG_MULTI &&
6715 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006716 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006717 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006718 {
6719 add_here = TRUE;
6720 add_state = t->state->out;
6721 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006722 break;
6723
6724 case NFA_COL:
6725 case NFA_COL_GT:
6726 case NFA_COL_LT:
6727 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006728 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006729 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006730 {
6731 add_here = TRUE;
6732 add_state = t->state->out;
6733 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006734 break;
6735
6736 case NFA_VCOL:
6737 case NFA_VCOL_GT:
6738 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006739 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006740 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006741 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006742 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006743
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006744 // Bail out quickly when there can't be a match, avoid the
6745 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006746 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006747 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006748 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006749 result = FALSE;
6750 if (op == 1 && col - 1 > t->state->val && col > 100)
6751 {
6752 int ts = wp->w_buffer->b_p_ts;
6753
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006754 // Guess that a character won't use more columns than
6755 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006756 if (ts < 4)
6757 ts = 4;
6758 result = col > t->state->val * ts;
6759 }
6760 if (!result)
6761 result = nfa_re_num_cmp(t->state->val, op,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006762 (long_u)win_linetabsize(wp, rex.line, col) + 1);
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006763 if (result)
6764 {
6765 add_here = TRUE;
6766 add_state = t->state->out;
6767 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006768 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006769 break;
6770
Bram Moolenaar044aa292013-06-04 21:27:38 +02006771 case NFA_MARK:
6772 case NFA_MARK_GT:
6773 case NFA_MARK_LT:
6774 {
Bram Moolenaarb4ad3b02022-03-30 10:57:45 +01006775 pos_T *pos;
6776 size_t col = REG_MULTI ? rex.input - rex.line : 0;
6777
6778 pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006779
Bram Moolenaar64066b92021-11-17 18:22:56 +00006780 // Line may have been freed, get it again.
6781 if (REG_MULTI)
6782 {
6783 rex.line = reg_getline(rex.lnum);
6784 rex.input = rex.line + col;
6785 }
6786
Bram Moolenaar872bee52021-05-24 22:56:15 +02006787 // Compare the mark position to the match position, if the mark
6788 // exists and mark is set in reg_buf.
6789 if (pos != NULL && pos->lnum > 0)
6790 {
6791 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6792 && pos->col == MAXCOL
6793 ? (colnr_T)STRLEN(reg_getline(
6794 pos->lnum - rex.reg_firstlnum))
6795 : pos->col;
6796
6797 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6798 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006799 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006800 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006801 ? t->state->c == NFA_MARK_GT
6802 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006803 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006804 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006805 : t->state->c == NFA_MARK_LT));
6806 if (result)
6807 {
6808 add_here = TRUE;
6809 add_state = t->state->out;
6810 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006811 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006812 break;
6813 }
6814
Bram Moolenaar423532e2013-05-29 21:14:42 +02006815 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006816 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006817 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006818 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006819 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006820 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006821 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006822 {
6823 add_here = TRUE;
6824 add_state = t->state->out;
6825 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006826 break;
6827
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006828 case NFA_VISUAL:
6829 result = reg_match_visual();
6830 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006831 {
6832 add_here = TRUE;
6833 add_state = t->state->out;
6834 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006835 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006836
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006837 case NFA_MOPEN1:
6838 case NFA_MOPEN2:
6839 case NFA_MOPEN3:
6840 case NFA_MOPEN4:
6841 case NFA_MOPEN5:
6842 case NFA_MOPEN6:
6843 case NFA_MOPEN7:
6844 case NFA_MOPEN8:
6845 case NFA_MOPEN9:
6846#ifdef FEAT_SYN_HL
6847 case NFA_ZOPEN:
6848 case NFA_ZOPEN1:
6849 case NFA_ZOPEN2:
6850 case NFA_ZOPEN3:
6851 case NFA_ZOPEN4:
6852 case NFA_ZOPEN5:
6853 case NFA_ZOPEN6:
6854 case NFA_ZOPEN7:
6855 case NFA_ZOPEN8:
6856 case NFA_ZOPEN9:
6857#endif
6858 case NFA_NOPEN:
6859 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006860 // These states are only added to be able to bail out when
6861 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006862 break;
6863
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006864 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006865 {
6866 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006867
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006868#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006869 if (c < 0)
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00006870 siemsg("INTERNAL: Negative state char: %ld", (long)c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006871#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006872 result = (c == curc);
6873
Bram Moolenaar6100d022016-10-02 16:51:57 +02006874 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006875 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006876 // If rex.reg_icombine is not set only skip over the character
6877 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006878 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006879 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006880 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006881 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006882 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006883
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006884 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006885
6886 if (add_state != NULL)
6887 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006888 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006889 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006890
6891 if (t->pim.result == NFA_PIM_UNUSED)
6892 pim = NULL;
6893 else
6894 pim = &t->pim;
6895
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006896 // Handle the postponed invisible match if the match might end
6897 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006898 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006899 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006900 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006901 {
6902#ifdef ENABLE_LOG
6903 fprintf(log_fd, "\n");
6904 fprintf(log_fd, "==================================\n");
6905 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6906 fprintf(log_fd, "\n");
6907#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006908 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006909 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006910 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006911 // for \@! and \@<! it is a match when the result is
6912 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006913 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006914 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6915 || pim->state->c
6916 == NFA_START_INVISIBLE_BEFORE_NEG
6917 || pim->state->c
6918 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006919 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006920 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006921 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006922#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006923 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006924 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006925#endif
6926 }
6927 }
6928 else
6929 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006930 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006931#ifdef ENABLE_LOG
6932 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006933 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006934 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6935 fprintf(log_fd, "\n");
6936#endif
6937 }
6938
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006939 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006940 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006941 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6942 || pim->state->c
6943 == NFA_START_INVISIBLE_BEFORE_NEG
6944 || pim->state->c
6945 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006946 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006947 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006948 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006949#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006950 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006951 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006952#endif
6953 }
6954 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006955 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02006956 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006957
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006958 // Postponed invisible match was handled, don't add it to
6959 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006960 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006961 }
6962
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006963 // If "pim" points into l->t it will become invalid when
6964 // adding the state causes the list to be reallocated. Make a
6965 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02006966 if (pim == &t->pim)
6967 {
6968 copy_pim(&pim_copy, pim);
6969 pim = &pim_copy;
6970 }
6971
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006972 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006973 r = addstate_here(thislist, add_state, &t->subs,
6974 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006975 else
6976 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006977 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006978 if (add_count > 0)
6979 nextlist->t[nextlist->n - 1].count = add_count;
6980 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006981 if (r == NULL)
6982 {
6983 nfa_match = NFA_TOO_EXPENSIVE;
6984 goto theend;
6985 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006986 }
6987
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006988 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006989
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006990 // Look for the start of a match in the current position by adding the
6991 // start state to the list of states.
6992 // The first found match is the leftmost one, thus the order of states
6993 // matters!
6994 // Do not add the start state in recursive calls of nfa_regmatch(),
6995 // because recursive calls should only start in the first position.
6996 // Unless "nfa_endp" is not NULL, then we match the end position.
6997 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02006998 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02006999 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02007000 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02007001 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02007002 && (rex.reg_maxcol == 0
Bram Moolenaar0270f382018-07-17 05:43:58 +02007003 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02007004 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02007005 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007006 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7007 || (rex.lnum == nfa_endp->se_u.pos.lnum
7008 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02007009 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02007010 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007011 {
7012#ifdef ENABLE_LOG
7013 fprintf(log_fd, "(---) STARTSTATE\n");
7014#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007015 // Inline optimized code for addstate() if we know the state is
7016 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007017 if (toplevel)
7018 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007019 int add = TRUE;
7020 int c;
7021
7022 if (prog->regstart != NUL && clen != 0)
7023 {
7024 if (nextlist->n == 0)
7025 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007026 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007027
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007028 // Nextlist is empty, we can skip ahead to the
7029 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007030 if (skip_to_start(prog->regstart, &col) == FAIL)
7031 break;
7032#ifdef ENABLE_LOG
7033 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007034 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007035#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007036 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007037 }
7038 else
7039 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007040 // Checking if the required start character matches is
7041 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007042 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007043 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007044 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007045 {
7046#ifdef ENABLE_LOG
7047 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7048#endif
7049 add = FALSE;
7050 }
7051 }
7052 }
7053
7054 if (add)
7055 {
7056 if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007057 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007058 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007059 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007060 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007061 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7062 {
7063 nfa_match = NFA_TOO_EXPENSIVE;
7064 goto theend;
7065 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007066 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007067 }
7068 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007069 {
7070 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7071 {
7072 nfa_match = NFA_TOO_EXPENSIVE;
7073 goto theend;
7074 }
7075 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007076 }
7077
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007078#ifdef ENABLE_LOG
7079 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007080 {
7081 int i;
7082
7083 for (i = 0; i < thislist->n; i++)
7084 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7085 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007086 fprintf(log_fd, "\n");
7087#endif
7088
7089nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007090 // Advance to the next character, or advance to the next line, or
7091 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007092 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007093 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007094 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007095 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007096 reg_nextline();
7097 else
7098 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007099
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007100 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007101 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007102 if (got_int)
7103 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007104#ifdef FEAT_RELTIME
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007105 // Check for timeout once in a twenty times to avoid overhead.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007106 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7107 {
7108 nfa_time_count = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007109 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007110 break;
7111 }
7112#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007113 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007114
7115#ifdef ENABLE_LOG
7116 if (log_fd != stderr)
7117 fclose(log_fd);
7118 log_fd = NULL;
7119#endif
7120
7121theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007122 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007123 vim_free(list[0].t);
7124 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007125 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007126#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007127#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007128 fclose(debug);
7129#endif
7130
Bram Moolenaar963fee22013-05-26 21:47:28 +02007131 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007132}
7133
7134/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007135 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007136 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007137 */
7138 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007139nfa_regtry(
7140 nfa_regprog_T *prog,
7141 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007142 proftime_T *tm UNUSED, // timeout limit or NULL
7143 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007144{
7145 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007146 regsubs_T subs, m;
7147 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007148 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007149#ifdef ENABLE_LOG
7150 FILE *f;
7151#endif
7152
Bram Moolenaar0270f382018-07-17 05:43:58 +02007153 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007154#ifdef FEAT_RELTIME
7155 nfa_time_limit = tm;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007156 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007157 nfa_time_count = 0;
7158#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007159
7160#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007161 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007162 if (f != NULL)
7163 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007164 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007165#ifdef DEBUG
7166 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7167#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007168 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007169 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007170 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007171 fprintf(f, "\n\n");
7172 fclose(f);
7173 }
7174 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007175 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007176#endif
7177
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007178 clear_sub(&subs.norm);
7179 clear_sub(&m.norm);
7180#ifdef FEAT_SYN_HL
7181 clear_sub(&subs.synt);
7182 clear_sub(&m.synt);
7183#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007184
Bram Moolenaarfda37292014-11-05 14:27:36 +01007185 result = nfa_regmatch(prog, start, &subs, &m);
7186 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007187 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007188 else if (result == NFA_TOO_EXPENSIVE)
7189 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007190
7191 cleanup_subexpr();
7192 if (REG_MULTI)
7193 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007194 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007195 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007196 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7197 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007198
Bram Moolenaar6100d022016-10-02 16:51:57 +02007199 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7200 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007201 }
7202
Bram Moolenaar6100d022016-10-02 16:51:57 +02007203 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007204 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007205 rex.reg_startpos[0].lnum = 0;
7206 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007207 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007208 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007209 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007210 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007211 rex.reg_endpos[0].lnum = rex.lnum;
7212 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007213 }
7214 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007215 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007216 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007217 }
7218 else
7219 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007220 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007221 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007222 rex.reg_startp[i] = subs.norm.list.line[i].start;
7223 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007224 }
7225
Bram Moolenaar6100d022016-10-02 16:51:57 +02007226 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007227 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007228 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007229 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007230 }
7231
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007232#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007233 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007234 unref_extmatch(re_extmatch_out);
7235 re_extmatch_out = NULL;
7236
7237 if (prog->reghasz == REX_SET)
7238 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007239 cleanup_zsubexpr();
7240 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007241 if (re_extmatch_out == NULL)
7242 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007243 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007244 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007245 {
7246 if (REG_MULTI)
7247 {
7248 struct multipos *mpos = &subs.synt.list.multi[i];
7249
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007250 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007251 if (mpos->start_lnum >= 0
7252 && mpos->start_lnum == mpos->end_lnum
7253 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007254 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007255 vim_strnsave(reg_getline(mpos->start_lnum)
7256 + mpos->start_col,
7257 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007258 }
7259 else
7260 {
7261 struct linepos *lpos = &subs.synt.list.line[i];
7262
7263 if (lpos->start != NULL && lpos->end != NULL)
7264 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007265 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007266 }
7267 }
7268 }
7269#endif
7270
Bram Moolenaar0270f382018-07-17 05:43:58 +02007271 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007272}
7273
7274/*
7275 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007276 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007277 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007278 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007279 */
7280 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007281nfa_regexec_both(
7282 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007283 colnr_T startcol, // column to start looking for match
7284 proftime_T *tm, // timeout limit or NULL
7285 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007286{
7287 nfa_regprog_T *prog;
7288 long retval = 0L;
7289 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007290 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007291
7292 if (REG_MULTI)
7293 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007294 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007295 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007296 rex.reg_startpos = rex.reg_mmatch->startpos;
7297 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007298 }
7299 else
7300 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007301 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7302 rex.reg_startp = rex.reg_match->startp;
7303 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007304 }
7305
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007306 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007307 if (prog == NULL || line == NULL)
7308 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02007309 iemsg(_(e_null_argument));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007310 goto theend;
7311 }
7312
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007313 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007314 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007315 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007316 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007317 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007318
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007319 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007320 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007321 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007322
Bram Moolenaar0270f382018-07-17 05:43:58 +02007323 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007324 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007325
Bram Moolenaar0270f382018-07-17 05:43:58 +02007326 rex.nfa_has_zend = prog->has_zend;
7327 rex.nfa_has_backref = prog->has_backref;
7328 rex.nfa_nsubexpr = prog->nsubexp;
7329 rex.nfa_listid = 1;
7330 rex.nfa_alt_listid = 2;
7331#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007332 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007333#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007334
Bram Moolenaard89616e2013-06-06 18:46:06 +02007335 if (prog->reganch && col > 0)
7336 return 0L;
7337
Bram Moolenaar0270f382018-07-17 05:43:58 +02007338 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007339#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007340 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007341 if (prog->reghasz == REX_SET)
7342 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007343 rex.nfa_has_zsubexpr = TRUE;
7344 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007345 }
7346 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007347 {
7348 rex.nfa_has_zsubexpr = FALSE;
7349 rex.need_clear_zsubexpr = FALSE;
7350 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007351#endif
7352
Bram Moolenaard89616e2013-06-06 18:46:06 +02007353 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007354 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007355 // Skip ahead until a character we know the match must start with.
7356 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007357 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007358 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007359
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007360 // If match_text is set it contains the full text that must match.
7361 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007362 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar473de612013-06-08 18:19:48 +02007363 return find_match_text(col, prog->regstart, prog->match_text);
7364 }
7365
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007366 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007367 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007368 goto theend;
7369
Bram Moolenaar0270f382018-07-17 05:43:58 +02007370 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7371 // it's accidentally used during execution.
7372 nstate = 0;
7373 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007374 {
7375 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007376 prog->state[i].lastlist[0] = 0;
7377 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007378 }
7379
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007380 retval = nfa_regtry(prog, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007381
Bram Moolenaar0270f382018-07-17 05:43:58 +02007382#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007383 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007384#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007385
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007386theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007387 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007388 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007389 // Make sure the end is never before the start. Can happen when \zs and
7390 // \ze are used.
7391 if (REG_MULTI)
7392 {
7393 lpos_T *start = &rex.reg_mmatch->startpos[0];
7394 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007395
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007396 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007397 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007398 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7399 }
7400 else
7401 {
7402 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7403 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7404 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007405 }
7406
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007407 return retval;
7408}
7409
7410/*
7411 * Compile a regular expression into internal code for the NFA matcher.
7412 * Returns the program in allocated space. Returns NULL for an error.
7413 */
7414 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007415nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007416{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007417 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007418 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007419 int *postfix;
7420
7421 if (expr == NULL)
7422 return NULL;
7423
Bram Moolenaar0270f382018-07-17 05:43:58 +02007424#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007425 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007426#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007427 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007428
7429 init_class_tab();
7430
7431 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7432 return NULL;
7433
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007434 // Build postfix form of the regexp. Needed to build the NFA
7435 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007436 postfix = re2post();
7437 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007438 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007439
7440 /*
7441 * In order to build the NFA, we parse the input regexp twice:
7442 * 1. first pass to count size (so we can allocate space)
7443 * 2. second to emit code
7444 */
7445#ifdef ENABLE_LOG
7446 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007447 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007448
7449 if (f != NULL)
7450 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007451 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007452 fclose(f);
7453 }
7454 }
7455#endif
7456
7457 /*
7458 * PASS 1
7459 * Count number of NFA states in "nstate". Do not build the NFA.
7460 */
7461 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007462
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007463 // allocate the regprog with space for the compiled regexp
Bram Moolenaar16619a22013-06-11 18:42:36 +02007464 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007465 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007466 if (prog == NULL)
7467 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007468 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007469 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007470
7471 /*
7472 * PASS 2
7473 * Build the NFA
7474 */
7475 prog->start = post2nfa(postfix, post_ptr, FALSE);
7476 if (prog->start == NULL)
7477 goto fail;
7478
7479 prog->regflags = regflags;
7480 prog->engine = &nfa_regengine;
7481 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007482 prog->has_zend = rex.nfa_has_zend;
7483 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007484 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007485
Bram Moolenaara2947e22013-06-11 22:44:09 +02007486 nfa_postprocess(prog);
7487
Bram Moolenaard89616e2013-06-06 18:46:06 +02007488 prog->reganch = nfa_get_reganch(prog->start, 0);
7489 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007490 prog->match_text = nfa_get_match_text(prog->start);
7491
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007492#ifdef ENABLE_LOG
7493 nfa_postfix_dump(expr, OK);
7494 nfa_dump(prog);
7495#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007496#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007497 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007498 prog->reghasz = re_has_z;
7499#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007500 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007501#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007502 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007503#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007504
7505out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007506 VIM_CLEAR(post_start);
7507 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007508 state_ptr = NULL;
7509 return (regprog_T *)prog;
7510
7511fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007512 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007513#ifdef ENABLE_LOG
7514 nfa_postfix_dump(expr, FAIL);
7515#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007516#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007517 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007518#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007519 goto out;
7520}
7521
Bram Moolenaar473de612013-06-08 18:19:48 +02007522/*
7523 * Free a compiled regexp program, returned by nfa_regcomp().
7524 */
7525 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007526nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007527{
7528 if (prog != NULL)
7529 {
7530 vim_free(((nfa_regprog_T *)prog)->match_text);
Bram Moolenaar473de612013-06-08 18:19:48 +02007531 vim_free(((nfa_regprog_T *)prog)->pattern);
Bram Moolenaar473de612013-06-08 18:19:48 +02007532 vim_free(prog);
7533 }
7534}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007535
7536/*
7537 * Match a regexp against a string.
7538 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7539 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007540 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007541 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007542 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007543 */
7544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007545nfa_regexec_nl(
7546 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007547 char_u *line, // string to match against
7548 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007549 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007550{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007551 rex.reg_match = rmp;
7552 rex.reg_mmatch = NULL;
7553 rex.reg_maxline = 0;
7554 rex.reg_line_lbr = line_lbr;
7555 rex.reg_buf = curbuf;
7556 rex.reg_win = NULL;
7557 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007558 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007559 rex.reg_maxcol = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007560 return nfa_regexec_both(line, col, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007561}
7562
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007563
7564/*
7565 * Match a regexp against multiple lines.
7566 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7567 * Uses curbuf for line count and 'iskeyword'.
7568 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007569 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007570 * match otherwise.
7571 *
7572 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7573 *
7574 * ! Also NOTE : match may actually be in another line. e.g.:
7575 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7576 *
7577 * +-------------------------+
7578 * |a |
7579 * |b |
7580 * |c |
7581 * | |
7582 * +-------------------------+
7583 *
7584 * then nfa_regexec_multi() returns 3. while the original
7585 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7586 *
7587 * FIXME if this behavior is not compatible.
7588 */
7589 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007590nfa_regexec_multi(
7591 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007592 win_T *win, // window in which to search or NULL
7593 buf_T *buf, // buffer in which to search
7594 linenr_T lnum, // nr of line to start looking for match
7595 colnr_T col, // column to start looking for match
7596 proftime_T *tm, // timeout limit or NULL
7597 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007598{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007599 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007600 return nfa_regexec_both(NULL, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007601}
7602
7603#ifdef DEBUG
7604# undef ENABLE_LOG
7605#endif