blob: 39f7f8ba88e678ffc600e9c57fd79e7fce343201 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar0270f382018-07-17 05:43:58 +0200246// Variables only used in nfa_regcomp() and descendants.
247static int nfa_re_flags; // re_flags passed to nfa_regcomp()
248static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200249static int *post_end;
250static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100251
252// Set when the pattern should use the NFA engine.
253// E.g. [[:upper:]] only allows 8bit characters for BT engine,
254// while NFA engine handles multibyte characters correctly.
255static int wants_nfa;
256
Bram Moolenaar0270f382018-07-17 05:43:58 +0200257static int nstate; // Number of states in the NFA.
258static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200261static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200262
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100263// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200264static int nfa_ll_index = 0;
265
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100266static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100267static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200268#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100269static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200270#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int match_follows(nfa_state_T *startstate, int depth);
272static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100274// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200276 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200277 return FAIL; \
278 *post_ptr++ = c; \
279 } while (0)
280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200281/*
282 * Initialize internal variables before NFA compilation.
283 * Return OK on success, FAIL otherwise.
284 */
285 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100286nfa_regcomp_start(
287 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200289{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200290 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200291 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200292
293 nstate = 0;
294 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200296 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298 // Some items blow up in size, such as [A-z]. Add more space for that.
299 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200300 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200301
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200303 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200304
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200305 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200306 if (post_start == NULL)
307 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200308 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100310 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200311 rex.nfa_has_zend = FALSE;
312 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315 regcomp_start(expr, re_flags);
316
317 return OK;
318}
319
320/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200321 * Figure out if the NFA state list starts with an anchor, must match at start
322 * of the line.
323 */
324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100325nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326{
327 nfa_state_T *p = start;
328
329 if (depth > 4)
330 return 0;
331
332 while (p != NULL)
333 {
334 switch (p->c)
335 {
336 case NFA_BOL:
337 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200339
340 case NFA_ZSTART:
341 case NFA_ZEND:
342 case NFA_CURSOR:
343 case NFA_VISUAL:
344
345 case NFA_MOPEN:
346 case NFA_MOPEN1:
347 case NFA_MOPEN2:
348 case NFA_MOPEN3:
349 case NFA_MOPEN4:
350 case NFA_MOPEN5:
351 case NFA_MOPEN6:
352 case NFA_MOPEN7:
353 case NFA_MOPEN8:
354 case NFA_MOPEN9:
355 case NFA_NOPEN:
356#ifdef FEAT_SYN_HL
357 case NFA_ZOPEN:
358 case NFA_ZOPEN1:
359 case NFA_ZOPEN2:
360 case NFA_ZOPEN3:
361 case NFA_ZOPEN4:
362 case NFA_ZOPEN5:
363 case NFA_ZOPEN6:
364 case NFA_ZOPEN7:
365 case NFA_ZOPEN8:
366 case NFA_ZOPEN9:
367#endif
368 p = p->out;
369 break;
370
371 case NFA_SPLIT:
372 return nfa_get_reganch(p->out, depth + 1)
373 && nfa_get_reganch(p->out1, depth + 1);
374
375 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100376 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200377 }
378 }
379 return 0;
380}
381
382/*
383 * Figure out if the NFA state list starts with a character which must match
384 * at start of the match.
385 */
386 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100387nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200388{
389 nfa_state_T *p = start;
390
391 if (depth > 4)
392 return 0;
393
394 while (p != NULL)
395 {
396 switch (p->c)
397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100398 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200399 case NFA_BOL:
400 case NFA_BOF:
401 case NFA_BOW:
402 case NFA_EOW:
403 case NFA_ZSTART:
404 case NFA_ZEND:
405 case NFA_CURSOR:
406 case NFA_VISUAL:
407 case NFA_LNUM:
408 case NFA_LNUM_GT:
409 case NFA_LNUM_LT:
410 case NFA_COL:
411 case NFA_COL_GT:
412 case NFA_COL_LT:
413 case NFA_VCOL:
414 case NFA_VCOL_GT:
415 case NFA_VCOL_LT:
416 case NFA_MARK:
417 case NFA_MARK_GT:
418 case NFA_MARK_LT:
419
420 case NFA_MOPEN:
421 case NFA_MOPEN1:
422 case NFA_MOPEN2:
423 case NFA_MOPEN3:
424 case NFA_MOPEN4:
425 case NFA_MOPEN5:
426 case NFA_MOPEN6:
427 case NFA_MOPEN7:
428 case NFA_MOPEN8:
429 case NFA_MOPEN9:
430 case NFA_NOPEN:
431#ifdef FEAT_SYN_HL
432 case NFA_ZOPEN:
433 case NFA_ZOPEN1:
434 case NFA_ZOPEN2:
435 case NFA_ZOPEN3:
436 case NFA_ZOPEN4:
437 case NFA_ZOPEN5:
438 case NFA_ZOPEN6:
439 case NFA_ZOPEN7:
440 case NFA_ZOPEN8:
441 case NFA_ZOPEN9:
442#endif
443 p = p->out;
444 break;
445
446 case NFA_SPLIT:
447 {
448 int c1 = nfa_get_regstart(p->out, depth + 1);
449 int c2 = nfa_get_regstart(p->out1, depth + 1);
450
451 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100452 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200453 return 0;
454 }
455
456 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200457 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100458 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200459 return 0;
460 }
461 }
462 return 0;
463}
464
465/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200466 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200467 * else. If so return a string in allocated memory with what must match after
468 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200469 */
470 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100471nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200472{
473 nfa_state_T *p = start;
474 int len = 0;
475 char_u *ret;
476 char_u *s;
477
478 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100479 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200480 p = p->out;
481 while (p->c > 0)
482 {
483 len += MB_CHAR2LEN(p->c);
484 p = p->out;
485 }
486 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
487 return NULL;
488
489 ret = alloc(len);
490 if (ret != NULL)
491 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100492 p = start->out->out; // skip first char, it goes into regstart
Bram Moolenaar473de612013-06-08 18:19:48 +0200493 s = ret;
494 while (p->c > 0)
495 {
Bram Moolenaar473de612013-06-08 18:19:48 +0200496 if (has_mbyte)
497 s += (*mb_char2bytes)(p->c, s);
498 else
Bram Moolenaar473de612013-06-08 18:19:48 +0200499 *s++ = p->c;
500 p = p->out;
501 }
502 *s = NUL;
503 }
504 return ret;
505}
506
507/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200508 * Allocate more space for post_start. Called when
509 * running above the estimated number of states.
510 */
511 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100512realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200514 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100515 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200516 int *new_start;
517 int *old_start;
518
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100519 // For weird patterns the number of states can be very high. Increasing by
520 // 50% seems a reasonable compromise between memory use and speed.
521 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200522 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200523 if (new_start == NULL)
524 return FAIL;
525 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200526 old_start = post_start;
527 post_start = new_start;
528 post_ptr = new_start + (post_ptr - old_start);
529 post_end = post_start + new_max;
530 vim_free(old_start);
531 return OK;
532}
533
534/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200535 * Search between "start" and "end" and try to recognize a
536 * character class in expanded form. For example [0-9].
537 * On success, return the id the character class to be emitted.
538 * On failure, return 0 (=FAIL)
539 * Start points to the first char of the range, while end should point
540 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200541 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
542 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200543 */
544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100545nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200546{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200547# define CLASS_not 0x80
548# define CLASS_af 0x40
549# define CLASS_AF 0x20
550# define CLASS_az 0x10
551# define CLASS_AZ 0x08
552# define CLASS_o7 0x04
553# define CLASS_o9 0x02
554# define CLASS_underscore 0x01
555
556 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200557 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200558 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200559
560 if (extra_newl == TRUE)
561 newl = TRUE;
562
563 if (*end != ']')
564 return FAIL;
565 p = start;
566 if (*p == '^')
567 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200568 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200569 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200570 }
571
572 while (p < end)
573 {
574 if (p + 2 < end && *(p + 1) == '-')
575 {
576 switch (*p)
577 {
578 case '0':
579 if (*(p + 2) == '9')
580 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200581 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200582 break;
583 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200584 if (*(p + 2) == '7')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200589 return FAIL;
590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 case 'a':
592 if (*(p + 2) == 'z')
593 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200594 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200595 break;
596 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200597 if (*(p + 2) == 'f')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200602 return FAIL;
603
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200604 case 'A':
605 if (*(p + 2) == 'Z')
606 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200607 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200608 break;
609 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200610 if (*(p + 2) == 'F')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200615 return FAIL;
616
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200617 default:
618 return FAIL;
619 }
620 p += 3;
621 }
622 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
623 {
624 newl = TRUE;
625 p += 2;
626 }
627 else if (*p == '_')
628 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200629 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200630 p ++;
631 }
632 else if (*p == '\n')
633 {
634 newl = TRUE;
635 p ++;
636 }
637 else
638 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100639 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200640
641 if (p != end)
642 return FAIL;
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200645 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200646
647 switch (config)
648 {
649 case CLASS_o9:
650 return extra_newl + NFA_DIGIT;
651 case CLASS_not | CLASS_o9:
652 return extra_newl + NFA_NDIGIT;
653 case CLASS_af | CLASS_AF | CLASS_o9:
654 return extra_newl + NFA_HEX;
655 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
656 return extra_newl + NFA_NHEX;
657 case CLASS_o7:
658 return extra_newl + NFA_OCTAL;
659 case CLASS_not | CLASS_o7:
660 return extra_newl + NFA_NOCTAL;
661 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
662 return extra_newl + NFA_WORD;
663 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
664 return extra_newl + NFA_NWORD;
665 case CLASS_az | CLASS_AZ | CLASS_underscore:
666 return extra_newl + NFA_HEAD;
667 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
668 return extra_newl + NFA_NHEAD;
669 case CLASS_az | CLASS_AZ:
670 return extra_newl + NFA_ALPHA;
671 case CLASS_not | CLASS_az | CLASS_AZ:
672 return extra_newl + NFA_NALPHA;
673 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200674 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200675 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200676 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200677 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200678 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200679 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200680 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200682 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200683}
684
685/*
686 * Produce the bytes for equivalence class "c".
687 * Currently only handles latin1, latin9 and utf-8.
688 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
689 * equivalent to 'a OR b OR c'
690 *
691 * NOTE! When changing this function, also update reg_equi_class()
692 */
693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200696#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
699 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200701#ifdef EBCDIC
702# define A_circumflex 0x62
703# define A_diaeresis 0x63
704# define A_grave 0x64
705# define A_acute 0x65
706# define A_virguilla 0x66
707# define A_ring 0x67
708# define C_cedilla 0x68
709# define E_acute 0x71
710# define E_circumflex 0x72
711# define E_diaeresis 0x73
712# define E_grave 0x74
713# define I_acute 0x75
714# define I_circumflex 0x76
715# define I_diaeresis 0x77
716# define I_grave 0x78
717# define N_virguilla 0x69
718# define O_circumflex 0xeb
719# define O_diaeresis 0xec
720# define O_grave 0xed
721# define O_acute 0xee
722# define O_virguilla 0xef
723# define O_slash 0x80
724# define U_circumflex 0xfb
725# define U_diaeresis 0xfc
726# define U_grave 0xfd
727# define U_acute 0xfe
728# define Y_acute 0xba
729# define a_grave 0x42
730# define a_acute 0x43
731# define a_circumflex 0x44
732# define a_virguilla 0x45
733# define a_diaeresis 0x46
734# define a_ring 0x47
735# define c_cedilla 0x48
736# define e_grave 0x51
737# define e_acute 0x52
738# define e_circumflex 0x53
739# define e_diaeresis 0x54
740# define i_grave 0x55
741# define i_acute 0x56
742# define i_circumflex 0x57
743# define i_diaeresis 0x58
744# define n_virguilla 0x49
745# define o_grave 0xcb
746# define o_acute 0xcc
747# define o_circumflex 0xcd
748# define o_virguilla 0xce
749# define o_diaeresis 0xcf
750# define o_slash 0x70
751# define u_grave 0xdb
752# define u_acute 0xdc
753# define u_circumflex 0xdd
754# define u_diaeresis 0xde
755# define y_acute 0x8d
756# define y_diaeresis 0xdf
757#else
758# define A_grave 0xc0
759# define A_acute 0xc1
760# define A_circumflex 0xc2
761# define A_virguilla 0xc3
762# define A_diaeresis 0xc4
763# define A_ring 0xc5
764# define C_cedilla 0xc7
765# define E_grave 0xc8
766# define E_acute 0xc9
767# define E_circumflex 0xca
768# define E_diaeresis 0xcb
769# define I_grave 0xcc
770# define I_acute 0xcd
771# define I_circumflex 0xce
772# define I_diaeresis 0xcf
773# define N_virguilla 0xd1
774# define O_grave 0xd2
775# define O_acute 0xd3
776# define O_circumflex 0xd4
777# define O_virguilla 0xd5
778# define O_diaeresis 0xd6
779# define O_slash 0xd8
780# define U_grave 0xd9
781# define U_acute 0xda
782# define U_circumflex 0xdb
783# define U_diaeresis 0xdc
784# define Y_acute 0xdd
785# define a_grave 0xe0
786# define a_acute 0xe1
787# define a_circumflex 0xe2
788# define a_virguilla 0xe3
789# define a_diaeresis 0xe4
790# define a_ring 0xe5
791# define c_cedilla 0xe7
792# define e_grave 0xe8
793# define e_acute 0xe9
794# define e_circumflex 0xea
795# define e_diaeresis 0xeb
796# define i_grave 0xec
797# define i_acute 0xed
798# define i_circumflex 0xee
799# define i_diaeresis 0xef
800# define n_virguilla 0xf1
801# define o_grave 0xf2
802# define o_acute 0xf3
803# define o_circumflex 0xf4
804# define o_virguilla 0xf5
805# define o_diaeresis 0xf6
806# define o_slash 0xf8
807# define u_grave 0xf9
808# define u_acute 0xfa
809# define u_circumflex 0xfb
810# define u_diaeresis 0xfc
811# define y_acute 0xfd
812# define y_diaeresis 0xff
813#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200814 switch (c)
815 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200816 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200817 case A_virguilla: case A_diaeresis: case A_ring:
818 case 0x100: case 0x102: case 0x104: case 0x1cd:
819 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
820 case 0x202: case 0x226: case 0x23a: case 0x1e00:
821 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
822 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
823 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
824 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
825 EMIT2(A_circumflex) EMIT2(A_virguilla)
826 EMIT2(A_diaeresis) EMIT2(A_ring)
827 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
828 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
829 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
830 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
831 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
832 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
833 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
834 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200835 return OK;
836
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200837 case 'B': case 0x181: case 0x243: case 0x1e02:
838 case 0x1e04: case 0x1e06:
839 EMIT2('B')
840 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
841 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200842 return OK;
843
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200844 case 'C': case C_cedilla: case 0x106: case 0x108:
845 case 0x10a: case 0x10c: case 0x187: case 0x23b:
846 case 0x1e08: case 0xa792:
847 EMIT2('C') EMIT2(C_cedilla)
848 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
849 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
850 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200851 return OK;
852
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200853 case 'D': case 0x10e: case 0x110: case 0x18a:
854 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
855 case 0x1e12:
856 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
857 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
858 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200859 return OK;
860
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200861 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200862 case E_diaeresis: case 0x112: case 0x114: case 0x116:
863 case 0x118: case 0x11a: case 0x204: case 0x206:
864 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
865 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
866 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
867 case 0x1ec2: case 0x1ec4: case 0x1ec6:
868 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
869 EMIT2(E_circumflex) EMIT2(E_diaeresis)
870 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
871 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
872 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
873 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
874 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
875 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
876 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
877 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200878 return OK;
879
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200880 case 'F': case 0x191: case 0x1e1e: case 0xa798:
881 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200882 return OK;
883
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200884 case 'G': case 0x11c: case 0x11e: case 0x120:
885 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
886 case 0x1f4: case 0x1e20: case 0xa7a0:
887 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
888 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
889 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
890 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200891 return OK;
892
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200893 case 'H': case 0x124: case 0x126: case 0x21e:
894 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
895 case 0x1e2a: case 0x2c67:
896 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
897 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
898 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200899 return OK;
900
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200901 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200902 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
903 case 0x12e: case 0x130: case 0x197: case 0x1cf:
904 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
905 case 0x1ec8: case 0x1eca:
906 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
907 EMIT2(I_circumflex) EMIT2(I_diaeresis)
908 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
909 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
910 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
911 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
912 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200913 return OK;
914
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200915 case 'J': case 0x134: case 0x248:
916 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200917 return OK;
918
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200919 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
920 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
921 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
922 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
923 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200924 return OK;
925
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200926 case 'L': case 0x139: case 0x13b: case 0x13d:
927 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
928 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
929 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
930 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
931 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
932 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200933 return OK;
934
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200935 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
936 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
937 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200938 return OK;
939
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200940 case 'N': case N_virguilla:
941 case 0x143: case 0x145: case 0x147: case 0x1f8:
942 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
943 case 0xa7a4:
944 EMIT2('N') EMIT2(N_virguilla)
945 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
946 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
947 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200948 return OK;
949
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200950 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200951 case O_virguilla: case O_diaeresis: case O_slash:
952 case 0x14c: case 0x14e: case 0x150: case 0x19f:
953 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
954 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
955 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
956 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
957 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
958 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
959 case 0x1ede: case 0x1ee0: case 0x1ee2:
960 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
961 EMIT2(O_circumflex) EMIT2(O_virguilla)
962 EMIT2(O_diaeresis) EMIT2(O_slash)
963 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
964 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
965 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
966 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
967 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
968 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
969 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
970 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
971 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
972 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
973 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200974 return OK;
975
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200976 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
977 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
978 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200979 return OK;
980
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'Q': case 0x24a:
982 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200983 return OK;
984
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
986 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
987 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
988 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
989 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
990 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
991 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200992 return OK;
993
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200994 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
995 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
996 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
997 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
998 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
999 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
1000 EMIT2(0xa7a8)
1001 return OK;
1002
1003 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
1004 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
1005 case 0x1e6e: case 0x1e70:
1006 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
1007 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
1008 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001009 return OK;
1010
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001011 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001012 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
1013 case 0x16e: case 0x170: case 0x172: case 0x1af:
1014 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
1015 case 0x1db: case 0x214: case 0x216: case 0x244:
1016 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
1017 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
1018 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
1019 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
1020 EMIT2(U_diaeresis) EMIT2(U_circumflex)
1021 EMIT2(0x168) EMIT2(0x16a)
1022 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
1023 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
1024 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
1025 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
1026 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
1027 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
1028 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
1029 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
1030 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001031 return OK;
1032
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001033 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
1034 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001035 return OK;
1036
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001037 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
1038 case 0x1e86: case 0x1e88:
1039 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
1040 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001041 return OK;
1042
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001043 case 'X': case 0x1e8a: case 0x1e8c:
1044 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001045 return OK;
1046
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001047 case 'Y': case Y_acute: case 0x176: case 0x178:
1048 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
1049 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
1050 EMIT2('Y') EMIT2(Y_acute)
1051 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
1052 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
1053 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
1054 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001055 return OK;
1056
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001057 case 'Z': case 0x179: case 0x17b: case 0x17d:
1058 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1059 case 0x2c6b:
1060 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1061 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1062 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001063 return OK;
1064
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001065 case 'a': case a_grave: case a_acute: case a_circumflex:
1066 case a_virguilla: case a_diaeresis: case a_ring:
1067 case 0x101: case 0x103: case 0x105: case 0x1ce:
1068 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1069 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1070 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1071 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1072 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1073 case 0x1eb7: case 0x2c65:
1074 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1075 EMIT2(a_circumflex) EMIT2(a_virguilla)
1076 EMIT2(a_diaeresis) EMIT2(a_ring)
1077 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1078 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1079 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1080 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1081 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1082 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1083 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1084 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1085 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001086 return OK;
1087
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001088 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1089 case 0x1e03: case 0x1e05: case 0x1e07:
1090 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1091 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001092 return OK;
1093
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001094 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1095 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1096 case 0xa794:
1097 EMIT2('c') EMIT2(c_cedilla)
1098 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1099 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1100 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001101 return OK;
1102
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001103 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1104 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1105 case 0x1e11: case 0x1e13:
1106 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1107 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1108 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1109 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001110 return OK;
1111
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001112 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001113 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1114 case 0x119: case 0x11b: case 0x205: case 0x207:
1115 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1116 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1117 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1118 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1119 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1120 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1121 EMIT2(0x113) EMIT2(0x115)
1122 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1123 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1124 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1125 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1126 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1127 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1128 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001129 return OK;
1130
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001131 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1132 case 0x1e1f: case 0xa799:
1133 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1134 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001135 return OK;
1136
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001137 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1138 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1139 case 0x1e21: case 0xa7a1:
1140 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1141 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1142 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1143 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001144 return OK;
1145
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001146 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1147 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1148 case 0x1e96: case 0x2c68: case 0xa795:
1149 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1150 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1151 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1152 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001153 return OK;
1154
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001155 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001156 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1157 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1158 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1159 case 0x1ec9: case 0x1ecb:
1160 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1161 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1162 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1163 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1164 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1165 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1166 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001167 return OK;
1168
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001169 case 'j': case 0x135: case 0x1f0: case 0x249:
1170 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001171 return OK;
1172
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001173 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1174 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1175 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1176 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1177 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001178 return OK;
1179
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001180 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1181 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1182 case 0x1e3d: case 0x2c61:
1183 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1184 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1185 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1186 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001187 return OK;
1188
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001189 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1190 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1191 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001192 return OK;
1193
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001194 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1195 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1196 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1197 EMIT2('n') EMIT2(n_virguilla)
1198 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1199 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1200 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1201 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001202 return OK;
1203
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001204 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001205 case o_virguilla: case o_diaeresis: case o_slash:
1206 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1207 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1208 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1209 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1210 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1211 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1212 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1213 case 0x1edf: case 0x1ee1: case 0x1ee3:
1214 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1215 EMIT2(o_circumflex) EMIT2(o_virguilla)
1216 EMIT2(o_diaeresis) EMIT2(o_slash)
1217 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1218 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1219 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1220 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1221 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1222 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1223 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1224 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1225 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1226 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1227 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001228 return OK;
1229
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001230 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1231 case 0x1e55: case 0x1e57:
1232 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1233 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001234 return OK;
1235
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001236 case 'q': case 0x24b: case 0x2a0:
1237 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001238 return OK;
1239
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001240 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1241 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1242 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1243 case 0xa7a7:
1244 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1245 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1246 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1247 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001248 return OK;
1249
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001250 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1251 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1252 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1253 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1254 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1255 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1256 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1257 return OK;
1258
1259 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1260 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1261 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1262 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1263 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1264 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1265 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001266 return OK;
1267
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001268 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001269 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1270 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1271 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1272 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1273 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1274 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1275 case 0x1eed: case 0x1eef: case 0x1ef1:
1276 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1277 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1278 EMIT2(0x169) EMIT2(0x16b)
1279 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1280 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1281 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1282 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1283 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1284 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1285 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1286 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1287 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001288 return OK;
1289
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001290 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1291 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1292 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001293 return OK;
1294
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001295 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1296 case 0x1e87: case 0x1e89: case 0x1e98:
1297 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1298 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001299 return OK;
1300
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001301 case 'x': case 0x1e8b: case 0x1e8d:
1302 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001303 return OK;
1304
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001305 case 'y': case y_acute: case y_diaeresis: case 0x177:
1306 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1307 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1308 case 0x1ef9:
1309 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1310 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1311 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1312 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 return OK;
1314
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001315 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1316 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1317 case 0x1e95: case 0x2c6c:
1318 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1319 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1320 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001321 return OK;
1322
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001323 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001324 }
1325 }
1326
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001327 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001328 return OK;
1329#undef EMIT2
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001330#undef EMIT2
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001331}
1332
1333/*
1334 * Code to parse regular expression.
1335 *
1336 * We try to reuse parsing functions in regexp.c to
1337 * minimize surprise and keep the syntax consistent.
1338 */
1339
1340/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001341 * Parse the lowest level.
1342 *
1343 * An atom can be one of a long list of items. Many atoms match one character
1344 * in the text. It is often an ordinary character or a character class.
1345 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1346 * is only for syntax highlighting.
1347 *
1348 * atom ::= ordinary-atom
1349 * or \( pattern \)
1350 * or \%( pattern \)
1351 * or \z( pattern \)
1352 */
1353 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001354nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001355{
1356 int c;
1357 int charclass;
1358 int equiclass;
1359 int collclass;
1360 int got_coll_char;
1361 char_u *p;
1362 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001363 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001364 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001365 int emit_range;
1366 int negated;
1367 int result;
1368 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001369 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001370
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001371 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001372 switch (c)
1373 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001374 case NUL:
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001375 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar47196582013-05-25 22:04:23 +02001376
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001377 case Magic('^'):
1378 EMIT(NFA_BOL);
1379 break;
1380
1381 case Magic('$'):
1382 EMIT(NFA_EOL);
1383#if defined(FEAT_SYN_HL) || defined(PROTO)
1384 had_eol = TRUE;
1385#endif
1386 break;
1387
1388 case Magic('<'):
1389 EMIT(NFA_BOW);
1390 break;
1391
1392 case Magic('>'):
1393 EMIT(NFA_EOW);
1394 break;
1395
1396 case Magic('_'):
1397 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001398 if (c == NUL)
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001399 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar174a8482013-11-28 14:20:17 +01001400
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001401 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001402 {
1403 EMIT(NFA_BOL);
1404 break;
1405 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001406 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001407 {
1408 EMIT(NFA_EOL);
1409#if defined(FEAT_SYN_HL) || defined(PROTO)
1410 had_eol = TRUE;
1411#endif
1412 break;
1413 }
1414
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001415 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001416
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001417 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001418 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001419 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001420
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001421 // "\_x" is character class plus newline
1422 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001423
1424 /*
1425 * Character classes.
1426 */
1427 case Magic('.'):
1428 case Magic('i'):
1429 case Magic('I'):
1430 case Magic('k'):
1431 case Magic('K'):
1432 case Magic('f'):
1433 case Magic('F'):
1434 case Magic('p'):
1435 case Magic('P'):
1436 case Magic('s'):
1437 case Magic('S'):
1438 case Magic('d'):
1439 case Magic('D'):
1440 case Magic('x'):
1441 case Magic('X'):
1442 case Magic('o'):
1443 case Magic('O'):
1444 case Magic('w'):
1445 case Magic('W'):
1446 case Magic('h'):
1447 case Magic('H'):
1448 case Magic('a'):
1449 case Magic('A'):
1450 case Magic('l'):
1451 case Magic('L'):
1452 case Magic('u'):
1453 case Magic('U'):
1454 p = vim_strchr(classchars, no_Magic(c));
1455 if (p == NULL)
1456 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001457 if (extra == NFA_ADD_NL)
1458 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001459 semsg(_(e_nfa_regexp_invalid_character_class_nr), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001460 rc_did_emsg = TRUE;
1461 return FAIL;
1462 }
Bram Moolenaarb5443cc2019-01-15 20:19:40 +01001463 siemsg("INTERNAL: Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001464 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001465 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001466
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001467 // When '.' is followed by a composing char ignore the dot, so that
1468 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001469 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1470 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001471 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001472 c = getchr();
1473 goto nfa_do_multibyte;
1474 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001475 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001476 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001477 {
1478 EMIT(NFA_NEWL);
1479 EMIT(NFA_OR);
1480 regflags |= RF_HASNL;
1481 }
1482 break;
1483
1484 case Magic('n'):
1485 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001486 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001487 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001488 else
1489 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001490 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001491 EMIT(NFA_NEWL);
1492 regflags |= RF_HASNL;
1493 }
1494 break;
1495
1496 case Magic('('):
1497 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001498 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001499 break;
1500
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001501 case Magic('|'):
1502 case Magic('&'):
1503 case Magic(')'):
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001504 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001505 return FAIL;
1506
1507 case Magic('='):
1508 case Magic('?'):
1509 case Magic('+'):
1510 case Magic('@'):
1511 case Magic('*'):
1512 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001513 // these should follow an atom, not form an atom
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001514 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001515 return FAIL;
1516
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001517 case Magic('~'):
1518 {
1519 char_u *lp;
1520
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001521 // Previous substitute pattern.
1522 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001523 if (reg_prev_sub == NULL)
1524 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001525 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001526 return FAIL;
1527 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001528 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001529 {
1530 EMIT(PTR2CHAR(lp));
1531 if (lp != reg_prev_sub)
1532 EMIT(NFA_CONCAT);
1533 }
1534 EMIT(NFA_NOPEN);
1535 break;
1536 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001537
Bram Moolenaar428e9872013-05-30 17:05:39 +02001538 case Magic('1'):
1539 case Magic('2'):
1540 case Magic('3'):
1541 case Magic('4'):
1542 case Magic('5'):
1543 case Magic('6'):
1544 case Magic('7'):
1545 case Magic('8'):
1546 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001547 {
1548 int refnum = no_Magic(c) - '1';
1549
1550 if (!seen_endbrace(refnum + 1))
1551 return FAIL;
1552 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001553 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001554 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001555 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001556
1557 case Magic('z'):
1558 c = no_Magic(getchr());
1559 switch (c)
1560 {
1561 case 's':
1562 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001563 if (re_mult_next("\\zs") == FAIL)
1564 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001565 break;
1566 case 'e':
1567 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001568 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001569 if (re_mult_next("\\ze") == FAIL)
1570 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001571 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001572#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001573 case '1':
1574 case '2':
1575 case '3':
1576 case '4':
1577 case '5':
1578 case '6':
1579 case '7':
1580 case '8':
1581 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001582 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001583 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001584 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001585 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001586 // No need to set rex.nfa_has_backref, the sub-matches don't
1587 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001588 re_has_z = REX_USE;
1589 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001590 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001591 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001592 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001593 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001594 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001595 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001596 re_has_z = REX_SET;
1597 break;
1598#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001599 default:
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001600 semsg(_(e_nfa_regexp_unknown_operator_z_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001601 return FAIL;
1602 }
1603 break;
1604
1605 case Magic('%'):
1606 c = no_Magic(getchr());
1607 switch (c)
1608 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001609 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001610 case '(':
1611 if (nfa_reg(REG_NPAREN) == FAIL)
1612 return FAIL;
1613 EMIT(NFA_NOPEN);
1614 break;
1615
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001616 case 'd': // %d123 decimal
1617 case 'o': // %o123 octal
1618 case 'x': // %xab hex 2
1619 case 'u': // %uabcd hex 4
1620 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001621 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001622 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001623
Bram Moolenaar47196582013-05-25 22:04:23 +02001624 switch (c)
1625 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001626 case 'd': nr = getdecchrs(); break;
1627 case 'o': nr = getoctchrs(); break;
1628 case 'x': nr = gethexchrs(2); break;
1629 case 'u': nr = gethexchrs(4); break;
1630 case 'U': nr = gethexchrs(8); break;
1631 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001632 }
1633
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001634 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001635 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1636 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001637 // A NUL is stored in the text as NL
1638 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001639 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001640 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001641 break;
1642
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001643 // Catch \%^ and \%$ regardless of where they appear in the
1644 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001645 case '^':
1646 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001647 break;
1648
1649 case '$':
1650 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001651 break;
1652
1653 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +02001654 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001655 break;
1656
1657 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001658 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001659 break;
1660
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001661 case 'C':
1662 EMIT(NFA_ANY_COMPOSING);
1663 break;
1664
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001665 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001666 {
1667 int n;
1668
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001669 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001670 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001671 {
1672 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001673 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001674 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001675 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001676 if (nfa_regatom() == FAIL)
1677 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001678 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001679 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001680 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001681 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001682 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001683 EMIT(NFA_OPT_CHARS);
1684 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001685
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001686 // Emit as "\%(\%[abc]\)" to be able to handle
1687 // "\%[abc]*" which would cause the empty string to be
1688 // matched an unlimited number of times. NFA_NOPEN is
1689 // added only once at a position, while NFA_SPLIT is
1690 // added multiple times. This is more efficient than
1691 // not allowing NFA_SPLIT multiple times, it is used
1692 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001693 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001694 break;
1695 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001696
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001697 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001698 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001699 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001700 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001701 int cur = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001702
1703 if (c == '<' || c == '>')
1704 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001705 if (no_Magic(c) == '.')
1706 {
1707 cur = TRUE;
1708 c = getchr();
1709 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001710 while (VIM_ISDIGIT(c))
1711 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001712 long_u tmp;
1713
1714 if (cur)
1715 semsg(_(e_regexp_number_after_dot_pos_search),
1716 no_Magic(c));
1717 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001718
1719 if (tmp < n)
1720 {
1721 // overflow.
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001722 emsg(_(e_percent_value_too_large));
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001723 return FAIL;
1724 }
1725 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001726 c = getchr();
1727 }
1728 if (c == 'l' || c == 'c' || c == 'v')
1729 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001730 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001731
Bram Moolenaar423532e2013-05-29 21:14:42 +02001732 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001733 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001734 if (cur)
1735 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001736 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001737 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001738 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001739 if (save_prev_at_start)
1740 at_start = TRUE;
1741 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001742 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001743 {
1744 if (cur)
1745 {
1746 n = curwin->w_cursor.col;
1747 n++;
1748 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001749 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001750 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001751 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001752 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001753 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001754 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001755 if (cur)
1756 {
1757 colnr_T vcol = 0;
1758
1759 getvvcol(curwin, &curwin->w_cursor,
1760 NULL, NULL, &vcol);
1761 n = ++vcol;
1762 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001763 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001764 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001765 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001766 limit = INT_MAX / MB_MAXBYTES;
1767 }
1768 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001769 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001770 emsg(_(e_percent_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001771 return FAIL;
1772 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001773 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001774 break;
1775 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001776 else if (c == '\'' && n == 0)
1777 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001778 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001779 EMIT(cmp == '<' ? NFA_MARK_LT :
1780 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001781 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001782 break;
1783 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001784 }
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001785 semsg(_(e_nfa_regexp_unknown_operator_percent_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001786 return FAIL;
1787 }
1788 break;
1789
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001790 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001791collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001792 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001793 * [abc] uses NFA_START_COLL - NFA_END_COLL
1794 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1795 * Each character is produced as a regular state, using
1796 * NFA_CONCAT to bind them together.
1797 * Besides normal characters there can be:
1798 * - character classes NFA_CLASS_*
1799 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001800 */
1801
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001802 p = regparse;
1803 endp = skip_anyof(p);
1804 if (*endp == ']')
1805 {
1806 /*
1807 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001808 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001809 * and perform the necessary substitutions in the NFA.
1810 */
1811 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001812 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001813 if (result != FAIL)
1814 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001815 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001816 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001817 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001818 EMIT(NFA_NEWL);
1819 EMIT(NFA_OR);
1820 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001821 else
1822 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001823 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001824 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001825 return OK;
1826 }
1827 /*
1828 * Failed to recognize a character class. Use the simple
1829 * version that turns [abc] into 'a' OR 'b' OR 'c'
1830 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001831 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001832 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001833 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001834 {
1835 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001836 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001837 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001838 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001839 else
1840 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001841 if (*regparse == '-')
1842 {
1843 startc = '-';
1844 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001845 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001846 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001847 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001848 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001849 emit_range = FALSE;
1850 while (regparse < endp)
1851 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001852 int oldstartc = startc;
1853
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001854 startc = -1;
1855 got_coll_char = FALSE;
1856 if (*regparse == '[')
1857 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001858 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001859 equiclass = collclass = 0;
1860 charclass = get_char_class(&regparse);
1861 if (charclass == CLASS_NONE)
1862 {
1863 equiclass = get_equi_class(&regparse);
1864 if (equiclass == 0)
1865 collclass = get_coll_element(&regparse);
1866 }
1867
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001868 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001869 if (charclass != CLASS_NONE)
1870 {
1871 switch (charclass)
1872 {
1873 case CLASS_ALNUM:
1874 EMIT(NFA_CLASS_ALNUM);
1875 break;
1876 case CLASS_ALPHA:
1877 EMIT(NFA_CLASS_ALPHA);
1878 break;
1879 case CLASS_BLANK:
1880 EMIT(NFA_CLASS_BLANK);
1881 break;
1882 case CLASS_CNTRL:
1883 EMIT(NFA_CLASS_CNTRL);
1884 break;
1885 case CLASS_DIGIT:
1886 EMIT(NFA_CLASS_DIGIT);
1887 break;
1888 case CLASS_GRAPH:
1889 EMIT(NFA_CLASS_GRAPH);
1890 break;
1891 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001892 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001893 EMIT(NFA_CLASS_LOWER);
1894 break;
1895 case CLASS_PRINT:
1896 EMIT(NFA_CLASS_PRINT);
1897 break;
1898 case CLASS_PUNCT:
1899 EMIT(NFA_CLASS_PUNCT);
1900 break;
1901 case CLASS_SPACE:
1902 EMIT(NFA_CLASS_SPACE);
1903 break;
1904 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001905 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001906 EMIT(NFA_CLASS_UPPER);
1907 break;
1908 case CLASS_XDIGIT:
1909 EMIT(NFA_CLASS_XDIGIT);
1910 break;
1911 case CLASS_TAB:
1912 EMIT(NFA_CLASS_TAB);
1913 break;
1914 case CLASS_RETURN:
1915 EMIT(NFA_CLASS_RETURN);
1916 break;
1917 case CLASS_BACKSPACE:
1918 EMIT(NFA_CLASS_BACKSPACE);
1919 break;
1920 case CLASS_ESCAPE:
1921 EMIT(NFA_CLASS_ESCAPE);
1922 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001923 case CLASS_IDENT:
1924 EMIT(NFA_CLASS_IDENT);
1925 break;
1926 case CLASS_KEYWORD:
1927 EMIT(NFA_CLASS_KEYWORD);
1928 break;
1929 case CLASS_FNAME:
1930 EMIT(NFA_CLASS_FNAME);
1931 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001932 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001933 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001934 continue;
1935 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001936 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001937 if (equiclass != 0)
1938 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001939 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001940 if (result == FAIL)
1941 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001942 // should never happen
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001943 EMSG_RET_FAIL(_(e_error_building_nfa_with_equivalence_class));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001944 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001945 continue;
1946 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001947 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001948 if (collclass != 0)
1949 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001950 startc = collclass; // allow [.a.]-x as a range
1951 // Will emit the proper atom at the end of the
1952 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001953 }
1954 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001955 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1956 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001957 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001958 {
1959 emit_range = TRUE;
1960 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001961 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001962 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001963 }
1964
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001965 // Now handle simple and escaped characters.
1966 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1967 // accepts "\t", "\e", etc., but only when the 'l' flag in
1968 // 'cpoptions' is not included.
1969 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001970 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001971 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001972 && regparse + 1 <= endp
1973 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001974 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001975 && vim_strchr(REGEXP_ABBR, regparse[1])
1976 != NULL)
1977 )
1978 )
1979 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001980 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001981
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001982 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001983 startc = (reg_string || emit_range
1984 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001985 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001986 || *regparse == 'o'
1987 || *regparse == 'x'
1988 || *regparse == 'u'
1989 || *regparse == 'U'
1990 )
1991 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001992 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001993 startc = coll_get_char();
1994 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001995 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001996 }
1997 else
1998 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001999 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002000 startc = backslash_trans(*regparse);
2001 }
2002 }
2003
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002004 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002005 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002006 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002007
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002008 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002009 if (emit_range)
2010 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02002011 int endc = startc;
2012
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002013 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002014 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002015 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02002016
2017 if (endc > startc + 2)
2018 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002019 // Emit a range instead of the sequence of
2020 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002021 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002022 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002023 EMIT(1);
2024 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002025 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02002026 EMIT(endc);
2027 EMIT(NFA_RANGE);
2028 EMIT(NFA_CONCAT);
2029 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002030 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002031 || (*mb_char2len)(endc) > 1))
2032 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002033 // Emit the characters in the range.
2034 // "startc" was already emitted, so skip it.
2035 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002036 for (c = startc + 1; c <= endc; c++)
2037 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002038 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002039 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002040 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002041 }
2042 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002043 {
2044#ifdef EBCDIC
2045 int alpha_only = FALSE;
2046
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002047 // for alphabetical range skip the gaps
2048 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002049 if (isalpha(startc) && isalpha(endc))
2050 alpha_only = TRUE;
2051#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002052 // Emit the range. "startc" was already emitted, so
2053 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002054 for (c = startc + 1; c <= endc; c++)
2055#ifdef EBCDIC
2056 if (!alpha_only || isalpha(startc))
2057#endif
2058 {
2059 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002060 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002061 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002062 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002063 emit_range = FALSE;
2064 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002065 }
2066 else
2067 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002068 // This char (startc) is not part of a range. Just
2069 // emit it.
2070 // Normally, simply emit startc. But if we get char
2071 // code=0 from a collating char, then replace it with
2072 // 0x0a.
2073 // This is needed to completely mimic the behaviour of
2074 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002075 if (startc == NFA_NEWL)
2076 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002077 // Line break can't be matched as part of the
2078 // collection, add an OR below. But not for negated
2079 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002080 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002081 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002082 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002083 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002084 {
2085 if (got_coll_char == TRUE && startc == 0)
2086 EMIT(0x0a);
2087 else
2088 EMIT(startc);
2089 EMIT(NFA_CONCAT);
2090 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002091 }
2092
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002093 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002094 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002095
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002096 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002097 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002098 {
2099 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002100 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002101 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002102
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002103 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002104 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002105 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002106
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002107 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002108 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002109 EMIT(NFA_END_NEG_COLL);
2110 else
2111 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002112
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002113 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002114 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002115 {
2116 EMIT(reg_string ? NL : NFA_NEWL);
2117 EMIT(NFA_OR);
2118 }
2119
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002120 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002121 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002122
2123 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002124 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002125 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002126
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002127 default:
2128 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002129 int plen;
2130
2131nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002132 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002133 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002134 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002135 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002136 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002137 int i = 0;
2138
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002139 // A base character plus composing characters, or just one
2140 // or more composing characters.
2141 // This requires creating a separate atom as if enclosing
2142 // the characters in (), where NFA_COMPOSING is the ( and
2143 // NFA_END_COMPOSING is the ). Note that right now we are
2144 // building the postfix form, not the NFA itself;
2145 // a composing char could be: a, b, c, NFA_COMPOSING
2146 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002147 for (;;)
2148 {
2149 EMIT(c);
2150 if (i > 0)
2151 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002152 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002153 break;
2154 c = utf_ptr2char(old_regparse + i);
2155 }
2156 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002157 regparse = old_regparse + plen;
2158 }
2159 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002160 {
2161 c = no_Magic(c);
2162 EMIT(c);
2163 }
2164 return OK;
2165 }
2166 }
2167
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002168 return OK;
2169}
2170
2171/*
2172 * Parse something followed by possible [*+=].
2173 *
2174 * A piece is an atom, possibly followed by a multi, an indication of how many
2175 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2176 * characters: "", "a", "aa", etc.
2177 *
2178 * piece ::= atom
2179 * or atom multi
2180 */
2181 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002182nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002183{
2184 int i;
2185 int op;
2186 int ret;
2187 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002188 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002189 parse_state_T old_state;
2190 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002191 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002192 int old_post_pos;
2193 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002194 int quest;
2195
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002196 // Save the current parse state, so that we can use it if <atom>{m,n} is
2197 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002198 save_parse_state(&old_state);
2199
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002200 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002201 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002202
2203 ret = nfa_regatom();
2204 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002205 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002206
2207 op = peekchr();
2208 if (re_multi_type(op) == NOT_MULTI)
2209 return OK;
2210
2211 skipchr();
2212 switch (op)
2213 {
2214 case Magic('*'):
2215 EMIT(NFA_STAR);
2216 break;
2217
2218 case Magic('+'):
2219 /*
2220 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2221 * first and only submatch would be "aaa". But the backtracking
2222 * engine interprets the plus as "try matching one more time", and
2223 * a* matches a second time at the end of the input, the empty
2224 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002225 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002226 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002227 * In order to be consistent with the old engine, we replace
2228 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002229 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002230 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002231 curchr = -1;
2232 if (nfa_regatom() == FAIL)
2233 return FAIL;
2234 EMIT(NFA_STAR);
2235 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002236 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002237 break;
2238
2239 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002240 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002241 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002242 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002243 switch(op)
2244 {
2245 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002246 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002247 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002248 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002249 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002250 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002251 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002252 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002253 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002254 op = no_Magic(getchr());
2255 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002256 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002257 i = NFA_PREV_ATOM_JUST_BEFORE;
2258 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002259 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002260 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2261 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002262 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002263 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002264 i = NFA_PREV_ATOM_LIKE_PATTERN;
2265 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002266 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002267 if (i == 0)
2268 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002269 semsg(_(e_nfa_regexp_unknown_operator_at_chr), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002270 return FAIL;
2271 }
2272 EMIT(i);
2273 if (i == NFA_PREV_ATOM_JUST_BEFORE
2274 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2275 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002276 break;
2277
2278 case Magic('?'):
2279 case Magic('='):
2280 EMIT(NFA_QUEST);
2281 break;
2282
2283 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002284 // a{2,5} will expand to 'aaa?a?a?'
2285 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2286 // version of '?'
2287 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2288 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002289
2290 greedy = TRUE;
2291 c2 = peekchr();
2292 if (c2 == '-' || c2 == Magic('-'))
2293 {
2294 skipchr();
2295 greedy = FALSE;
2296 }
2297 if (!read_limits(&minval, &maxval))
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002298 EMSG_RET_FAIL(_(e_nfa_regexp_error_reading_repetition_limits));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002299
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002300 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2301 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002302 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002303 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002304 if (greedy) // { { (match the braces)
2305 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002306 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002307 else // { { (match the braces)
2308 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002309 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002310 break;
2311 }
2312
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002313 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002314 if (maxval == 0)
2315 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002316 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002317 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002318 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002319 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002320 return OK;
2321 }
2322
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002323 // The engine is very inefficient (uses too many states) when the
2324 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002325 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2326 // will emit NFA_STAR.
2327 // Bail out if we can use the other engine, but only, when the
2328 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002329 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002330 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002331 && (maxval > 500 || maxval > minval + 200)
2332 && (maxval != MAX_LIMIT && minval < 200)
2333 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002334 return FAIL;
2335
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002336 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002337 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002338 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002339 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002340
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002341 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2342 for (i = 0; i < maxval; i++)
2343 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002344 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002345 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002346 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002347 if (nfa_regatom() == FAIL)
2348 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002349 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002350 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002351 {
2352 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002353 {
2354 if (greedy)
2355 EMIT(NFA_STAR);
2356 else
2357 EMIT(NFA_STAR_NONGREEDY);
2358 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002359 else
2360 EMIT(quest);
2361 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002362 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002363 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002364 if (i + 1 > minval && maxval == MAX_LIMIT)
2365 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002366 }
2367
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002368 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002369 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002370 curchr = -1;
2371
2372 break;
2373
2374
2375 default:
2376 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002377 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002378
2379 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002380 // Can't have a multi follow a multi.
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002381 EMSG_RET_FAIL(_(e_nfa_regexp_cant_have_multi_follow_multi));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002382
2383 return OK;
2384}
2385
2386/*
2387 * Parse one or more pieces, concatenated. It matches a match for the
2388 * first piece, followed by a match for the second piece, etc. Example:
2389 * "f[0-9]b", first matches "f", then a digit and then "b".
2390 *
2391 * concat ::= piece
2392 * or piece piece
2393 * or piece piece piece
2394 * etc.
2395 */
2396 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002397nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002398{
2399 int cont = TRUE;
2400 int first = TRUE;
2401
2402 while (cont)
2403 {
2404 switch (peekchr())
2405 {
2406 case NUL:
2407 case Magic('|'):
2408 case Magic('&'):
2409 case Magic(')'):
2410 cont = FALSE;
2411 break;
2412
2413 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002414 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002415 skipchr_keepstart();
2416 break;
2417 case Magic('c'):
2418 regflags |= RF_ICASE;
2419 skipchr_keepstart();
2420 break;
2421 case Magic('C'):
2422 regflags |= RF_NOICASE;
2423 skipchr_keepstart();
2424 break;
2425 case Magic('v'):
2426 reg_magic = MAGIC_ALL;
2427 skipchr_keepstart();
2428 curchr = -1;
2429 break;
2430 case Magic('m'):
2431 reg_magic = MAGIC_ON;
2432 skipchr_keepstart();
2433 curchr = -1;
2434 break;
2435 case Magic('M'):
2436 reg_magic = MAGIC_OFF;
2437 skipchr_keepstart();
2438 curchr = -1;
2439 break;
2440 case Magic('V'):
2441 reg_magic = MAGIC_NONE;
2442 skipchr_keepstart();
2443 curchr = -1;
2444 break;
2445
2446 default:
2447 if (nfa_regpiece() == FAIL)
2448 return FAIL;
2449 if (first == FALSE)
2450 EMIT(NFA_CONCAT);
2451 else
2452 first = FALSE;
2453 break;
2454 }
2455 }
2456
2457 return OK;
2458}
2459
2460/*
2461 * Parse a branch, one or more concats, separated by "\&". It matches the
2462 * last concat, but only if all the preceding concats also match at the same
2463 * position. Examples:
2464 * "foobeep\&..." matches "foo" in "foobeep".
2465 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2466 *
2467 * branch ::= concat
2468 * or concat \& concat
2469 * or concat \& concat \& concat
2470 * etc.
2471 */
2472 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002473nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002474{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002475 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002476
Bram Moolenaar16299b52013-05-30 18:45:23 +02002477 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002478
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002479 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002480 if (nfa_regconcat() == FAIL)
2481 return FAIL;
2482
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002483 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002484 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002485 {
2486 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002487 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002488 if (old_post_pos == (int)(post_ptr - post_start))
2489 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002490 EMIT(NFA_NOPEN);
2491 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002492 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002493 if (nfa_regconcat() == FAIL)
2494 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002495 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002496 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002497 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002498 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002499 }
2500
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002501 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002502 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002503 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002504
2505 return OK;
2506}
2507
2508/*
2509 * Parse a pattern, one or more branches, separated by "\|". It matches
2510 * anything that matches one of the branches. Example: "foo\|beep" matches
2511 * "foo" and matches "beep". If more than one branch matches, the first one
2512 * is used.
2513 *
2514 * pattern ::= branch
2515 * or branch \| branch
2516 * or branch \| branch \| branch
2517 * etc.
2518 */
2519 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002520nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002521 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002522{
2523 int parno = 0;
2524
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002525 if (paren == REG_PAREN)
2526 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002527 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002528 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_parens));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002529 parno = regnpar++;
2530 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002531#ifdef FEAT_SYN_HL
2532 else if (paren == REG_ZPAREN)
2533 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002534 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002535 if (regnzpar >= NSUBEXP)
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002536 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_z));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002537 parno = regnzpar++;
2538 }
2539#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002540
2541 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002542 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002543
2544 while (peekchr() == Magic('|'))
2545 {
2546 skipchr();
2547 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002548 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002549 EMIT(NFA_OR);
2550 }
2551
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002552 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002553 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2554 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002555 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002556 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2557 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002558 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002559 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002560 }
2561 else if (paren == REG_NOPAREN && peekchr() != NUL)
2562 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002563 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002564 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002565 else
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002566 EMSG_RET_FAIL(_(e_nfa_regexp_proper_termination_error));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002567 }
2568 /*
2569 * Here we set the flag allowing back references to this set of
2570 * parentheses.
2571 */
2572 if (paren == REG_PAREN)
2573 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002574 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002575 EMIT(NFA_MOPEN + parno);
2576 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002577#ifdef FEAT_SYN_HL
2578 else if (paren == REG_ZPAREN)
2579 EMIT(NFA_ZOPEN + parno);
2580#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002581
2582 return OK;
2583}
2584
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002585#ifdef DEBUG
2586static char_u code[50];
2587
2588 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002589nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002590{
2591 int addnl = FALSE;
2592
2593 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2594 {
2595 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002596 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597 }
2598
2599 STRCPY(code, "");
2600 switch (c)
2601 {
2602 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2603 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2604 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2605 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2606 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2607 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2608
Bram Moolenaar5714b802013-05-28 22:03:20 +02002609 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2610 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2611 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2612 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2613 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2614 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2615 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2616 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2617 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002618#ifdef FEAT_SYN_HL
2619 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2620 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2621 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2622 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2623 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2624 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2625 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2626 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2627 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2628#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002629 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2630
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002631 case NFA_PREV_ATOM_NO_WIDTH:
2632 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002633 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2634 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002635 case NFA_PREV_ATOM_JUST_BEFORE:
2636 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2637 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2638 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002639 case NFA_PREV_ATOM_LIKE_PATTERN:
2640 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2641
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002642 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2643 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002644 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002645 case NFA_START_INVISIBLE_FIRST:
2646 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002647 case NFA_START_INVISIBLE_NEG:
2648 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002649 case NFA_START_INVISIBLE_NEG_FIRST:
2650 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002651 case NFA_START_INVISIBLE_BEFORE:
2652 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002653 case NFA_START_INVISIBLE_BEFORE_FIRST:
2654 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002655 case NFA_START_INVISIBLE_BEFORE_NEG:
2656 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002657 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2658 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002659 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002660 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002661 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002662 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002663
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002664 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2665 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002666 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002667
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002668 case NFA_MOPEN:
2669 case NFA_MOPEN1:
2670 case NFA_MOPEN2:
2671 case NFA_MOPEN3:
2672 case NFA_MOPEN4:
2673 case NFA_MOPEN5:
2674 case NFA_MOPEN6:
2675 case NFA_MOPEN7:
2676 case NFA_MOPEN8:
2677 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002678 STRCPY(code, "NFA_MOPEN(x)");
2679 code[10] = c - NFA_MOPEN + '0';
2680 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002681 case NFA_MCLOSE:
2682 case NFA_MCLOSE1:
2683 case NFA_MCLOSE2:
2684 case NFA_MCLOSE3:
2685 case NFA_MCLOSE4:
2686 case NFA_MCLOSE5:
2687 case NFA_MCLOSE6:
2688 case NFA_MCLOSE7:
2689 case NFA_MCLOSE8:
2690 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002691 STRCPY(code, "NFA_MCLOSE(x)");
2692 code[11] = c - NFA_MCLOSE + '0';
2693 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002694#ifdef FEAT_SYN_HL
2695 case NFA_ZOPEN:
2696 case NFA_ZOPEN1:
2697 case NFA_ZOPEN2:
2698 case NFA_ZOPEN3:
2699 case NFA_ZOPEN4:
2700 case NFA_ZOPEN5:
2701 case NFA_ZOPEN6:
2702 case NFA_ZOPEN7:
2703 case NFA_ZOPEN8:
2704 case NFA_ZOPEN9:
2705 STRCPY(code, "NFA_ZOPEN(x)");
2706 code[10] = c - NFA_ZOPEN + '0';
2707 break;
2708 case NFA_ZCLOSE:
2709 case NFA_ZCLOSE1:
2710 case NFA_ZCLOSE2:
2711 case NFA_ZCLOSE3:
2712 case NFA_ZCLOSE4:
2713 case NFA_ZCLOSE5:
2714 case NFA_ZCLOSE6:
2715 case NFA_ZCLOSE7:
2716 case NFA_ZCLOSE8:
2717 case NFA_ZCLOSE9:
2718 STRCPY(code, "NFA_ZCLOSE(x)");
2719 code[11] = c - NFA_ZCLOSE + '0';
2720 break;
2721#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002722 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2723 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2724 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2725 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002726 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2727 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002728 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2729 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2730 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2731 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2732 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2733 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2734 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2735 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2736 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2737 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2738 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2739 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2740 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2741 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002742 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002743
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002744 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002745 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2746 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2747 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002748 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002749 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002750
2751 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2752 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2753 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2754 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2755 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2756 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2757 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2758
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002759 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2760 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2761 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2762 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2763 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2764 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2765 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2766 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2767 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2768 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2769 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2770 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2771 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2772 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2773 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2774 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002775 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2776 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2777 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002778
2779 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2780 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2781 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2782 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2783 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2784 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2785 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2786 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2787 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2788 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2789 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2790 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2791 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2792 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2793 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2794 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2795 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2796 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2797 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2798 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2799 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2800 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2801 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2802 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2803 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2804 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2805 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002806 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2807 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2808 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2809 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002810
2811 default:
2812 STRCPY(code, "CHAR(x)");
2813 code[5] = c;
2814 }
2815
2816 if (addnl == TRUE)
2817 STRCAT(code, " + NEWLINE ");
2818
2819}
2820
2821#ifdef ENABLE_LOG
2822static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002823static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002824
2825/*
2826 * Print the postfix notation of the current regexp.
2827 */
2828 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002829nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002830{
2831 int *p;
2832 FILE *f;
2833
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002834 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002835 if (f != NULL)
2836 {
2837 fprintf(f, "\n-------------------------\n");
2838 if (retval == FAIL)
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002839 fprintf(f, ">>> NFA engine failed... \n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002840 else if (retval == OK)
2841 fprintf(f, ">>> NFA engine succeeded !\n");
2842 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002843 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002844 {
2845 nfa_set_code(*p);
2846 fprintf(f, "%s, ", code);
2847 }
2848 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002849 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002850 fprintf(f, "%d ", *p);
2851 fprintf(f, "\n\n");
2852 fclose(f);
2853 }
2854}
2855
2856/*
2857 * Print the NFA starting with a root node "state".
2858 */
2859 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002860nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002861{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002862 garray_T indent;
2863
2864 ga_init2(&indent, 1, 64);
2865 ga_append(&indent, '\0');
2866 nfa_print_state2(debugf, state, &indent);
2867 ga_clear(&indent);
2868}
2869
2870 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002871nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002872{
2873 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002874
2875 if (state == NULL)
2876 return;
2877
2878 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002879
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002880 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002881 p = (char_u *)indent->ga_data;
2882 if (indent->ga_len >= 3)
2883 {
2884 int last = indent->ga_len - 3;
2885 char_u save[2];
2886
2887 STRNCPY(save, &p[last], 2);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00002888 memcpy(&p[last], "+-", 2);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002889 fprintf(debugf, " %s", p);
2890 STRNCPY(&p[last], save, 2);
2891 }
2892 else
2893 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002894
2895 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002896 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002897 code,
2898 state->c,
2899 abs(state->id),
2900 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002901 if (state->id < 0)
2902 return;
2903
2904 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002905
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002906 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002907 indent->ga_len -= 1;
2908 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002909 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002910 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002911 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002912 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002913
2914 nfa_print_state2(debugf, state->out, indent);
2915
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002916 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002917 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002918 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002919 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002920
2921 nfa_print_state2(debugf, state->out1, indent);
2922
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002923 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002924 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002925 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002926}
2927
2928/*
2929 * Print the NFA state machine.
2930 */
2931 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002932nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002933{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002934 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002935
2936 if (debugf != NULL)
2937 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02002938 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002939
Bram Moolenaar473de612013-06-08 18:19:48 +02002940 if (prog->reganch)
2941 fprintf(debugf, "reganch: %d\n", prog->reganch);
2942 if (prog->regstart != NUL)
2943 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2944 prog->regstart, prog->regstart);
2945 if (prog->match_text != NULL)
2946 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002947
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002948 fclose(debugf);
2949 }
2950}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002951#endif // ENABLE_LOG
2952#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002953
2954/*
2955 * Parse r.e. @expr and convert it into postfix form.
2956 * Return the postfix string on success, NULL otherwise.
2957 */
2958 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002959re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002960{
2961 if (nfa_reg(REG_NOPAREN) == FAIL)
2962 return NULL;
2963 EMIT(NFA_MOPEN);
2964 return post_start;
2965}
2966
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002967// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002968
2969/*
2970 * Represents an NFA state plus zero or one or two arrows exiting.
2971 * if c == MATCH, no arrows out; matching state.
2972 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2973 * If c < 256, labeled arrow with character c to out.
2974 */
2975
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002976static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002977
2978/*
2979 * Allocate and initialize nfa_state_T.
2980 */
2981 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002982alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002983{
2984 nfa_state_T *s;
2985
2986 if (istate >= nstate)
2987 return NULL;
2988
2989 s = &state_ptr[istate++];
2990
2991 s->c = c;
2992 s->out = out;
2993 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002994 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002995
2996 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002997 s->lastlist[0] = 0;
2998 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002999
3000 return s;
3001}
3002
3003/*
3004 * A partially built NFA without the matching state filled in.
3005 * Frag_T.start points at the start state.
3006 * Frag_T.out is a list of places that need to be set to the
3007 * next state for this fragment.
3008 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003009
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003010// Since the out pointers in the list are always
3011// uninitialized, we use the pointers themselves
3012// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003013typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003014union Ptrlist
3015{
3016 Ptrlist *next;
3017 nfa_state_T *s;
3018};
3019
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003020struct Frag
3021{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003022 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003023 Ptrlist *out;
3024};
3025typedef struct Frag Frag_T;
3026
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003027/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02003028 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003029 */
3030 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003031frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003032{
Bram Moolenaar053bb602013-05-20 13:55:21 +02003033 Frag_T n;
3034
3035 n.start = start;
3036 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003037 return n;
3038}
3039
3040/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003041 * Create singleton list containing just outp.
3042 */
3043 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003044list1(
3045 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003046{
3047 Ptrlist *l;
3048
3049 l = (Ptrlist *)outp;
3050 l->next = NULL;
3051 return l;
3052}
3053
3054/*
3055 * Patch the list of states at out to point to start.
3056 */
3057 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003058patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003059{
3060 Ptrlist *next;
3061
3062 for (; l; l = next)
3063 {
3064 next = l->next;
3065 l->s = s;
3066 }
3067}
3068
3069
3070/*
3071 * Join the two lists l1 and l2, returning the combination.
3072 */
3073 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003074append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003075{
3076 Ptrlist *oldl1;
3077
3078 oldl1 = l1;
3079 while (l1->next)
3080 l1 = l1->next;
3081 l1->next = l2;
3082 return oldl1;
3083}
3084
3085/*
3086 * Stack used for transforming postfix form into NFA.
3087 */
3088static Frag_T empty;
3089
3090 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003091st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003092{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003093#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003094 FILE *df;
3095 int *p2;
3096
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003097 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003098 if (df)
3099 {
3100 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003101# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003102 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003103# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003104 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003105# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003106 for (p2 = postfix; p2 < end; p2++)
3107 {
3108 nfa_set_code(*p2);
3109 fprintf(df, "%s, ", code);
3110 }
3111 nfa_set_code(*p);
3112 fprintf(df, "\nCurrent position is: ");
3113 for (p2 = postfix; p2 <= p; p2 ++)
3114 {
3115 nfa_set_code(*p2);
3116 fprintf(df, "%s, ", code);
3117 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003118# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003119 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003120 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003121 fprintf(df, "\nCurrent position is: ");
3122 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003123 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003124# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003125 fprintf(df, "\n--------------------------\n");
3126 fclose(df);
3127 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003128#endif
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003129 emsg(_(e_nfa_regexp_could_not_pop_stack));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003130}
3131
3132/*
3133 * Push an item onto the stack.
3134 */
3135 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003136st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003137{
3138 Frag_T *stackp = *p;
3139
3140 if (stackp >= stack_end)
3141 return;
3142 *stackp = s;
3143 *p = *p + 1;
3144}
3145
3146/*
3147 * Pop an item from the stack.
3148 */
3149 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003150st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003151{
3152 Frag_T *stackp;
3153
3154 *p = *p - 1;
3155 stackp = *p;
3156 if (stackp < stack)
3157 return empty;
3158 return **p;
3159}
3160
3161/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003162 * Estimate the maximum byte length of anything matching "state".
3163 * When unknown or unlimited return -1.
3164 */
3165 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003166nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003167{
3168 int l, r;
3169 nfa_state_T *state = startstate;
3170 int len = 0;
3171
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003172 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003173 if (depth > 4)
3174 return -1;
3175
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003176 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003177 {
3178 switch (state->c)
3179 {
3180 case NFA_END_INVISIBLE:
3181 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003182 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003183 return len;
3184
3185 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003186 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003187 l = nfa_max_width(state->out, depth + 1);
3188 r = nfa_max_width(state->out1, depth + 1);
3189 if (l < 0 || r < 0)
3190 return -1;
3191 return len + (l > r ? l : r);
3192
3193 case NFA_ANY:
3194 case NFA_START_COLL:
3195 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003196 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003197 if (enc_utf8)
3198 len += MB_MAXBYTES;
3199 else if (has_mbyte)
3200 len += 2;
3201 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003202 ++len;
3203 if (state->c != NFA_ANY)
3204 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003205 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003206 state = state->out1->out;
3207 continue;
3208 }
3209 break;
3210
3211 case NFA_DIGIT:
3212 case NFA_WHITE:
3213 case NFA_HEX:
3214 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003215 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003216 ++len;
3217 break;
3218
3219 case NFA_IDENT:
3220 case NFA_SIDENT:
3221 case NFA_KWORD:
3222 case NFA_SKWORD:
3223 case NFA_FNAME:
3224 case NFA_SFNAME:
3225 case NFA_PRINT:
3226 case NFA_SPRINT:
3227 case NFA_NWHITE:
3228 case NFA_NDIGIT:
3229 case NFA_NHEX:
3230 case NFA_NOCTAL:
3231 case NFA_WORD:
3232 case NFA_NWORD:
3233 case NFA_HEAD:
3234 case NFA_NHEAD:
3235 case NFA_ALPHA:
3236 case NFA_NALPHA:
3237 case NFA_LOWER:
3238 case NFA_NLOWER:
3239 case NFA_UPPER:
3240 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003241 case NFA_LOWER_IC:
3242 case NFA_NLOWER_IC:
3243 case NFA_UPPER_IC:
3244 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003245 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003246 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003247 if (has_mbyte)
3248 len += 3;
3249 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003250 ++len;
3251 break;
3252
3253 case NFA_START_INVISIBLE:
3254 case NFA_START_INVISIBLE_NEG:
3255 case NFA_START_INVISIBLE_BEFORE:
3256 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003257 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003258 state = state->out1->out;
3259 continue;
3260
3261 case NFA_BACKREF1:
3262 case NFA_BACKREF2:
3263 case NFA_BACKREF3:
3264 case NFA_BACKREF4:
3265 case NFA_BACKREF5:
3266 case NFA_BACKREF6:
3267 case NFA_BACKREF7:
3268 case NFA_BACKREF8:
3269 case NFA_BACKREF9:
3270#ifdef FEAT_SYN_HL
3271 case NFA_ZREF1:
3272 case NFA_ZREF2:
3273 case NFA_ZREF3:
3274 case NFA_ZREF4:
3275 case NFA_ZREF5:
3276 case NFA_ZREF6:
3277 case NFA_ZREF7:
3278 case NFA_ZREF8:
3279 case NFA_ZREF9:
3280#endif
3281 case NFA_NEWL:
3282 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003283 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003284 return -1;
3285
3286 case NFA_BOL:
3287 case NFA_EOL:
3288 case NFA_BOF:
3289 case NFA_EOF:
3290 case NFA_BOW:
3291 case NFA_EOW:
3292 case NFA_MOPEN:
3293 case NFA_MOPEN1:
3294 case NFA_MOPEN2:
3295 case NFA_MOPEN3:
3296 case NFA_MOPEN4:
3297 case NFA_MOPEN5:
3298 case NFA_MOPEN6:
3299 case NFA_MOPEN7:
3300 case NFA_MOPEN8:
3301 case NFA_MOPEN9:
3302#ifdef FEAT_SYN_HL
3303 case NFA_ZOPEN:
3304 case NFA_ZOPEN1:
3305 case NFA_ZOPEN2:
3306 case NFA_ZOPEN3:
3307 case NFA_ZOPEN4:
3308 case NFA_ZOPEN5:
3309 case NFA_ZOPEN6:
3310 case NFA_ZOPEN7:
3311 case NFA_ZOPEN8:
3312 case NFA_ZOPEN9:
3313 case NFA_ZCLOSE:
3314 case NFA_ZCLOSE1:
3315 case NFA_ZCLOSE2:
3316 case NFA_ZCLOSE3:
3317 case NFA_ZCLOSE4:
3318 case NFA_ZCLOSE5:
3319 case NFA_ZCLOSE6:
3320 case NFA_ZCLOSE7:
3321 case NFA_ZCLOSE8:
3322 case NFA_ZCLOSE9:
3323#endif
3324 case NFA_MCLOSE:
3325 case NFA_MCLOSE1:
3326 case NFA_MCLOSE2:
3327 case NFA_MCLOSE3:
3328 case NFA_MCLOSE4:
3329 case NFA_MCLOSE5:
3330 case NFA_MCLOSE6:
3331 case NFA_MCLOSE7:
3332 case NFA_MCLOSE8:
3333 case NFA_MCLOSE9:
3334 case NFA_NOPEN:
3335 case NFA_NCLOSE:
3336
3337 case NFA_LNUM_GT:
3338 case NFA_LNUM_LT:
3339 case NFA_COL_GT:
3340 case NFA_COL_LT:
3341 case NFA_VCOL_GT:
3342 case NFA_VCOL_LT:
3343 case NFA_MARK_GT:
3344 case NFA_MARK_LT:
3345 case NFA_VISUAL:
3346 case NFA_LNUM:
3347 case NFA_CURSOR:
3348 case NFA_COL:
3349 case NFA_VCOL:
3350 case NFA_MARK:
3351
3352 case NFA_ZSTART:
3353 case NFA_ZEND:
3354 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003355 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003356 case NFA_START_PATTERN:
3357 case NFA_END_PATTERN:
3358 case NFA_COMPOSING:
3359 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003360 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003361 break;
3362
3363 default:
3364 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003365 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003366 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003367 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003368 len += MB_CHAR2LEN(state->c);
3369 break;
3370 }
3371
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003372 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003373 state = state->out;
3374 }
3375
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003376 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003377 return -1;
3378}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003379
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003380/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003381 * Convert a postfix form into its equivalent NFA.
3382 * Return the NFA start state on success, NULL otherwise.
3383 */
3384 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003385post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003386{
3387 int *p;
3388 int mopen;
3389 int mclose;
3390 Frag_T *stack = NULL;
3391 Frag_T *stackp = NULL;
3392 Frag_T *stack_end = NULL;
3393 Frag_T e1;
3394 Frag_T e2;
3395 Frag_T e;
3396 nfa_state_T *s;
3397 nfa_state_T *s1;
3398 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003399 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003400
3401 if (postfix == NULL)
3402 return NULL;
3403
Bram Moolenaar053bb602013-05-20 13:55:21 +02003404#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003405#define POP() st_pop(&stackp, stack); \
3406 if (stackp < stack) \
3407 { \
3408 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003409 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003410 return NULL; \
3411 }
3412
3413 if (nfa_calc_size == FALSE)
3414 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003415 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003416 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003417 if (stack == NULL)
3418 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003419 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003420 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003421 }
3422
3423 for (p = postfix; p < end; ++p)
3424 {
3425 switch (*p)
3426 {
3427 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003428 // Concatenation.
3429 // Pay attention: this operator does not exist in the r.e. itself
3430 // (it is implicit, really). It is added when r.e. is translated
3431 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003432 if (nfa_calc_size == TRUE)
3433 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003434 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003435 break;
3436 }
3437 e2 = POP();
3438 e1 = POP();
3439 patch(e1.out, e2.start);
3440 PUSH(frag(e1.start, e2.out));
3441 break;
3442
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003443 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003444 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003445 if (nfa_calc_size == TRUE)
3446 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003447 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003448 break;
3449 }
3450 e2 = POP();
3451 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003452 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003453 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003454 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003455 PUSH(frag(s, append(e1.out, e2.out)));
3456 break;
3457
3458 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003459 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003460 if (nfa_calc_size == TRUE)
3461 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003462 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003463 break;
3464 }
3465 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003466 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003467 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003468 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003469 patch(e.out, s);
3470 PUSH(frag(s, list1(&s->out1)));
3471 break;
3472
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003473 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003474 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003475 if (nfa_calc_size == TRUE)
3476 {
3477 nstate++;
3478 break;
3479 }
3480 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003481 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003482 if (s == NULL)
3483 goto theend;
3484 patch(e.out, s);
3485 PUSH(frag(s, list1(&s->out)));
3486 break;
3487
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003488 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003489 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003490 if (nfa_calc_size == TRUE)
3491 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003492 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003493 break;
3494 }
3495 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003496 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003497 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003498 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003499 PUSH(frag(s, append(e.out, list1(&s->out1))));
3500 break;
3501
3502 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003503 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003504 if (nfa_calc_size == TRUE)
3505 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003506 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003507 break;
3508 }
3509 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003510 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003511 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003512 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003513 PUSH(frag(s, append(e.out, list1(&s->out))));
3514 break;
3515
Bram Moolenaar417bad22013-06-07 14:08:30 +02003516 case NFA_END_COLL:
3517 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003518 // On the stack is the sequence starting with NFA_START_COLL or
3519 // NFA_START_NEG_COLL and all possible characters. Patch it to
3520 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003521 if (nfa_calc_size == TRUE)
3522 {
3523 nstate++;
3524 break;
3525 }
3526 e = POP();
3527 s = alloc_state(NFA_END_COLL, NULL, NULL);
3528 if (s == NULL)
3529 goto theend;
3530 patch(e.out, s);
3531 e.start->out1 = s;
3532 PUSH(frag(e.start, list1(&s->out)));
3533 break;
3534
3535 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003536 // Before this are two characters, the low and high end of a
3537 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003538 if (nfa_calc_size == TRUE)
3539 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003540 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003541 break;
3542 }
3543 e2 = POP();
3544 e1 = POP();
3545 e2.start->val = e2.start->c;
3546 e2.start->c = NFA_RANGE_MAX;
3547 e1.start->val = e1.start->c;
3548 e1.start->c = NFA_RANGE_MIN;
3549 patch(e1.out, e2.start);
3550 PUSH(frag(e1.start, e2.out));
3551 break;
3552
Bram Moolenaar699c1202013-09-25 16:41:54 +02003553 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003554 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003555 if (nfa_calc_size == TRUE)
3556 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003557 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003558 break;
3559 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003560 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003561 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003562 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003563 PUSH(frag(s, list1(&s->out)));
3564 break;
3565
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003566 case NFA_OPT_CHARS:
3567 {
3568 int n;
3569
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003570 // \%[abc] implemented as:
3571 // NFA_SPLIT
3572 // +-CHAR(a)
3573 // | +-NFA_SPLIT
3574 // | +-CHAR(b)
3575 // | | +-NFA_SPLIT
3576 // | | +-CHAR(c)
3577 // | | | +-next
3578 // | | +- next
3579 // | +- next
3580 // +- next
3581 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003582 if (nfa_calc_size == TRUE)
3583 {
3584 nstate += n;
3585 break;
3586 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003587 s = NULL; // avoid compiler warning
3588 e1.out = NULL; // stores list with out1's
3589 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003590 while (n-- > 0)
3591 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003592 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003593 s = alloc_state(NFA_SPLIT, e.start, NULL);
3594 if (s == NULL)
3595 goto theend;
3596 if (e1.out == NULL)
3597 e1 = e;
3598 patch(e.out, s1);
3599 append(e1.out, list1(&s->out1));
3600 s1 = s;
3601 }
3602 PUSH(frag(s, e1.out));
3603 break;
3604 }
3605
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003606 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003607 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003608 case NFA_PREV_ATOM_JUST_BEFORE:
3609 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003610 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003611 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003612 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3613 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003614 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003615 int start_state;
3616 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003617 int n = 0;
3618 nfa_state_T *zend;
3619 nfa_state_T *skip;
3620
Bram Moolenaardecd9542013-06-07 16:31:50 +02003621 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003622 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003623 case NFA_PREV_ATOM_NO_WIDTH:
3624 start_state = NFA_START_INVISIBLE;
3625 end_state = NFA_END_INVISIBLE;
3626 break;
3627 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3628 start_state = NFA_START_INVISIBLE_NEG;
3629 end_state = NFA_END_INVISIBLE_NEG;
3630 break;
3631 case NFA_PREV_ATOM_JUST_BEFORE:
3632 start_state = NFA_START_INVISIBLE_BEFORE;
3633 end_state = NFA_END_INVISIBLE;
3634 break;
3635 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3636 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3637 end_state = NFA_END_INVISIBLE_NEG;
3638 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003639 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003640 start_state = NFA_START_PATTERN;
3641 end_state = NFA_END_PATTERN;
3642 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003643 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003644
3645 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003646 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003647
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003648 // The \@= operator: match the preceding atom with zero width.
3649 // The \@! operator: no match for the preceding atom.
3650 // The \@<= operator: match for the preceding atom.
3651 // The \@<! operator: no match for the preceding atom.
3652 // Surrounds the preceding atom with START_INVISIBLE and
3653 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003654
3655 if (nfa_calc_size == TRUE)
3656 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003657 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003658 break;
3659 }
3660 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003661 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003662 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003663 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003664
Bram Moolenaar87953742013-06-05 18:52:40 +02003665 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003666 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003667 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003668 if (pattern)
3669 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003670 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003671 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003672 if (skip == NULL)
3673 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003674 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003675 if (zend == NULL)
3676 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003677 s1->out= skip;
3678 patch(e.out, zend);
3679 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003680 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003681 else
3682 {
3683 patch(e.out, s1);
3684 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003685 if (before)
3686 {
3687 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003688 // See if we can guess the maximum width, it avoids a
3689 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003690 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003691 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003692 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003693 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003694 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003695 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003696
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003697 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003698#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003699 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003700 if (regflags & RF_ICOMBINE)
3701 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003702 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003703 }
3704#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003705 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003706
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003707 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003708 case NFA_MOPEN1:
3709 case NFA_MOPEN2:
3710 case NFA_MOPEN3:
3711 case NFA_MOPEN4:
3712 case NFA_MOPEN5:
3713 case NFA_MOPEN6:
3714 case NFA_MOPEN7:
3715 case NFA_MOPEN8:
3716 case NFA_MOPEN9:
3717#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003718 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003719 case NFA_ZOPEN1:
3720 case NFA_ZOPEN2:
3721 case NFA_ZOPEN3:
3722 case NFA_ZOPEN4:
3723 case NFA_ZOPEN5:
3724 case NFA_ZOPEN6:
3725 case NFA_ZOPEN7:
3726 case NFA_ZOPEN8:
3727 case NFA_ZOPEN9:
3728#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003729 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003730 if (nfa_calc_size == TRUE)
3731 {
3732 nstate += 2;
3733 break;
3734 }
3735
3736 mopen = *p;
3737 switch (*p)
3738 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003739 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3740#ifdef FEAT_SYN_HL
3741 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3742 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3743 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3744 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3745 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3746 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3747 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3748 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3749 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3750 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3751#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003752 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003753 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003754 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003755 mclose = *p + NSUBEXP;
3756 break;
3757 }
3758
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003759 // Allow "NFA_MOPEN" as a valid postfix representation for
3760 // the empty regexp "". In this case, the NFA will be
3761 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3762 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003763 if (stackp == stack)
3764 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003765 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003766 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003767 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003768 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003769 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003770 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003771 patch(list1(&s->out), s1);
3772 PUSH(frag(s, list1(&s1->out)));
3773 break;
3774 }
3775
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003776 // At least one node was emitted before NFA_MOPEN, so
3777 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003778 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003779 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003780 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003781 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003782
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003783 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003784 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003785 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003786 patch(e.out, s1);
3787
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003788 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003789 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003790 patch(list1(&s->out1), s1);
3791
3792 PUSH(frag(s, list1(&s1->out)));
3793 break;
3794
Bram Moolenaar5714b802013-05-28 22:03:20 +02003795 case NFA_BACKREF1:
3796 case NFA_BACKREF2:
3797 case NFA_BACKREF3:
3798 case NFA_BACKREF4:
3799 case NFA_BACKREF5:
3800 case NFA_BACKREF6:
3801 case NFA_BACKREF7:
3802 case NFA_BACKREF8:
3803 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003804#ifdef FEAT_SYN_HL
3805 case NFA_ZREF1:
3806 case NFA_ZREF2:
3807 case NFA_ZREF3:
3808 case NFA_ZREF4:
3809 case NFA_ZREF5:
3810 case NFA_ZREF6:
3811 case NFA_ZREF7:
3812 case NFA_ZREF8:
3813 case NFA_ZREF9:
3814#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003815 if (nfa_calc_size == TRUE)
3816 {
3817 nstate += 2;
3818 break;
3819 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003820 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003821 if (s == NULL)
3822 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003823 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003824 if (s1 == NULL)
3825 goto theend;
3826 patch(list1(&s->out), s1);
3827 PUSH(frag(s, list1(&s1->out)));
3828 break;
3829
Bram Moolenaar423532e2013-05-29 21:14:42 +02003830 case NFA_LNUM:
3831 case NFA_LNUM_GT:
3832 case NFA_LNUM_LT:
3833 case NFA_VCOL:
3834 case NFA_VCOL_GT:
3835 case NFA_VCOL_LT:
3836 case NFA_COL:
3837 case NFA_COL_GT:
3838 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003839 case NFA_MARK:
3840 case NFA_MARK_GT:
3841 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003842 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003843 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003844
Bram Moolenaar423532e2013-05-29 21:14:42 +02003845 if (nfa_calc_size == TRUE)
3846 {
3847 nstate += 1;
3848 break;
3849 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003850 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003851 if (s == NULL)
3852 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003853 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003854 PUSH(frag(s, list1(&s->out)));
3855 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003856 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003857
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003858 case NFA_ZSTART:
3859 case NFA_ZEND:
3860 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003861 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003862 if (nfa_calc_size == TRUE)
3863 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003864 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003865 break;
3866 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003867 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003868 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003869 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003870 PUSH(frag(s, list1(&s->out)));
3871 break;
3872
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003873 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003874
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003875 } // for(p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003876
3877 if (nfa_calc_size == TRUE)
3878 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003879 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003880 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003881 }
3882
3883 e = POP();
3884 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003885 {
3886 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003887 EMSG_RET_NULL(_(e_nfa_regexp_while_converting_from_postfix_to_nfa_too_many_stats_left_on_stack));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003888 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003889
3890 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003891 {
3892 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003893 EMSG_RET_NULL(_(e_nfa_regexp_not_enough_space_to_store_whole_nfa));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003894 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003895
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003896 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003897 matchstate->c = NFA_MATCH;
3898 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003899 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003900
3901 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003902 ret = e.start;
3903
3904theend:
3905 vim_free(stack);
3906 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003907
3908#undef POP1
3909#undef PUSH1
3910#undef POP2
3911#undef PUSH2
3912#undef POP
3913#undef PUSH
3914}
3915
Bram Moolenaara2947e22013-06-11 22:44:09 +02003916/*
3917 * After building the NFA program, inspect it to add optimization hints.
3918 */
3919 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003920nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003921{
3922 int i;
3923 int c;
3924
3925 for (i = 0; i < prog->nstate; ++i)
3926 {
3927 c = prog->state[i].c;
3928 if (c == NFA_START_INVISIBLE
3929 || c == NFA_START_INVISIBLE_NEG
3930 || c == NFA_START_INVISIBLE_BEFORE
3931 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3932 {
3933 int directly;
3934
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003935 // Do it directly when what follows is possibly the end of the
3936 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003937 if (match_follows(prog->state[i].out1->out, 0))
3938 directly = TRUE;
3939 else
3940 {
3941 int ch_invisible = failure_chance(prog->state[i].out, 0);
3942 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3943
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003944 // Postpone when the invisible match is expensive or has a
3945 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003946 if (c == NFA_START_INVISIBLE_BEFORE
3947 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3948 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003949 // "before" matches are very expensive when
3950 // unbounded, always prefer what follows then,
3951 // unless what follows will always match.
3952 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003953 if (prog->state[i].val <= 0 && ch_follows > 0)
3954 directly = FALSE;
3955 else
3956 directly = ch_follows * 10 < ch_invisible;
3957 }
3958 else
3959 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003960 // normal invisible, first do the one with the
3961 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003962 directly = ch_follows < ch_invisible;
3963 }
3964 }
3965 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003966 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003967 ++prog->state[i].c;
3968 }
3969 }
3970}
3971
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003972/////////////////////////////////////////////////////////////////
3973// NFA execution code.
3974/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003975
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003976typedef struct
3977{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003978 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003979
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003980 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003981 union
3982 {
3983 struct multipos
3984 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003985 linenr_T start_lnum;
3986 linenr_T end_lnum;
3987 colnr_T start_col;
3988 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003989 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003990 struct linepos
3991 {
3992 char_u *start;
3993 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003994 } line[NSUBEXP];
3995 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003996} regsub_T;
3997
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003998typedef struct
3999{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004000 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004001#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004002 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004003#endif
4004} regsubs_T;
4005
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004006// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02004007typedef struct nfa_pim_S nfa_pim_T;
4008struct nfa_pim_S
4009{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004010 int result; // NFA_PIM_*, see below
4011 nfa_state_T *state; // the invisible match start state
4012 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004013 union
4014 {
4015 lpos_T pos;
4016 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004017 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02004018};
4019
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004020// Values for done in nfa_pim_T.
4021#define NFA_PIM_UNUSED 0 // pim not used
4022#define NFA_PIM_TODO 1 // pim not done yet
4023#define NFA_PIM_MATCH 2 // pim executed, matches
4024#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02004025
4026
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004027// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02004028typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004029{
4030 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004031 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004032 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
4033 // invisible match
4034 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02004035} nfa_thread_T;
4036
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004037// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004038typedef struct
4039{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004040 nfa_thread_T *t; // allocated array of states
4041 int n; // nr of states currently in "t"
4042 int len; // max nr of states in "t"
4043 int id; // ID of the list
4044 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02004045} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004046
Bram Moolenaar5714b802013-05-28 22:03:20 +02004047#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004048static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004049
4050 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004051log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004052{
4053 log_subexpr(&subs->norm);
4054# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004055 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02004056 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004057# endif
4058}
4059
Bram Moolenaar5714b802013-05-28 22:03:20 +02004060 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004061log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02004062{
4063 int j;
4064
4065 for (j = 0; j < sub->in_use; j++)
4066 if (REG_MULTI)
Bram Moolenaar87953742013-06-05 18:52:40 +02004067 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004068 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004069 sub->list.multi[j].start_col,
4070 (int)sub->list.multi[j].start_lnum,
4071 sub->list.multi[j].end_col,
4072 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004073 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004074 {
4075 char *s = (char *)sub->list.line[j].start;
4076 char *e = (char *)sub->list.line[j].end;
4077
Bram Moolenaar87953742013-06-05 18:52:40 +02004078 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004079 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004080 s == NULL ? "NULL" : s,
4081 e == NULL ? "NULL" : e);
4082 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004083}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004084
4085 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004086pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004087{
4088 static char buf[30];
4089
4090 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4091 buf[0] = NUL;
4092 else
4093 {
4094 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004095 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004096 }
4097 return buf;
4098}
4099
Bram Moolenaar5714b802013-05-28 22:03:20 +02004100#endif
4101
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004102// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004103static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004104#ifdef FEAT_RELTIME
4105static proftime_T *nfa_time_limit;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004106static int *nfa_timed_out;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004107static int nfa_time_count;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004108#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004109
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004110static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004111static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004112
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004113/*
4114 * Copy postponed invisible match info from "from" to "to".
4115 */
4116 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004117copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004118{
4119 to->result = from->result;
4120 to->state = from->state;
4121 copy_sub(&to->subs.norm, &from->subs.norm);
4122#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004123 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004124 copy_sub(&to->subs.synt, &from->subs.synt);
4125#endif
4126 to->end = from->end;
4127}
4128
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004129 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004130clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004131{
4132 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004133 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004134 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004135 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004136 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004137 vim_memset(sub->list.line, 0,
4138 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004139 sub->in_use = 0;
4140}
4141
4142/*
4143 * Copy the submatches from "from" to "to".
4144 */
4145 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004146copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004147{
4148 to->in_use = from->in_use;
4149 if (from->in_use > 0)
4150 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004151 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004152 if (REG_MULTI)
4153 mch_memmove(&to->list.multi[0],
4154 &from->list.multi[0],
4155 sizeof(struct multipos) * from->in_use);
4156 else
4157 mch_memmove(&to->list.line[0],
4158 &from->list.line[0],
4159 sizeof(struct linepos) * from->in_use);
4160 }
4161}
4162
4163/*
4164 * Like copy_sub() but exclude the main match.
4165 */
4166 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004167copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004168{
4169 if (to->in_use < from->in_use)
4170 to->in_use = from->in_use;
4171 if (from->in_use > 1)
4172 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004173 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004174 if (REG_MULTI)
4175 mch_memmove(&to->list.multi[1],
4176 &from->list.multi[1],
4177 sizeof(struct multipos) * (from->in_use - 1));
4178 else
4179 mch_memmove(&to->list.line[1],
4180 &from->list.line[1],
4181 sizeof(struct linepos) * (from->in_use - 1));
4182 }
4183}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004184
Bram Moolenaar428e9872013-05-30 17:05:39 +02004185/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004186 * Like copy_sub() but only do the end of the main match if \ze is present.
4187 */
4188 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004189copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004190{
Bram Moolenaar0270f382018-07-17 05:43:58 +02004191 if (rex.nfa_has_zend)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004192 {
4193 if (REG_MULTI)
4194 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004195 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004196 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004197 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4198 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004199 }
Bram Moolenaarf2118842013-09-25 18:16:38 +02004200 }
4201 else
4202 {
4203 if (from->list.line[0].end != NULL)
4204 to->list.line[0].end = from->list.line[0].end;
4205 }
4206 }
4207}
4208
4209/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004210 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004211 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004212 */
4213 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004214sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004215{
4216 int i;
4217 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004218 linenr_T s1;
4219 linenr_T s2;
4220 char_u *sp1;
4221 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004222
4223 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4224 if (REG_MULTI)
4225 {
4226 for (i = 0; i < todo; ++i)
4227 {
4228 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004229 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004230 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004231 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004232 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004233 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004234 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004235 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004236 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004237 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004238 if (s1 != -1 && sub1->list.multi[i].start_col
4239 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004240 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004241
Bram Moolenaar0270f382018-07-17 05:43:58 +02004242 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004243 {
4244 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004245 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004246 else
4247 s1 = -1;
4248 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004249 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004250 else
4251 s2 = -1;
4252 if (s1 != s2)
4253 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004254 if (s1 != -1 && sub1->list.multi[i].end_col
4255 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004256 return FALSE;
4257 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004258 }
4259 }
4260 else
4261 {
4262 for (i = 0; i < todo; ++i)
4263 {
4264 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004265 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004266 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004267 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004268 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004269 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004270 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004271 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004272 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004273 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004274 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004275 {
4276 if (i < sub1->in_use)
4277 sp1 = sub1->list.line[i].end;
4278 else
4279 sp1 = NULL;
4280 if (i < sub2->in_use)
4281 sp2 = sub2->list.line[i].end;
4282 else
4283 sp2 = NULL;
4284 if (sp1 != sp2)
4285 return FALSE;
4286 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004287 }
4288 }
4289
4290 return TRUE;
4291}
4292
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004293#ifdef ENABLE_LOG
4294 static void
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004295open_debug_log(int result)
4296{
4297 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4298 if (log_fd == NULL)
4299 {
4300 emsg(_(e_log_open_failed));
4301 log_fd = stderr;
4302 }
4303
4304 fprintf(log_fd, "****************************\n");
4305 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4306 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : result == MAYBE
4307 ? "MAYBE" : "FALSE");
4308 fprintf(log_fd, "****************************\n");
4309}
4310
4311 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004312report_state(char *action,
4313 regsub_T *sub,
4314 nfa_state_T *state,
4315 int lid,
4316 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004317{
4318 int col;
4319
4320 if (sub->in_use <= 0)
4321 col = -1;
4322 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004323 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004324 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004325 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004326 nfa_set_code(state->c);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004327 if (log_fd == NULL)
4328 open_debug_log(MAYBE);
4329
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004330 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4331 action, abs(state->id), lid, state->c, code, col,
4332 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004333}
4334#endif
4335
Bram Moolenaar43e02982013-06-07 17:31:29 +02004336/*
4337 * Return TRUE if the same state is already in list "l" with the same
4338 * positions as "subs".
4339 */
4340 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004341has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004342 nfa_list_T *l, // runtime state list
4343 nfa_state_T *state, // state to update
4344 regsubs_T *subs, // pointers to subexpressions
4345 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004346{
4347 nfa_thread_T *thread;
4348 int i;
4349
4350 for (i = 0; i < l->n; ++i)
4351 {
4352 thread = &l->t[i];
4353 if (thread->state->id == state->id
4354 && sub_equal(&thread->subs.norm, &subs->norm)
4355#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004356 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004357 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004358#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004359 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004360 return TRUE;
4361 }
4362 return FALSE;
4363}
4364
4365/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004366 * Return TRUE if "one" and "two" are equal. That includes when both are not
4367 * set.
4368 */
4369 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004370pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004371{
4372 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4373 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4374
4375 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004376 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004377 return two_unused;
4378 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004379 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004380 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004381 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004382 if (one->state->id != two->state->id)
4383 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004384 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004385 if (REG_MULTI)
4386 return one->end.pos.lnum == two->end.pos.lnum
4387 && one->end.pos.col == two->end.pos.col;
4388 return one->end.ptr == two->end.ptr;
4389}
4390
4391/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004392 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4393 */
4394 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004395match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004396{
4397 nfa_state_T *state = startstate;
4398
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004399 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004400 if (depth > 10)
4401 return FALSE;
4402
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004403 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004404 {
4405 switch (state->c)
4406 {
4407 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004408 case NFA_MCLOSE:
4409 case NFA_END_INVISIBLE:
4410 case NFA_END_INVISIBLE_NEG:
4411 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004412 return TRUE;
4413
4414 case NFA_SPLIT:
4415 return match_follows(state->out, depth + 1)
4416 || match_follows(state->out1, depth + 1);
4417
4418 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004419 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004420 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004421 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004422 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004423 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004424 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004425 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004426 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004427 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004428 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004429 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004430
4431 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004432 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004433 case NFA_IDENT:
4434 case NFA_SIDENT:
4435 case NFA_KWORD:
4436 case NFA_SKWORD:
4437 case NFA_FNAME:
4438 case NFA_SFNAME:
4439 case NFA_PRINT:
4440 case NFA_SPRINT:
4441 case NFA_WHITE:
4442 case NFA_NWHITE:
4443 case NFA_DIGIT:
4444 case NFA_NDIGIT:
4445 case NFA_HEX:
4446 case NFA_NHEX:
4447 case NFA_OCTAL:
4448 case NFA_NOCTAL:
4449 case NFA_WORD:
4450 case NFA_NWORD:
4451 case NFA_HEAD:
4452 case NFA_NHEAD:
4453 case NFA_ALPHA:
4454 case NFA_NALPHA:
4455 case NFA_LOWER:
4456 case NFA_NLOWER:
4457 case NFA_UPPER:
4458 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004459 case NFA_LOWER_IC:
4460 case NFA_NLOWER_IC:
4461 case NFA_UPPER_IC:
4462 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004463 case NFA_START_COLL:
4464 case NFA_START_NEG_COLL:
4465 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004466 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004467 return FALSE;
4468
4469 default:
4470 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004471 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004472 return FALSE;
4473
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004474 // Others: zero-width or possibly zero-width, might still find
4475 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004476 break;
4477 }
4478 state = state->out;
4479 }
4480 return FALSE;
4481}
4482
4483
4484/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004485 * Return TRUE if "state" is already in list "l".
4486 */
4487 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004488state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004489 nfa_list_T *l, // runtime state list
4490 nfa_state_T *state, // state to update
4491 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004492{
4493 if (state->lastlist[nfa_ll_index] == l->id)
4494 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004495 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004496 return TRUE;
4497 }
4498 return FALSE;
4499}
4500
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004501// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004502#define ADDSTATE_HERE_OFFSET 10
4503
Bram Moolenaard05bf562013-06-30 23:24:08 +02004504/*
4505 * Add "state" and possibly what follows to state list ".".
4506 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004507 * Returns NULL when recursiveness is too deep.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004508 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004509 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004510addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004511 nfa_list_T *l, // runtime state list
4512 nfa_state_T *state, // state to update
4513 regsubs_T *subs_arg, // pointers to subexpressions
4514 nfa_pim_T *pim, // postponed look-behind match
4515 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004516{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004517 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004518 int off = off_arg;
4519 int add_here = FALSE;
4520 int listindex = 0;
4521 int k;
4522 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004523 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004524 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004525 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004526 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004527 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004528 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004529 regsubs_T *subs = subs_arg;
4530 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004531#ifdef ENABLE_LOG
4532 int did_print = FALSE;
4533#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004534 static int depth = 0;
4535
4536 // This function is called recursively. When the depth is too much we run
4537 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004538 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004539 {
4540 --depth;
4541 return NULL;
4542 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004543
Bram Moolenaar16b35782016-09-09 20:29:50 +02004544 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4545 {
4546 add_here = TRUE;
4547 off = 0;
4548 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4549 }
4550
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004551 switch (state->c)
4552 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004553 case NFA_NCLOSE:
4554 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004555 case NFA_MCLOSE1:
4556 case NFA_MCLOSE2:
4557 case NFA_MCLOSE3:
4558 case NFA_MCLOSE4:
4559 case NFA_MCLOSE5:
4560 case NFA_MCLOSE6:
4561 case NFA_MCLOSE7:
4562 case NFA_MCLOSE8:
4563 case NFA_MCLOSE9:
4564#ifdef FEAT_SYN_HL
4565 case NFA_ZCLOSE:
4566 case NFA_ZCLOSE1:
4567 case NFA_ZCLOSE2:
4568 case NFA_ZCLOSE3:
4569 case NFA_ZCLOSE4:
4570 case NFA_ZCLOSE5:
4571 case NFA_ZCLOSE6:
4572 case NFA_ZCLOSE7:
4573 case NFA_ZCLOSE8:
4574 case NFA_ZCLOSE9:
4575#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004576 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004577 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004578 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004579 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004580 // These nodes are not added themselves but their "out" and/or
4581 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004582 break;
4583
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004584 case NFA_BOL:
4585 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004586 // "^" won't match past end-of-line, don't bother trying.
4587 // Except when at the end of the line, or when we are going to the
4588 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004589 if (rex.input > rex.line
4590 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004591 && (nfa_endp == NULL
4592 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004593 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004594 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004595 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004596
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004597 case NFA_MOPEN1:
4598 case NFA_MOPEN2:
4599 case NFA_MOPEN3:
4600 case NFA_MOPEN4:
4601 case NFA_MOPEN5:
4602 case NFA_MOPEN6:
4603 case NFA_MOPEN7:
4604 case NFA_MOPEN8:
4605 case NFA_MOPEN9:
4606#ifdef FEAT_SYN_HL
4607 case NFA_ZOPEN:
4608 case NFA_ZOPEN1:
4609 case NFA_ZOPEN2:
4610 case NFA_ZOPEN3:
4611 case NFA_ZOPEN4:
4612 case NFA_ZOPEN5:
4613 case NFA_ZOPEN6:
4614 case NFA_ZOPEN7:
4615 case NFA_ZOPEN8:
4616 case NFA_ZOPEN9:
4617#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004618 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004619 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004620 // These nodes need to be added so that we can bail out when it
4621 // was added to this list before at the same position to avoid an
4622 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004623
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004624 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004625 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004626 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004627 // This state is already in the list, don't add it again,
4628 // unless it is an MOPEN that is used for a backreference or
4629 // when there is a PIM. For NFA_MATCH check the position,
4630 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004631 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004632 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004633 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004634 // When called from addstate_here() do insert before
4635 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004636 if (add_here)
4637 {
4638 for (k = 0; k < l->n && k < listindex; ++k)
4639 if (l->t[k].state->id == state->id)
4640 {
4641 found = TRUE;
4642 break;
4643 }
4644 }
4645 if (!add_here || found)
4646 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004647skip_add:
4648#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004649 nfa_set_code(state->c);
4650 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4651 abs(state->id), l->id, state->c, code,
4652 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004653#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004654 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004655 return subs;
4656 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004657 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004658
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004659 // Do not add the state again when it exists with the same
4660 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004661 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004662 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004663 }
4664
Bram Moolenaar688b3982019-02-13 21:47:36 +01004665 // When there are backreferences or PIMs the number of states may
4666 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004667 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004668 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004669 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004670 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004671 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004672
Bram Moolenaar688b3982019-02-13 21:47:36 +01004673 if ((long)(newsize >> 10) >= p_mmp)
4674 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004675 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004676 --depth;
4677 return NULL;
4678 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004679 if (subs != &temp_subs)
4680 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004681 // "subs" may point into the current array, need to make a
4682 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004683 copy_sub(&temp_subs.norm, &subs->norm);
4684#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004685 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004686 copy_sub(&temp_subs.synt, &subs->synt);
4687#endif
4688 subs = &temp_subs;
4689 }
4690
Bram Moolenaar688b3982019-02-13 21:47:36 +01004691 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004692 if (newt == NULL)
4693 {
4694 // out of memory
4695 --depth;
4696 return NULL;
4697 }
4698 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004699 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004700 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004701
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004702 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004703 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004704 thread = &l->t[l->n++];
4705 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004706 if (pim == NULL)
4707 thread->pim.result = NFA_PIM_UNUSED;
4708 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004709 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004710 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004711 l->has_pim = TRUE;
4712 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004713 copy_sub(&thread->subs.norm, &subs->norm);
4714#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004715 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004716 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004717#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004718#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004719 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004720 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004721#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004722 }
4723
4724#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004725 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004726 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004727#endif
4728 switch (state->c)
4729 {
4730 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004731 break;
4732
4733 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004734 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004735 subs = addstate(l, state->out, subs, pim, off_arg);
4736 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004737 break;
4738
Bram Moolenaar699c1202013-09-25 16:41:54 +02004739 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004740 case NFA_NOPEN:
4741 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004742 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004743 break;
4744
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004745 case NFA_MOPEN:
4746 case NFA_MOPEN1:
4747 case NFA_MOPEN2:
4748 case NFA_MOPEN3:
4749 case NFA_MOPEN4:
4750 case NFA_MOPEN5:
4751 case NFA_MOPEN6:
4752 case NFA_MOPEN7:
4753 case NFA_MOPEN8:
4754 case NFA_MOPEN9:
4755#ifdef FEAT_SYN_HL
4756 case NFA_ZOPEN:
4757 case NFA_ZOPEN1:
4758 case NFA_ZOPEN2:
4759 case NFA_ZOPEN3:
4760 case NFA_ZOPEN4:
4761 case NFA_ZOPEN5:
4762 case NFA_ZOPEN6:
4763 case NFA_ZOPEN7:
4764 case NFA_ZOPEN8:
4765 case NFA_ZOPEN9:
4766#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004767 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004768 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004769 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004770 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004771 sub = &subs->norm;
4772 }
4773#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004774 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004775 {
4776 subidx = state->c - NFA_ZOPEN;
4777 sub = &subs->synt;
4778 }
4779#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004780 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004781 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004782 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004783 sub = &subs->norm;
4784 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004785
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004786 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004787 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004788 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004789
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004790 // Set the position (with "off" added) in the subexpression. Save
4791 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004792 if (REG_MULTI)
4793 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004794 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004795 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004796 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004797 save_in_use = -1;
4798 }
4799 else
4800 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004801 save_in_use = sub->in_use;
4802 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004803 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004804 sub->list.multi[i].start_lnum = -1;
4805 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004806 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004807 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004808 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004809 if (off == -1)
4810 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004811 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004812 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004813 }
4814 else
4815 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004816 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004817 sub->list.multi[subidx].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004818 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004819 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004820 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004821 }
4822 else
4823 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004824 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004825 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004826 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004827 save_in_use = -1;
4828 }
4829 else
4830 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004831 save_in_use = sub->in_use;
4832 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004833 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004834 sub->list.line[i].start = NULL;
4835 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004836 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004837 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004838 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004839 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004840 }
4841
Bram Moolenaar16b35782016-09-09 20:29:50 +02004842 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004843 if (subs == NULL)
4844 break;
4845 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004846#ifdef FEAT_SYN_HL
4847 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4848 sub = &subs->synt;
4849 else
4850#endif
4851 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004852
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004853 if (save_in_use == -1)
4854 {
4855 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004856 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004857 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004858 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004859 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004860 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004861 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004862 break;
4863
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004864 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004865 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004866 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004867 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004868 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004869 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004870 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004871 break;
4872 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004873 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004874 case NFA_MCLOSE1:
4875 case NFA_MCLOSE2:
4876 case NFA_MCLOSE3:
4877 case NFA_MCLOSE4:
4878 case NFA_MCLOSE5:
4879 case NFA_MCLOSE6:
4880 case NFA_MCLOSE7:
4881 case NFA_MCLOSE8:
4882 case NFA_MCLOSE9:
4883#ifdef FEAT_SYN_HL
4884 case NFA_ZCLOSE:
4885 case NFA_ZCLOSE1:
4886 case NFA_ZCLOSE2:
4887 case NFA_ZCLOSE3:
4888 case NFA_ZCLOSE4:
4889 case NFA_ZCLOSE5:
4890 case NFA_ZCLOSE6:
4891 case NFA_ZCLOSE7:
4892 case NFA_ZCLOSE8:
4893 case NFA_ZCLOSE9:
4894#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004895 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004896 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004897 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004898 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004899 sub = &subs->norm;
4900 }
4901#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004902 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004903 {
4904 subidx = state->c - NFA_ZCLOSE;
4905 sub = &subs->synt;
4906 }
4907#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004908 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004909 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004910 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004911 sub = &subs->norm;
4912 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004913
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004914 // We don't fill in gaps here, there must have been an MOPEN that
4915 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004916 save_in_use = sub->in_use;
4917 if (sub->in_use <= subidx)
4918 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004919 if (REG_MULTI)
4920 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004921 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004922 if (off == -1)
4923 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004924 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004925 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004926 }
4927 else
4928 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004929 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004930 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004931 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004932 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004933 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004934 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004935 }
4936 else
4937 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004938 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004939 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004940 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004941 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004942 }
4943
Bram Moolenaar16b35782016-09-09 20:29:50 +02004944 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004945 if (subs == NULL)
4946 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004947 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004948#ifdef FEAT_SYN_HL
4949 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4950 sub = &subs->synt;
4951 else
4952#endif
4953 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004954
4955 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004956 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004957 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004958 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004959 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004960 break;
4961 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004962 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004963 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004964}
4965
4966/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004967 * Like addstate(), but the new state(s) are put at position "*ip".
4968 * Used for zero-width matches, next state to use is the added one.
4969 * This makes sure the order of states to be tried does not change, which
4970 * matters for alternatives.
4971 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004972 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004973addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004974 nfa_list_T *l, // runtime state list
4975 nfa_state_T *state, // state to update
4976 regsubs_T *subs, // pointers to subexpressions
4977 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004978 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004979{
4980 int tlen = l->n;
4981 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004982 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004983 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004984
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004985 // First add the state(s) at the end, so that we know how many there are.
4986 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00004987 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004988 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4989 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004990 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004991
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004992 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004993 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004994 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004995
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004996 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004997 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004998 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004999 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02005000 if (count == 1)
5001 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005002 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02005003 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02005004 }
5005 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02005006 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005007 if (l->n + count - 1 >= l->len)
5008 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005009 // not enough space to move the new states, reallocate the list
5010 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005011 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01005012 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005013 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005014
Bram Moolenaar688b3982019-02-13 21:47:36 +01005015 if ((long)(newsize >> 10) >= p_mmp)
5016 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00005017 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01005018 return NULL;
5019 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005020 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005021 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005022 return NULL;
5023 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005024 mch_memmove(&(newl[0]),
5025 &(l->t[0]),
5026 sizeof(nfa_thread_T) * listidx);
5027 mch_memmove(&(newl[listidx]),
5028 &(l->t[l->n - count]),
5029 sizeof(nfa_thread_T) * count);
5030 mch_memmove(&(newl[listidx + count]),
5031 &(l->t[listidx + 1]),
5032 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
5033 vim_free(l->t);
5034 l->t = newl;
5035 }
5036 else
5037 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005038 // make space for new states, then move them from the
5039 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005040 mch_memmove(&(l->t[listidx + count]),
5041 &(l->t[listidx + 1]),
5042 sizeof(nfa_thread_T) * (l->n - listidx - 1));
5043 mch_memmove(&(l->t[listidx]),
5044 &(l->t[l->n - 1]),
5045 sizeof(nfa_thread_T) * count);
5046 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005047 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005048 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005049 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005050
5051 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005052}
5053
5054/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005055 * Check character class "class" against current character c.
5056 */
5057 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005058check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005059{
5060 switch (class)
5061 {
5062 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005063 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005064 return OK;
5065 break;
5066 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005067 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005068 return OK;
5069 break;
5070 case NFA_CLASS_BLANK:
5071 if (c == ' ' || c == '\t')
5072 return OK;
5073 break;
5074 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005075 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005076 return OK;
5077 break;
5078 case NFA_CLASS_DIGIT:
5079 if (VIM_ISDIGIT(c))
5080 return OK;
5081 break;
5082 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005083 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005084 return OK;
5085 break;
5086 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005087 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005088 return OK;
5089 break;
5090 case NFA_CLASS_PRINT:
5091 if (vim_isprintc(c))
5092 return OK;
5093 break;
5094 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005095 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005096 return OK;
5097 break;
5098 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005099 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005100 return OK;
5101 break;
5102 case NFA_CLASS_UPPER:
5103 if (MB_ISUPPER(c))
5104 return OK;
5105 break;
5106 case NFA_CLASS_XDIGIT:
5107 if (vim_isxdigit(c))
5108 return OK;
5109 break;
5110 case NFA_CLASS_TAB:
5111 if (c == '\t')
5112 return OK;
5113 break;
5114 case NFA_CLASS_RETURN:
5115 if (c == '\r')
5116 return OK;
5117 break;
5118 case NFA_CLASS_BACKSPACE:
5119 if (c == '\b')
5120 return OK;
5121 break;
5122 case NFA_CLASS_ESCAPE:
5123 if (c == '\033')
5124 return OK;
5125 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005126 case NFA_CLASS_IDENT:
5127 if (vim_isIDc(c))
5128 return OK;
5129 break;
5130 case NFA_CLASS_KEYWORD:
5131 if (reg_iswordc(c))
5132 return OK;
5133 break;
5134 case NFA_CLASS_FNAME:
5135 if (vim_isfilec(c))
5136 return OK;
5137 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005138
5139 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005140 // should not be here :P
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00005141 siemsg(_(e_nfa_regexp_invalid_character_class_nr), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005142 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005143 }
5144 return FAIL;
5145}
5146
Bram Moolenaar5714b802013-05-28 22:03:20 +02005147/*
5148 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005149 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005150 */
5151 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005152match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005153 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005154 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005155 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005156{
5157 int len;
5158
5159 if (sub->in_use <= subidx)
5160 {
5161retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005162 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005163 *bytelen = 0;
5164 return TRUE;
5165 }
5166
5167 if (REG_MULTI)
5168 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005169 if (sub->list.multi[subidx].start_lnum < 0
5170 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005171 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005172 if (sub->list.multi[subidx].start_lnum == rex.lnum
5173 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005174 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005175 len = sub->list.multi[subidx].end_col
5176 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005177 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5178 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005179 {
5180 *bytelen = len;
5181 return TRUE;
5182 }
5183 }
5184 else
5185 {
5186 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005187 sub->list.multi[subidx].start_lnum,
5188 sub->list.multi[subidx].start_col,
5189 sub->list.multi[subidx].end_lnum,
5190 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005191 bytelen) == RA_MATCH)
5192 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005193 }
5194 }
5195 else
5196 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005197 if (sub->list.line[subidx].start == NULL
5198 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005199 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005200 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005201 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005202 {
5203 *bytelen = len;
5204 return TRUE;
5205 }
5206 }
5207 return FALSE;
5208}
5209
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005210#ifdef FEAT_SYN_HL
5211
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005212/*
5213 * Check for a match with \z subexpression "subidx".
5214 * Return TRUE if it matches.
5215 */
5216 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005217match_zref(
5218 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005219 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005220{
5221 int len;
5222
5223 cleanup_zsubexpr();
5224 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5225 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005226 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005227 *bytelen = 0;
5228 return TRUE;
5229 }
5230
5231 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005232 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005233 {
5234 *bytelen = len;
5235 return TRUE;
5236 }
5237 return FALSE;
5238}
5239#endif
5240
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005241/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005242 * Save list IDs for all NFA states of "prog" into "list".
5243 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005244 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005245 */
5246 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005247nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005248{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005249 int i;
5250 nfa_state_T *p;
5251
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005252 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005253 p = &prog->state[0];
5254 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005255 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005256 list[i] = p->lastlist[1];
5257 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005258 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005259 }
5260}
5261
5262/*
5263 * Restore list IDs from "list" to all NFA states.
5264 */
5265 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005266nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005267{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005268 int i;
5269 nfa_state_T *p;
5270
5271 p = &prog->state[0];
5272 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005273 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005274 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005275 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005276 }
5277}
5278
Bram Moolenaar423532e2013-05-29 21:14:42 +02005279 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005280nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005281{
5282 if (op == 1) return pos > val;
5283 if (op == 2) return pos < val;
5284 return val == pos;
5285}
5286
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005287static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005288
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005289/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005290 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005291 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5292 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005293 */
5294 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005295recursive_regmatch(
5296 nfa_state_T *state,
5297 nfa_pim_T *pim,
5298 nfa_regprog_T *prog,
5299 regsubs_T *submatch,
5300 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005301 int **listids,
5302 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005303{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005304 int save_reginput_col = (int)(rex.input - rex.line);
5305 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005306 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005307 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005308 save_se_T *save_nfa_endp = nfa_endp;
5309 save_se_T endpos;
5310 save_se_T *endposp = NULL;
5311 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005312 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005313
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005314 if (pim != NULL)
5315 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005316 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005317 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005318 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005319 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005320 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005321 }
5322
Bram Moolenaardecd9542013-06-07 16:31:50 +02005323 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005324 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5325 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5326 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005327 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005328 // The recursive match must end at the current position. When "pim" is
5329 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005330 endposp = &endpos;
5331 if (REG_MULTI)
5332 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005333 if (pim == NULL)
5334 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005335 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5336 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005337 }
5338 else
5339 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005340 }
5341 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005342 {
5343 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005344 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005345 else
5346 endpos.se_u.ptr = pim->end.ptr;
5347 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005348
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005349 // Go back the specified number of bytes, or as far as the
5350 // start of the previous line, to try matching "\@<=" or
5351 // not matching "\@<!". This is very inefficient, limit the number of
5352 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005353 if (state->val <= 0)
5354 {
5355 if (REG_MULTI)
5356 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005357 rex.line = reg_getline(--rex.lnum);
5358 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005359 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005360 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005361 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005362 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005363 }
5364 else
5365 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005366 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005367 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005368 // Not enough bytes in this line, go to end of
5369 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005370 rex.line = reg_getline(--rex.lnum);
5371 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005372 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005373 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005374 rex.line = reg_getline(++rex.lnum);
5375 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005376 }
5377 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005378 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005379 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005380 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005381 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005382 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005383 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005384 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005385 }
5386 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005387 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005388 }
5389 }
5390
Bram Moolenaarf46da702013-06-02 22:37:42 +02005391#ifdef ENABLE_LOG
5392 if (log_fd != stderr)
5393 fclose(log_fd);
5394 log_fd = NULL;
5395#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005396 // Have to clear the lastlist field of the NFA nodes, so that
5397 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005398 if (nfa_ll_index == 1)
5399 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005400 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5401 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005402 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005403 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005404 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005405 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005406 if (*listids == NULL)
5407 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00005408 emsg(_(e_nfa_regexp_could_not_allocate_memory_for_branch_traversal));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005409 return 0;
5410 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005411 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005412 }
5413 nfa_save_listids(prog, *listids);
5414 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005415 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005416 }
5417 else
5418 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005419 // First recursive nfa_regmatch() call, switch to the second lastlist
5420 // entry. Make sure rex.nfa_listid is different from a previous
5421 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005422 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005423 if (rex.nfa_listid <= rex.nfa_alt_listid)
5424 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005425 }
5426
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005427 // Call nfa_regmatch() to check if the current concat matches at this
5428 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005429 nfa_endp = endposp;
5430 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005431
5432 if (need_restore)
5433 nfa_restore_listids(prog, *listids);
5434 else
5435 {
5436 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005437 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005438 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005439
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005440 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005441 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005442 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005443 rex.line = reg_getline(rex.lnum);
5444 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005445 if (result != NFA_TOO_EXPENSIVE)
5446 {
5447 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005448 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005449 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005450 nfa_endp = save_nfa_endp;
5451
5452#ifdef ENABLE_LOG
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005453 open_debug_log(result);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005454#endif
5455
5456 return result;
5457}
5458
Bram Moolenaara2d95102013-06-04 14:23:05 +02005459/*
5460 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005461 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005462 * NFA_ANY: 1
5463 * specific character: 99
5464 */
5465 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005466failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005467{
5468 int c = state->c;
5469 int l, r;
5470
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005471 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005472 if (depth > 4)
5473 return 1;
5474
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005475 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005476 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005477 case NFA_SPLIT:
5478 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005479 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005480 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005481 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005482 l = failure_chance(state->out, depth + 1);
5483 r = failure_chance(state->out1, depth + 1);
5484 return l < r ? l : r;
5485
5486 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005487 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005488 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005489
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005490 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005491 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005492 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005493 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005494 return 0;
5495
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005496 case NFA_START_INVISIBLE:
5497 case NFA_START_INVISIBLE_FIRST:
5498 case NFA_START_INVISIBLE_NEG:
5499 case NFA_START_INVISIBLE_NEG_FIRST:
5500 case NFA_START_INVISIBLE_BEFORE:
5501 case NFA_START_INVISIBLE_BEFORE_FIRST:
5502 case NFA_START_INVISIBLE_BEFORE_NEG:
5503 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5504 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005505 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005506 return 5;
5507
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005508 case NFA_BOL:
5509 case NFA_EOL:
5510 case NFA_BOF:
5511 case NFA_EOF:
5512 case NFA_NEWL:
5513 return 99;
5514
5515 case NFA_BOW:
5516 case NFA_EOW:
5517 return 90;
5518
5519 case NFA_MOPEN:
5520 case NFA_MOPEN1:
5521 case NFA_MOPEN2:
5522 case NFA_MOPEN3:
5523 case NFA_MOPEN4:
5524 case NFA_MOPEN5:
5525 case NFA_MOPEN6:
5526 case NFA_MOPEN7:
5527 case NFA_MOPEN8:
5528 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005529#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005530 case NFA_ZOPEN:
5531 case NFA_ZOPEN1:
5532 case NFA_ZOPEN2:
5533 case NFA_ZOPEN3:
5534 case NFA_ZOPEN4:
5535 case NFA_ZOPEN5:
5536 case NFA_ZOPEN6:
5537 case NFA_ZOPEN7:
5538 case NFA_ZOPEN8:
5539 case NFA_ZOPEN9:
5540 case NFA_ZCLOSE:
5541 case NFA_ZCLOSE1:
5542 case NFA_ZCLOSE2:
5543 case NFA_ZCLOSE3:
5544 case NFA_ZCLOSE4:
5545 case NFA_ZCLOSE5:
5546 case NFA_ZCLOSE6:
5547 case NFA_ZCLOSE7:
5548 case NFA_ZCLOSE8:
5549 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005550#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005551 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005552 case NFA_MCLOSE1:
5553 case NFA_MCLOSE2:
5554 case NFA_MCLOSE3:
5555 case NFA_MCLOSE4:
5556 case NFA_MCLOSE5:
5557 case NFA_MCLOSE6:
5558 case NFA_MCLOSE7:
5559 case NFA_MCLOSE8:
5560 case NFA_MCLOSE9:
5561 case NFA_NCLOSE:
5562 return failure_chance(state->out, depth + 1);
5563
5564 case NFA_BACKREF1:
5565 case NFA_BACKREF2:
5566 case NFA_BACKREF3:
5567 case NFA_BACKREF4:
5568 case NFA_BACKREF5:
5569 case NFA_BACKREF6:
5570 case NFA_BACKREF7:
5571 case NFA_BACKREF8:
5572 case NFA_BACKREF9:
5573#ifdef FEAT_SYN_HL
5574 case NFA_ZREF1:
5575 case NFA_ZREF2:
5576 case NFA_ZREF3:
5577 case NFA_ZREF4:
5578 case NFA_ZREF5:
5579 case NFA_ZREF6:
5580 case NFA_ZREF7:
5581 case NFA_ZREF8:
5582 case NFA_ZREF9:
5583#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005584 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005585 return 94;
5586
5587 case NFA_LNUM_GT:
5588 case NFA_LNUM_LT:
5589 case NFA_COL_GT:
5590 case NFA_COL_LT:
5591 case NFA_VCOL_GT:
5592 case NFA_VCOL_LT:
5593 case NFA_MARK_GT:
5594 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005595 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005596 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005597 return 85;
5598
5599 case NFA_LNUM:
5600 return 90;
5601
5602 case NFA_CURSOR:
5603 case NFA_COL:
5604 case NFA_VCOL:
5605 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005606 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005607 return 98;
5608
5609 case NFA_COMPOSING:
5610 return 95;
5611
5612 default:
5613 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005614 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005615 return 95;
5616 }
5617
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005618 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005619 return 50;
5620}
5621
Bram Moolenaarf46da702013-06-02 22:37:42 +02005622/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005623 * Skip until the char "c" we know a match must start with.
5624 */
5625 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005626skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005627{
5628 char_u *s;
5629
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005630 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005631 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005632 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005633 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005634 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005635 if (s == NULL)
5636 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005637 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005638 return OK;
5639}
5640
5641/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005642 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005643 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005644 * Returns zero for no match, 1 for a match.
5645 */
5646 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01005647find_match_text(colnr_T startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005648{
5649 colnr_T col = startcol;
5650 int c1, c2;
5651 int len1, len2;
5652 int match;
5653
5654 for (;;)
5655 {
5656 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005657 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005658 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5659 {
5660 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005661 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005662 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005663 {
5664 match = FALSE;
5665 break;
5666 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005667 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5668 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005669 }
5670 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005671 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005672 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005673 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005674 {
5675 cleanup_subexpr();
5676 if (REG_MULTI)
5677 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005678 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005679 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005680 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005681 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005682 }
5683 else
5684 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005685 rex.reg_startp[0] = rex.line + col;
5686 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005687 }
5688 return 1L;
5689 }
5690
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005691 // Try finding regstart after the current match.
5692 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005693 if (skip_to_start(regstart, &col) == FAIL)
5694 break;
5695 }
5696 return 0L;
5697}
5698
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005699#ifdef FEAT_RELTIME
5700 static int
5701nfa_did_time_out()
5702{
5703 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5704 {
5705 if (nfa_timed_out != NULL)
5706 *nfa_timed_out = TRUE;
5707 return TRUE;
5708 }
5709 return FALSE;
5710}
5711#endif
5712
Bram Moolenaar473de612013-06-08 18:19:48 +02005713/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005714 * Main matching routine.
5715 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005716 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005717 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005718 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005719 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005720 * Return TRUE if there is a match, FALSE if there is no match,
5721 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005722 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005723 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005724 * Note: Caller must ensure that: start != NULL.
5725 */
5726 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005727nfa_regmatch(
5728 nfa_regprog_T *prog,
5729 nfa_state_T *start,
5730 regsubs_T *submatch,
5731 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005732{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005733 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005734 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005735 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005736 int go_to_nextline = FALSE;
5737 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005738 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005739 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005740 nfa_list_T *thislist;
5741 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005742 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005743 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005744 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005745 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005746 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005747 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005748 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005749 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005750#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005751 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005752#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005753
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005754 // Some patterns may take a long time to match, especially when using
5755 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005756 fast_breakcheck();
5757 if (got_int)
5758 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005759#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005760 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005761 return FALSE;
5762#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005763
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005764#ifdef NFA_REGEXP_DEBUG_LOG
5765 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5766 if (debug == NULL)
5767 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005768 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005769 return FALSE;
5770 }
5771#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005772 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005773
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005774 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005775 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005776
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005777 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005778 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005779 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005780 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005781 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005782 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005783
5784#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005785 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005786 if (log_fd == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005787 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005788 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005789 log_fd = stderr;
5790 }
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005791 fprintf(log_fd, "**********************************\n");
5792 nfa_set_code(start->c);
5793 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5794 abs(start->id), code);
5795 fprintf(log_fd, "**********************************\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005796#endif
5797
5798 thislist = &list[0];
5799 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005800 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005801 nextlist = &list[1];
5802 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005803 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005804#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005805 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005806#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005807 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005808
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005809 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5810 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005811 if (toplevel)
5812 {
5813 if (REG_MULTI)
5814 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005815 m->norm.list.multi[0].start_lnum = rex.lnum;
5816 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005817 }
5818 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005819 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005820 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005821 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005822 }
5823 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005824 r = addstate(thislist, start, m, NULL, 0);
5825 if (r == NULL)
5826 {
5827 nfa_match = NFA_TOO_EXPENSIVE;
5828 goto theend;
5829 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005830
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005831#define ADD_STATE_IF_MATCH(state) \
5832 if (result) { \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005833 add_state = state->out; \
5834 add_off = clen; \
5835 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005836
5837 /*
5838 * Run for each character.
5839 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005840 for (;;)
5841 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005842 int curc;
5843 int clen;
5844
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005845 if (has_mbyte)
5846 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005847 curc = (*mb_ptr2char)(rex.input);
5848 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005849 }
5850 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005851 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005852 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005853 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005854 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005855 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005856 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005857 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005858 go_to_nextline = FALSE;
5859 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005860
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005861 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005862 thislist = &list[flag];
5863 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005864 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005865 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005866 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005867 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005868 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005869# ifdef FEAT_EVAL
5870 || nfa_fail_for_testing
5871# endif
5872 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005873 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005874 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005875 nfa_match = NFA_TOO_EXPENSIVE;
5876 goto theend;
5877 }
5878
Bram Moolenaar0270f382018-07-17 05:43:58 +02005879 thislist->id = rex.nfa_listid;
5880 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005881
5882#ifdef ENABLE_LOG
5883 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005884 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005885 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005886 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005887 {
5888 int i;
5889
5890 for (i = 0; i < thislist->n; i++)
5891 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5892 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005893 fprintf(log_fd, "\n");
5894#endif
5895
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005896#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005897 fprintf(debug, "\n-------------------\n");
5898#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005899 /*
5900 * If the state lists are empty we can stop.
5901 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005902 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005903 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005904
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005905 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005906 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005907 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005908 // If the list gets very long there probably is something wrong.
5909 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005910 fast_breakcheck();
5911 if (got_int)
5912 break;
5913#ifdef FEAT_RELTIME
5914 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5915 {
5916 nfa_time_count = 0;
5917 if (nfa_did_time_out())
5918 break;
5919 }
5920#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005921 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005922
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005923#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005924 nfa_set_code(t->state->c);
5925 fprintf(debug, "%s, ", code);
5926#endif
5927#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005928 {
5929 int col;
5930
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005931 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005932 col = -1;
5933 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005934 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005935 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005936 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005937 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005938 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005939 abs(t->state->id), (int)t->state->c, code, col,
5940 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005941 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005942#endif
5943
5944 /*
5945 * Handle the possible codes of the current state.
5946 * The most important is NFA_MATCH.
5947 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005948 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005949 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005950 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005951 switch (t->state->c)
5952 {
5953 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005954 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005955 // If the match is not at the start of the line, ends before a
5956 // composing characters and rex.reg_icombine is not set, that
5957 // is not really a match.
5958 if (enc_utf8 && !rex.reg_icombine
5959 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005960 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005961
Bram Moolenaar963fee22013-05-26 21:47:28 +02005962 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005963 copy_sub(&submatch->norm, &t->subs.norm);
5964#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005965 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005966 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005967#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005968#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005969 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005970#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005971 // Found the left-most longest match, do not look at any other
5972 // states at this position. When the list of states is going
5973 // to be empty quit without advancing, so that "rex.input" is
5974 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005975 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005976 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005977 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005978 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005979
5980 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005981 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005982 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005983 /*
5984 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005985 * NFA_START_INVISIBLE_BEFORE node.
5986 * They surround a zero-width group, used with "\@=", "\&",
5987 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005988 * If we got here, it means that the current "invisible" group
5989 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005990 * nfa_regmatch(). For a look-behind match only when it ends
5991 * in the position in "nfa_endp".
5992 * Submatches are stored in *m, and used in the parent call.
5993 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005994#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005995 if (nfa_endp != NULL)
5996 {
5997 if (REG_MULTI)
5998 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005999 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02006000 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006001 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02006002 nfa_endp->se_u.pos.col);
6003 else
6004 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02006005 (int)(rex.input - rex.line),
6006 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006007 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02006008#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006009 // If "nfa_endp" is set it's only a match if it ends at
6010 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02006011 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006012 ? (rex.lnum != nfa_endp->se_u.pos.lnum
6013 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006014 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006015 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02006016 break;
6017
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006018 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02006019 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006020 {
6021 copy_sub(&m->norm, &t->subs.norm);
6022#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006023 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006024 copy_sub(&m->synt, &t->subs.synt);
6025#endif
6026 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006027#ifdef ENABLE_LOG
6028 fprintf(log_fd, "Match found:\n");
6029 log_subsexpr(m);
6030#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02006031 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006032 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02006033 if (nextlist->n == 0)
6034 clen = 0;
6035 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006036
6037 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006038 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006039 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006040 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02006041 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006042 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006043 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006044 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006045 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02006046#ifdef ENABLE_LOG
6047 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6048 failure_chance(t->state->out, 0),
6049 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02006050#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006051 // Do it directly if there already is a PIM or when
6052 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02006053 if (t->pim.result != NFA_PIM_UNUSED
6054 || t->state->c == NFA_START_INVISIBLE_FIRST
6055 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6056 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6057 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006058 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006059 int in_use = m->norm.in_use;
6060
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006061 // Copy submatch info for the recursive call, opposite
6062 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006063 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02006064#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006065 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006066 copy_sub_off(&m->synt, &t->subs.synt);
6067#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006068
Bram Moolenaara2d95102013-06-04 14:23:05 +02006069 /*
6070 * First try matching the invisible match, then what
6071 * follows.
6072 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006073 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006074 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006075 if (result == NFA_TOO_EXPENSIVE)
6076 {
6077 nfa_match = result;
6078 goto theend;
6079 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006080
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006081 // for \@! and \@<! it is a match when the result is
6082 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006083 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006084 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6085 || t->state->c
6086 == NFA_START_INVISIBLE_BEFORE_NEG
6087 || t->state->c
6088 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006089 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006090 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006091 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006092#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006093 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006094 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006095#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006096 // If the pattern has \ze and it matched in the
6097 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006098 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006099
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006100 // t->state->out1 is the corresponding
6101 // END_INVISIBLE node; Add its out to the current
6102 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006103 add_here = TRUE;
6104 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006105 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006106 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006107 }
6108 else
6109 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006110 nfa_pim_T pim;
6111
Bram Moolenaara2d95102013-06-04 14:23:05 +02006112 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006113 * First try matching what follows. Only if a match
6114 * is found verify the invisible match matches. Add a
6115 * nfa_pim_T to the following states, it contains info
6116 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006117 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006118 pim.state = t->state;
6119 pim.result = NFA_PIM_TODO;
6120 pim.subs.norm.in_use = 0;
6121#ifdef FEAT_SYN_HL
6122 pim.subs.synt.in_use = 0;
6123#endif
6124 if (REG_MULTI)
6125 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006126 pim.end.pos.col = (int)(rex.input - rex.line);
6127 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006128 }
6129 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006130 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006131
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006132 // t->state->out1 is the corresponding END_INVISIBLE
6133 // node; Add its out to the current list (zero-width
6134 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006135 if (addstate_here(thislist, t->state->out1->out,
6136 &t->subs, &pim, &listidx) == NULL)
6137 {
6138 nfa_match = NFA_TOO_EXPENSIVE;
6139 goto theend;
6140 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006141 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006142 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006143 break;
6144
Bram Moolenaar87953742013-06-05 18:52:40 +02006145 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006146 {
6147 nfa_state_T *skip = NULL;
6148#ifdef ENABLE_LOG
6149 int skip_lid = 0;
6150#endif
6151
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006152 // There is no point in trying to match the pattern if the
6153 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006154 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6155 {
6156 skip = t->state->out1->out;
6157#ifdef ENABLE_LOG
6158 skip_lid = nextlist->id;
6159#endif
6160 }
6161 else if (state_in_list(nextlist,
6162 t->state->out1->out->out, &t->subs))
6163 {
6164 skip = t->state->out1->out->out;
6165#ifdef ENABLE_LOG
6166 skip_lid = nextlist->id;
6167#endif
6168 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006169 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006170 t->state->out1->out->out, &t->subs))
6171 {
6172 skip = t->state->out1->out->out;
6173#ifdef ENABLE_LOG
6174 skip_lid = thislist->id;
6175#endif
6176 }
6177 if (skip != NULL)
6178 {
6179#ifdef ENABLE_LOG
6180 nfa_set_code(skip->c);
6181 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6182 abs(skip->id), skip_lid, skip->c, code);
6183#endif
6184 break;
6185 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006186 // Copy submatch info to the recursive call, opposite of what
6187 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006188 copy_sub_off(&m->norm, &t->subs.norm);
6189#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006190 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006191 copy_sub_off(&m->synt, &t->subs.synt);
6192#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006193
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006194 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006195 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006196 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006197 if (result == NFA_TOO_EXPENSIVE)
6198 {
6199 nfa_match = result;
6200 goto theend;
6201 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006202 if (result)
6203 {
6204 int bytelen;
6205
6206#ifdef ENABLE_LOG
6207 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6208 log_subsexpr(m);
6209#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006210 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006211 copy_sub_off(&t->subs.norm, &m->norm);
6212#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006213 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006214 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006215#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006216 // Now we need to skip over the matched text and then
6217 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006218 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006219 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006220 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006221 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006222 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006223 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006224
6225#ifdef ENABLE_LOG
6226 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6227#endif
6228 if (bytelen == 0)
6229 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006230 // empty match, output of corresponding
6231 // NFA_END_PATTERN/NFA_SKIP to be used at current
6232 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006233 add_here = TRUE;
6234 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006235 }
6236 else if (bytelen <= clen)
6237 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006238 // match current character, output of corresponding
6239 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006240 add_state = t->state->out1->out->out;
6241 add_off = clen;
6242 }
6243 else
6244 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006245 // skip over the matched characters, set character
6246 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006247 add_state = t->state->out1->out;
6248 add_off = bytelen;
6249 add_count = bytelen - clen;
6250 }
6251 }
6252 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006253 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006254
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006255 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006256 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006257 {
6258 add_here = TRUE;
6259 add_state = t->state->out;
6260 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006261 break;
6262
6263 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006264 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006265 {
6266 add_here = TRUE;
6267 add_state = t->state->out;
6268 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006269 break;
6270
6271 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006272 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006273
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006274 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006275 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006276 else if (has_mbyte)
6277 {
6278 int this_class;
6279
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006280 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006281 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006282 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006283 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006284 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006285 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006286 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006287 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006288 || (rex.input > rex.line
6289 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006290 result = FALSE;
6291 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006292 {
6293 add_here = TRUE;
6294 add_state = t->state->out;
6295 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006296 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006297
6298 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006299 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006300 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006301 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006302 else if (has_mbyte)
6303 {
6304 int this_class, prev_class;
6305
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006306 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006307 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006308 prev_class = reg_prev_class();
6309 if (this_class == prev_class
6310 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006311 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006312 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006313 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6314 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006315 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006316 result = FALSE;
6317 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006318 {
6319 add_here = TRUE;
6320 add_state = t->state->out;
6321 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006322 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006323
Bram Moolenaar4b780632013-05-31 22:14:52 +02006324 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006325 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006326 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006327 {
6328 add_here = TRUE;
6329 add_state = t->state->out;
6330 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006331 break;
6332
6333 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006334 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006335 {
6336 add_here = TRUE;
6337 add_state = t->state->out;
6338 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006339 break;
6340
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006341 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006342 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006343 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006344 int len = 0;
6345 nfa_state_T *end;
6346 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006347 int cchars[MAX_MCO];
6348 int ccount = 0;
6349 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006350
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006351 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006352 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006353 if (utf_iscomposing(sta->c))
6354 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006355 // Only match composing character(s), ignore base
6356 // character. Used for ".{composing}" and "{composing}"
6357 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006358 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006359 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006360 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006361 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006362 // If \Z was present, then ignore composing characters.
6363 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006364 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006365 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006366 else
6367 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006368 while (sta->c != NFA_END_COMPOSING)
6369 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006370 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006371
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006372 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006373 else if (len > 0 || mc == sta->c)
6374 {
6375 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006376 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006377 len += mb_char2len(mc);
6378 sta = sta->out;
6379 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006380
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006381 // We don't care about the order of composing characters.
6382 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006383 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006384 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006385 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006386 cchars[ccount++] = mc;
6387 len += mb_char2len(mc);
6388 if (ccount == MAX_MCO)
6389 break;
6390 }
6391
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006392 // Check that each composing char in the pattern matches a
6393 // composing char in the text. We do not check if all
6394 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006395 result = OK;
6396 while (sta->c != NFA_END_COMPOSING)
6397 {
6398 for (j = 0; j < ccount; ++j)
6399 if (cchars[j] == sta->c)
6400 break;
6401 if (j == ccount)
6402 {
6403 result = FAIL;
6404 break;
6405 }
6406 sta = sta->out;
6407 }
6408 }
6409 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006410 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006411
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006412 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006413 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006414 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006415 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006416
6417 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006418 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006419 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006420 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006421 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006422 // Pass -1 for the offset, which means taking the position
6423 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006424 add_state = t->state->out;
6425 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006426 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006427 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006428 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006429 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006430 add_state = t->state->out;
6431 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006432 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006433 break;
6434
Bram Moolenaar417bad22013-06-07 14:08:30 +02006435 case NFA_START_COLL:
6436 case NFA_START_NEG_COLL:
6437 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006438 // What follows is a list of characters, until NFA_END_COLL.
6439 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006440 nfa_state_T *state;
6441 int result_if_matched;
6442 int c1, c2;
6443
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006444 // Never match EOL. If it's part of the collection it is added
6445 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006446 if (curc == NUL)
6447 break;
6448
6449 state = t->state->out;
6450 result_if_matched = (t->state->c == NFA_START_COLL);
6451 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006452 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006453 if (state->c == NFA_END_COLL)
6454 {
6455 result = !result_if_matched;
6456 break;
6457 }
6458 if (state->c == NFA_RANGE_MIN)
6459 {
6460 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006461 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006462 c2 = state->val;
6463#ifdef ENABLE_LOG
6464 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6465 curc, c1, c2);
6466#endif
6467 if (curc >= c1 && curc <= c2)
6468 {
6469 result = result_if_matched;
6470 break;
6471 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006472 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006473 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006474 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006475 int done = FALSE;
6476
6477 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006478 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006479 {
6480 result = result_if_matched;
6481 done = TRUE;
6482 break;
6483 }
6484 if (done)
6485 break;
6486 }
6487 }
6488 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006489 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006490 || (rex.reg_ic && MB_CASEFOLD(curc)
6491 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006492 {
6493 result = result_if_matched;
6494 break;
6495 }
6496 state = state->out;
6497 }
6498 if (result)
6499 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006500 // next state is in out of the NFA_END_COLL, out1 of
6501 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006502 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006503 add_off = clen;
6504 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006505 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006506 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006507
6508 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006509 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006510 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006511 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006512 add_state = t->state->out;
6513 add_off = clen;
6514 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006515 break;
6516
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006517 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006518 // On a composing character skip over it. Otherwise do
6519 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006520 if (enc_utf8 && utf_iscomposing(curc))
6521 {
6522 add_off = clen;
6523 }
6524 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006525 {
6526 add_here = TRUE;
6527 add_off = 0;
6528 }
6529 add_state = t->state->out;
6530 break;
6531
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006532 /*
6533 * Character classes like \a for alpha, \d for digit etc.
6534 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006535 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006536 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006537 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006538 break;
6539
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006540 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006541 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006542 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006543 break;
6544
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006545 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006546 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006547 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006548 break;
6549
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006550 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006551 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006552 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006553 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006554 break;
6555
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006556 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006557 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006558 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006559 break;
6560
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006561 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006562 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006563 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006564 break;
6565
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006566 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006567 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006568 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006569 break;
6570
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006571 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006572 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006573 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006574 break;
6575
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006576 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006577 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006578 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006579 break;
6580
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006581 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006582 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006583 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006584 break;
6585
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006586 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006587 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006588 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006589 break;
6590
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006591 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006592 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006593 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006594 break;
6595
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006596 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006597 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006598 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006599 break;
6600
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006601 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006602 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006603 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006604 break;
6605
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006606 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006607 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006608 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006609 break;
6610
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006611 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006612 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006613 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006614 break;
6615
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006616 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006617 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006618 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006619 break;
6620
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006621 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006622 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006623 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006624 break;
6625
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006626 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006627 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006628 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006629 break;
6630
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006631 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006632 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006633 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006634 break;
6635
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006636 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006637 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006638 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006639 break;
6640
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006641 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006642 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006643 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006644 break;
6645
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006646 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006647 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006648 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006649 break;
6650
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006651 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006652 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006653 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006654 break;
6655
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006656 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006657 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006658 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006659 break;
6660
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006661 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006662 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006663 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006664 break;
6665
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006666 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006667 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006668 ADD_STATE_IF_MATCH(t->state);
6669 break;
6670
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006671 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006672 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006673 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006674 ADD_STATE_IF_MATCH(t->state);
6675 break;
6676
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006677 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006678 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006679 ADD_STATE_IF_MATCH(t->state);
6680 break;
6681
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006682 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006683 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006684 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006685 ADD_STATE_IF_MATCH(t->state);
6686 break;
6687
Bram Moolenaar5714b802013-05-28 22:03:20 +02006688 case NFA_BACKREF1:
6689 case NFA_BACKREF2:
6690 case NFA_BACKREF3:
6691 case NFA_BACKREF4:
6692 case NFA_BACKREF5:
6693 case NFA_BACKREF6:
6694 case NFA_BACKREF7:
6695 case NFA_BACKREF8:
6696 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006697#ifdef FEAT_SYN_HL
6698 case NFA_ZREF1:
6699 case NFA_ZREF2:
6700 case NFA_ZREF3:
6701 case NFA_ZREF4:
6702 case NFA_ZREF5:
6703 case NFA_ZREF6:
6704 case NFA_ZREF7:
6705 case NFA_ZREF8:
6706 case NFA_ZREF9:
6707#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006708 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006709 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006710 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006711 int bytelen;
6712
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006713 if (t->state->c <= NFA_BACKREF9)
6714 {
6715 subidx = t->state->c - NFA_BACKREF1 + 1;
6716 result = match_backref(&t->subs.norm, subidx, &bytelen);
6717 }
6718#ifdef FEAT_SYN_HL
6719 else
6720 {
6721 subidx = t->state->c - NFA_ZREF1 + 1;
6722 result = match_zref(subidx, &bytelen);
6723 }
6724#endif
6725
Bram Moolenaar5714b802013-05-28 22:03:20 +02006726 if (result)
6727 {
6728 if (bytelen == 0)
6729 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006730 // empty match always works, output of NFA_SKIP to be
6731 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006732 add_here = TRUE;
6733 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006734 }
6735 else if (bytelen <= clen)
6736 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006737 // match current character, jump ahead to out of
6738 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006739 add_state = t->state->out->out;
6740 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006741 }
6742 else
6743 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006744 // skip over the matched characters, set character
6745 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006746 add_state = t->state->out;
6747 add_off = bytelen;
6748 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006749 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006750 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006751 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006752 }
6753 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006754 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006755 if (t->count - clen <= 0)
6756 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006757 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006758 add_state = t->state->out;
6759 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006760 }
6761 else
6762 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006763 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006764 add_state = t->state;
6765 add_off = 0;
6766 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006767 }
6768 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006769
Bram Moolenaar423532e2013-05-29 21:14:42 +02006770 case NFA_LNUM:
6771 case NFA_LNUM_GT:
6772 case NFA_LNUM_LT:
6773 result = (REG_MULTI &&
6774 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006775 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006776 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006777 {
6778 add_here = TRUE;
6779 add_state = t->state->out;
6780 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006781 break;
6782
6783 case NFA_COL:
6784 case NFA_COL_GT:
6785 case NFA_COL_LT:
6786 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006787 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006788 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006789 {
6790 add_here = TRUE;
6791 add_state = t->state->out;
6792 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006793 break;
6794
6795 case NFA_VCOL:
6796 case NFA_VCOL_GT:
6797 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006798 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006799 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006800 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006801 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006802
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006803 // Bail out quickly when there can't be a match, avoid the
6804 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006805 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006806 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006807 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006808 result = FALSE;
6809 if (op == 1 && col - 1 > t->state->val && col > 100)
6810 {
6811 int ts = wp->w_buffer->b_p_ts;
6812
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006813 // Guess that a character won't use more columns than
6814 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006815 if (ts < 4)
6816 ts = 4;
6817 result = col > t->state->val * ts;
6818 }
6819 if (!result)
6820 result = nfa_re_num_cmp(t->state->val, op,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006821 (long_u)win_linetabsize(wp, rex.line, col) + 1);
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006822 if (result)
6823 {
6824 add_here = TRUE;
6825 add_state = t->state->out;
6826 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006827 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006828 break;
6829
Bram Moolenaar044aa292013-06-04 21:27:38 +02006830 case NFA_MARK:
6831 case NFA_MARK_GT:
6832 case NFA_MARK_LT:
6833 {
Bram Moolenaar64066b92021-11-17 18:22:56 +00006834 size_t col = rex.input - rex.line;
Bram Moolenaar6100d022016-10-02 16:51:57 +02006835 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006836
Bram Moolenaar64066b92021-11-17 18:22:56 +00006837 // Line may have been freed, get it again.
6838 if (REG_MULTI)
6839 {
6840 rex.line = reg_getline(rex.lnum);
6841 rex.input = rex.line + col;
6842 }
6843
Bram Moolenaar872bee52021-05-24 22:56:15 +02006844 // Compare the mark position to the match position, if the mark
6845 // exists and mark is set in reg_buf.
6846 if (pos != NULL && pos->lnum > 0)
6847 {
6848 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6849 && pos->col == MAXCOL
6850 ? (colnr_T)STRLEN(reg_getline(
6851 pos->lnum - rex.reg_firstlnum))
6852 : pos->col;
6853
6854 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6855 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006856 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006857 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006858 ? t->state->c == NFA_MARK_GT
6859 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006860 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006861 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006862 : t->state->c == NFA_MARK_LT));
6863 if (result)
6864 {
6865 add_here = TRUE;
6866 add_state = t->state->out;
6867 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006868 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006869 break;
6870 }
6871
Bram Moolenaar423532e2013-05-29 21:14:42 +02006872 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006873 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006874 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006875 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006876 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006877 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006878 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006879 {
6880 add_here = TRUE;
6881 add_state = t->state->out;
6882 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006883 break;
6884
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006885 case NFA_VISUAL:
6886 result = reg_match_visual();
6887 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006888 {
6889 add_here = TRUE;
6890 add_state = t->state->out;
6891 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006892 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006893
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006894 case NFA_MOPEN1:
6895 case NFA_MOPEN2:
6896 case NFA_MOPEN3:
6897 case NFA_MOPEN4:
6898 case NFA_MOPEN5:
6899 case NFA_MOPEN6:
6900 case NFA_MOPEN7:
6901 case NFA_MOPEN8:
6902 case NFA_MOPEN9:
6903#ifdef FEAT_SYN_HL
6904 case NFA_ZOPEN:
6905 case NFA_ZOPEN1:
6906 case NFA_ZOPEN2:
6907 case NFA_ZOPEN3:
6908 case NFA_ZOPEN4:
6909 case NFA_ZOPEN5:
6910 case NFA_ZOPEN6:
6911 case NFA_ZOPEN7:
6912 case NFA_ZOPEN8:
6913 case NFA_ZOPEN9:
6914#endif
6915 case NFA_NOPEN:
6916 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006917 // These states are only added to be able to bail out when
6918 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006919 break;
6920
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006921 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006922 {
6923 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006924
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006925#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006926 if (c < 0)
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00006927 siemsg("INTERNAL: Negative state char: %ld", (long)c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006928#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006929 result = (c == curc);
6930
Bram Moolenaar6100d022016-10-02 16:51:57 +02006931 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006932 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006933 // If rex.reg_icombine is not set only skip over the character
6934 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006935 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006936 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006937 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006938 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006939 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006940
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006941 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006942
6943 if (add_state != NULL)
6944 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006945 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006946 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006947
6948 if (t->pim.result == NFA_PIM_UNUSED)
6949 pim = NULL;
6950 else
6951 pim = &t->pim;
6952
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006953 // Handle the postponed invisible match if the match might end
6954 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006955 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006956 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006957 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006958 {
6959#ifdef ENABLE_LOG
6960 fprintf(log_fd, "\n");
6961 fprintf(log_fd, "==================================\n");
6962 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6963 fprintf(log_fd, "\n");
6964#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006965 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006966 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006967 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006968 // for \@! and \@<! it is a match when the result is
6969 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006970 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006971 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6972 || pim->state->c
6973 == NFA_START_INVISIBLE_BEFORE_NEG
6974 || pim->state->c
6975 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006976 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006977 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006978 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006979#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006980 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006981 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006982#endif
6983 }
6984 }
6985 else
6986 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006987 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006988#ifdef ENABLE_LOG
6989 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006990 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006991 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6992 fprintf(log_fd, "\n");
6993#endif
6994 }
6995
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006996 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006997 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006998 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6999 || pim->state->c
7000 == NFA_START_INVISIBLE_BEFORE_NEG
7001 || pim->state->c
7002 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02007003 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007004 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007005 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007006#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02007007 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007008 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007009#endif
7010 }
7011 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007012 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02007013 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007014
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007015 // Postponed invisible match was handled, don't add it to
7016 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007017 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02007018 }
7019
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007020 // If "pim" points into l->t it will become invalid when
7021 // adding the state causes the list to be reallocated. Make a
7022 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02007023 if (pim == &t->pim)
7024 {
7025 copy_pim(&pim_copy, pim);
7026 pim = &pim_copy;
7027 }
7028
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007029 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007030 r = addstate_here(thislist, add_state, &t->subs,
7031 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007032 else
7033 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007034 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007035 if (add_count > 0)
7036 nextlist->t[nextlist->n - 1].count = add_count;
7037 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007038 if (r == NULL)
7039 {
7040 nfa_match = NFA_TOO_EXPENSIVE;
7041 goto theend;
7042 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007043 }
7044
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007045 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007046
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007047 // Look for the start of a match in the current position by adding the
7048 // start state to the list of states.
7049 // The first found match is the leftmost one, thus the order of states
7050 // matters!
7051 // Do not add the start state in recursive calls of nfa_regmatch(),
7052 // because recursive calls should only start in the first position.
7053 // Unless "nfa_endp" is not NULL, then we match the end position.
7054 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02007055 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007056 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02007057 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02007058 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02007059 && (rex.reg_maxcol == 0
Bram Moolenaar0270f382018-07-17 05:43:58 +02007060 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02007061 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02007062 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007063 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7064 || (rex.lnum == nfa_endp->se_u.pos.lnum
7065 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02007066 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02007067 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007068 {
7069#ifdef ENABLE_LOG
7070 fprintf(log_fd, "(---) STARTSTATE\n");
7071#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007072 // Inline optimized code for addstate() if we know the state is
7073 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007074 if (toplevel)
7075 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007076 int add = TRUE;
7077 int c;
7078
7079 if (prog->regstart != NUL && clen != 0)
7080 {
7081 if (nextlist->n == 0)
7082 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007083 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007084
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007085 // Nextlist is empty, we can skip ahead to the
7086 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007087 if (skip_to_start(prog->regstart, &col) == FAIL)
7088 break;
7089#ifdef ENABLE_LOG
7090 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007091 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007092#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007093 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007094 }
7095 else
7096 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007097 // Checking if the required start character matches is
7098 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007099 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007100 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007101 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007102 {
7103#ifdef ENABLE_LOG
7104 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7105#endif
7106 add = FALSE;
7107 }
7108 }
7109 }
7110
7111 if (add)
7112 {
7113 if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007114 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007115 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007116 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007117 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007118 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7119 {
7120 nfa_match = NFA_TOO_EXPENSIVE;
7121 goto theend;
7122 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007123 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007124 }
7125 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007126 {
7127 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7128 {
7129 nfa_match = NFA_TOO_EXPENSIVE;
7130 goto theend;
7131 }
7132 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007133 }
7134
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007135#ifdef ENABLE_LOG
7136 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007137 {
7138 int i;
7139
7140 for (i = 0; i < thislist->n; i++)
7141 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7142 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007143 fprintf(log_fd, "\n");
7144#endif
7145
7146nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007147 // Advance to the next character, or advance to the next line, or
7148 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007149 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007150 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007151 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007152 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007153 reg_nextline();
7154 else
7155 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007156
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007157 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007158 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007159 if (got_int)
7160 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007161#ifdef FEAT_RELTIME
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007162 // Check for timeout once in a twenty times to avoid overhead.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007163 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7164 {
7165 nfa_time_count = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007166 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007167 break;
7168 }
7169#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007170 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007171
7172#ifdef ENABLE_LOG
7173 if (log_fd != stderr)
7174 fclose(log_fd);
7175 log_fd = NULL;
7176#endif
7177
7178theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007179 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007180 vim_free(list[0].t);
7181 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007182 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007183#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007184#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007185 fclose(debug);
7186#endif
7187
Bram Moolenaar963fee22013-05-26 21:47:28 +02007188 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007189}
7190
7191/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007192 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007193 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007194 */
7195 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007196nfa_regtry(
7197 nfa_regprog_T *prog,
7198 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007199 proftime_T *tm UNUSED, // timeout limit or NULL
7200 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007201{
7202 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007203 regsubs_T subs, m;
7204 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007205 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007206#ifdef ENABLE_LOG
7207 FILE *f;
7208#endif
7209
Bram Moolenaar0270f382018-07-17 05:43:58 +02007210 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007211#ifdef FEAT_RELTIME
7212 nfa_time_limit = tm;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007213 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007214 nfa_time_count = 0;
7215#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007216
7217#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007218 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007219 if (f != NULL)
7220 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007221 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007222#ifdef DEBUG
7223 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7224#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007225 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007226 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007227 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007228 fprintf(f, "\n\n");
7229 fclose(f);
7230 }
7231 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007232 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007233#endif
7234
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007235 clear_sub(&subs.norm);
7236 clear_sub(&m.norm);
7237#ifdef FEAT_SYN_HL
7238 clear_sub(&subs.synt);
7239 clear_sub(&m.synt);
7240#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007241
Bram Moolenaarfda37292014-11-05 14:27:36 +01007242 result = nfa_regmatch(prog, start, &subs, &m);
7243 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007244 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007245 else if (result == NFA_TOO_EXPENSIVE)
7246 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007247
7248 cleanup_subexpr();
7249 if (REG_MULTI)
7250 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007251 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007252 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007253 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7254 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007255
Bram Moolenaar6100d022016-10-02 16:51:57 +02007256 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7257 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007258 }
7259
Bram Moolenaar6100d022016-10-02 16:51:57 +02007260 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007261 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007262 rex.reg_startpos[0].lnum = 0;
7263 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007264 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007265 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007266 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007267 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007268 rex.reg_endpos[0].lnum = rex.lnum;
7269 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007270 }
7271 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007272 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007273 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007274 }
7275 else
7276 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007277 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007278 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007279 rex.reg_startp[i] = subs.norm.list.line[i].start;
7280 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007281 }
7282
Bram Moolenaar6100d022016-10-02 16:51:57 +02007283 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007284 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007285 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007286 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007287 }
7288
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007289#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007290 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007291 unref_extmatch(re_extmatch_out);
7292 re_extmatch_out = NULL;
7293
7294 if (prog->reghasz == REX_SET)
7295 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007296 cleanup_zsubexpr();
7297 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007298 if (re_extmatch_out == NULL)
7299 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007300 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007301 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007302 {
7303 if (REG_MULTI)
7304 {
7305 struct multipos *mpos = &subs.synt.list.multi[i];
7306
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007307 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007308 if (mpos->start_lnum >= 0
7309 && mpos->start_lnum == mpos->end_lnum
7310 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007311 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007312 vim_strnsave(reg_getline(mpos->start_lnum)
7313 + mpos->start_col,
7314 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007315 }
7316 else
7317 {
7318 struct linepos *lpos = &subs.synt.list.line[i];
7319
7320 if (lpos->start != NULL && lpos->end != NULL)
7321 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007322 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007323 }
7324 }
7325 }
7326#endif
7327
Bram Moolenaar0270f382018-07-17 05:43:58 +02007328 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007329}
7330
7331/*
7332 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007333 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007334 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007335 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007336 */
7337 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007338nfa_regexec_both(
7339 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007340 colnr_T startcol, // column to start looking for match
7341 proftime_T *tm, // timeout limit or NULL
7342 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007343{
7344 nfa_regprog_T *prog;
7345 long retval = 0L;
7346 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007347 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007348
7349 if (REG_MULTI)
7350 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007351 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007352 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007353 rex.reg_startpos = rex.reg_mmatch->startpos;
7354 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007355 }
7356 else
7357 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007358 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7359 rex.reg_startp = rex.reg_match->startp;
7360 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007361 }
7362
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007363 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007364 if (prog == NULL || line == NULL)
7365 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02007366 iemsg(_(e_null_argument));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007367 goto theend;
7368 }
7369
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007370 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007371 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007372 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007373 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007374 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007375
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007376 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007377 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007378 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007379
Bram Moolenaar0270f382018-07-17 05:43:58 +02007380 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007381 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007382
Bram Moolenaar0270f382018-07-17 05:43:58 +02007383 rex.nfa_has_zend = prog->has_zend;
7384 rex.nfa_has_backref = prog->has_backref;
7385 rex.nfa_nsubexpr = prog->nsubexp;
7386 rex.nfa_listid = 1;
7387 rex.nfa_alt_listid = 2;
7388#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007389 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007390#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007391
Bram Moolenaard89616e2013-06-06 18:46:06 +02007392 if (prog->reganch && col > 0)
7393 return 0L;
7394
Bram Moolenaar0270f382018-07-17 05:43:58 +02007395 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007396#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007397 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007398 if (prog->reghasz == REX_SET)
7399 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007400 rex.nfa_has_zsubexpr = TRUE;
7401 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007402 }
7403 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007404 {
7405 rex.nfa_has_zsubexpr = FALSE;
7406 rex.need_clear_zsubexpr = FALSE;
7407 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007408#endif
7409
Bram Moolenaard89616e2013-06-06 18:46:06 +02007410 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007411 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007412 // Skip ahead until a character we know the match must start with.
7413 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007414 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007415 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007416
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007417 // If match_text is set it contains the full text that must match.
7418 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007419 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar473de612013-06-08 18:19:48 +02007420 return find_match_text(col, prog->regstart, prog->match_text);
7421 }
7422
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007423 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007424 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007425 goto theend;
7426
Bram Moolenaar0270f382018-07-17 05:43:58 +02007427 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7428 // it's accidentally used during execution.
7429 nstate = 0;
7430 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007431 {
7432 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007433 prog->state[i].lastlist[0] = 0;
7434 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007435 }
7436
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007437 retval = nfa_regtry(prog, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007438
Bram Moolenaar0270f382018-07-17 05:43:58 +02007439#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007440 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007441#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007442
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007443theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007444 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007445 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007446 // Make sure the end is never before the start. Can happen when \zs and
7447 // \ze are used.
7448 if (REG_MULTI)
7449 {
7450 lpos_T *start = &rex.reg_mmatch->startpos[0];
7451 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007452
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007453 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007454 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007455 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7456 }
7457 else
7458 {
7459 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7460 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7461 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007462 }
7463
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007464 return retval;
7465}
7466
7467/*
7468 * Compile a regular expression into internal code for the NFA matcher.
7469 * Returns the program in allocated space. Returns NULL for an error.
7470 */
7471 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007472nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007473{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007474 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007475 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007476 int *postfix;
7477
7478 if (expr == NULL)
7479 return NULL;
7480
Bram Moolenaar0270f382018-07-17 05:43:58 +02007481#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007482 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007483#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007484 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007485
7486 init_class_tab();
7487
7488 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7489 return NULL;
7490
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007491 // Build postfix form of the regexp. Needed to build the NFA
7492 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007493 postfix = re2post();
7494 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007495 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007496
7497 /*
7498 * In order to build the NFA, we parse the input regexp twice:
7499 * 1. first pass to count size (so we can allocate space)
7500 * 2. second to emit code
7501 */
7502#ifdef ENABLE_LOG
7503 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007504 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007505
7506 if (f != NULL)
7507 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007508 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007509 fclose(f);
7510 }
7511 }
7512#endif
7513
7514 /*
7515 * PASS 1
7516 * Count number of NFA states in "nstate". Do not build the NFA.
7517 */
7518 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007519
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007520 // allocate the regprog with space for the compiled regexp
Bram Moolenaar16619a22013-06-11 18:42:36 +02007521 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007522 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007523 if (prog == NULL)
7524 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007525 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007526 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007527
7528 /*
7529 * PASS 2
7530 * Build the NFA
7531 */
7532 prog->start = post2nfa(postfix, post_ptr, FALSE);
7533 if (prog->start == NULL)
7534 goto fail;
7535
7536 prog->regflags = regflags;
7537 prog->engine = &nfa_regengine;
7538 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007539 prog->has_zend = rex.nfa_has_zend;
7540 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007541 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007542
Bram Moolenaara2947e22013-06-11 22:44:09 +02007543 nfa_postprocess(prog);
7544
Bram Moolenaard89616e2013-06-06 18:46:06 +02007545 prog->reganch = nfa_get_reganch(prog->start, 0);
7546 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007547 prog->match_text = nfa_get_match_text(prog->start);
7548
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007549#ifdef ENABLE_LOG
7550 nfa_postfix_dump(expr, OK);
7551 nfa_dump(prog);
7552#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007553#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007554 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007555 prog->reghasz = re_has_z;
7556#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007557 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007558#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007559 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007560#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007561
7562out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007563 VIM_CLEAR(post_start);
7564 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007565 state_ptr = NULL;
7566 return (regprog_T *)prog;
7567
7568fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007569 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007570#ifdef ENABLE_LOG
7571 nfa_postfix_dump(expr, FAIL);
7572#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007573#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007574 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007575#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007576 goto out;
7577}
7578
Bram Moolenaar473de612013-06-08 18:19:48 +02007579/*
7580 * Free a compiled regexp program, returned by nfa_regcomp().
7581 */
7582 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007583nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007584{
7585 if (prog != NULL)
7586 {
7587 vim_free(((nfa_regprog_T *)prog)->match_text);
Bram Moolenaar473de612013-06-08 18:19:48 +02007588 vim_free(((nfa_regprog_T *)prog)->pattern);
Bram Moolenaar473de612013-06-08 18:19:48 +02007589 vim_free(prog);
7590 }
7591}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007592
7593/*
7594 * Match a regexp against a string.
7595 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7596 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007597 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007598 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007599 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007600 */
7601 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007602nfa_regexec_nl(
7603 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007604 char_u *line, // string to match against
7605 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007606 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007607{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007608 rex.reg_match = rmp;
7609 rex.reg_mmatch = NULL;
7610 rex.reg_maxline = 0;
7611 rex.reg_line_lbr = line_lbr;
7612 rex.reg_buf = curbuf;
7613 rex.reg_win = NULL;
7614 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007615 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007616 rex.reg_maxcol = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007617 return nfa_regexec_both(line, col, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007618}
7619
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007620
7621/*
7622 * Match a regexp against multiple lines.
7623 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7624 * Uses curbuf for line count and 'iskeyword'.
7625 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007626 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007627 * match otherwise.
7628 *
7629 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7630 *
7631 * ! Also NOTE : match may actually be in another line. e.g.:
7632 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7633 *
7634 * +-------------------------+
7635 * |a |
7636 * |b |
7637 * |c |
7638 * | |
7639 * +-------------------------+
7640 *
7641 * then nfa_regexec_multi() returns 3. while the original
7642 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7643 *
7644 * FIXME if this behavior is not compatible.
7645 */
7646 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007647nfa_regexec_multi(
7648 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007649 win_T *win, // window in which to search or NULL
7650 buf_T *buf, // buffer in which to search
7651 linenr_T lnum, // nr of line to start looking for match
7652 colnr_T col, // column to start looking for match
7653 proftime_T *tm, // timeout limit or NULL
7654 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007655{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007656 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007657 return nfa_regexec_both(NULL, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007658}
7659
7660#ifdef DEBUG
7661# undef ENABLE_LOG
7662#endif