blob: d724d527b6d23bd6fb74b62d65df236c23b707eb [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar0270f382018-07-17 05:43:58 +0200246// Variables only used in nfa_regcomp() and descendants.
247static int nfa_re_flags; // re_flags passed to nfa_regcomp()
248static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200249static int *post_end;
250static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100251
252// Set when the pattern should use the NFA engine.
253// E.g. [[:upper:]] only allows 8bit characters for BT engine,
254// while NFA engine handles multibyte characters correctly.
255static int wants_nfa;
256
Bram Moolenaar0270f382018-07-17 05:43:58 +0200257static int nstate; // Number of states in the NFA.
258static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200261static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200262
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100263// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200264static int nfa_ll_index = 0;
265
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100266static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100267static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200268#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100269static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200270#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int match_follows(nfa_state_T *startstate, int depth);
272static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100274// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200276 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200277 return FAIL; \
278 *post_ptr++ = c; \
279 } while (0)
280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200281/*
282 * Initialize internal variables before NFA compilation.
283 * Return OK on success, FAIL otherwise.
284 */
285 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100286nfa_regcomp_start(
287 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200289{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200290 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200291 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200292
293 nstate = 0;
294 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200296 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298 // Some items blow up in size, such as [A-z]. Add more space for that.
299 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200300 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200301
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200303 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200304
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200305 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200306 if (post_start == NULL)
307 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200308 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100310 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200311 rex.nfa_has_zend = FALSE;
312 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315 regcomp_start(expr, re_flags);
316
317 return OK;
318}
319
320/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200321 * Figure out if the NFA state list starts with an anchor, must match at start
322 * of the line.
323 */
324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100325nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326{
327 nfa_state_T *p = start;
328
329 if (depth > 4)
330 return 0;
331
332 while (p != NULL)
333 {
334 switch (p->c)
335 {
336 case NFA_BOL:
337 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200339
340 case NFA_ZSTART:
341 case NFA_ZEND:
342 case NFA_CURSOR:
343 case NFA_VISUAL:
344
345 case NFA_MOPEN:
346 case NFA_MOPEN1:
347 case NFA_MOPEN2:
348 case NFA_MOPEN3:
349 case NFA_MOPEN4:
350 case NFA_MOPEN5:
351 case NFA_MOPEN6:
352 case NFA_MOPEN7:
353 case NFA_MOPEN8:
354 case NFA_MOPEN9:
355 case NFA_NOPEN:
356#ifdef FEAT_SYN_HL
357 case NFA_ZOPEN:
358 case NFA_ZOPEN1:
359 case NFA_ZOPEN2:
360 case NFA_ZOPEN3:
361 case NFA_ZOPEN4:
362 case NFA_ZOPEN5:
363 case NFA_ZOPEN6:
364 case NFA_ZOPEN7:
365 case NFA_ZOPEN8:
366 case NFA_ZOPEN9:
367#endif
368 p = p->out;
369 break;
370
371 case NFA_SPLIT:
372 return nfa_get_reganch(p->out, depth + 1)
373 && nfa_get_reganch(p->out1, depth + 1);
374
375 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100376 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200377 }
378 }
379 return 0;
380}
381
382/*
383 * Figure out if the NFA state list starts with a character which must match
384 * at start of the match.
385 */
386 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100387nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200388{
389 nfa_state_T *p = start;
390
391 if (depth > 4)
392 return 0;
393
394 while (p != NULL)
395 {
396 switch (p->c)
397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100398 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200399 case NFA_BOL:
400 case NFA_BOF:
401 case NFA_BOW:
402 case NFA_EOW:
403 case NFA_ZSTART:
404 case NFA_ZEND:
405 case NFA_CURSOR:
406 case NFA_VISUAL:
407 case NFA_LNUM:
408 case NFA_LNUM_GT:
409 case NFA_LNUM_LT:
410 case NFA_COL:
411 case NFA_COL_GT:
412 case NFA_COL_LT:
413 case NFA_VCOL:
414 case NFA_VCOL_GT:
415 case NFA_VCOL_LT:
416 case NFA_MARK:
417 case NFA_MARK_GT:
418 case NFA_MARK_LT:
419
420 case NFA_MOPEN:
421 case NFA_MOPEN1:
422 case NFA_MOPEN2:
423 case NFA_MOPEN3:
424 case NFA_MOPEN4:
425 case NFA_MOPEN5:
426 case NFA_MOPEN6:
427 case NFA_MOPEN7:
428 case NFA_MOPEN8:
429 case NFA_MOPEN9:
430 case NFA_NOPEN:
431#ifdef FEAT_SYN_HL
432 case NFA_ZOPEN:
433 case NFA_ZOPEN1:
434 case NFA_ZOPEN2:
435 case NFA_ZOPEN3:
436 case NFA_ZOPEN4:
437 case NFA_ZOPEN5:
438 case NFA_ZOPEN6:
439 case NFA_ZOPEN7:
440 case NFA_ZOPEN8:
441 case NFA_ZOPEN9:
442#endif
443 p = p->out;
444 break;
445
446 case NFA_SPLIT:
447 {
448 int c1 = nfa_get_regstart(p->out, depth + 1);
449 int c2 = nfa_get_regstart(p->out1, depth + 1);
450
451 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100452 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200453 return 0;
454 }
455
456 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200457 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100458 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200459 return 0;
460 }
461 }
462 return 0;
463}
464
465/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200466 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200467 * else. If so return a string in allocated memory with what must match after
468 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200469 */
470 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100471nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200472{
473 nfa_state_T *p = start;
474 int len = 0;
475 char_u *ret;
476 char_u *s;
477
478 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100479 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200480 p = p->out;
481 while (p->c > 0)
482 {
483 len += MB_CHAR2LEN(p->c);
484 p = p->out;
485 }
486 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
487 return NULL;
488
489 ret = alloc(len);
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000490 if (ret == NULL)
491 return NULL;
492
493 p = start->out->out; // skip first char, it goes into regstart
494 s = ret;
495 while (p->c > 0)
Bram Moolenaar473de612013-06-08 18:19:48 +0200496 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000497 if (has_mbyte)
498 s += (*mb_char2bytes)(p->c, s);
499 else
500 *s++ = p->c;
501 p = p->out;
Bram Moolenaar473de612013-06-08 18:19:48 +0200502 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000503 *s = NUL;
Bram Moolenaar473de612013-06-08 18:19:48 +0200504 return ret;
505}
506
507/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200508 * Allocate more space for post_start. Called when
509 * running above the estimated number of states.
510 */
511 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100512realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200514 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100515 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200516 int *new_start;
517 int *old_start;
518
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100519 // For weird patterns the number of states can be very high. Increasing by
520 // 50% seems a reasonable compromise between memory use and speed.
521 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200522 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200523 if (new_start == NULL)
524 return FAIL;
525 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200526 old_start = post_start;
527 post_start = new_start;
528 post_ptr = new_start + (post_ptr - old_start);
529 post_end = post_start + new_max;
530 vim_free(old_start);
531 return OK;
532}
533
534/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200535 * Search between "start" and "end" and try to recognize a
536 * character class in expanded form. For example [0-9].
537 * On success, return the id the character class to be emitted.
538 * On failure, return 0 (=FAIL)
539 * Start points to the first char of the range, while end should point
540 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200541 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
542 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200543 */
544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100545nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200546{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200547# define CLASS_not 0x80
548# define CLASS_af 0x40
549# define CLASS_AF 0x20
550# define CLASS_az 0x10
551# define CLASS_AZ 0x08
552# define CLASS_o7 0x04
553# define CLASS_o9 0x02
554# define CLASS_underscore 0x01
555
556 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200557 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200558 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200559
560 if (extra_newl == TRUE)
561 newl = TRUE;
562
563 if (*end != ']')
564 return FAIL;
565 p = start;
566 if (*p == '^')
567 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200568 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200569 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200570 }
571
572 while (p < end)
573 {
574 if (p + 2 < end && *(p + 1) == '-')
575 {
576 switch (*p)
577 {
578 case '0':
579 if (*(p + 2) == '9')
580 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200581 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200582 break;
583 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200584 if (*(p + 2) == '7')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200589 return FAIL;
590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 case 'a':
592 if (*(p + 2) == 'z')
593 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200594 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200595 break;
596 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200597 if (*(p + 2) == 'f')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200602 return FAIL;
603
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200604 case 'A':
605 if (*(p + 2) == 'Z')
606 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200607 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200608 break;
609 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200610 if (*(p + 2) == 'F')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200615 return FAIL;
616
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200617 default:
618 return FAIL;
619 }
620 p += 3;
621 }
622 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
623 {
624 newl = TRUE;
625 p += 2;
626 }
627 else if (*p == '_')
628 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200629 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200630 p ++;
631 }
632 else if (*p == '\n')
633 {
634 newl = TRUE;
635 p ++;
636 }
637 else
638 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100639 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200640
641 if (p != end)
642 return FAIL;
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200645 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200646
647 switch (config)
648 {
649 case CLASS_o9:
650 return extra_newl + NFA_DIGIT;
651 case CLASS_not | CLASS_o9:
652 return extra_newl + NFA_NDIGIT;
653 case CLASS_af | CLASS_AF | CLASS_o9:
654 return extra_newl + NFA_HEX;
655 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
656 return extra_newl + NFA_NHEX;
657 case CLASS_o7:
658 return extra_newl + NFA_OCTAL;
659 case CLASS_not | CLASS_o7:
660 return extra_newl + NFA_NOCTAL;
661 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
662 return extra_newl + NFA_WORD;
663 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
664 return extra_newl + NFA_NWORD;
665 case CLASS_az | CLASS_AZ | CLASS_underscore:
666 return extra_newl + NFA_HEAD;
667 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
668 return extra_newl + NFA_NHEAD;
669 case CLASS_az | CLASS_AZ:
670 return extra_newl + NFA_ALPHA;
671 case CLASS_not | CLASS_az | CLASS_AZ:
672 return extra_newl + NFA_NALPHA;
673 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200674 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200675 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200676 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200677 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200678 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200679 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200680 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200682 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200683}
684
685/*
686 * Produce the bytes for equivalence class "c".
687 * Currently only handles latin1, latin9 and utf-8.
688 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
689 * equivalent to 'a OR b OR c'
690 *
691 * NOTE! When changing this function, also update reg_equi_class()
692 */
693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200696#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
699 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700 {
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000701#define A_grave 0xc0
702#define A_acute 0xc1
703#define A_circumflex 0xc2
704#define A_virguilla 0xc3
705#define A_diaeresis 0xc4
706#define A_ring 0xc5
707#define C_cedilla 0xc7
708#define E_grave 0xc8
709#define E_acute 0xc9
710#define E_circumflex 0xca
711#define E_diaeresis 0xcb
712#define I_grave 0xcc
713#define I_acute 0xcd
714#define I_circumflex 0xce
715#define I_diaeresis 0xcf
716#define N_virguilla 0xd1
717#define O_grave 0xd2
718#define O_acute 0xd3
719#define O_circumflex 0xd4
720#define O_virguilla 0xd5
721#define O_diaeresis 0xd6
722#define O_slash 0xd8
723#define U_grave 0xd9
724#define U_acute 0xda
725#define U_circumflex 0xdb
726#define U_diaeresis 0xdc
727#define Y_acute 0xdd
728#define a_grave 0xe0
729#define a_acute 0xe1
730#define a_circumflex 0xe2
731#define a_virguilla 0xe3
732#define a_diaeresis 0xe4
733#define a_ring 0xe5
734#define c_cedilla 0xe7
735#define e_grave 0xe8
736#define e_acute 0xe9
737#define e_circumflex 0xea
738#define e_diaeresis 0xeb
739#define i_grave 0xec
740#define i_acute 0xed
741#define i_circumflex 0xee
742#define i_diaeresis 0xef
743#define n_virguilla 0xf1
744#define o_grave 0xf2
745#define o_acute 0xf3
746#define o_circumflex 0xf4
747#define o_virguilla 0xf5
748#define o_diaeresis 0xf6
749#define o_slash 0xf8
750#define u_grave 0xf9
751#define u_acute 0xfa
752#define u_circumflex 0xfb
753#define u_diaeresis 0xfc
754#define y_acute 0xfd
755#define y_diaeresis 0xff
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200756 switch (c)
757 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200758 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200759 case A_virguilla: case A_diaeresis: case A_ring:
760 case 0x100: case 0x102: case 0x104: case 0x1cd:
761 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
762 case 0x202: case 0x226: case 0x23a: case 0x1e00:
763 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
764 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
765 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
766 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
767 EMIT2(A_circumflex) EMIT2(A_virguilla)
768 EMIT2(A_diaeresis) EMIT2(A_ring)
769 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
770 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
771 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
772 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
773 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
774 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
775 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
776 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200777 return OK;
778
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200779 case 'B': case 0x181: case 0x243: case 0x1e02:
780 case 0x1e04: case 0x1e06:
781 EMIT2('B')
782 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
783 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200784 return OK;
785
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200786 case 'C': case C_cedilla: case 0x106: case 0x108:
787 case 0x10a: case 0x10c: case 0x187: case 0x23b:
788 case 0x1e08: case 0xa792:
789 EMIT2('C') EMIT2(C_cedilla)
790 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
791 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
792 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200793 return OK;
794
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200795 case 'D': case 0x10e: case 0x110: case 0x18a:
796 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
797 case 0x1e12:
798 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
799 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
800 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200801 return OK;
802
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200803 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200804 case E_diaeresis: case 0x112: case 0x114: case 0x116:
805 case 0x118: case 0x11a: case 0x204: case 0x206:
806 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
807 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
808 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
809 case 0x1ec2: case 0x1ec4: case 0x1ec6:
810 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
811 EMIT2(E_circumflex) EMIT2(E_diaeresis)
812 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
813 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
814 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
815 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
816 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
817 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
818 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
819 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200820 return OK;
821
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case 'F': case 0x191: case 0x1e1e: case 0xa798:
823 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200824 return OK;
825
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200826 case 'G': case 0x11c: case 0x11e: case 0x120:
827 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
828 case 0x1f4: case 0x1e20: case 0xa7a0:
829 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
830 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
831 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
832 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200833 return OK;
834
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200835 case 'H': case 0x124: case 0x126: case 0x21e:
836 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
837 case 0x1e2a: case 0x2c67:
838 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
839 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
840 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200841 return OK;
842
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200843 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200844 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
845 case 0x12e: case 0x130: case 0x197: case 0x1cf:
846 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
847 case 0x1ec8: case 0x1eca:
848 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
849 EMIT2(I_circumflex) EMIT2(I_diaeresis)
850 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
851 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
852 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
853 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
854 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200855 return OK;
856
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200857 case 'J': case 0x134: case 0x248:
858 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200859 return OK;
860
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
862 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
863 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
864 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
865 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200866 return OK;
867
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200868 case 'L': case 0x139: case 0x13b: case 0x13d:
869 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
870 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
871 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
872 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
873 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
874 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200875 return OK;
876
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200877 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
878 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
879 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200880 return OK;
881
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200882 case 'N': case N_virguilla:
883 case 0x143: case 0x145: case 0x147: case 0x1f8:
884 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
885 case 0xa7a4:
886 EMIT2('N') EMIT2(N_virguilla)
887 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
888 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
889 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200890 return OK;
891
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200892 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200893 case O_virguilla: case O_diaeresis: case O_slash:
894 case 0x14c: case 0x14e: case 0x150: case 0x19f:
895 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
896 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
897 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
898 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
899 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
900 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
901 case 0x1ede: case 0x1ee0: case 0x1ee2:
902 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
903 EMIT2(O_circumflex) EMIT2(O_virguilla)
904 EMIT2(O_diaeresis) EMIT2(O_slash)
905 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
906 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
907 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
908 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
909 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
910 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
911 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
912 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
913 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
914 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
915 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200916 return OK;
917
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200918 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
919 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
920 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200921 return OK;
922
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200923 case 'Q': case 0x24a:
924 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200925 return OK;
926
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200927 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
928 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
929 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
930 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
931 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
932 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
933 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200934 return OK;
935
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200936 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
937 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
938 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
939 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
940 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
941 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
942 EMIT2(0xa7a8)
943 return OK;
944
945 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
946 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
947 case 0x1e6e: case 0x1e70:
948 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
949 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
950 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200951 return OK;
952
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200953 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200954 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
955 case 0x16e: case 0x170: case 0x172: case 0x1af:
956 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
957 case 0x1db: case 0x214: case 0x216: case 0x244:
958 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
959 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
960 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
961 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
962 EMIT2(U_diaeresis) EMIT2(U_circumflex)
963 EMIT2(0x168) EMIT2(0x16a)
964 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
965 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
966 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
967 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
968 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
969 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
970 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
971 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
972 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200973 return OK;
974
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200975 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
976 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200977 return OK;
978
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200979 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
980 case 0x1e86: case 0x1e88:
981 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
982 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200983 return OK;
984
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 case 'X': case 0x1e8a: case 0x1e8c:
986 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200987 return OK;
988
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200989 case 'Y': case Y_acute: case 0x176: case 0x178:
990 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
991 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
992 EMIT2('Y') EMIT2(Y_acute)
993 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
994 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
995 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
996 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'Z': case 0x179: case 0x17b: case 0x17d:
1000 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1001 case 0x2c6b:
1002 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1003 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1004 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001005 return OK;
1006
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001007 case 'a': case a_grave: case a_acute: case a_circumflex:
1008 case a_virguilla: case a_diaeresis: case a_ring:
1009 case 0x101: case 0x103: case 0x105: case 0x1ce:
1010 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1011 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1012 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1013 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1014 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1015 case 0x1eb7: case 0x2c65:
1016 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1017 EMIT2(a_circumflex) EMIT2(a_virguilla)
1018 EMIT2(a_diaeresis) EMIT2(a_ring)
1019 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1020 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1021 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1022 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1023 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1024 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1025 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1026 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1027 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001028 return OK;
1029
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001030 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1031 case 0x1e03: case 0x1e05: case 0x1e07:
1032 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1033 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001034 return OK;
1035
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001036 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1037 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1038 case 0xa794:
1039 EMIT2('c') EMIT2(c_cedilla)
1040 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1041 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1042 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001043 return OK;
1044
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001045 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1046 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1047 case 0x1e11: case 0x1e13:
1048 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1049 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1050 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1051 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001052 return OK;
1053
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001054 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001055 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1056 case 0x119: case 0x11b: case 0x205: case 0x207:
1057 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1058 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1059 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1060 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1061 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1062 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1063 EMIT2(0x113) EMIT2(0x115)
1064 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1065 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1066 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1067 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1068 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1069 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1070 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001071 return OK;
1072
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001073 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1074 case 0x1e1f: case 0xa799:
1075 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1076 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001077 return OK;
1078
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001079 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1080 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1081 case 0x1e21: case 0xa7a1:
1082 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1083 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1084 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1085 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001086 return OK;
1087
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001088 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1089 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1090 case 0x1e96: case 0x2c68: case 0xa795:
1091 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1092 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1093 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1094 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001095 return OK;
1096
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001097 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001098 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1099 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1100 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1101 case 0x1ec9: case 0x1ecb:
1102 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1103 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1104 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1105 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1106 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1107 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1108 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001109 return OK;
1110
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001111 case 'j': case 0x135: case 0x1f0: case 0x249:
1112 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001113 return OK;
1114
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001115 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1116 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1117 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1118 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1119 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001120 return OK;
1121
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001122 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1123 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1124 case 0x1e3d: case 0x2c61:
1125 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1126 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1127 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1128 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001129 return OK;
1130
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001131 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1132 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1133 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1137 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1138 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1139 EMIT2('n') EMIT2(n_virguilla)
1140 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1141 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1142 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1143 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001144 return OK;
1145
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001146 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001147 case o_virguilla: case o_diaeresis: case o_slash:
1148 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1149 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1150 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1151 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1152 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1153 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1154 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1155 case 0x1edf: case 0x1ee1: case 0x1ee3:
1156 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1157 EMIT2(o_circumflex) EMIT2(o_virguilla)
1158 EMIT2(o_diaeresis) EMIT2(o_slash)
1159 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1160 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1161 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1162 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1163 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1164 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1165 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1166 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1167 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1168 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1169 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001170 return OK;
1171
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001172 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1173 case 0x1e55: case 0x1e57:
1174 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1175 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'q': case 0x24b: case 0x2a0:
1179 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001180 return OK;
1181
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001182 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1183 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1184 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1185 case 0xa7a7:
1186 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1187 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1188 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1189 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001190 return OK;
1191
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001192 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1193 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1194 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1195 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1196 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1197 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1198 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1199 return OK;
1200
1201 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1202 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1203 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1204 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1205 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1206 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1207 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001208 return OK;
1209
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001210 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001211 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1212 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1213 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1214 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1215 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1216 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1217 case 0x1eed: case 0x1eef: case 0x1ef1:
1218 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1219 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1220 EMIT2(0x169) EMIT2(0x16b)
1221 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1222 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1223 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1224 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1225 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1226 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1227 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1228 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1229 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001230 return OK;
1231
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001232 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1233 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1234 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001235 return OK;
1236
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001237 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1238 case 0x1e87: case 0x1e89: case 0x1e98:
1239 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1240 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001241 return OK;
1242
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001243 case 'x': case 0x1e8b: case 0x1e8d:
1244 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001245 return OK;
1246
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001247 case 'y': case y_acute: case y_diaeresis: case 0x177:
1248 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1249 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1250 case 0x1ef9:
1251 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1252 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1253 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1254 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001255 return OK;
1256
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001257 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1258 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1259 case 0x1e95: case 0x2c6c:
1260 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1261 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1262 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001263 return OK;
1264
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001265 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001266 }
1267 }
1268
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001269 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001270 return OK;
1271#undef EMIT2
1272}
1273
1274/*
1275 * Code to parse regular expression.
1276 *
1277 * We try to reuse parsing functions in regexp.c to
1278 * minimize surprise and keep the syntax consistent.
1279 */
1280
1281/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001282 * Parse the lowest level.
1283 *
1284 * An atom can be one of a long list of items. Many atoms match one character
1285 * in the text. It is often an ordinary character or a character class.
1286 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1287 * is only for syntax highlighting.
1288 *
1289 * atom ::= ordinary-atom
1290 * or \( pattern \)
1291 * or \%( pattern \)
1292 * or \z( pattern \)
1293 */
1294 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001295nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001296{
1297 int c;
1298 int charclass;
1299 int equiclass;
1300 int collclass;
1301 int got_coll_char;
1302 char_u *p;
1303 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001304 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001305 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001306 int emit_range;
1307 int negated;
1308 int result;
1309 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001310 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001311
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001312 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 switch (c)
1314 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001315 case NUL:
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001316 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar47196582013-05-25 22:04:23 +02001317
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001318 case Magic('^'):
1319 EMIT(NFA_BOL);
1320 break;
1321
1322 case Magic('$'):
1323 EMIT(NFA_EOL);
1324#if defined(FEAT_SYN_HL) || defined(PROTO)
1325 had_eol = TRUE;
1326#endif
1327 break;
1328
1329 case Magic('<'):
1330 EMIT(NFA_BOW);
1331 break;
1332
1333 case Magic('>'):
1334 EMIT(NFA_EOW);
1335 break;
1336
1337 case Magic('_'):
1338 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001339 if (c == NUL)
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001340 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar174a8482013-11-28 14:20:17 +01001341
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001342 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001343 {
1344 EMIT(NFA_BOL);
1345 break;
1346 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001347 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001348 {
1349 EMIT(NFA_EOL);
1350#if defined(FEAT_SYN_HL) || defined(PROTO)
1351 had_eol = TRUE;
1352#endif
1353 break;
1354 }
1355
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001356 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001357
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001358 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001359 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001360 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001361
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001362 // "\_x" is character class plus newline
1363 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001364
1365 /*
1366 * Character classes.
1367 */
1368 case Magic('.'):
1369 case Magic('i'):
1370 case Magic('I'):
1371 case Magic('k'):
1372 case Magic('K'):
1373 case Magic('f'):
1374 case Magic('F'):
1375 case Magic('p'):
1376 case Magic('P'):
1377 case Magic('s'):
1378 case Magic('S'):
1379 case Magic('d'):
1380 case Magic('D'):
1381 case Magic('x'):
1382 case Magic('X'):
1383 case Magic('o'):
1384 case Magic('O'):
1385 case Magic('w'):
1386 case Magic('W'):
1387 case Magic('h'):
1388 case Magic('H'):
1389 case Magic('a'):
1390 case Magic('A'):
1391 case Magic('l'):
1392 case Magic('L'):
1393 case Magic('u'):
1394 case Magic('U'):
1395 p = vim_strchr(classchars, no_Magic(c));
1396 if (p == NULL)
1397 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001398 if (extra == NFA_ADD_NL)
1399 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001400 semsg(_(e_nfa_regexp_invalid_character_class_nr), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001401 rc_did_emsg = TRUE;
1402 return FAIL;
1403 }
Bram Moolenaar097c5372023-05-24 21:02:24 +01001404 siemsg("Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001405 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001406 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001408 // When '.' is followed by a composing char ignore the dot, so that
1409 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001410 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1411 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001412 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001413 c = getchr();
1414 goto nfa_do_multibyte;
1415 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001416 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001417 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001418 {
1419 EMIT(NFA_NEWL);
1420 EMIT(NFA_OR);
1421 regflags |= RF_HASNL;
1422 }
1423 break;
1424
1425 case Magic('n'):
1426 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001427 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001428 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001429 else
1430 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001431 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001432 EMIT(NFA_NEWL);
1433 regflags |= RF_HASNL;
1434 }
1435 break;
1436
1437 case Magic('('):
1438 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001439 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001440 break;
1441
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001442 case Magic('|'):
1443 case Magic('&'):
1444 case Magic(')'):
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001445 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001446 return FAIL;
1447
1448 case Magic('='):
1449 case Magic('?'):
1450 case Magic('+'):
1451 case Magic('@'):
1452 case Magic('*'):
1453 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001454 // these should follow an atom, not form an atom
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001455 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001456 return FAIL;
1457
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001458 case Magic('~'):
1459 {
1460 char_u *lp;
1461
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001462 // Previous substitute pattern.
1463 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001464 if (reg_prev_sub == NULL)
1465 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001466 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001467 return FAIL;
1468 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001469 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001470 {
1471 EMIT(PTR2CHAR(lp));
1472 if (lp != reg_prev_sub)
1473 EMIT(NFA_CONCAT);
1474 }
1475 EMIT(NFA_NOPEN);
1476 break;
1477 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001478
Bram Moolenaar428e9872013-05-30 17:05:39 +02001479 case Magic('1'):
1480 case Magic('2'):
1481 case Magic('3'):
1482 case Magic('4'):
1483 case Magic('5'):
1484 case Magic('6'):
1485 case Magic('7'):
1486 case Magic('8'):
1487 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001488 {
1489 int refnum = no_Magic(c) - '1';
1490
1491 if (!seen_endbrace(refnum + 1))
1492 return FAIL;
1493 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001494 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001495 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001496 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001497
1498 case Magic('z'):
1499 c = no_Magic(getchr());
1500 switch (c)
1501 {
1502 case 's':
1503 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001504 if (re_mult_next("\\zs") == FAIL)
1505 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001506 break;
1507 case 'e':
1508 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001509 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001510 if (re_mult_next("\\ze") == FAIL)
1511 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001512 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001513#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001514 case '1':
1515 case '2':
1516 case '3':
1517 case '4':
1518 case '5':
1519 case '6':
1520 case '7':
1521 case '8':
1522 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001523 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001524 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001525 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001526 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001527 // No need to set rex.nfa_has_backref, the sub-matches don't
1528 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001529 re_has_z = REX_USE;
1530 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001531 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001532 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001533 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001534 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001535 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001536 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001537 re_has_z = REX_SET;
1538 break;
1539#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001540 default:
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001541 semsg(_(e_nfa_regexp_unknown_operator_z_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001542 return FAIL;
1543 }
1544 break;
1545
1546 case Magic('%'):
1547 c = no_Magic(getchr());
1548 switch (c)
1549 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001550 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001551 case '(':
1552 if (nfa_reg(REG_NPAREN) == FAIL)
1553 return FAIL;
1554 EMIT(NFA_NOPEN);
1555 break;
1556
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001557 case 'd': // %d123 decimal
1558 case 'o': // %o123 octal
1559 case 'x': // %xab hex 2
1560 case 'u': // %uabcd hex 4
1561 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001562 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001563 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001564
Bram Moolenaar47196582013-05-25 22:04:23 +02001565 switch (c)
1566 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001567 case 'd': nr = getdecchrs(); break;
1568 case 'o': nr = getoctchrs(); break;
1569 case 'x': nr = gethexchrs(2); break;
1570 case 'u': nr = gethexchrs(4); break;
1571 case 'U': nr = gethexchrs(8); break;
1572 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001573 }
1574
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001575 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001576 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1577 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001578 // A NUL is stored in the text as NL
1579 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001580 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001581 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001582 break;
1583
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001584 // Catch \%^ and \%$ regardless of where they appear in the
1585 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001586 case '^':
1587 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001588 break;
1589
1590 case '$':
1591 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001592 break;
1593
1594 case '#':
Christian Brabandt360da402022-05-18 15:04:02 +01001595 if (regparse[0] == '=' && regparse[1] >= 48
1596 && regparse[1] <= 50)
1597 {
1598 // misplaced \%#=1
1599 semsg(_(e_atom_engine_must_be_at_start_of_pattern),
1600 regparse[1]);
1601 return FAIL;
1602 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001603 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001604 break;
1605
1606 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001607 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001608 break;
1609
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001610 case 'C':
1611 EMIT(NFA_ANY_COMPOSING);
1612 break;
1613
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001614 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001615 {
1616 int n;
1617
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001618 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001619 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001620 {
1621 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001622 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001623 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001624 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001625 if (nfa_regatom() == FAIL)
1626 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001627 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001628 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001629 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001630 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001631 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001632 EMIT(NFA_OPT_CHARS);
1633 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001634
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001635 // Emit as "\%(\%[abc]\)" to be able to handle
1636 // "\%[abc]*" which would cause the empty string to be
1637 // matched an unlimited number of times. NFA_NOPEN is
1638 // added only once at a position, while NFA_SPLIT is
1639 // added multiple times. This is more efficient than
1640 // not allowing NFA_SPLIT multiple times, it is used
1641 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001642 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001643 break;
1644 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001645
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001646 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001647 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001648 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001649 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001650 int cur = FALSE;
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001651 int got_digit = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001652
1653 if (c == '<' || c == '>')
1654 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001655 if (no_Magic(c) == '.')
1656 {
1657 cur = TRUE;
1658 c = getchr();
1659 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001660 while (VIM_ISDIGIT(c))
1661 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001662 long_u tmp;
1663
1664 if (cur)
Bram Moolenaarb10ff5c2022-03-19 11:31:38 +00001665 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001666 semsg(_(e_regexp_number_after_dot_pos_search_chr),
Bram Moolenaarb10ff5c2022-03-19 11:31:38 +00001667 no_Magic(c));
1668 return FAIL;
1669 }
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001670 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001671
1672 if (tmp < n)
1673 {
1674 // overflow.
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001675 emsg(_(e_percent_value_too_large));
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001676 return FAIL;
1677 }
1678 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001679 c = getchr();
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001680 got_digit = TRUE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001681 }
1682 if (c == 'l' || c == 'c' || c == 'v')
1683 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001684 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001685
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001686 if (!cur && !got_digit)
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001687 {
1688 semsg(_(e_nfa_regexp_missing_value_in_chr),
1689 no_Magic(c));
1690 return FAIL;
1691 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001692 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001693 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001694 if (cur)
1695 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001696 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001697 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001698 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001699 if (save_prev_at_start)
1700 at_start = TRUE;
1701 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001702 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001703 {
1704 if (cur)
1705 {
1706 n = curwin->w_cursor.col;
1707 n++;
1708 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001709 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001710 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001711 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001712 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001713 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001714 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001715 if (cur)
1716 {
1717 colnr_T vcol = 0;
1718
1719 getvvcol(curwin, &curwin->w_cursor,
1720 NULL, NULL, &vcol);
1721 n = ++vcol;
1722 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001723 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001724 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001725 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001726 limit = INT_MAX / MB_MAXBYTES;
1727 }
1728 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001729 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001730 emsg(_(e_percent_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001731 return FAIL;
1732 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001733 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001734 break;
1735 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001736 else if (c == '\'' && n == 0)
1737 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001738 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001739 EMIT(cmp == '<' ? NFA_MARK_LT :
1740 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001741 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001742 break;
1743 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001744 }
Bram Moolenaarc96311b2022-11-25 21:13:47 +00001745 semsg(_(e_nfa_regexp_unknown_operator_percent_chr),
1746 no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001747 return FAIL;
1748 }
1749 break;
1750
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001751 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001752collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001753 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001754 * [abc] uses NFA_START_COLL - NFA_END_COLL
1755 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1756 * Each character is produced as a regular state, using
1757 * NFA_CONCAT to bind them together.
1758 * Besides normal characters there can be:
1759 * - character classes NFA_CLASS_*
1760 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001761 */
1762
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001763 p = regparse;
1764 endp = skip_anyof(p);
1765 if (*endp == ']')
1766 {
1767 /*
1768 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001769 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001770 * and perform the necessary substitutions in the NFA.
1771 */
1772 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001773 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001774 if (result != FAIL)
1775 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001776 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001777 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001778 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001779 EMIT(NFA_NEWL);
1780 EMIT(NFA_OR);
1781 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001782 else
1783 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001784 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001785 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001786 return OK;
1787 }
1788 /*
1789 * Failed to recognize a character class. Use the simple
1790 * version that turns [abc] into 'a' OR 'b' OR 'c'
1791 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001792 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001793 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001794 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001795 {
1796 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001797 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001798 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001799 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001800 else
1801 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001802 if (*regparse == '-')
1803 {
1804 startc = '-';
1805 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001806 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001807 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001808 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001809 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001810 emit_range = FALSE;
1811 while (regparse < endp)
1812 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001813 int oldstartc = startc;
1814
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001815 startc = -1;
1816 got_coll_char = FALSE;
1817 if (*regparse == '[')
1818 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001819 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001820 equiclass = collclass = 0;
1821 charclass = get_char_class(&regparse);
1822 if (charclass == CLASS_NONE)
1823 {
1824 equiclass = get_equi_class(&regparse);
1825 if (equiclass == 0)
1826 collclass = get_coll_element(&regparse);
1827 }
1828
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001829 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001830 if (charclass != CLASS_NONE)
1831 {
1832 switch (charclass)
1833 {
1834 case CLASS_ALNUM:
1835 EMIT(NFA_CLASS_ALNUM);
1836 break;
1837 case CLASS_ALPHA:
1838 EMIT(NFA_CLASS_ALPHA);
1839 break;
1840 case CLASS_BLANK:
1841 EMIT(NFA_CLASS_BLANK);
1842 break;
1843 case CLASS_CNTRL:
1844 EMIT(NFA_CLASS_CNTRL);
1845 break;
1846 case CLASS_DIGIT:
1847 EMIT(NFA_CLASS_DIGIT);
1848 break;
1849 case CLASS_GRAPH:
1850 EMIT(NFA_CLASS_GRAPH);
1851 break;
1852 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001853 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001854 EMIT(NFA_CLASS_LOWER);
1855 break;
1856 case CLASS_PRINT:
1857 EMIT(NFA_CLASS_PRINT);
1858 break;
1859 case CLASS_PUNCT:
1860 EMIT(NFA_CLASS_PUNCT);
1861 break;
1862 case CLASS_SPACE:
1863 EMIT(NFA_CLASS_SPACE);
1864 break;
1865 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001866 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001867 EMIT(NFA_CLASS_UPPER);
1868 break;
1869 case CLASS_XDIGIT:
1870 EMIT(NFA_CLASS_XDIGIT);
1871 break;
1872 case CLASS_TAB:
1873 EMIT(NFA_CLASS_TAB);
1874 break;
1875 case CLASS_RETURN:
1876 EMIT(NFA_CLASS_RETURN);
1877 break;
1878 case CLASS_BACKSPACE:
1879 EMIT(NFA_CLASS_BACKSPACE);
1880 break;
1881 case CLASS_ESCAPE:
1882 EMIT(NFA_CLASS_ESCAPE);
1883 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001884 case CLASS_IDENT:
1885 EMIT(NFA_CLASS_IDENT);
1886 break;
1887 case CLASS_KEYWORD:
1888 EMIT(NFA_CLASS_KEYWORD);
1889 break;
1890 case CLASS_FNAME:
1891 EMIT(NFA_CLASS_FNAME);
1892 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001893 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001894 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001895 continue;
1896 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001897 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001898 if (equiclass != 0)
1899 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001900 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001901 if (result == FAIL)
1902 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001903 // should never happen
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001904 EMSG_RET_FAIL(_(e_error_building_nfa_with_equivalence_class));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001905 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001906 continue;
1907 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001908 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001909 if (collclass != 0)
1910 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001911 startc = collclass; // allow [.a.]-x as a range
1912 // Will emit the proper atom at the end of the
1913 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001914 }
1915 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001916 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1917 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001918 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001919 {
1920 emit_range = TRUE;
1921 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001922 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001923 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001924 }
1925
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001926 // Now handle simple and escaped characters.
1927 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1928 // accepts "\t", "\e", etc., but only when the 'l' flag in
1929 // 'cpoptions' is not included.
1930 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001931 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001932 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001933 && regparse + 1 <= endp
1934 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001935 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001936 && vim_strchr(REGEXP_ABBR, regparse[1])
1937 != NULL)
1938 )
1939 )
1940 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001941 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001942
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001943 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001944 startc = (reg_string || emit_range
1945 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001946 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001947 || *regparse == 'o'
1948 || *regparse == 'x'
1949 || *regparse == 'u'
1950 || *regparse == 'U'
1951 )
1952 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001953 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001954 startc = coll_get_char();
1955 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001956 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001957 }
1958 else
1959 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001960 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001961 startc = backslash_trans(*regparse);
1962 }
1963 }
1964
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001965 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001966 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001967 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001968
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001969 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001970 if (emit_range)
1971 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001972 int endc = startc;
1973
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001974 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001975 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001976 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02001977
1978 if (endc > startc + 2)
1979 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001980 // Emit a range instead of the sequence of
1981 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001982 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001983 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001984 EMIT(1);
1985 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001986 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02001987 EMIT(endc);
1988 EMIT(NFA_RANGE);
1989 EMIT(NFA_CONCAT);
1990 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001991 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001992 || (*mb_char2len)(endc) > 1))
1993 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001994 // Emit the characters in the range.
1995 // "startc" was already emitted, so skip it.
1996 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001997 for (c = startc + 1; c <= endc; c++)
1998 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001999 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002000 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002001 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002002 }
2003 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002004 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002005 // Emit the range. "startc" was already emitted, so
2006 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002007 for (c = startc + 1; c <= endc; c++)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00002008 {
2009 EMIT(c);
2010 EMIT(NFA_CONCAT);
2011 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002012 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002013 emit_range = FALSE;
2014 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002015 }
2016 else
2017 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002018 // This char (startc) is not part of a range. Just
2019 // emit it.
2020 // Normally, simply emit startc. But if we get char
2021 // code=0 from a collating char, then replace it with
2022 // 0x0a.
2023 // This is needed to completely mimic the behaviour of
2024 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002025 if (startc == NFA_NEWL)
2026 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002027 // Line break can't be matched as part of the
2028 // collection, add an OR below. But not for negated
2029 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002030 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002031 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002032 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002033 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002034 {
2035 if (got_coll_char == TRUE && startc == 0)
2036 EMIT(0x0a);
2037 else
2038 EMIT(startc);
Christian Brabandtca22fc32023-08-20 20:34:22 +02002039 EMIT(NFA_CONCAT);
Christian Brabandtca22fc32023-08-20 20:34:22 +02002040 }
Christian Brabandtca22fc32023-08-20 20:34:22 +02002041 }
Christian Brabandtbe07caa2023-08-20 22:26:15 +02002042
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002043 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002044 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002045
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002046 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002047 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002048 {
2049 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002050 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002051 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002052
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002053 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002054 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002055 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002056
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002057 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002058 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002059 EMIT(NFA_END_NEG_COLL);
2060 else
2061 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002062
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002063 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002064 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002065 {
2066 EMIT(reg_string ? NL : NFA_NEWL);
2067 EMIT(NFA_OR);
2068 }
2069
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002070 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002071 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002072
2073 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002074 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002075 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002076
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002077 default:
2078 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002079 int plen;
2080
2081nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002082 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002083 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002084 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002085 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002086 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002087 int i = 0;
2088
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002089 // A base character plus composing characters, or just one
2090 // or more composing characters.
2091 // This requires creating a separate atom as if enclosing
2092 // the characters in (), where NFA_COMPOSING is the ( and
2093 // NFA_END_COMPOSING is the ). Note that right now we are
2094 // building the postfix form, not the NFA itself;
2095 // a composing char could be: a, b, c, NFA_COMPOSING
2096 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002097 for (;;)
2098 {
2099 EMIT(c);
2100 if (i > 0)
2101 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002102 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002103 break;
2104 c = utf_ptr2char(old_regparse + i);
2105 }
2106 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002107 regparse = old_regparse + plen;
2108 }
2109 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002110 {
2111 c = no_Magic(c);
2112 EMIT(c);
2113 }
2114 return OK;
2115 }
2116 }
2117
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002118 return OK;
2119}
2120
2121/*
2122 * Parse something followed by possible [*+=].
2123 *
2124 * A piece is an atom, possibly followed by a multi, an indication of how many
2125 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2126 * characters: "", "a", "aa", etc.
2127 *
2128 * piece ::= atom
2129 * or atom multi
2130 */
2131 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002132nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002133{
2134 int i;
2135 int op;
2136 int ret;
2137 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002138 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002139 parse_state_T old_state;
2140 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002141 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002142 int old_post_pos;
2143 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002144 int quest;
2145
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002146 // Save the current parse state, so that we can use it if <atom>{m,n} is
2147 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002148 save_parse_state(&old_state);
2149
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002150 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002151 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002152
2153 ret = nfa_regatom();
2154 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002155 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002156
2157 op = peekchr();
2158 if (re_multi_type(op) == NOT_MULTI)
2159 return OK;
2160
2161 skipchr();
2162 switch (op)
2163 {
2164 case Magic('*'):
2165 EMIT(NFA_STAR);
2166 break;
2167
2168 case Magic('+'):
2169 /*
2170 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2171 * first and only submatch would be "aaa". But the backtracking
2172 * engine interprets the plus as "try matching one more time", and
2173 * a* matches a second time at the end of the input, the empty
2174 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002175 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002176 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002177 * In order to be consistent with the old engine, we replace
2178 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002179 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002180 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002181 curchr = -1;
2182 if (nfa_regatom() == FAIL)
2183 return FAIL;
2184 EMIT(NFA_STAR);
2185 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002186 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002187 break;
2188
2189 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002190 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002191 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002192 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002193 switch(op)
2194 {
2195 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002196 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002197 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002198 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002199 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002200 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002201 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002202 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002203 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002204 op = no_Magic(getchr());
2205 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002206 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002207 i = NFA_PREV_ATOM_JUST_BEFORE;
2208 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002209 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002210 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2211 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002212 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002213 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002214 i = NFA_PREV_ATOM_LIKE_PATTERN;
2215 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002216 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002217 if (i == 0)
2218 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002219 semsg(_(e_nfa_regexp_unknown_operator_at_chr), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002220 return FAIL;
2221 }
2222 EMIT(i);
2223 if (i == NFA_PREV_ATOM_JUST_BEFORE
2224 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2225 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002226 break;
2227
2228 case Magic('?'):
2229 case Magic('='):
2230 EMIT(NFA_QUEST);
2231 break;
2232
2233 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002234 // a{2,5} will expand to 'aaa?a?a?'
2235 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2236 // version of '?'
2237 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2238 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002239
2240 greedy = TRUE;
2241 c2 = peekchr();
2242 if (c2 == '-' || c2 == Magic('-'))
2243 {
2244 skipchr();
2245 greedy = FALSE;
2246 }
2247 if (!read_limits(&minval, &maxval))
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002248 EMSG_RET_FAIL(_(e_nfa_regexp_error_reading_repetition_limits));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002249
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002250 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2251 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002252 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002253 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002254 if (greedy) // { { (match the braces)
2255 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002256 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002257 else // { { (match the braces)
2258 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002259 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002260 break;
2261 }
2262
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002263 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002264 if (maxval == 0)
2265 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002266 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002267 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002268 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002269 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002270 return OK;
2271 }
2272
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002273 // The engine is very inefficient (uses too many states) when the
2274 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002275 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2276 // will emit NFA_STAR.
2277 // Bail out if we can use the other engine, but only, when the
2278 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002279 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002280 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002281 && (maxval > 500 || maxval > minval + 200)
2282 && (maxval != MAX_LIMIT && minval < 200)
2283 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002284 return FAIL;
2285
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002286 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002287 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002288 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002289 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002290
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002291 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2292 for (i = 0; i < maxval; i++)
2293 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002294 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002295 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002296 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002297 if (nfa_regatom() == FAIL)
2298 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002299 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002300 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002301 {
2302 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002303 {
2304 if (greedy)
2305 EMIT(NFA_STAR);
2306 else
2307 EMIT(NFA_STAR_NONGREEDY);
2308 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002309 else
2310 EMIT(quest);
2311 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002312 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002313 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002314 if (i + 1 > minval && maxval == MAX_LIMIT)
2315 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002316 }
2317
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002318 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002319 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002320 curchr = -1;
2321
2322 break;
2323
2324
2325 default:
2326 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002327 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002328
2329 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002330 // Can't have a multi follow a multi.
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002331 EMSG_RET_FAIL(_(e_nfa_regexp_cant_have_multi_follow_multi));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002332
2333 return OK;
2334}
2335
2336/*
2337 * Parse one or more pieces, concatenated. It matches a match for the
2338 * first piece, followed by a match for the second piece, etc. Example:
2339 * "f[0-9]b", first matches "f", then a digit and then "b".
2340 *
2341 * concat ::= piece
2342 * or piece piece
2343 * or piece piece piece
2344 * etc.
2345 */
2346 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002347nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002348{
2349 int cont = TRUE;
2350 int first = TRUE;
2351
2352 while (cont)
2353 {
2354 switch (peekchr())
2355 {
2356 case NUL:
2357 case Magic('|'):
2358 case Magic('&'):
2359 case Magic(')'):
2360 cont = FALSE;
2361 break;
2362
2363 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002364 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002365 skipchr_keepstart();
2366 break;
2367 case Magic('c'):
2368 regflags |= RF_ICASE;
2369 skipchr_keepstart();
2370 break;
2371 case Magic('C'):
2372 regflags |= RF_NOICASE;
2373 skipchr_keepstart();
2374 break;
2375 case Magic('v'):
2376 reg_magic = MAGIC_ALL;
2377 skipchr_keepstart();
2378 curchr = -1;
2379 break;
2380 case Magic('m'):
2381 reg_magic = MAGIC_ON;
2382 skipchr_keepstart();
2383 curchr = -1;
2384 break;
2385 case Magic('M'):
2386 reg_magic = MAGIC_OFF;
2387 skipchr_keepstart();
2388 curchr = -1;
2389 break;
2390 case Magic('V'):
2391 reg_magic = MAGIC_NONE;
2392 skipchr_keepstart();
2393 curchr = -1;
2394 break;
2395
2396 default:
2397 if (nfa_regpiece() == FAIL)
2398 return FAIL;
2399 if (first == FALSE)
2400 EMIT(NFA_CONCAT);
2401 else
2402 first = FALSE;
2403 break;
2404 }
2405 }
2406
2407 return OK;
2408}
2409
2410/*
2411 * Parse a branch, one or more concats, separated by "\&". It matches the
2412 * last concat, but only if all the preceding concats also match at the same
2413 * position. Examples:
2414 * "foobeep\&..." matches "foo" in "foobeep".
2415 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2416 *
2417 * branch ::= concat
2418 * or concat \& concat
2419 * or concat \& concat \& concat
2420 * etc.
2421 */
2422 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002423nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002424{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002425 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002426
Bram Moolenaar16299b52013-05-30 18:45:23 +02002427 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002428
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002429 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002430 if (nfa_regconcat() == FAIL)
2431 return FAIL;
2432
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002433 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002434 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002435 {
2436 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002437 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002438 if (old_post_pos == (int)(post_ptr - post_start))
2439 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002440 EMIT(NFA_NOPEN);
2441 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002442 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002443 if (nfa_regconcat() == FAIL)
2444 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002445 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002446 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002447 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002448 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002449 }
2450
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002451 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002452 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002453 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002454
2455 return OK;
2456}
2457
2458/*
2459 * Parse a pattern, one or more branches, separated by "\|". It matches
2460 * anything that matches one of the branches. Example: "foo\|beep" matches
2461 * "foo" and matches "beep". If more than one branch matches, the first one
2462 * is used.
2463 *
2464 * pattern ::= branch
2465 * or branch \| branch
2466 * or branch \| branch \| branch
2467 * etc.
2468 */
2469 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002470nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002471 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002472{
2473 int parno = 0;
2474
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002475 if (paren == REG_PAREN)
2476 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002477 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002478 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_parens));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002479 parno = regnpar++;
2480 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002481#ifdef FEAT_SYN_HL
2482 else if (paren == REG_ZPAREN)
2483 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002484 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002485 if (regnzpar >= NSUBEXP)
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002486 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_z));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002487 parno = regnzpar++;
2488 }
2489#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002490
2491 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002492 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002493
2494 while (peekchr() == Magic('|'))
2495 {
2496 skipchr();
2497 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002498 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002499 EMIT(NFA_OR);
2500 }
2501
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002502 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002503 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2504 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002505 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002506 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2507 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002508 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002509 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002510 }
2511 else if (paren == REG_NOPAREN && peekchr() != NUL)
2512 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002513 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002514 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002515 else
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002516 EMSG_RET_FAIL(_(e_nfa_regexp_proper_termination_error));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002517 }
2518 /*
2519 * Here we set the flag allowing back references to this set of
2520 * parentheses.
2521 */
2522 if (paren == REG_PAREN)
2523 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002524 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002525 EMIT(NFA_MOPEN + parno);
2526 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002527#ifdef FEAT_SYN_HL
2528 else if (paren == REG_ZPAREN)
2529 EMIT(NFA_ZOPEN + parno);
2530#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002531
2532 return OK;
2533}
2534
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002535#ifdef DEBUG
2536static char_u code[50];
2537
2538 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002539nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002540{
2541 int addnl = FALSE;
2542
2543 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2544 {
2545 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002546 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002547 }
2548
2549 STRCPY(code, "");
2550 switch (c)
2551 {
2552 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2553 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2554 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2555 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2556 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2557 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2558
Bram Moolenaar5714b802013-05-28 22:03:20 +02002559 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2560 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2561 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2562 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2563 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2564 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2565 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2566 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2567 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002568#ifdef FEAT_SYN_HL
2569 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2570 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2571 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2572 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2573 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2574 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2575 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2576 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2577 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2578#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002579 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2580
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002581 case NFA_PREV_ATOM_NO_WIDTH:
2582 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002583 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2584 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002585 case NFA_PREV_ATOM_JUST_BEFORE:
2586 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2587 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2588 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002589 case NFA_PREV_ATOM_LIKE_PATTERN:
2590 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2591
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002592 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2593 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002595 case NFA_START_INVISIBLE_FIRST:
2596 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002597 case NFA_START_INVISIBLE_NEG:
2598 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002599 case NFA_START_INVISIBLE_NEG_FIRST:
2600 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002601 case NFA_START_INVISIBLE_BEFORE:
2602 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002603 case NFA_START_INVISIBLE_BEFORE_FIRST:
2604 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002605 case NFA_START_INVISIBLE_BEFORE_NEG:
2606 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002607 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2608 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002609 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002610 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002611 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002612 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002613
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002614 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2615 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002616 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002617
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002618 case NFA_MOPEN:
2619 case NFA_MOPEN1:
2620 case NFA_MOPEN2:
2621 case NFA_MOPEN3:
2622 case NFA_MOPEN4:
2623 case NFA_MOPEN5:
2624 case NFA_MOPEN6:
2625 case NFA_MOPEN7:
2626 case NFA_MOPEN8:
2627 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002628 STRCPY(code, "NFA_MOPEN(x)");
2629 code[10] = c - NFA_MOPEN + '0';
2630 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002631 case NFA_MCLOSE:
2632 case NFA_MCLOSE1:
2633 case NFA_MCLOSE2:
2634 case NFA_MCLOSE3:
2635 case NFA_MCLOSE4:
2636 case NFA_MCLOSE5:
2637 case NFA_MCLOSE6:
2638 case NFA_MCLOSE7:
2639 case NFA_MCLOSE8:
2640 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002641 STRCPY(code, "NFA_MCLOSE(x)");
2642 code[11] = c - NFA_MCLOSE + '0';
2643 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002644#ifdef FEAT_SYN_HL
2645 case NFA_ZOPEN:
2646 case NFA_ZOPEN1:
2647 case NFA_ZOPEN2:
2648 case NFA_ZOPEN3:
2649 case NFA_ZOPEN4:
2650 case NFA_ZOPEN5:
2651 case NFA_ZOPEN6:
2652 case NFA_ZOPEN7:
2653 case NFA_ZOPEN8:
2654 case NFA_ZOPEN9:
2655 STRCPY(code, "NFA_ZOPEN(x)");
2656 code[10] = c - NFA_ZOPEN + '0';
2657 break;
2658 case NFA_ZCLOSE:
2659 case NFA_ZCLOSE1:
2660 case NFA_ZCLOSE2:
2661 case NFA_ZCLOSE3:
2662 case NFA_ZCLOSE4:
2663 case NFA_ZCLOSE5:
2664 case NFA_ZCLOSE6:
2665 case NFA_ZCLOSE7:
2666 case NFA_ZCLOSE8:
2667 case NFA_ZCLOSE9:
2668 STRCPY(code, "NFA_ZCLOSE(x)");
2669 code[11] = c - NFA_ZCLOSE + '0';
2670 break;
2671#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002672 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2673 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2674 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2675 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002676 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2677 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002678 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2679 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2680 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2681 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2682 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2683 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2684 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2685 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2686 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2687 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2688 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2689 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2690 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2691 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002692 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002693
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002694 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002695 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2696 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2697 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002698 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002699 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002700
2701 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2702 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2703 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2704 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2705 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2706 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2707 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2708
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002709 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2710 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2711 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2712 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2713 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2714 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2715 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2716 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2717 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2718 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2719 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2720 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2721 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2722 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2723 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2724 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002725 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2726 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2727 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002728
2729 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2730 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2731 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2732 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2733 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2734 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2735 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2736 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2737 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2738 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2739 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2740 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2741 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2742 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2743 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2744 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2745 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2746 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2747 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2748 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2749 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2750 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2751 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2752 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2753 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2754 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2755 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002756 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2757 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2758 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2759 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002760
2761 default:
2762 STRCPY(code, "CHAR(x)");
2763 code[5] = c;
2764 }
2765
2766 if (addnl == TRUE)
2767 STRCAT(code, " + NEWLINE ");
2768
2769}
2770
2771#ifdef ENABLE_LOG
2772static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002773static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002774
2775/*
2776 * Print the postfix notation of the current regexp.
2777 */
2778 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002779nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002780{
2781 int *p;
2782 FILE *f;
2783
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002784 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002785 if (f == NULL)
2786 return;
2787
2788 fprintf(f, "\n-------------------------\n");
2789 if (retval == FAIL)
2790 fprintf(f, ">>> NFA engine failed... \n");
2791 else if (retval == OK)
2792 fprintf(f, ">>> NFA engine succeeded !\n");
2793 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
2794 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002795 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002796 nfa_set_code(*p);
2797 fprintf(f, "%s, ", code);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002798 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002799 fprintf(f, "\"\nPostfix notation (int): ");
2800 for (p = post_start; *p && p < post_ptr; p++)
2801 fprintf(f, "%d ", *p);
2802 fprintf(f, "\n\n");
2803 fclose(f);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002804}
2805
2806/*
2807 * Print the NFA starting with a root node "state".
2808 */
2809 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002810nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002811{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002812 garray_T indent;
2813
2814 ga_init2(&indent, 1, 64);
2815 ga_append(&indent, '\0');
2816 nfa_print_state2(debugf, state, &indent);
2817 ga_clear(&indent);
2818}
2819
2820 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002821nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002822{
2823 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002824
2825 if (state == NULL)
2826 return;
2827
2828 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002829
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002830 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002831 p = (char_u *)indent->ga_data;
2832 if (indent->ga_len >= 3)
2833 {
2834 int last = indent->ga_len - 3;
2835 char_u save[2];
2836
2837 STRNCPY(save, &p[last], 2);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00002838 memcpy(&p[last], "+-", 2);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002839 fprintf(debugf, " %s", p);
2840 STRNCPY(&p[last], save, 2);
2841 }
2842 else
2843 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002844
2845 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002846 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002847 code,
2848 state->c,
2849 abs(state->id),
2850 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002851 if (state->id < 0)
2852 return;
2853
2854 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002855
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002856 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002857 indent->ga_len -= 1;
2858 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002859 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002860 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002861 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002862 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002863
2864 nfa_print_state2(debugf, state->out, indent);
2865
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002866 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002867 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002868 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002869 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002870
2871 nfa_print_state2(debugf, state->out1, indent);
2872
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002873 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002874 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002875 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002876}
2877
2878/*
2879 * Print the NFA state machine.
2880 */
2881 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002882nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002883{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002884 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002885
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002886 if (debugf == NULL)
2887 return;
Bram Moolenaard89616e2013-06-06 18:46:06 +02002888
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002889 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002890
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002891 if (prog->reganch)
2892 fprintf(debugf, "reganch: %d\n", prog->reganch);
2893 if (prog->regstart != NUL)
2894 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2895 prog->regstart, prog->regstart);
2896 if (prog->match_text != NULL)
2897 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
2898
2899 fclose(debugf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002900}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002901#endif // ENABLE_LOG
2902#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002903
2904/*
2905 * Parse r.e. @expr and convert it into postfix form.
2906 * Return the postfix string on success, NULL otherwise.
2907 */
2908 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002909re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002910{
2911 if (nfa_reg(REG_NOPAREN) == FAIL)
2912 return NULL;
2913 EMIT(NFA_MOPEN);
2914 return post_start;
2915}
2916
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002917// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002918
2919/*
2920 * Represents an NFA state plus zero or one or two arrows exiting.
2921 * if c == MATCH, no arrows out; matching state.
2922 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2923 * If c < 256, labeled arrow with character c to out.
2924 */
2925
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002926static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002927
2928/*
2929 * Allocate and initialize nfa_state_T.
2930 */
2931 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002932alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002933{
2934 nfa_state_T *s;
2935
2936 if (istate >= nstate)
2937 return NULL;
2938
2939 s = &state_ptr[istate++];
2940
2941 s->c = c;
2942 s->out = out;
2943 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002944 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002945
2946 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002947 s->lastlist[0] = 0;
2948 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002949
2950 return s;
2951}
2952
2953/*
2954 * A partially built NFA without the matching state filled in.
2955 * Frag_T.start points at the start state.
2956 * Frag_T.out is a list of places that need to be set to the
2957 * next state for this fragment.
2958 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002959
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002960// Since the out pointers in the list are always
2961// uninitialized, we use the pointers themselves
2962// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002963typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002964union Ptrlist
2965{
2966 Ptrlist *next;
2967 nfa_state_T *s;
2968};
2969
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002970struct Frag
2971{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002972 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002973 Ptrlist *out;
2974};
2975typedef struct Frag Frag_T;
2976
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002977/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02002978 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002979 */
2980 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01002981frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002982{
Bram Moolenaar053bb602013-05-20 13:55:21 +02002983 Frag_T n;
2984
2985 n.start = start;
2986 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002987 return n;
2988}
2989
2990/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002991 * Create singleton list containing just outp.
2992 */
2993 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01002994list1(
2995 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002996{
2997 Ptrlist *l;
2998
2999 l = (Ptrlist *)outp;
3000 l->next = NULL;
3001 return l;
3002}
3003
3004/*
3005 * Patch the list of states at out to point to start.
3006 */
3007 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003008patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003009{
3010 Ptrlist *next;
3011
3012 for (; l; l = next)
3013 {
3014 next = l->next;
3015 l->s = s;
3016 }
3017}
3018
3019
3020/*
3021 * Join the two lists l1 and l2, returning the combination.
3022 */
3023 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003024append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003025{
3026 Ptrlist *oldl1;
3027
3028 oldl1 = l1;
3029 while (l1->next)
3030 l1 = l1->next;
3031 l1->next = l2;
3032 return oldl1;
3033}
3034
3035/*
3036 * Stack used for transforming postfix form into NFA.
3037 */
3038static Frag_T empty;
3039
3040 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003041st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003042{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003043#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003044 FILE *df;
3045 int *p2;
3046
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003047 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003048 if (df)
3049 {
3050 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003051# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003052 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003053# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003054 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003055# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003056 for (p2 = postfix; p2 < end; p2++)
3057 {
3058 nfa_set_code(*p2);
3059 fprintf(df, "%s, ", code);
3060 }
3061 nfa_set_code(*p);
3062 fprintf(df, "\nCurrent position is: ");
3063 for (p2 = postfix; p2 <= p; p2 ++)
3064 {
3065 nfa_set_code(*p2);
3066 fprintf(df, "%s, ", code);
3067 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003068# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003069 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003070 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003071 fprintf(df, "\nCurrent position is: ");
3072 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003073 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003074# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003075 fprintf(df, "\n--------------------------\n");
3076 fclose(df);
3077 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003078#endif
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003079 emsg(_(e_nfa_regexp_could_not_pop_stack));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003080}
3081
3082/*
3083 * Push an item onto the stack.
3084 */
3085 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003086st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003087{
3088 Frag_T *stackp = *p;
3089
3090 if (stackp >= stack_end)
3091 return;
3092 *stackp = s;
3093 *p = *p + 1;
3094}
3095
3096/*
3097 * Pop an item from the stack.
3098 */
3099 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003100st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003101{
3102 Frag_T *stackp;
3103
3104 *p = *p - 1;
3105 stackp = *p;
3106 if (stackp < stack)
3107 return empty;
3108 return **p;
3109}
3110
3111/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003112 * Estimate the maximum byte length of anything matching "state".
3113 * When unknown or unlimited return -1.
3114 */
3115 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003116nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003117{
3118 int l, r;
3119 nfa_state_T *state = startstate;
3120 int len = 0;
3121
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003122 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003123 if (depth > 4)
3124 return -1;
3125
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003126 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003127 {
3128 switch (state->c)
3129 {
3130 case NFA_END_INVISIBLE:
3131 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003132 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003133 return len;
3134
3135 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003136 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003137 l = nfa_max_width(state->out, depth + 1);
3138 r = nfa_max_width(state->out1, depth + 1);
3139 if (l < 0 || r < 0)
3140 return -1;
3141 return len + (l > r ? l : r);
3142
3143 case NFA_ANY:
3144 case NFA_START_COLL:
3145 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003146 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003147 if (enc_utf8)
3148 len += MB_MAXBYTES;
3149 else if (has_mbyte)
3150 len += 2;
3151 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003152 ++len;
3153 if (state->c != NFA_ANY)
3154 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003155 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003156 state = state->out1->out;
3157 continue;
3158 }
3159 break;
3160
3161 case NFA_DIGIT:
3162 case NFA_WHITE:
3163 case NFA_HEX:
3164 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003165 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003166 ++len;
3167 break;
3168
3169 case NFA_IDENT:
3170 case NFA_SIDENT:
3171 case NFA_KWORD:
3172 case NFA_SKWORD:
3173 case NFA_FNAME:
3174 case NFA_SFNAME:
3175 case NFA_PRINT:
3176 case NFA_SPRINT:
3177 case NFA_NWHITE:
3178 case NFA_NDIGIT:
3179 case NFA_NHEX:
3180 case NFA_NOCTAL:
3181 case NFA_WORD:
3182 case NFA_NWORD:
3183 case NFA_HEAD:
3184 case NFA_NHEAD:
3185 case NFA_ALPHA:
3186 case NFA_NALPHA:
3187 case NFA_LOWER:
3188 case NFA_NLOWER:
3189 case NFA_UPPER:
3190 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003191 case NFA_LOWER_IC:
3192 case NFA_NLOWER_IC:
3193 case NFA_UPPER_IC:
3194 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003195 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003196 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003197 if (has_mbyte)
3198 len += 3;
3199 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003200 ++len;
3201 break;
3202
3203 case NFA_START_INVISIBLE:
3204 case NFA_START_INVISIBLE_NEG:
3205 case NFA_START_INVISIBLE_BEFORE:
3206 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003207 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003208 state = state->out1->out;
3209 continue;
3210
3211 case NFA_BACKREF1:
3212 case NFA_BACKREF2:
3213 case NFA_BACKREF3:
3214 case NFA_BACKREF4:
3215 case NFA_BACKREF5:
3216 case NFA_BACKREF6:
3217 case NFA_BACKREF7:
3218 case NFA_BACKREF8:
3219 case NFA_BACKREF9:
3220#ifdef FEAT_SYN_HL
3221 case NFA_ZREF1:
3222 case NFA_ZREF2:
3223 case NFA_ZREF3:
3224 case NFA_ZREF4:
3225 case NFA_ZREF5:
3226 case NFA_ZREF6:
3227 case NFA_ZREF7:
3228 case NFA_ZREF8:
3229 case NFA_ZREF9:
3230#endif
3231 case NFA_NEWL:
3232 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003233 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003234 return -1;
3235
3236 case NFA_BOL:
3237 case NFA_EOL:
3238 case NFA_BOF:
3239 case NFA_EOF:
3240 case NFA_BOW:
3241 case NFA_EOW:
3242 case NFA_MOPEN:
3243 case NFA_MOPEN1:
3244 case NFA_MOPEN2:
3245 case NFA_MOPEN3:
3246 case NFA_MOPEN4:
3247 case NFA_MOPEN5:
3248 case NFA_MOPEN6:
3249 case NFA_MOPEN7:
3250 case NFA_MOPEN8:
3251 case NFA_MOPEN9:
3252#ifdef FEAT_SYN_HL
3253 case NFA_ZOPEN:
3254 case NFA_ZOPEN1:
3255 case NFA_ZOPEN2:
3256 case NFA_ZOPEN3:
3257 case NFA_ZOPEN4:
3258 case NFA_ZOPEN5:
3259 case NFA_ZOPEN6:
3260 case NFA_ZOPEN7:
3261 case NFA_ZOPEN8:
3262 case NFA_ZOPEN9:
3263 case NFA_ZCLOSE:
3264 case NFA_ZCLOSE1:
3265 case NFA_ZCLOSE2:
3266 case NFA_ZCLOSE3:
3267 case NFA_ZCLOSE4:
3268 case NFA_ZCLOSE5:
3269 case NFA_ZCLOSE6:
3270 case NFA_ZCLOSE7:
3271 case NFA_ZCLOSE8:
3272 case NFA_ZCLOSE9:
3273#endif
3274 case NFA_MCLOSE:
3275 case NFA_MCLOSE1:
3276 case NFA_MCLOSE2:
3277 case NFA_MCLOSE3:
3278 case NFA_MCLOSE4:
3279 case NFA_MCLOSE5:
3280 case NFA_MCLOSE6:
3281 case NFA_MCLOSE7:
3282 case NFA_MCLOSE8:
3283 case NFA_MCLOSE9:
3284 case NFA_NOPEN:
3285 case NFA_NCLOSE:
3286
3287 case NFA_LNUM_GT:
3288 case NFA_LNUM_LT:
3289 case NFA_COL_GT:
3290 case NFA_COL_LT:
3291 case NFA_VCOL_GT:
3292 case NFA_VCOL_LT:
3293 case NFA_MARK_GT:
3294 case NFA_MARK_LT:
3295 case NFA_VISUAL:
3296 case NFA_LNUM:
3297 case NFA_CURSOR:
3298 case NFA_COL:
3299 case NFA_VCOL:
3300 case NFA_MARK:
3301
3302 case NFA_ZSTART:
3303 case NFA_ZEND:
3304 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003305 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003306 case NFA_START_PATTERN:
3307 case NFA_END_PATTERN:
3308 case NFA_COMPOSING:
3309 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003310 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003311 break;
3312
3313 default:
3314 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003315 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003316 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003317 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003318 len += MB_CHAR2LEN(state->c);
3319 break;
3320 }
3321
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003322 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003323 state = state->out;
3324 }
3325
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003326 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003327 return -1;
3328}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003329
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003330/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003331 * Convert a postfix form into its equivalent NFA.
3332 * Return the NFA start state on success, NULL otherwise.
3333 */
3334 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003335post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003336{
3337 int *p;
3338 int mopen;
3339 int mclose;
3340 Frag_T *stack = NULL;
3341 Frag_T *stackp = NULL;
3342 Frag_T *stack_end = NULL;
3343 Frag_T e1;
3344 Frag_T e2;
3345 Frag_T e;
3346 nfa_state_T *s;
3347 nfa_state_T *s1;
3348 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003349 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003350
3351 if (postfix == NULL)
3352 return NULL;
3353
Bram Moolenaar053bb602013-05-20 13:55:21 +02003354#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003355#define POP() st_pop(&stackp, stack); \
3356 if (stackp < stack) \
3357 { \
3358 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003359 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003360 return NULL; \
3361 }
3362
3363 if (nfa_calc_size == FALSE)
3364 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003365 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003366 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003367 if (stack == NULL)
3368 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003369 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003370 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003371 }
3372
3373 for (p = postfix; p < end; ++p)
3374 {
3375 switch (*p)
3376 {
3377 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003378 // Concatenation.
3379 // Pay attention: this operator does not exist in the r.e. itself
3380 // (it is implicit, really). It is added when r.e. is translated
3381 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003382 if (nfa_calc_size == TRUE)
3383 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003384 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003385 break;
3386 }
3387 e2 = POP();
3388 e1 = POP();
3389 patch(e1.out, e2.start);
3390 PUSH(frag(e1.start, e2.out));
3391 break;
3392
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003393 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003394 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003395 if (nfa_calc_size == TRUE)
3396 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003397 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003398 break;
3399 }
3400 e2 = POP();
3401 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003402 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003403 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003404 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003405 PUSH(frag(s, append(e1.out, e2.out)));
3406 break;
3407
3408 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003409 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003410 if (nfa_calc_size == TRUE)
3411 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003412 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003413 break;
3414 }
3415 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003416 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003417 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003418 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003419 patch(e.out, s);
3420 PUSH(frag(s, list1(&s->out1)));
3421 break;
3422
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003423 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003424 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003425 if (nfa_calc_size == TRUE)
3426 {
3427 nstate++;
3428 break;
3429 }
3430 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003431 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003432 if (s == NULL)
3433 goto theend;
3434 patch(e.out, s);
3435 PUSH(frag(s, list1(&s->out)));
3436 break;
3437
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003438 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003439 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003440 if (nfa_calc_size == TRUE)
3441 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003442 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003443 break;
3444 }
3445 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003446 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003447 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003448 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003449 PUSH(frag(s, append(e.out, list1(&s->out1))));
3450 break;
3451
3452 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003453 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003454 if (nfa_calc_size == TRUE)
3455 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003456 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003457 break;
3458 }
3459 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003460 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003461 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003462 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003463 PUSH(frag(s, append(e.out, list1(&s->out))));
3464 break;
3465
Bram Moolenaar417bad22013-06-07 14:08:30 +02003466 case NFA_END_COLL:
3467 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003468 // On the stack is the sequence starting with NFA_START_COLL or
3469 // NFA_START_NEG_COLL and all possible characters. Patch it to
3470 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003471 if (nfa_calc_size == TRUE)
3472 {
3473 nstate++;
3474 break;
3475 }
3476 e = POP();
3477 s = alloc_state(NFA_END_COLL, NULL, NULL);
3478 if (s == NULL)
3479 goto theend;
3480 patch(e.out, s);
3481 e.start->out1 = s;
3482 PUSH(frag(e.start, list1(&s->out)));
3483 break;
3484
3485 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003486 // Before this are two characters, the low and high end of a
3487 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003488 if (nfa_calc_size == TRUE)
3489 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003490 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003491 break;
3492 }
3493 e2 = POP();
3494 e1 = POP();
3495 e2.start->val = e2.start->c;
3496 e2.start->c = NFA_RANGE_MAX;
3497 e1.start->val = e1.start->c;
3498 e1.start->c = NFA_RANGE_MIN;
3499 patch(e1.out, e2.start);
3500 PUSH(frag(e1.start, e2.out));
3501 break;
3502
Bram Moolenaar699c1202013-09-25 16:41:54 +02003503 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003504 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003505 if (nfa_calc_size == TRUE)
3506 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003507 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003508 break;
3509 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003510 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003511 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003512 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003513 PUSH(frag(s, list1(&s->out)));
3514 break;
3515
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003516 case NFA_OPT_CHARS:
3517 {
3518 int n;
3519
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003520 // \%[abc] implemented as:
3521 // NFA_SPLIT
3522 // +-CHAR(a)
3523 // | +-NFA_SPLIT
3524 // | +-CHAR(b)
3525 // | | +-NFA_SPLIT
3526 // | | +-CHAR(c)
3527 // | | | +-next
3528 // | | +- next
3529 // | +- next
3530 // +- next
3531 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003532 if (nfa_calc_size == TRUE)
3533 {
3534 nstate += n;
3535 break;
3536 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003537 s = NULL; // avoid compiler warning
3538 e1.out = NULL; // stores list with out1's
3539 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003540 while (n-- > 0)
3541 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003542 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003543 s = alloc_state(NFA_SPLIT, e.start, NULL);
3544 if (s == NULL)
3545 goto theend;
3546 if (e1.out == NULL)
3547 e1 = e;
3548 patch(e.out, s1);
3549 append(e1.out, list1(&s->out1));
3550 s1 = s;
3551 }
3552 PUSH(frag(s, e1.out));
3553 break;
3554 }
3555
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003556 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003557 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003558 case NFA_PREV_ATOM_JUST_BEFORE:
3559 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003560 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003561 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003562 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3563 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003564 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003565 int start_state;
3566 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003567 int n = 0;
3568 nfa_state_T *zend;
3569 nfa_state_T *skip;
3570
Bram Moolenaardecd9542013-06-07 16:31:50 +02003571 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003572 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003573 case NFA_PREV_ATOM_NO_WIDTH:
3574 start_state = NFA_START_INVISIBLE;
3575 end_state = NFA_END_INVISIBLE;
3576 break;
3577 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3578 start_state = NFA_START_INVISIBLE_NEG;
3579 end_state = NFA_END_INVISIBLE_NEG;
3580 break;
3581 case NFA_PREV_ATOM_JUST_BEFORE:
3582 start_state = NFA_START_INVISIBLE_BEFORE;
3583 end_state = NFA_END_INVISIBLE;
3584 break;
3585 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3586 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3587 end_state = NFA_END_INVISIBLE_NEG;
3588 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003589 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003590 start_state = NFA_START_PATTERN;
3591 end_state = NFA_END_PATTERN;
3592 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003593 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003594
3595 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003596 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003597
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003598 // The \@= operator: match the preceding atom with zero width.
3599 // The \@! operator: no match for the preceding atom.
3600 // The \@<= operator: match for the preceding atom.
3601 // The \@<! operator: no match for the preceding atom.
3602 // Surrounds the preceding atom with START_INVISIBLE and
3603 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003604
3605 if (nfa_calc_size == TRUE)
3606 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003607 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003608 break;
3609 }
3610 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003611 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003612 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003613 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003614
Bram Moolenaar87953742013-06-05 18:52:40 +02003615 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003616 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003617 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003618 if (pattern)
3619 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003620 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003621 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003622 if (skip == NULL)
3623 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003624 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003625 if (zend == NULL)
3626 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003627 s1->out= skip;
3628 patch(e.out, zend);
3629 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003630 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003631 else
3632 {
3633 patch(e.out, s1);
3634 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003635 if (before)
3636 {
3637 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003638 // See if we can guess the maximum width, it avoids a
3639 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003640 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003641 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003642 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003643 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003644 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003645 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003646
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003647 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003648#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003649 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003650 if (regflags & RF_ICOMBINE)
3651 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003652 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003653 }
3654#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003655 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003656
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003657 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003658 case NFA_MOPEN1:
3659 case NFA_MOPEN2:
3660 case NFA_MOPEN3:
3661 case NFA_MOPEN4:
3662 case NFA_MOPEN5:
3663 case NFA_MOPEN6:
3664 case NFA_MOPEN7:
3665 case NFA_MOPEN8:
3666 case NFA_MOPEN9:
3667#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003668 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003669 case NFA_ZOPEN1:
3670 case NFA_ZOPEN2:
3671 case NFA_ZOPEN3:
3672 case NFA_ZOPEN4:
3673 case NFA_ZOPEN5:
3674 case NFA_ZOPEN6:
3675 case NFA_ZOPEN7:
3676 case NFA_ZOPEN8:
3677 case NFA_ZOPEN9:
3678#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003679 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003680 if (nfa_calc_size == TRUE)
3681 {
3682 nstate += 2;
3683 break;
3684 }
3685
3686 mopen = *p;
3687 switch (*p)
3688 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003689 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3690#ifdef FEAT_SYN_HL
3691 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3692 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3693 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3694 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3695 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3696 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3697 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3698 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3699 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3700 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3701#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003702 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003703 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003704 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003705 mclose = *p + NSUBEXP;
3706 break;
3707 }
3708
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003709 // Allow "NFA_MOPEN" as a valid postfix representation for
3710 // the empty regexp "". In this case, the NFA will be
3711 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3712 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003713 if (stackp == stack)
3714 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003715 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003716 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003717 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003718 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003719 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003720 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003721 patch(list1(&s->out), s1);
3722 PUSH(frag(s, list1(&s1->out)));
3723 break;
3724 }
3725
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003726 // At least one node was emitted before NFA_MOPEN, so
3727 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003728 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003729 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003730 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003731 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003732
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003733 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003734 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003735 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003736 patch(e.out, s1);
3737
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003738 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003739 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003740 patch(list1(&s->out1), s1);
3741
3742 PUSH(frag(s, list1(&s1->out)));
3743 break;
3744
Bram Moolenaar5714b802013-05-28 22:03:20 +02003745 case NFA_BACKREF1:
3746 case NFA_BACKREF2:
3747 case NFA_BACKREF3:
3748 case NFA_BACKREF4:
3749 case NFA_BACKREF5:
3750 case NFA_BACKREF6:
3751 case NFA_BACKREF7:
3752 case NFA_BACKREF8:
3753 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003754#ifdef FEAT_SYN_HL
3755 case NFA_ZREF1:
3756 case NFA_ZREF2:
3757 case NFA_ZREF3:
3758 case NFA_ZREF4:
3759 case NFA_ZREF5:
3760 case NFA_ZREF6:
3761 case NFA_ZREF7:
3762 case NFA_ZREF8:
3763 case NFA_ZREF9:
3764#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003765 if (nfa_calc_size == TRUE)
3766 {
3767 nstate += 2;
3768 break;
3769 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003770 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003771 if (s == NULL)
3772 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003773 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003774 if (s1 == NULL)
3775 goto theend;
3776 patch(list1(&s->out), s1);
3777 PUSH(frag(s, list1(&s1->out)));
3778 break;
3779
Bram Moolenaar423532e2013-05-29 21:14:42 +02003780 case NFA_LNUM:
3781 case NFA_LNUM_GT:
3782 case NFA_LNUM_LT:
3783 case NFA_VCOL:
3784 case NFA_VCOL_GT:
3785 case NFA_VCOL_LT:
3786 case NFA_COL:
3787 case NFA_COL_GT:
3788 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003789 case NFA_MARK:
3790 case NFA_MARK_GT:
3791 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003792 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003793 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003794
Bram Moolenaar423532e2013-05-29 21:14:42 +02003795 if (nfa_calc_size == TRUE)
3796 {
3797 nstate += 1;
3798 break;
3799 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003800 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003801 if (s == NULL)
3802 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003803 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003804 PUSH(frag(s, list1(&s->out)));
3805 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003806 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003807
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003808 case NFA_ZSTART:
3809 case NFA_ZEND:
3810 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003811 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003812 if (nfa_calc_size == TRUE)
3813 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003814 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003815 break;
3816 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003817 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003818 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003819 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003820 PUSH(frag(s, list1(&s->out)));
3821 break;
3822
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003823 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003824
Bram Moolenaarc9471b12023-05-09 15:00:00 +01003825 } // for (p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003826
3827 if (nfa_calc_size == TRUE)
3828 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003829 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003830 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003831 }
3832
3833 e = POP();
3834 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003835 {
3836 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003837 EMSG_RET_NULL(_(e_nfa_regexp_while_converting_from_postfix_to_nfa_too_many_stats_left_on_stack));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003838 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003839
3840 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003841 {
3842 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003843 EMSG_RET_NULL(_(e_nfa_regexp_not_enough_space_to_store_whole_nfa));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003844 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003845
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003846 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003847 matchstate->c = NFA_MATCH;
3848 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003849 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003850
3851 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003852 ret = e.start;
3853
3854theend:
3855 vim_free(stack);
3856 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003857
3858#undef POP1
3859#undef PUSH1
3860#undef POP2
3861#undef PUSH2
3862#undef POP
3863#undef PUSH
3864}
3865
Bram Moolenaara2947e22013-06-11 22:44:09 +02003866/*
3867 * After building the NFA program, inspect it to add optimization hints.
3868 */
3869 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003870nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003871{
3872 int i;
3873 int c;
3874
3875 for (i = 0; i < prog->nstate; ++i)
3876 {
3877 c = prog->state[i].c;
3878 if (c == NFA_START_INVISIBLE
3879 || c == NFA_START_INVISIBLE_NEG
3880 || c == NFA_START_INVISIBLE_BEFORE
3881 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3882 {
3883 int directly;
3884
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003885 // Do it directly when what follows is possibly the end of the
3886 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003887 if (match_follows(prog->state[i].out1->out, 0))
3888 directly = TRUE;
3889 else
3890 {
3891 int ch_invisible = failure_chance(prog->state[i].out, 0);
3892 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3893
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003894 // Postpone when the invisible match is expensive or has a
3895 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003896 if (c == NFA_START_INVISIBLE_BEFORE
3897 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3898 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003899 // "before" matches are very expensive when
3900 // unbounded, always prefer what follows then,
3901 // unless what follows will always match.
3902 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003903 if (prog->state[i].val <= 0 && ch_follows > 0)
3904 directly = FALSE;
3905 else
3906 directly = ch_follows * 10 < ch_invisible;
3907 }
3908 else
3909 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003910 // normal invisible, first do the one with the
3911 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003912 directly = ch_follows < ch_invisible;
3913 }
3914 }
3915 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003916 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003917 ++prog->state[i].c;
3918 }
3919 }
3920}
3921
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003922/////////////////////////////////////////////////////////////////
3923// NFA execution code.
3924/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003925
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003926typedef struct
3927{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003928 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003929
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003930 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003931 union
3932 {
3933 struct multipos
3934 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003935 linenr_T start_lnum;
3936 linenr_T end_lnum;
3937 colnr_T start_col;
3938 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003939 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003940 struct linepos
3941 {
3942 char_u *start;
3943 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003944 } line[NSUBEXP];
3945 } list;
Bram Moolenaar79336e12022-12-11 14:18:31 +00003946 colnr_T orig_start_col; // list.multi[0].start_col without \zs
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003947} regsub_T;
3948
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003949typedef struct
3950{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003951 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003952#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003953 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003954#endif
3955} regsubs_T;
3956
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003957// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02003958typedef struct nfa_pim_S nfa_pim_T;
3959struct nfa_pim_S
3960{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003961 int result; // NFA_PIM_*, see below
3962 nfa_state_T *state; // the invisible match start state
3963 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02003964 union
3965 {
3966 lpos_T pos;
3967 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003968 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02003969};
3970
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003971// Values for done in nfa_pim_T.
3972#define NFA_PIM_UNUSED 0 // pim not used
3973#define NFA_PIM_TODO 1 // pim not done yet
3974#define NFA_PIM_MATCH 2 // pim executed, matches
3975#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02003976
3977
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003978// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02003979typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003980{
3981 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02003982 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003983 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
3984 // invisible match
3985 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02003986} nfa_thread_T;
3987
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003988// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003989typedef struct
3990{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003991 nfa_thread_T *t; // allocated array of states
3992 int n; // nr of states currently in "t"
3993 int len; // max nr of states in "t"
3994 int id; // ID of the list
3995 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02003996} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003997
Bram Moolenaar5714b802013-05-28 22:03:20 +02003998#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01003999static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004000
4001 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004002log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004003{
4004 log_subexpr(&subs->norm);
4005# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004006 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02004007 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004008# endif
4009}
4010
Bram Moolenaar5714b802013-05-28 22:03:20 +02004011 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004012log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02004013{
4014 int j;
4015
4016 for (j = 0; j < sub->in_use; j++)
4017 if (REG_MULTI)
Bram Moolenaarc96311b2022-11-25 21:13:47 +00004018 fprintf(log_fd,
4019 "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004020 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004021 sub->list.multi[j].start_col,
4022 (int)sub->list.multi[j].start_lnum,
4023 sub->list.multi[j].end_col,
4024 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004025 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004026 {
4027 char *s = (char *)sub->list.line[j].start;
4028 char *e = (char *)sub->list.line[j].end;
4029
Bram Moolenaar87953742013-06-05 18:52:40 +02004030 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004031 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004032 s == NULL ? "NULL" : s,
4033 e == NULL ? "NULL" : e);
4034 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004035}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004036
4037 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004038pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004039{
4040 static char buf[30];
4041
4042 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4043 buf[0] = NUL;
4044 else
4045 {
4046 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004047 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004048 }
4049 return buf;
4050}
4051
Bram Moolenaar5714b802013-05-28 22:03:20 +02004052#endif
4053
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004054// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004055static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004056#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004057static int *nfa_timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004058#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004059
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004060static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004061static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004062
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004063/*
4064 * Copy postponed invisible match info from "from" to "to".
4065 */
4066 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004067copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004068{
4069 to->result = from->result;
4070 to->state = from->state;
4071 copy_sub(&to->subs.norm, &from->subs.norm);
4072#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004073 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004074 copy_sub(&to->subs.synt, &from->subs.synt);
4075#endif
4076 to->end = from->end;
4077}
4078
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004079 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004080clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004081{
4082 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004083 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004084 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004085 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004086 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004087 vim_memset(sub->list.line, 0,
4088 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004089 sub->in_use = 0;
4090}
4091
4092/*
4093 * Copy the submatches from "from" to "to".
4094 */
4095 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004096copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004097{
4098 to->in_use = from->in_use;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004099 if (from->in_use <= 0)
4100 return;
4101
4102 // Copy the match start and end positions.
4103 if (REG_MULTI)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004104 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004105 mch_memmove(&to->list.multi[0],
4106 &from->list.multi[0],
4107 sizeof(struct multipos) * from->in_use);
4108 to->orig_start_col = from->orig_start_col;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004109 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004110 else
4111 mch_memmove(&to->list.line[0],
4112 &from->list.line[0],
4113 sizeof(struct linepos) * from->in_use);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004114}
4115
4116/*
4117 * Like copy_sub() but exclude the main match.
4118 */
4119 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004120copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004121{
4122 if (to->in_use < from->in_use)
4123 to->in_use = from->in_use;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004124 if (from->in_use <= 1)
4125 return;
4126
4127 // Copy the match start and end positions.
4128 if (REG_MULTI)
4129 mch_memmove(&to->list.multi[1],
4130 &from->list.multi[1],
4131 sizeof(struct multipos) * (from->in_use - 1));
4132 else
4133 mch_memmove(&to->list.line[1],
4134 &from->list.line[1],
4135 sizeof(struct linepos) * (from->in_use - 1));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004136}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004137
Bram Moolenaar428e9872013-05-30 17:05:39 +02004138/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004139 * Like copy_sub() but only do the end of the main match if \ze is present.
4140 */
4141 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004142copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004143{
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004144 if (!rex.nfa_has_zend)
4145 return;
4146
4147 if (REG_MULTI)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004148 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004149 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004150 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004151 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4152 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaarf2118842013-09-25 18:16:38 +02004153 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004154 }
4155 else
4156 {
4157 if (from->list.line[0].end != NULL)
4158 to->list.line[0].end = from->list.line[0].end;
Bram Moolenaarf2118842013-09-25 18:16:38 +02004159 }
4160}
4161
4162/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004163 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004164 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004165 */
4166 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004167sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004168{
4169 int i;
4170 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004171 linenr_T s1;
4172 linenr_T s2;
4173 char_u *sp1;
4174 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004175
4176 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4177 if (REG_MULTI)
4178 {
4179 for (i = 0; i < todo; ++i)
4180 {
4181 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004182 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004183 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004184 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004185 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004186 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004187 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004188 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004189 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004190 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004191 if (s1 != -1 && sub1->list.multi[i].start_col
4192 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004193 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004194
Bram Moolenaar0270f382018-07-17 05:43:58 +02004195 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004196 {
4197 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004198 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004199 else
4200 s1 = -1;
4201 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004202 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004203 else
4204 s2 = -1;
4205 if (s1 != s2)
4206 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004207 if (s1 != -1 && sub1->list.multi[i].end_col
4208 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004209 return FALSE;
4210 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004211 }
4212 }
4213 else
4214 {
4215 for (i = 0; i < todo; ++i)
4216 {
4217 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004218 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004219 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004220 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004221 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004222 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004223 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004224 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004225 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004226 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004227 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004228 {
4229 if (i < sub1->in_use)
4230 sp1 = sub1->list.line[i].end;
4231 else
4232 sp1 = NULL;
4233 if (i < sub2->in_use)
4234 sp2 = sub2->list.line[i].end;
4235 else
4236 sp2 = NULL;
4237 if (sp1 != sp2)
4238 return FALSE;
4239 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004240 }
4241 }
4242
4243 return TRUE;
4244}
4245
Bram Moolenaar616592e2022-06-17 15:17:10 +01004246#ifdef FEAT_RELTIME
4247/*
4248 * Check if we are past the time limit, if there is one.
4249 */
4250 static int
4251nfa_did_time_out(void)
4252{
4253 if (*timeout_flag)
4254 {
4255 if (nfa_timed_out != NULL)
4256 {
Bram Moolenaar4c5678f2022-11-30 18:12:19 +00004257# ifdef FEAT_EVAL
Bram Moolenaar616592e2022-06-17 15:17:10 +01004258 if (!*nfa_timed_out)
4259 ch_log(NULL, "NFA regexp timed out");
Bram Moolenaar509ce032022-06-20 11:23:01 +01004260# endif
Bram Moolenaar616592e2022-06-17 15:17:10 +01004261 *nfa_timed_out = TRUE;
4262 }
4263 return TRUE;
4264 }
4265 return FALSE;
4266}
4267#endif
4268
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004269#ifdef ENABLE_LOG
4270 static void
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004271open_debug_log(int result)
4272{
4273 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4274 if (log_fd == NULL)
4275 {
4276 emsg(_(e_log_open_failed));
4277 log_fd = stderr;
4278 }
4279
4280 fprintf(log_fd, "****************************\n");
4281 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4282 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : result == MAYBE
4283 ? "MAYBE" : "FALSE");
4284 fprintf(log_fd, "****************************\n");
4285}
4286
4287 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004288report_state(char *action,
4289 regsub_T *sub,
4290 nfa_state_T *state,
4291 int lid,
4292 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004293{
4294 int col;
4295
4296 if (sub->in_use <= 0)
4297 col = -1;
4298 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004299 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004300 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004301 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004302 nfa_set_code(state->c);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004303 if (log_fd == NULL)
4304 open_debug_log(MAYBE);
4305
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004306 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4307 action, abs(state->id), lid, state->c, code, col,
4308 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004309}
4310#endif
4311
Bram Moolenaar43e02982013-06-07 17:31:29 +02004312/*
4313 * Return TRUE if the same state is already in list "l" with the same
4314 * positions as "subs".
4315 */
4316 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004317has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004318 nfa_list_T *l, // runtime state list
4319 nfa_state_T *state, // state to update
4320 regsubs_T *subs, // pointers to subexpressions
4321 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004322{
4323 nfa_thread_T *thread;
4324 int i;
4325
4326 for (i = 0; i < l->n; ++i)
4327 {
4328 thread = &l->t[i];
4329 if (thread->state->id == state->id
4330 && sub_equal(&thread->subs.norm, &subs->norm)
4331#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004332 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004333 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004334#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004335 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004336 return TRUE;
4337 }
4338 return FALSE;
4339}
4340
4341/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004342 * Return TRUE if "one" and "two" are equal. That includes when both are not
4343 * set.
4344 */
4345 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004346pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004347{
4348 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4349 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4350
4351 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004352 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004353 return two_unused;
4354 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004355 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004356 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004357 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004358 if (one->state->id != two->state->id)
4359 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004360 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004361 if (REG_MULTI)
4362 return one->end.pos.lnum == two->end.pos.lnum
4363 && one->end.pos.col == two->end.pos.col;
4364 return one->end.ptr == two->end.ptr;
4365}
4366
4367/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004368 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4369 */
4370 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004371match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004372{
4373 nfa_state_T *state = startstate;
4374
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004375 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004376 if (depth > 10)
4377 return FALSE;
4378
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004379 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004380 {
4381 switch (state->c)
4382 {
4383 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004384 case NFA_MCLOSE:
4385 case NFA_END_INVISIBLE:
4386 case NFA_END_INVISIBLE_NEG:
4387 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004388 return TRUE;
4389
4390 case NFA_SPLIT:
4391 return match_follows(state->out, depth + 1)
4392 || match_follows(state->out1, depth + 1);
4393
4394 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004395 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004396 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004397 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004398 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004399 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004400 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004401 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004402 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004403 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004404 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004405 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004406
4407 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004408 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004409 case NFA_IDENT:
4410 case NFA_SIDENT:
4411 case NFA_KWORD:
4412 case NFA_SKWORD:
4413 case NFA_FNAME:
4414 case NFA_SFNAME:
4415 case NFA_PRINT:
4416 case NFA_SPRINT:
4417 case NFA_WHITE:
4418 case NFA_NWHITE:
4419 case NFA_DIGIT:
4420 case NFA_NDIGIT:
4421 case NFA_HEX:
4422 case NFA_NHEX:
4423 case NFA_OCTAL:
4424 case NFA_NOCTAL:
4425 case NFA_WORD:
4426 case NFA_NWORD:
4427 case NFA_HEAD:
4428 case NFA_NHEAD:
4429 case NFA_ALPHA:
4430 case NFA_NALPHA:
4431 case NFA_LOWER:
4432 case NFA_NLOWER:
4433 case NFA_UPPER:
4434 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004435 case NFA_LOWER_IC:
4436 case NFA_NLOWER_IC:
4437 case NFA_UPPER_IC:
4438 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004439 case NFA_START_COLL:
4440 case NFA_START_NEG_COLL:
4441 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004442 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004443 return FALSE;
4444
4445 default:
4446 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004447 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004448 return FALSE;
4449
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004450 // Others: zero-width or possibly zero-width, might still find
4451 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004452 break;
4453 }
4454 state = state->out;
4455 }
4456 return FALSE;
4457}
4458
4459
4460/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004461 * Return TRUE if "state" is already in list "l".
4462 */
4463 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004464state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004465 nfa_list_T *l, // runtime state list
4466 nfa_state_T *state, // state to update
4467 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004468{
4469 if (state->lastlist[nfa_ll_index] == l->id)
4470 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004471 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004472 return TRUE;
4473 }
4474 return FALSE;
4475}
4476
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004477// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004478#define ADDSTATE_HERE_OFFSET 10
4479
Bram Moolenaard05bf562013-06-30 23:24:08 +02004480/*
4481 * Add "state" and possibly what follows to state list ".".
4482 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar616592e2022-06-17 15:17:10 +01004483 * Returns NULL when recursiveness is too deep or timed out.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004484 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004485 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004486addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004487 nfa_list_T *l, // runtime state list
4488 nfa_state_T *state, // state to update
4489 regsubs_T *subs_arg, // pointers to subexpressions
4490 nfa_pim_T *pim, // postponed look-behind match
4491 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004492{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004493 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004494 int off = off_arg;
4495 int add_here = FALSE;
4496 int listindex = 0;
4497 int k;
4498 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004499 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004500 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004501 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004502 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004503 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004504 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004505 regsubs_T *subs = subs_arg;
4506 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004507#ifdef ENABLE_LOG
4508 int did_print = FALSE;
4509#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004510 static int depth = 0;
4511
Bram Moolenaar616592e2022-06-17 15:17:10 +01004512#ifdef FEAT_RELTIME
4513 if (nfa_did_time_out())
4514 return NULL;
4515#endif
4516
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004517 // This function is called recursively. When the depth is too much we run
4518 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004519 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004520 {
4521 --depth;
4522 return NULL;
4523 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004524
Bram Moolenaar16b35782016-09-09 20:29:50 +02004525 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4526 {
4527 add_here = TRUE;
4528 off = 0;
4529 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4530 }
4531
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004532 switch (state->c)
4533 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004534 case NFA_NCLOSE:
4535 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004536 case NFA_MCLOSE1:
4537 case NFA_MCLOSE2:
4538 case NFA_MCLOSE3:
4539 case NFA_MCLOSE4:
4540 case NFA_MCLOSE5:
4541 case NFA_MCLOSE6:
4542 case NFA_MCLOSE7:
4543 case NFA_MCLOSE8:
4544 case NFA_MCLOSE9:
4545#ifdef FEAT_SYN_HL
4546 case NFA_ZCLOSE:
4547 case NFA_ZCLOSE1:
4548 case NFA_ZCLOSE2:
4549 case NFA_ZCLOSE3:
4550 case NFA_ZCLOSE4:
4551 case NFA_ZCLOSE5:
4552 case NFA_ZCLOSE6:
4553 case NFA_ZCLOSE7:
4554 case NFA_ZCLOSE8:
4555 case NFA_ZCLOSE9:
4556#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004557 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004558 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004559 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004560 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004561 // These nodes are not added themselves but their "out" and/or
4562 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004563 break;
4564
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004565 case NFA_BOL:
4566 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004567 // "^" won't match past end-of-line, don't bother trying.
4568 // Except when at the end of the line, or when we are going to the
4569 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004570 if (rex.input > rex.line
4571 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004572 && (nfa_endp == NULL
4573 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004574 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004575 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004576 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004577
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004578 case NFA_MOPEN1:
4579 case NFA_MOPEN2:
4580 case NFA_MOPEN3:
4581 case NFA_MOPEN4:
4582 case NFA_MOPEN5:
4583 case NFA_MOPEN6:
4584 case NFA_MOPEN7:
4585 case NFA_MOPEN8:
4586 case NFA_MOPEN9:
4587#ifdef FEAT_SYN_HL
4588 case NFA_ZOPEN:
4589 case NFA_ZOPEN1:
4590 case NFA_ZOPEN2:
4591 case NFA_ZOPEN3:
4592 case NFA_ZOPEN4:
4593 case NFA_ZOPEN5:
4594 case NFA_ZOPEN6:
4595 case NFA_ZOPEN7:
4596 case NFA_ZOPEN8:
4597 case NFA_ZOPEN9:
4598#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004599 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004600 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004601 // These nodes need to be added so that we can bail out when it
4602 // was added to this list before at the same position to avoid an
4603 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004604
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004605 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004606 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004607 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004608 // This state is already in the list, don't add it again,
4609 // unless it is an MOPEN that is used for a backreference or
4610 // when there is a PIM. For NFA_MATCH check the position,
4611 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004612 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004613 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004614 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004615 // When called from addstate_here() do insert before
4616 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004617 if (add_here)
4618 {
4619 for (k = 0; k < l->n && k < listindex; ++k)
4620 if (l->t[k].state->id == state->id)
4621 {
4622 found = TRUE;
4623 break;
4624 }
4625 }
4626 if (!add_here || found)
4627 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004628skip_add:
4629#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004630 nfa_set_code(state->c);
4631 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4632 abs(state->id), l->id, state->c, code,
4633 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004634#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004635 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004636 return subs;
4637 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004638 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004639
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004640 // Do not add the state again when it exists with the same
4641 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004642 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004643 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004644 }
4645
Bram Moolenaar688b3982019-02-13 21:47:36 +01004646 // When there are backreferences or PIMs the number of states may
4647 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004648 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004649 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004650 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004651 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004652 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004653
Bram Moolenaar688b3982019-02-13 21:47:36 +01004654 if ((long)(newsize >> 10) >= p_mmp)
4655 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004656 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004657 --depth;
4658 return NULL;
4659 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004660 if (subs != &temp_subs)
4661 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004662 // "subs" may point into the current array, need to make a
4663 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004664 copy_sub(&temp_subs.norm, &subs->norm);
4665#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004666 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004667 copy_sub(&temp_subs.synt, &subs->synt);
4668#endif
4669 subs = &temp_subs;
4670 }
4671
Bram Moolenaar688b3982019-02-13 21:47:36 +01004672 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004673 if (newt == NULL)
4674 {
4675 // out of memory
4676 --depth;
4677 return NULL;
4678 }
4679 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004680 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004681 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004682
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004683 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004684 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004685 thread = &l->t[l->n++];
4686 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004687 if (pim == NULL)
4688 thread->pim.result = NFA_PIM_UNUSED;
4689 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004690 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004691 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004692 l->has_pim = TRUE;
4693 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004694 copy_sub(&thread->subs.norm, &subs->norm);
4695#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004696 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004697 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004698#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004699#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004700 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004701 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004702#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004703 }
4704
4705#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004706 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004707 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004708#endif
4709 switch (state->c)
4710 {
4711 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004712 break;
4713
4714 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004715 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004716 subs = addstate(l, state->out, subs, pim, off_arg);
4717 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004718 break;
4719
Bram Moolenaar699c1202013-09-25 16:41:54 +02004720 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004721 case NFA_NOPEN:
4722 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004723 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004724 break;
4725
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004726 case NFA_MOPEN:
4727 case NFA_MOPEN1:
4728 case NFA_MOPEN2:
4729 case NFA_MOPEN3:
4730 case NFA_MOPEN4:
4731 case NFA_MOPEN5:
4732 case NFA_MOPEN6:
4733 case NFA_MOPEN7:
4734 case NFA_MOPEN8:
4735 case NFA_MOPEN9:
4736#ifdef FEAT_SYN_HL
4737 case NFA_ZOPEN:
4738 case NFA_ZOPEN1:
4739 case NFA_ZOPEN2:
4740 case NFA_ZOPEN3:
4741 case NFA_ZOPEN4:
4742 case NFA_ZOPEN5:
4743 case NFA_ZOPEN6:
4744 case NFA_ZOPEN7:
4745 case NFA_ZOPEN8:
4746 case NFA_ZOPEN9:
4747#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004748 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004749 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004750 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004751 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004752 sub = &subs->norm;
4753 }
4754#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004755 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004756 {
4757 subidx = state->c - NFA_ZOPEN;
4758 sub = &subs->synt;
4759 }
4760#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004761 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004762 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004763 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004764 sub = &subs->norm;
4765 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004766
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004767 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004768 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004769 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004770
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004771 // Set the position (with "off" added) in the subexpression. Save
4772 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004773 if (REG_MULTI)
4774 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004775 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004776 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004777 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004778 save_in_use = -1;
4779 }
4780 else
4781 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004782 save_in_use = sub->in_use;
4783 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004784 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004785 sub->list.multi[i].start_lnum = -1;
4786 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004787 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004788 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004789 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004790 if (off == -1)
4791 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004792 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004793 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004794 }
4795 else
4796 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004797 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004798 sub->list.multi[subidx].start_col =
Bram Moolenaarc96311b2022-11-25 21:13:47 +00004799 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004800 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004801 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004802 }
4803 else
4804 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004805 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004806 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004807 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004808 save_in_use = -1;
4809 }
4810 else
4811 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004812 save_in_use = sub->in_use;
4813 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004814 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004815 sub->list.line[i].start = NULL;
4816 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004817 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004818 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004819 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004820 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004821 }
4822
Bram Moolenaar16b35782016-09-09 20:29:50 +02004823 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004824 if (subs == NULL)
4825 break;
4826 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004827#ifdef FEAT_SYN_HL
4828 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4829 sub = &subs->synt;
4830 else
4831#endif
4832 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004833
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004834 if (save_in_use == -1)
4835 {
4836 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004837 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004838 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004839 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004840 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004841 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004842 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004843 break;
4844
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004845 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004846 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004847 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004848 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004849 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004850 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004851 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004852 break;
4853 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004854 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004855 case NFA_MCLOSE1:
4856 case NFA_MCLOSE2:
4857 case NFA_MCLOSE3:
4858 case NFA_MCLOSE4:
4859 case NFA_MCLOSE5:
4860 case NFA_MCLOSE6:
4861 case NFA_MCLOSE7:
4862 case NFA_MCLOSE8:
4863 case NFA_MCLOSE9:
4864#ifdef FEAT_SYN_HL
4865 case NFA_ZCLOSE:
4866 case NFA_ZCLOSE1:
4867 case NFA_ZCLOSE2:
4868 case NFA_ZCLOSE3:
4869 case NFA_ZCLOSE4:
4870 case NFA_ZCLOSE5:
4871 case NFA_ZCLOSE6:
4872 case NFA_ZCLOSE7:
4873 case NFA_ZCLOSE8:
4874 case NFA_ZCLOSE9:
4875#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004876 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004877 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004878 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004879 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004880 sub = &subs->norm;
4881 }
4882#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004883 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004884 {
4885 subidx = state->c - NFA_ZCLOSE;
4886 sub = &subs->synt;
4887 }
4888#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004889 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004890 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004891 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004892 sub = &subs->norm;
4893 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004894
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004895 // We don't fill in gaps here, there must have been an MOPEN that
4896 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004897 save_in_use = sub->in_use;
4898 if (sub->in_use <= subidx)
4899 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004900 if (REG_MULTI)
4901 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004902 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004903 if (off == -1)
4904 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004905 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004906 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004907 }
4908 else
4909 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004910 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004911 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004912 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004913 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004914 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004915 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004916 }
4917 else
4918 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004919 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004920 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004921 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004922 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004923 }
4924
Bram Moolenaar16b35782016-09-09 20:29:50 +02004925 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004926 if (subs == NULL)
4927 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004928 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004929#ifdef FEAT_SYN_HL
4930 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4931 sub = &subs->synt;
4932 else
4933#endif
4934 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004935
4936 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004937 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004938 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004939 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004940 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004941 break;
4942 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004943 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004944 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004945}
4946
4947/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004948 * Like addstate(), but the new state(s) are put at position "*ip".
4949 * Used for zero-width matches, next state to use is the added one.
4950 * This makes sure the order of states to be tried does not change, which
4951 * matters for alternatives.
4952 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004953 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004954addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004955 nfa_list_T *l, // runtime state list
4956 nfa_state_T *state, // state to update
4957 regsubs_T *subs, // pointers to subexpressions
4958 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004959 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004960{
4961 int tlen = l->n;
4962 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004963 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004964 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004965
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004966 // First add the state(s) at the end, so that we know how many there are.
4967 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00004968 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004969 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4970 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004971 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004972
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004973 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004974 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004975 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004976
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004977 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004978 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004979 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004980 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02004981 if (count == 1)
4982 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004983 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02004984 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02004985 }
4986 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004987 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004988 if (l->n + count - 1 >= l->len)
4989 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004990 // not enough space to move the new states, reallocate the list
4991 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004992 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004993 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004994 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004995
Bram Moolenaar688b3982019-02-13 21:47:36 +01004996 if ((long)(newsize >> 10) >= p_mmp)
4997 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004998 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004999 return NULL;
5000 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005001 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005002 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005003 return NULL;
5004 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005005 mch_memmove(&(newl[0]),
5006 &(l->t[0]),
5007 sizeof(nfa_thread_T) * listidx);
5008 mch_memmove(&(newl[listidx]),
5009 &(l->t[l->n - count]),
5010 sizeof(nfa_thread_T) * count);
5011 mch_memmove(&(newl[listidx + count]),
5012 &(l->t[listidx + 1]),
5013 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
5014 vim_free(l->t);
5015 l->t = newl;
5016 }
5017 else
5018 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005019 // make space for new states, then move them from the
5020 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005021 mch_memmove(&(l->t[listidx + count]),
5022 &(l->t[listidx + 1]),
5023 sizeof(nfa_thread_T) * (l->n - listidx - 1));
5024 mch_memmove(&(l->t[listidx]),
5025 &(l->t[l->n - 1]),
5026 sizeof(nfa_thread_T) * count);
5027 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005028 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005029 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005030 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005031
5032 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005033}
5034
5035/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005036 * Check character class "class" against current character c.
5037 */
5038 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005039check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005040{
5041 switch (class)
5042 {
5043 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005044 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005045 return OK;
5046 break;
5047 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005048 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005049 return OK;
5050 break;
5051 case NFA_CLASS_BLANK:
5052 if (c == ' ' || c == '\t')
5053 return OK;
5054 break;
5055 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005056 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005057 return OK;
5058 break;
5059 case NFA_CLASS_DIGIT:
5060 if (VIM_ISDIGIT(c))
5061 return OK;
5062 break;
5063 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005064 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005065 return OK;
5066 break;
5067 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005068 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005069 return OK;
5070 break;
5071 case NFA_CLASS_PRINT:
5072 if (vim_isprintc(c))
5073 return OK;
5074 break;
5075 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005076 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005077 return OK;
5078 break;
5079 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005080 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005081 return OK;
5082 break;
5083 case NFA_CLASS_UPPER:
5084 if (MB_ISUPPER(c))
5085 return OK;
5086 break;
5087 case NFA_CLASS_XDIGIT:
5088 if (vim_isxdigit(c))
5089 return OK;
5090 break;
5091 case NFA_CLASS_TAB:
5092 if (c == '\t')
5093 return OK;
5094 break;
5095 case NFA_CLASS_RETURN:
5096 if (c == '\r')
5097 return OK;
5098 break;
5099 case NFA_CLASS_BACKSPACE:
5100 if (c == '\b')
5101 return OK;
5102 break;
5103 case NFA_CLASS_ESCAPE:
5104 if (c == '\033')
5105 return OK;
5106 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005107 case NFA_CLASS_IDENT:
5108 if (vim_isIDc(c))
5109 return OK;
5110 break;
5111 case NFA_CLASS_KEYWORD:
5112 if (reg_iswordc(c))
5113 return OK;
5114 break;
5115 case NFA_CLASS_FNAME:
5116 if (vim_isfilec(c))
5117 return OK;
5118 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005119
5120 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005121 // should not be here :P
RestorerZ68ebcee2023-05-31 17:12:14 +01005122 siemsg(e_nfa_regexp_invalid_character_class_nr, class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005123 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005124 }
5125 return FAIL;
5126}
5127
Bram Moolenaar5714b802013-05-28 22:03:20 +02005128/*
5129 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005130 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005131 */
5132 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005133match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005134 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005135 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005136 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005137{
5138 int len;
5139
5140 if (sub->in_use <= subidx)
5141 {
5142retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005143 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005144 *bytelen = 0;
5145 return TRUE;
5146 }
5147
5148 if (REG_MULTI)
5149 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005150 if (sub->list.multi[subidx].start_lnum < 0
5151 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005152 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005153 if (sub->list.multi[subidx].start_lnum == rex.lnum
5154 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005155 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005156 len = sub->list.multi[subidx].end_col
5157 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005158 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5159 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005160 {
5161 *bytelen = len;
5162 return TRUE;
5163 }
5164 }
5165 else
5166 {
5167 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005168 sub->list.multi[subidx].start_lnum,
5169 sub->list.multi[subidx].start_col,
5170 sub->list.multi[subidx].end_lnum,
5171 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005172 bytelen) == RA_MATCH)
5173 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005174 }
5175 }
5176 else
5177 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005178 if (sub->list.line[subidx].start == NULL
5179 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005180 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005181 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005182 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005183 {
5184 *bytelen = len;
5185 return TRUE;
5186 }
5187 }
5188 return FALSE;
5189}
5190
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005191#ifdef FEAT_SYN_HL
5192
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005193/*
5194 * Check for a match with \z subexpression "subidx".
5195 * Return TRUE if it matches.
5196 */
5197 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005198match_zref(
5199 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005200 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005201{
5202 int len;
5203
5204 cleanup_zsubexpr();
5205 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5206 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005207 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005208 *bytelen = 0;
5209 return TRUE;
5210 }
5211
5212 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005213 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005214 {
5215 *bytelen = len;
5216 return TRUE;
5217 }
5218 return FALSE;
5219}
5220#endif
5221
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005222/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005223 * Save list IDs for all NFA states of "prog" into "list".
5224 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005225 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005226 */
5227 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005228nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005229{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005230 int i;
5231 nfa_state_T *p;
5232
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005233 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005234 p = &prog->state[0];
5235 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005236 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005237 list[i] = p->lastlist[1];
5238 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005239 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005240 }
5241}
5242
5243/*
5244 * Restore list IDs from "list" to all NFA states.
5245 */
5246 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005247nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005248{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005249 int i;
5250 nfa_state_T *p;
5251
5252 p = &prog->state[0];
5253 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005254 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005255 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005256 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005257 }
5258}
5259
Bram Moolenaar423532e2013-05-29 21:14:42 +02005260 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005261nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005262{
5263 if (op == 1) return pos > val;
5264 if (op == 2) return pos < val;
5265 return val == pos;
5266}
5267
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005268static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005269
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005270/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005271 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005272 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5273 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005274 */
5275 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005276recursive_regmatch(
5277 nfa_state_T *state,
5278 nfa_pim_T *pim,
5279 nfa_regprog_T *prog,
5280 regsubs_T *submatch,
5281 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005282 int **listids,
5283 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005284{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005285 int save_reginput_col = (int)(rex.input - rex.line);
5286 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005287 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005288 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005289 save_se_T *save_nfa_endp = nfa_endp;
5290 save_se_T endpos;
5291 save_se_T *endposp = NULL;
5292 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005293 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005294
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005295 if (pim != NULL)
5296 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005297 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005298 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005299 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005300 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005301 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005302 }
5303
Bram Moolenaardecd9542013-06-07 16:31:50 +02005304 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005305 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5306 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5307 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005308 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005309 // The recursive match must end at the current position. When "pim" is
5310 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005311 endposp = &endpos;
5312 if (REG_MULTI)
5313 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005314 if (pim == NULL)
5315 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005316 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5317 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005318 }
5319 else
5320 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005321 }
5322 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005323 {
5324 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005325 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005326 else
5327 endpos.se_u.ptr = pim->end.ptr;
5328 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005329
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005330 // Go back the specified number of bytes, or as far as the
5331 // start of the previous line, to try matching "\@<=" or
5332 // not matching "\@<!". This is very inefficient, limit the number of
5333 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005334 if (state->val <= 0)
5335 {
5336 if (REG_MULTI)
5337 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005338 rex.line = reg_getline(--rex.lnum);
5339 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005340 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005341 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005342 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005343 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005344 }
5345 else
5346 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005347 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005348 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005349 // Not enough bytes in this line, go to end of
5350 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005351 rex.line = reg_getline(--rex.lnum);
5352 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005353 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005354 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005355 rex.line = reg_getline(++rex.lnum);
5356 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005357 }
5358 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005359 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005360 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005361 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005362 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005363 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005364 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005365 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005366 }
5367 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005368 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005369 }
5370 }
5371
Bram Moolenaarf46da702013-06-02 22:37:42 +02005372#ifdef ENABLE_LOG
5373 if (log_fd != stderr)
5374 fclose(log_fd);
5375 log_fd = NULL;
5376#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005377 // Have to clear the lastlist field of the NFA nodes, so that
5378 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005379 if (nfa_ll_index == 1)
5380 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005381 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5382 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005383 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005384 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005385 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005386 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005387 if (*listids == NULL)
5388 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00005389 emsg(_(e_nfa_regexp_could_not_allocate_memory_for_branch_traversal));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005390 return 0;
5391 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005392 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005393 }
5394 nfa_save_listids(prog, *listids);
5395 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005396 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005397 }
5398 else
5399 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005400 // First recursive nfa_regmatch() call, switch to the second lastlist
5401 // entry. Make sure rex.nfa_listid is different from a previous
5402 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005403 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005404 if (rex.nfa_listid <= rex.nfa_alt_listid)
5405 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005406 }
5407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005408 // Call nfa_regmatch() to check if the current concat matches at this
5409 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005410 nfa_endp = endposp;
5411 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005412
5413 if (need_restore)
5414 nfa_restore_listids(prog, *listids);
5415 else
5416 {
5417 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005418 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005419 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005420
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005421 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005422 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005423 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005424 rex.line = reg_getline(rex.lnum);
5425 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005426 if (result != NFA_TOO_EXPENSIVE)
5427 {
5428 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005429 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005430 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005431 nfa_endp = save_nfa_endp;
5432
5433#ifdef ENABLE_LOG
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005434 open_debug_log(result);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005435#endif
5436
5437 return result;
5438}
5439
Bram Moolenaara2d95102013-06-04 14:23:05 +02005440/*
5441 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005442 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005443 * NFA_ANY: 1
5444 * specific character: 99
5445 */
5446 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005447failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005448{
5449 int c = state->c;
5450 int l, r;
5451
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005452 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005453 if (depth > 4)
5454 return 1;
5455
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005456 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005457 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005458 case NFA_SPLIT:
5459 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005460 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005461 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005462 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005463 l = failure_chance(state->out, depth + 1);
5464 r = failure_chance(state->out1, depth + 1);
5465 return l < r ? l : r;
5466
5467 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005468 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005469 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005470
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005471 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005472 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005473 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005474 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005475 return 0;
5476
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005477 case NFA_START_INVISIBLE:
5478 case NFA_START_INVISIBLE_FIRST:
5479 case NFA_START_INVISIBLE_NEG:
5480 case NFA_START_INVISIBLE_NEG_FIRST:
5481 case NFA_START_INVISIBLE_BEFORE:
5482 case NFA_START_INVISIBLE_BEFORE_FIRST:
5483 case NFA_START_INVISIBLE_BEFORE_NEG:
5484 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5485 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005486 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005487 return 5;
5488
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005489 case NFA_BOL:
5490 case NFA_EOL:
5491 case NFA_BOF:
5492 case NFA_EOF:
5493 case NFA_NEWL:
5494 return 99;
5495
5496 case NFA_BOW:
5497 case NFA_EOW:
5498 return 90;
5499
5500 case NFA_MOPEN:
5501 case NFA_MOPEN1:
5502 case NFA_MOPEN2:
5503 case NFA_MOPEN3:
5504 case NFA_MOPEN4:
5505 case NFA_MOPEN5:
5506 case NFA_MOPEN6:
5507 case NFA_MOPEN7:
5508 case NFA_MOPEN8:
5509 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005510#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005511 case NFA_ZOPEN:
5512 case NFA_ZOPEN1:
5513 case NFA_ZOPEN2:
5514 case NFA_ZOPEN3:
5515 case NFA_ZOPEN4:
5516 case NFA_ZOPEN5:
5517 case NFA_ZOPEN6:
5518 case NFA_ZOPEN7:
5519 case NFA_ZOPEN8:
5520 case NFA_ZOPEN9:
5521 case NFA_ZCLOSE:
5522 case NFA_ZCLOSE1:
5523 case NFA_ZCLOSE2:
5524 case NFA_ZCLOSE3:
5525 case NFA_ZCLOSE4:
5526 case NFA_ZCLOSE5:
5527 case NFA_ZCLOSE6:
5528 case NFA_ZCLOSE7:
5529 case NFA_ZCLOSE8:
5530 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005531#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005532 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005533 case NFA_MCLOSE1:
5534 case NFA_MCLOSE2:
5535 case NFA_MCLOSE3:
5536 case NFA_MCLOSE4:
5537 case NFA_MCLOSE5:
5538 case NFA_MCLOSE6:
5539 case NFA_MCLOSE7:
5540 case NFA_MCLOSE8:
5541 case NFA_MCLOSE9:
5542 case NFA_NCLOSE:
5543 return failure_chance(state->out, depth + 1);
5544
5545 case NFA_BACKREF1:
5546 case NFA_BACKREF2:
5547 case NFA_BACKREF3:
5548 case NFA_BACKREF4:
5549 case NFA_BACKREF5:
5550 case NFA_BACKREF6:
5551 case NFA_BACKREF7:
5552 case NFA_BACKREF8:
5553 case NFA_BACKREF9:
5554#ifdef FEAT_SYN_HL
5555 case NFA_ZREF1:
5556 case NFA_ZREF2:
5557 case NFA_ZREF3:
5558 case NFA_ZREF4:
5559 case NFA_ZREF5:
5560 case NFA_ZREF6:
5561 case NFA_ZREF7:
5562 case NFA_ZREF8:
5563 case NFA_ZREF9:
5564#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005565 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005566 return 94;
5567
5568 case NFA_LNUM_GT:
5569 case NFA_LNUM_LT:
5570 case NFA_COL_GT:
5571 case NFA_COL_LT:
5572 case NFA_VCOL_GT:
5573 case NFA_VCOL_LT:
5574 case NFA_MARK_GT:
5575 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005576 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005577 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005578 return 85;
5579
5580 case NFA_LNUM:
5581 return 90;
5582
5583 case NFA_CURSOR:
5584 case NFA_COL:
5585 case NFA_VCOL:
5586 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005587 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005588 return 98;
5589
5590 case NFA_COMPOSING:
5591 return 95;
5592
5593 default:
5594 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005595 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005596 return 95;
5597 }
5598
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005599 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005600 return 50;
5601}
5602
Bram Moolenaarf46da702013-06-02 22:37:42 +02005603/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005604 * Skip until the char "c" we know a match must start with.
5605 */
5606 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005607skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005608{
5609 char_u *s;
5610
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005611 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005612 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005613 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005614 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005615 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005616 if (s == NULL)
5617 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005618 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005619 return OK;
5620}
5621
5622/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005623 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005624 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005625 * Returns zero for no match, 1 for a match.
5626 */
5627 static long
Bram Moolenaar79336e12022-12-11 14:18:31 +00005628find_match_text(colnr_T *startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005629{
Bram Moolenaar79336e12022-12-11 14:18:31 +00005630 colnr_T col = *startcol;
Bram Moolenaar473de612013-06-08 18:19:48 +02005631 int c1, c2;
5632 int len1, len2;
5633 int match;
5634
5635 for (;;)
5636 {
5637 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005638 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005639 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5640 {
5641 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005642 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005643 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005644 {
5645 match = FALSE;
5646 break;
5647 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005648 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5649 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005650 }
5651 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005652 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005653 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005654 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005655 {
5656 cleanup_subexpr();
5657 if (REG_MULTI)
5658 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005659 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005660 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005661 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005662 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005663 }
5664 else
5665 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005666 rex.reg_startp[0] = rex.line + col;
5667 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005668 }
Bram Moolenaar79336e12022-12-11 14:18:31 +00005669 *startcol = col;
Bram Moolenaar473de612013-06-08 18:19:48 +02005670 return 1L;
5671 }
5672
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005673 // Try finding regstart after the current match.
5674 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005675 if (skip_to_start(regstart, &col) == FAIL)
5676 break;
5677 }
Bram Moolenaar79336e12022-12-11 14:18:31 +00005678
5679 *startcol = col;
Bram Moolenaar473de612013-06-08 18:19:48 +02005680 return 0L;
5681}
5682
5683/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005684 * Main matching routine.
5685 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005686 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005687 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005688 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005689 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005690 * Return TRUE if there is a match, FALSE if there is no match,
5691 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005692 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005693 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005694 * Note: Caller must ensure that: start != NULL.
5695 */
5696 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005697nfa_regmatch(
5698 nfa_regprog_T *prog,
5699 nfa_state_T *start,
5700 regsubs_T *submatch,
5701 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005702{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005703 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005704 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005705 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005706 int go_to_nextline = FALSE;
5707 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005708 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005709 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005710 nfa_list_T *thislist;
5711 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005712 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005713 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005714 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005715 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005716 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005717 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005718 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005719 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005720#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005721 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005722#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005723
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005724 // Some patterns may take a long time to match, especially when using
5725 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005726 fast_breakcheck();
5727 if (got_int)
5728 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005729#ifdef FEAT_RELTIME
Paul Ollis65745772022-06-05 16:55:54 +01005730 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005731 return FALSE;
5732#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005733
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005734#ifdef NFA_REGEXP_DEBUG_LOG
5735 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5736 if (debug == NULL)
5737 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005738 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005739 return FALSE;
5740 }
5741#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005742 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005743
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005744 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005745 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005746
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005747 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005748 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005749 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005750 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005751 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005752 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005753
5754#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005755 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005756 if (log_fd == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005757 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005758 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005759 log_fd = stderr;
5760 }
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005761 fprintf(log_fd, "**********************************\n");
5762 nfa_set_code(start->c);
5763 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5764 abs(start->id), code);
5765 fprintf(log_fd, "**********************************\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005766#endif
5767
5768 thislist = &list[0];
5769 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005770 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005771 nextlist = &list[1];
5772 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005773 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005774#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005775 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005776#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005777 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005778
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005779 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5780 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005781 if (toplevel)
5782 {
5783 if (REG_MULTI)
5784 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005785 m->norm.list.multi[0].start_lnum = rex.lnum;
5786 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar79336e12022-12-11 14:18:31 +00005787 m->norm.orig_start_col = m->norm.list.multi[0].start_col;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005788 }
5789 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005790 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005791 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005792 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005793 }
5794 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005795 r = addstate(thislist, start, m, NULL, 0);
5796 if (r == NULL)
5797 {
5798 nfa_match = NFA_TOO_EXPENSIVE;
5799 goto theend;
5800 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005801
Bram Moolenaarebfec1c2023-01-22 21:14:53 +00005802#define ADD_STATE_IF_MATCH(state) \
5803 if (result) \
5804 { \
5805 add_state = state->out; \
5806 add_off = clen; \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005807 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005808
5809 /*
5810 * Run for each character.
5811 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005812 for (;;)
5813 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005814 int curc;
5815 int clen;
5816
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005817 if (has_mbyte)
5818 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005819 curc = (*mb_ptr2char)(rex.input);
5820 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005821 }
5822 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005823 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005824 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005825 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005826 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005827 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005828 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005829 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005830 go_to_nextline = FALSE;
5831 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005832
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005833 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005834 thislist = &list[flag];
5835 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005836 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005837 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005838 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005839 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005840 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005841# ifdef FEAT_EVAL
5842 || nfa_fail_for_testing
5843# endif
5844 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005845 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005846 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005847 nfa_match = NFA_TOO_EXPENSIVE;
5848 goto theend;
5849 }
5850
Bram Moolenaar0270f382018-07-17 05:43:58 +02005851 thislist->id = rex.nfa_listid;
5852 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005853
5854#ifdef ENABLE_LOG
5855 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005856 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005857 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005858 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005859 {
5860 int i;
5861
5862 for (i = 0; i < thislist->n; i++)
5863 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5864 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005865 fprintf(log_fd, "\n");
5866#endif
5867
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005868#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005869 fprintf(debug, "\n-------------------\n");
5870#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005871 /*
5872 * If the state lists are empty we can stop.
5873 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005874 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005875 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005876
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005877 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005878 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005879 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005880 // If the list gets very long there probably is something wrong.
5881 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005882 fast_breakcheck();
5883 if (got_int)
5884 break;
5885#ifdef FEAT_RELTIME
Paul Ollis65745772022-06-05 16:55:54 +01005886 if (nfa_did_time_out())
Bram Moolenaar305abc62022-05-28 11:08:40 +01005887 break;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005888#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005889 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005890
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005891#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005892 nfa_set_code(t->state->c);
5893 fprintf(debug, "%s, ", code);
5894#endif
5895#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005896 {
5897 int col;
5898
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005899 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005900 col = -1;
5901 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005902 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005903 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005904 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005905 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005906 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005907 abs(t->state->id), (int)t->state->c, code, col,
5908 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005909 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005910#endif
5911
5912 /*
5913 * Handle the possible codes of the current state.
5914 * The most important is NFA_MATCH.
5915 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005916 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005917 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005918 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005919 switch (t->state->c)
5920 {
5921 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005922 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005923 // If the match is not at the start of the line, ends before a
5924 // composing characters and rex.reg_icombine is not set, that
5925 // is not really a match.
5926 if (enc_utf8 && !rex.reg_icombine
5927 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005928 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005929
Bram Moolenaar963fee22013-05-26 21:47:28 +02005930 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005931 copy_sub(&submatch->norm, &t->subs.norm);
5932#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005933 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005934 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005935#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005936#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005937 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005938#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005939 // Found the left-most longest match, do not look at any other
5940 // states at this position. When the list of states is going
5941 // to be empty quit without advancing, so that "rex.input" is
5942 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005943 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005944 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005945 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005946 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005947
5948 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005949 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005950 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005951 /*
5952 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005953 * NFA_START_INVISIBLE_BEFORE node.
5954 * They surround a zero-width group, used with "\@=", "\&",
5955 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005956 * If we got here, it means that the current "invisible" group
5957 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005958 * nfa_regmatch(). For a look-behind match only when it ends
5959 * in the position in "nfa_endp".
5960 * Submatches are stored in *m, and used in the parent call.
5961 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005962#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005963 if (nfa_endp != NULL)
5964 {
5965 if (REG_MULTI)
5966 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005967 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02005968 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02005969 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02005970 nfa_endp->se_u.pos.col);
5971 else
5972 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005973 (int)(rex.input - rex.line),
5974 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005975 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005976#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005977 // If "nfa_endp" is set it's only a match if it ends at
5978 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02005979 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02005980 ? (rex.lnum != nfa_endp->se_u.pos.lnum
5981 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005982 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005983 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02005984 break;
5985
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005986 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02005987 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005988 {
5989 copy_sub(&m->norm, &t->subs.norm);
5990#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005991 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005992 copy_sub(&m->synt, &t->subs.synt);
5993#endif
5994 }
Bram Moolenaar87953742013-06-05 18:52:40 +02005995#ifdef ENABLE_LOG
5996 fprintf(log_fd, "Match found:\n");
5997 log_subsexpr(m);
5998#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02005999 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006000 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02006001 if (nextlist->n == 0)
6002 clen = 0;
6003 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006004
6005 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006006 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006007 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006008 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02006009 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006010 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006011 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006012 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006013 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02006014#ifdef ENABLE_LOG
6015 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6016 failure_chance(t->state->out, 0),
6017 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02006018#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006019 // Do it directly if there already is a PIM or when
6020 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02006021 if (t->pim.result != NFA_PIM_UNUSED
6022 || t->state->c == NFA_START_INVISIBLE_FIRST
6023 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6024 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6025 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006026 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006027 int in_use = m->norm.in_use;
6028
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006029 // Copy submatch info for the recursive call, opposite
6030 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006031 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02006032#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006033 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006034 copy_sub_off(&m->synt, &t->subs.synt);
6035#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006036
Bram Moolenaara2d95102013-06-04 14:23:05 +02006037 /*
6038 * First try matching the invisible match, then what
6039 * follows.
6040 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006041 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006042 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006043 if (result == NFA_TOO_EXPENSIVE)
6044 {
6045 nfa_match = result;
6046 goto theend;
6047 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006048
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006049 // for \@! and \@<! it is a match when the result is
6050 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006051 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006052 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6053 || t->state->c
6054 == NFA_START_INVISIBLE_BEFORE_NEG
6055 || t->state->c
6056 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006057 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006058 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006059 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006060#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006061 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006062 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006063#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006064 // If the pattern has \ze and it matched in the
6065 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006066 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006067
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006068 // t->state->out1 is the corresponding
6069 // END_INVISIBLE node; Add its out to the current
6070 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006071 add_here = TRUE;
6072 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006073 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006074 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006075 }
6076 else
6077 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006078 nfa_pim_T pim;
6079
Bram Moolenaara2d95102013-06-04 14:23:05 +02006080 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006081 * First try matching what follows. Only if a match
6082 * is found verify the invisible match matches. Add a
6083 * nfa_pim_T to the following states, it contains info
6084 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006085 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006086 pim.state = t->state;
6087 pim.result = NFA_PIM_TODO;
6088 pim.subs.norm.in_use = 0;
6089#ifdef FEAT_SYN_HL
6090 pim.subs.synt.in_use = 0;
6091#endif
6092 if (REG_MULTI)
6093 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006094 pim.end.pos.col = (int)(rex.input - rex.line);
6095 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006096 }
6097 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006098 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006099
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006100 // t->state->out1 is the corresponding END_INVISIBLE
6101 // node; Add its out to the current list (zero-width
6102 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006103 if (addstate_here(thislist, t->state->out1->out,
6104 &t->subs, &pim, &listidx) == NULL)
6105 {
6106 nfa_match = NFA_TOO_EXPENSIVE;
6107 goto theend;
6108 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006109 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006110 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006111 break;
6112
Bram Moolenaar87953742013-06-05 18:52:40 +02006113 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006114 {
6115 nfa_state_T *skip = NULL;
6116#ifdef ENABLE_LOG
6117 int skip_lid = 0;
6118#endif
6119
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006120 // There is no point in trying to match the pattern if the
6121 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006122 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6123 {
6124 skip = t->state->out1->out;
6125#ifdef ENABLE_LOG
6126 skip_lid = nextlist->id;
6127#endif
6128 }
6129 else if (state_in_list(nextlist,
6130 t->state->out1->out->out, &t->subs))
6131 {
6132 skip = t->state->out1->out->out;
6133#ifdef ENABLE_LOG
6134 skip_lid = nextlist->id;
6135#endif
6136 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006137 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006138 t->state->out1->out->out, &t->subs))
6139 {
6140 skip = t->state->out1->out->out;
6141#ifdef ENABLE_LOG
6142 skip_lid = thislist->id;
6143#endif
6144 }
6145 if (skip != NULL)
6146 {
6147#ifdef ENABLE_LOG
6148 nfa_set_code(skip->c);
6149 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6150 abs(skip->id), skip_lid, skip->c, code);
6151#endif
6152 break;
6153 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006154 // Copy submatch info to the recursive call, opposite of what
6155 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006156 copy_sub_off(&m->norm, &t->subs.norm);
6157#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006158 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006159 copy_sub_off(&m->synt, &t->subs.synt);
6160#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006161
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006162 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006163 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006164 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006165 if (result == NFA_TOO_EXPENSIVE)
6166 {
6167 nfa_match = result;
6168 goto theend;
6169 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006170 if (result)
6171 {
6172 int bytelen;
6173
6174#ifdef ENABLE_LOG
6175 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6176 log_subsexpr(m);
6177#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006178 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006179 copy_sub_off(&t->subs.norm, &m->norm);
6180#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006181 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006182 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006183#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006184 // Now we need to skip over the matched text and then
6185 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006186 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006187 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006188 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006189 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006190 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006191 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006192
6193#ifdef ENABLE_LOG
6194 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6195#endif
6196 if (bytelen == 0)
6197 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006198 // empty match, output of corresponding
6199 // NFA_END_PATTERN/NFA_SKIP to be used at current
6200 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006201 add_here = TRUE;
6202 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006203 }
6204 else if (bytelen <= clen)
6205 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006206 // match current character, output of corresponding
6207 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006208 add_state = t->state->out1->out->out;
6209 add_off = clen;
6210 }
6211 else
6212 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006213 // skip over the matched characters, set character
6214 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006215 add_state = t->state->out1->out;
6216 add_off = bytelen;
6217 add_count = bytelen - clen;
6218 }
6219 }
6220 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006221 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006222
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006223 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006224 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006225 {
6226 add_here = TRUE;
6227 add_state = t->state->out;
6228 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006229 break;
6230
6231 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006232 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006233 {
6234 add_here = TRUE;
6235 add_state = t->state->out;
6236 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006237 break;
6238
6239 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006240 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006241
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006242 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006243 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006244 else if (has_mbyte)
6245 {
6246 int this_class;
6247
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006248 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006249 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006250 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006251 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006252 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006253 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006254 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006255 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006256 || (rex.input > rex.line
Bram Moolenaarc96311b2022-11-25 21:13:47 +00006257 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006258 result = FALSE;
6259 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006260 {
6261 add_here = TRUE;
6262 add_state = t->state->out;
6263 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006264 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006265
6266 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006267 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006268 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006269 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006270 else if (has_mbyte)
6271 {
6272 int this_class, prev_class;
6273
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006274 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006275 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006276 prev_class = reg_prev_class();
6277 if (this_class == prev_class
6278 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006279 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006280 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006281 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6282 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006283 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006284 result = FALSE;
6285 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006286 {
6287 add_here = TRUE;
6288 add_state = t->state->out;
6289 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006290 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006291
Bram Moolenaar4b780632013-05-31 22:14:52 +02006292 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006293 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006294 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006295 {
6296 add_here = TRUE;
6297 add_state = t->state->out;
6298 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006299 break;
6300
6301 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006302 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006303 {
6304 add_here = TRUE;
6305 add_state = t->state->out;
6306 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006307 break;
6308
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006309 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006310 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006311 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006312 int len = 0;
6313 nfa_state_T *end;
6314 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006315 int cchars[MAX_MCO];
6316 int ccount = 0;
6317 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006318
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006319 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006320 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006321 if (utf_iscomposing(sta->c))
6322 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006323 // Only match composing character(s), ignore base
6324 // character. Used for ".{composing}" and "{composing}"
6325 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006326 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006327 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006328 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006329 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006330 // If \Z was present, then ignore composing characters.
6331 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006332 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006333 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006334 else
6335 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006336 while (sta->c != NFA_END_COMPOSING)
6337 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006338 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006339
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006340 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006341 else if (len > 0 || mc == sta->c)
6342 {
6343 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006344 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006345 len += mb_char2len(mc);
6346 sta = sta->out;
6347 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006348
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006349 // We don't care about the order of composing characters.
6350 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006351 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006352 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006353 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006354 cchars[ccount++] = mc;
6355 len += mb_char2len(mc);
6356 if (ccount == MAX_MCO)
6357 break;
6358 }
6359
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006360 // Check that each composing char in the pattern matches a
6361 // composing char in the text. We do not check if all
6362 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006363 result = OK;
6364 while (sta->c != NFA_END_COMPOSING)
6365 {
6366 for (j = 0; j < ccount; ++j)
6367 if (cchars[j] == sta->c)
6368 break;
6369 if (j == ccount)
6370 {
6371 result = FAIL;
6372 break;
6373 }
6374 sta = sta->out;
6375 }
6376 }
6377 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006378 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006379
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006380 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006381 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006382 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006383 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006384
6385 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006386 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaarc96311b2022-11-25 21:13:47 +00006387 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006388 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006389 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006390 // Pass -1 for the offset, which means taking the position
6391 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006392 add_state = t->state->out;
6393 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006394 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006395 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006396 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006397 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006398 add_state = t->state->out;
6399 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006400 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006401 break;
6402
Bram Moolenaar417bad22013-06-07 14:08:30 +02006403 case NFA_START_COLL:
6404 case NFA_START_NEG_COLL:
6405 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006406 // What follows is a list of characters, until NFA_END_COLL.
6407 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006408 nfa_state_T *state;
6409 int result_if_matched;
6410 int c1, c2;
6411
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006412 // Never match EOL. If it's part of the collection it is added
6413 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006414 if (curc == NUL)
6415 break;
6416
6417 state = t->state->out;
6418 result_if_matched = (t->state->c == NFA_START_COLL);
6419 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006420 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006421 if (state->c == NFA_END_COLL)
6422 {
6423 result = !result_if_matched;
6424 break;
6425 }
6426 if (state->c == NFA_RANGE_MIN)
6427 {
6428 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006429 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006430 c2 = state->val;
6431#ifdef ENABLE_LOG
6432 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6433 curc, c1, c2);
6434#endif
6435 if (curc >= c1 && curc <= c2)
6436 {
6437 result = result_if_matched;
6438 break;
6439 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006440 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006441 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006442 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006443 int done = FALSE;
6444
6445 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006446 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006447 {
6448 result = result_if_matched;
6449 done = TRUE;
6450 break;
6451 }
6452 if (done)
6453 break;
6454 }
6455 }
6456 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006457 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006458 || (rex.reg_ic && MB_CASEFOLD(curc)
6459 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006460 {
6461 result = result_if_matched;
6462 break;
6463 }
6464 state = state->out;
6465 }
6466 if (result)
6467 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006468 // next state is in out of the NFA_END_COLL, out1 of
6469 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006470 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006471 add_off = clen;
6472 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006473 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006474 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006475
6476 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006477 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006478 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006479 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006480 add_state = t->state->out;
6481 add_off = clen;
6482 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006483 break;
6484
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006485 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006486 // On a composing character skip over it. Otherwise do
6487 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006488 if (enc_utf8 && utf_iscomposing(curc))
6489 {
6490 add_off = clen;
6491 }
6492 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006493 {
6494 add_here = TRUE;
6495 add_off = 0;
6496 }
6497 add_state = t->state->out;
6498 break;
6499
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006500 /*
6501 * Character classes like \a for alpha, \d for digit etc.
6502 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006503 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006504 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006505 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006506 break;
6507
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006508 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006509 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006510 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006511 break;
6512
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006513 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006514 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006515 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006516 break;
6517
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006518 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006519 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006520 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006521 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006522 break;
6523
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006524 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006525 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006526 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006527 break;
6528
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006529 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006530 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006531 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006532 break;
6533
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006534 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006535 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006536 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006537 break;
6538
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006539 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006540 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006541 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006542 break;
6543
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006544 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006545 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006546 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006547 break;
6548
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006549 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006550 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006551 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006552 break;
6553
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006554 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006555 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006556 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006557 break;
6558
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006559 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006560 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006561 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006562 break;
6563
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006564 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006565 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006566 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006567 break;
6568
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006569 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006570 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006571 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006572 break;
6573
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006574 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006575 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006576 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006577 break;
6578
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006579 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006580 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006581 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006582 break;
6583
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006584 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006585 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006586 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006587 break;
6588
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006589 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006590 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006591 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006592 break;
6593
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006594 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006595 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006596 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006597 break;
6598
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006599 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006600 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006601 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006602 break;
6603
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006604 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006605 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006606 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006607 break;
6608
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006609 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006610 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006611 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006612 break;
6613
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006614 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006615 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006616 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006617 break;
6618
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006619 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006620 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006621 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006622 break;
6623
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006624 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006625 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006626 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006627 break;
6628
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006629 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006630 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006631 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006632 break;
6633
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006634 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006635 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006636 ADD_STATE_IF_MATCH(t->state);
6637 break;
6638
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006639 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006640 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006641 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006642 ADD_STATE_IF_MATCH(t->state);
6643 break;
6644
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006645 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006646 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006647 ADD_STATE_IF_MATCH(t->state);
6648 break;
6649
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006650 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006651 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006652 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006653 ADD_STATE_IF_MATCH(t->state);
6654 break;
6655
Bram Moolenaar5714b802013-05-28 22:03:20 +02006656 case NFA_BACKREF1:
6657 case NFA_BACKREF2:
6658 case NFA_BACKREF3:
6659 case NFA_BACKREF4:
6660 case NFA_BACKREF5:
6661 case NFA_BACKREF6:
6662 case NFA_BACKREF7:
6663 case NFA_BACKREF8:
6664 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006665#ifdef FEAT_SYN_HL
6666 case NFA_ZREF1:
6667 case NFA_ZREF2:
6668 case NFA_ZREF3:
6669 case NFA_ZREF4:
6670 case NFA_ZREF5:
6671 case NFA_ZREF6:
6672 case NFA_ZREF7:
6673 case NFA_ZREF8:
6674 case NFA_ZREF9:
6675#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006676 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006677 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006678 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006679 int bytelen;
6680
Bram Moolenaar1f761382023-03-25 11:31:32 +00006681#ifdef FEAT_SYN_HL
6682 if (t->state->c >= NFA_BACKREF1 && t->state->c <= NFA_BACKREF9)
6683#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006684 {
6685 subidx = t->state->c - NFA_BACKREF1 + 1;
6686 result = match_backref(&t->subs.norm, subidx, &bytelen);
6687 }
6688#ifdef FEAT_SYN_HL
6689 else
6690 {
6691 subidx = t->state->c - NFA_ZREF1 + 1;
6692 result = match_zref(subidx, &bytelen);
6693 }
6694#endif
6695
Bram Moolenaar5714b802013-05-28 22:03:20 +02006696 if (result)
6697 {
6698 if (bytelen == 0)
6699 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006700 // empty match always works, output of NFA_SKIP to be
6701 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006702 add_here = TRUE;
6703 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006704 }
6705 else if (bytelen <= clen)
6706 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006707 // match current character, jump ahead to out of
6708 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006709 add_state = t->state->out->out;
6710 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006711 }
6712 else
6713 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006714 // skip over the matched characters, set character
6715 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006716 add_state = t->state->out;
6717 add_off = bytelen;
6718 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006719 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006720 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006721 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006722 }
6723 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006724 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006725 if (t->count - clen <= 0)
6726 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006727 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006728 add_state = t->state->out;
6729 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006730 }
6731 else
6732 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006733 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006734 add_state = t->state;
6735 add_off = 0;
6736 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006737 }
6738 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006739
Bram Moolenaar423532e2013-05-29 21:14:42 +02006740 case NFA_LNUM:
6741 case NFA_LNUM_GT:
6742 case NFA_LNUM_LT:
6743 result = (REG_MULTI &&
6744 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006745 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006746 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006747 {
6748 add_here = TRUE;
6749 add_state = t->state->out;
6750 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006751 break;
6752
6753 case NFA_COL:
6754 case NFA_COL_GT:
6755 case NFA_COL_LT:
6756 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006757 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006758 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006759 {
6760 add_here = TRUE;
6761 add_state = t->state->out;
6762 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006763 break;
6764
6765 case NFA_VCOL:
6766 case NFA_VCOL_GT:
6767 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006768 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006769 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006770 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006771 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006772
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006773 // Bail out quickly when there can't be a match, avoid the
6774 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006775 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006776 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006777 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006778 result = FALSE;
6779 if (op == 1 && col - 1 > t->state->val && col > 100)
6780 {
6781 int ts = wp->w_buffer->b_p_ts;
6782
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006783 // Guess that a character won't use more columns than
6784 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006785 if (ts < 4)
6786 ts = 4;
6787 result = col > t->state->val * ts;
6788 }
6789 if (!result)
Bram Moolenaar13ed4942022-08-19 13:59:25 +01006790 {
Bram Moolenaar753aead2022-09-08 12:17:06 +01006791 linenr_T lnum = REG_MULTI
6792 ? rex.reg_firstlnum + rex.lnum : 1;
6793 long_u vcol;
Bram Moolenaar13ed4942022-08-19 13:59:25 +01006794
Bram Moolenaar753aead2022-09-08 12:17:06 +01006795 if (REG_MULTI && (lnum <= 0
6796 || lnum > wp->w_buffer->b_ml.ml_line_count))
6797 lnum = 1;
Bram Moolenaar88456cd2022-11-18 22:14:09 +00006798 vcol = (long_u)win_linetabsize(wp, lnum, rex.line, col);
Bram Moolenaar13ed4942022-08-19 13:59:25 +01006799 result = nfa_re_num_cmp(t->state->val, op, vcol + 1);
6800 }
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006801 if (result)
6802 {
6803 add_here = TRUE;
6804 add_state = t->state->out;
6805 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006806 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006807 break;
6808
Bram Moolenaar044aa292013-06-04 21:27:38 +02006809 case NFA_MARK:
6810 case NFA_MARK_GT:
6811 case NFA_MARK_LT:
6812 {
Bram Moolenaarb4ad3b02022-03-30 10:57:45 +01006813 pos_T *pos;
6814 size_t col = REG_MULTI ? rex.input - rex.line : 0;
6815
6816 pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006817
Bram Moolenaar64066b92021-11-17 18:22:56 +00006818 // Line may have been freed, get it again.
6819 if (REG_MULTI)
6820 {
6821 rex.line = reg_getline(rex.lnum);
6822 rex.input = rex.line + col;
6823 }
6824
Bram Moolenaar872bee52021-05-24 22:56:15 +02006825 // Compare the mark position to the match position, if the mark
6826 // exists and mark is set in reg_buf.
6827 if (pos != NULL && pos->lnum > 0)
6828 {
6829 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6830 && pos->col == MAXCOL
6831 ? (colnr_T)STRLEN(reg_getline(
6832 pos->lnum - rex.reg_firstlnum))
6833 : pos->col;
6834
6835 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6836 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006837 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006838 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006839 ? t->state->c == NFA_MARK_GT
6840 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006841 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006842 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006843 : t->state->c == NFA_MARK_LT));
6844 if (result)
6845 {
6846 add_here = TRUE;
6847 add_state = t->state->out;
6848 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006849 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006850 break;
6851 }
6852
Bram Moolenaar423532e2013-05-29 21:14:42 +02006853 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006854 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006855 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006856 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006857 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006858 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006859 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006860 {
6861 add_here = TRUE;
6862 add_state = t->state->out;
6863 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006864 break;
6865
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006866 case NFA_VISUAL:
6867 result = reg_match_visual();
6868 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006869 {
6870 add_here = TRUE;
6871 add_state = t->state->out;
6872 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006873 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006874
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006875 case NFA_MOPEN1:
6876 case NFA_MOPEN2:
6877 case NFA_MOPEN3:
6878 case NFA_MOPEN4:
6879 case NFA_MOPEN5:
6880 case NFA_MOPEN6:
6881 case NFA_MOPEN7:
6882 case NFA_MOPEN8:
6883 case NFA_MOPEN9:
6884#ifdef FEAT_SYN_HL
6885 case NFA_ZOPEN:
6886 case NFA_ZOPEN1:
6887 case NFA_ZOPEN2:
6888 case NFA_ZOPEN3:
6889 case NFA_ZOPEN4:
6890 case NFA_ZOPEN5:
6891 case NFA_ZOPEN6:
6892 case NFA_ZOPEN7:
6893 case NFA_ZOPEN8:
6894 case NFA_ZOPEN9:
6895#endif
6896 case NFA_NOPEN:
6897 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006898 // These states are only added to be able to bail out when
6899 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006900 break;
6901
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006902 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006903 {
6904 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006905
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006906#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006907 if (c < 0)
Bram Moolenaar097c5372023-05-24 21:02:24 +01006908 siemsg("Negative state char: %ld", (long)c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006909#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006910 result = (c == curc);
6911
Bram Moolenaar6100d022016-10-02 16:51:57 +02006912 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006913 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006914 // If rex.reg_icombine is not set only skip over the character
6915 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006916 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006917 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006918 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006919 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006920 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006921
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006922 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006923
6924 if (add_state != NULL)
6925 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006926 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006927 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006928
6929 if (t->pim.result == NFA_PIM_UNUSED)
6930 pim = NULL;
6931 else
6932 pim = &t->pim;
6933
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006934 // Handle the postponed invisible match if the match might end
6935 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006936 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006937 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006938 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006939 {
6940#ifdef ENABLE_LOG
6941 fprintf(log_fd, "\n");
6942 fprintf(log_fd, "==================================\n");
6943 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6944 fprintf(log_fd, "\n");
6945#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006946 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006947 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006948 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006949 // for \@! and \@<! it is a match when the result is
6950 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006951 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006952 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6953 || pim->state->c
6954 == NFA_START_INVISIBLE_BEFORE_NEG
6955 || pim->state->c
6956 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006957 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006958 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006959 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006960#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006961 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006962 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006963#endif
6964 }
6965 }
6966 else
6967 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006968 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006969#ifdef ENABLE_LOG
6970 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006971 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006972 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6973 fprintf(log_fd, "\n");
6974#endif
6975 }
6976
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006977 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006978 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006979 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6980 || pim->state->c
6981 == NFA_START_INVISIBLE_BEFORE_NEG
6982 || pim->state->c
6983 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006984 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006985 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006986 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006987#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006988 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006989 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006990#endif
6991 }
6992 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006993 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02006994 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006995
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006996 // Postponed invisible match was handled, don't add it to
6997 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006998 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006999 }
7000
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007001 // If "pim" points into l->t it will become invalid when
7002 // adding the state causes the list to be reallocated. Make a
7003 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02007004 if (pim == &t->pim)
7005 {
7006 copy_pim(&pim_copy, pim);
7007 pim = &pim_copy;
7008 }
7009
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007010 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007011 r = addstate_here(thislist, add_state, &t->subs,
7012 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007013 else
7014 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007015 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007016 if (add_count > 0)
7017 nextlist->t[nextlist->n - 1].count = add_count;
7018 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007019 if (r == NULL)
7020 {
7021 nfa_match = NFA_TOO_EXPENSIVE;
7022 goto theend;
7023 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007024 }
7025
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007026 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007027
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007028 // Look for the start of a match in the current position by adding the
7029 // start state to the list of states.
7030 // The first found match is the leftmost one, thus the order of states
7031 // matters!
7032 // Do not add the start state in recursive calls of nfa_regmatch(),
7033 // because recursive calls should only start in the first position.
7034 // Unless "nfa_endp" is not NULL, then we match the end position.
7035 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02007036 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007037 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02007038 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02007039 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02007040 && (rex.reg_maxcol == 0
Bram Moolenaarc96311b2022-11-25 21:13:47 +00007041 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02007042 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02007043 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007044 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7045 || (rex.lnum == nfa_endp->se_u.pos.lnum
7046 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02007047 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02007048 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007049 {
7050#ifdef ENABLE_LOG
7051 fprintf(log_fd, "(---) STARTSTATE\n");
7052#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007053 // Inline optimized code for addstate() if we know the state is
7054 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007055 if (toplevel)
7056 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007057 int add = TRUE;
7058 int c;
7059
7060 if (prog->regstart != NUL && clen != 0)
7061 {
7062 if (nextlist->n == 0)
7063 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007064 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007065
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007066 // Nextlist is empty, we can skip ahead to the
7067 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007068 if (skip_to_start(prog->regstart, &col) == FAIL)
7069 break;
7070#ifdef ENABLE_LOG
7071 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007072 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007073#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007074 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007075 }
7076 else
7077 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007078 // Checking if the required start character matches is
7079 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007080 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007081 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007082 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007083 {
7084#ifdef ENABLE_LOG
7085 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7086#endif
7087 add = FALSE;
7088 }
7089 }
7090 }
7091
7092 if (add)
7093 {
7094 if (REG_MULTI)
Bram Moolenaar79336e12022-12-11 14:18:31 +00007095 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007096 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007097 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar79336e12022-12-11 14:18:31 +00007098 m->norm.orig_start_col =
7099 m->norm.list.multi[0].start_col;
7100 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007101 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007102 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007103 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7104 {
7105 nfa_match = NFA_TOO_EXPENSIVE;
7106 goto theend;
7107 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007108 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007109 }
7110 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007111 {
7112 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7113 {
7114 nfa_match = NFA_TOO_EXPENSIVE;
7115 goto theend;
7116 }
7117 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007118 }
7119
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007120#ifdef ENABLE_LOG
7121 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007122 {
7123 int i;
7124
7125 for (i = 0; i < thislist->n; i++)
7126 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7127 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007128 fprintf(log_fd, "\n");
7129#endif
7130
7131nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007132 // Advance to the next character, or advance to the next line, or
7133 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007134 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007135 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007136 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007137 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007138 reg_nextline();
7139 else
7140 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007141
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007142 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007143 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007144 if (got_int)
7145 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007146#ifdef FEAT_RELTIME
Paul Ollis65745772022-06-05 16:55:54 +01007147 if (nfa_did_time_out())
Bram Moolenaar305abc62022-05-28 11:08:40 +01007148 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007149#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007150 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007151
7152#ifdef ENABLE_LOG
7153 if (log_fd != stderr)
7154 fclose(log_fd);
7155 log_fd = NULL;
7156#endif
7157
7158theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007159 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007160 vim_free(list[0].t);
7161 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007162 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007163#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007164#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007165 fclose(debug);
7166#endif
7167
Bram Moolenaar963fee22013-05-26 21:47:28 +02007168 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007169}
7170
7171/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007172 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007173 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007174 */
7175 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007176nfa_regtry(
7177 nfa_regprog_T *prog,
7178 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007179 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007180{
7181 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007182 regsubs_T subs, m;
7183 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007184 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007185#ifdef ENABLE_LOG
7186 FILE *f;
7187#endif
7188
Bram Moolenaar0270f382018-07-17 05:43:58 +02007189 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007190#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007191 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007192#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007193
7194#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007195 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007196 if (f != NULL)
7197 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007198 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaar097c5372023-05-24 21:02:24 +01007199# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007200 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar097c5372023-05-24 21:02:24 +01007201# endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007202 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007203 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007204 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007205 fprintf(f, "\n\n");
7206 fclose(f);
7207 }
7208 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007209 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007210#endif
7211
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007212 clear_sub(&subs.norm);
7213 clear_sub(&m.norm);
7214#ifdef FEAT_SYN_HL
7215 clear_sub(&subs.synt);
7216 clear_sub(&m.synt);
7217#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007218
Bram Moolenaarfda37292014-11-05 14:27:36 +01007219 result = nfa_regmatch(prog, start, &subs, &m);
7220 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007221 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007222 else if (result == NFA_TOO_EXPENSIVE)
7223 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007224
7225 cleanup_subexpr();
7226 if (REG_MULTI)
7227 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007228 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007229 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007230 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7231 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007232
Bram Moolenaar6100d022016-10-02 16:51:57 +02007233 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7234 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007235 }
Bram Moolenaar79336e12022-12-11 14:18:31 +00007236 if (rex.reg_mmatch != NULL)
7237 rex.reg_mmatch->rmm_matchcol = subs.norm.orig_start_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007238
Bram Moolenaar6100d022016-10-02 16:51:57 +02007239 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007240 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007241 rex.reg_startpos[0].lnum = 0;
7242 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007243 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007244 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007245 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007246 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007247 rex.reg_endpos[0].lnum = rex.lnum;
7248 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007249 }
7250 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007251 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007252 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007253 }
7254 else
7255 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007256 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007257 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007258 rex.reg_startp[i] = subs.norm.list.line[i].start;
7259 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007260 }
7261
Bram Moolenaar6100d022016-10-02 16:51:57 +02007262 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007263 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007264 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007265 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007266 }
7267
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007268#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007269 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007270 unref_extmatch(re_extmatch_out);
7271 re_extmatch_out = NULL;
7272
7273 if (prog->reghasz == REX_SET)
7274 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007275 cleanup_zsubexpr();
7276 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007277 if (re_extmatch_out == NULL)
7278 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007279 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007280 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007281 {
7282 if (REG_MULTI)
7283 {
7284 struct multipos *mpos = &subs.synt.list.multi[i];
7285
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007286 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007287 if (mpos->start_lnum >= 0
7288 && mpos->start_lnum == mpos->end_lnum
7289 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007290 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007291 vim_strnsave(reg_getline(mpos->start_lnum)
7292 + mpos->start_col,
7293 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007294 }
7295 else
7296 {
7297 struct linepos *lpos = &subs.synt.list.line[i];
7298
7299 if (lpos->start != NULL && lpos->end != NULL)
7300 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007301 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007302 }
7303 }
7304 }
7305#endif
7306
Bram Moolenaar0270f382018-07-17 05:43:58 +02007307 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007308}
7309
7310/*
7311 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007312 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007313 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007314 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007315 */
7316 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007317nfa_regexec_both(
7318 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007319 colnr_T startcol, // column to start looking for match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007320 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007321{
7322 nfa_regprog_T *prog;
7323 long retval = 0L;
7324 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007325 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007326
7327 if (REG_MULTI)
7328 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007329 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007330 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007331 rex.reg_startpos = rex.reg_mmatch->startpos;
7332 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007333 }
7334 else
7335 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007336 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7337 rex.reg_startp = rex.reg_match->startp;
7338 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007339 }
7340
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007341 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007342 if (prog == NULL || line == NULL)
7343 {
RestorerZ68ebcee2023-05-31 17:12:14 +01007344 iemsg(e_null_argument);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007345 goto theend;
7346 }
7347
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007348 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007349 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007350 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007351 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007352 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007353
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007354 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007355 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007356 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007357
Bram Moolenaar0270f382018-07-17 05:43:58 +02007358 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007359 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007360
Bram Moolenaar0270f382018-07-17 05:43:58 +02007361 rex.nfa_has_zend = prog->has_zend;
7362 rex.nfa_has_backref = prog->has_backref;
7363 rex.nfa_nsubexpr = prog->nsubexp;
7364 rex.nfa_listid = 1;
7365 rex.nfa_alt_listid = 2;
7366#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007367 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007368#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007369
Bram Moolenaard89616e2013-06-06 18:46:06 +02007370 if (prog->reganch && col > 0)
7371 return 0L;
7372
Bram Moolenaar0270f382018-07-17 05:43:58 +02007373 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007374#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007375 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007376 if (prog->reghasz == REX_SET)
7377 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007378 rex.nfa_has_zsubexpr = TRUE;
7379 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007380 }
7381 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007382 {
7383 rex.nfa_has_zsubexpr = FALSE;
7384 rex.need_clear_zsubexpr = FALSE;
7385 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007386#endif
7387
Bram Moolenaard89616e2013-06-06 18:46:06 +02007388 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007389 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007390 // Skip ahead until a character we know the match must start with.
7391 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007392 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007393 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007394
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007395 // If match_text is set it contains the full text that must match.
7396 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007397 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar01105b32022-11-26 11:47:10 +00007398 {
Bram Moolenaar79336e12022-12-11 14:18:31 +00007399 retval = find_match_text(&col, prog->regstart, prog->match_text);
Bram Moolenaar01105b32022-11-26 11:47:10 +00007400 if (REG_MULTI)
7401 rex.reg_mmatch->rmm_matchcol = col;
7402 else
7403 rex.reg_match->rm_matchcol = col;
7404 return retval;
7405 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007406 }
7407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007408 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007409 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007410 goto theend;
7411
Bram Moolenaar0270f382018-07-17 05:43:58 +02007412 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7413 // it's accidentally used during execution.
7414 nstate = 0;
7415 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007416 {
7417 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007418 prog->state[i].lastlist[0] = 0;
7419 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007420 }
7421
Paul Ollis65745772022-06-05 16:55:54 +01007422 retval = nfa_regtry(prog, col, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007423
Bram Moolenaar0270f382018-07-17 05:43:58 +02007424#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007425 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007426#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007427
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007428theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007429 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007430 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007431 // Make sure the end is never before the start. Can happen when \zs and
7432 // \ze are used.
7433 if (REG_MULTI)
7434 {
7435 lpos_T *start = &rex.reg_mmatch->startpos[0];
7436 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007437
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007438 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007439 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007440 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7441 }
7442 else
7443 {
7444 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7445 rex.reg_match->endp[0] = rex.reg_match->startp[0];
Bram Moolenaar01105b32022-11-26 11:47:10 +00007446
7447 // startpos[0] may be set by "\zs", also return the column where
7448 // the whole pattern matched.
7449 rex.reg_match->rm_matchcol = col;
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007450 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007451 }
7452
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007453 return retval;
7454}
7455
7456/*
7457 * Compile a regular expression into internal code for the NFA matcher.
7458 * Returns the program in allocated space. Returns NULL for an error.
7459 */
7460 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007461nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007462{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007463 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007464 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007465 int *postfix;
7466
7467 if (expr == NULL)
7468 return NULL;
7469
Bram Moolenaar0270f382018-07-17 05:43:58 +02007470#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007471 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007472#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007473 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007474
7475 init_class_tab();
7476
7477 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7478 return NULL;
7479
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007480 // Build postfix form of the regexp. Needed to build the NFA
7481 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007482 postfix = re2post();
7483 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007484 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007485
7486 /*
7487 * In order to build the NFA, we parse the input regexp twice:
7488 * 1. first pass to count size (so we can allocate space)
7489 * 2. second to emit code
7490 */
7491#ifdef ENABLE_LOG
7492 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007493 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007494
7495 if (f != NULL)
7496 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007497 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007498 fclose(f);
7499 }
7500 }
7501#endif
7502
7503 /*
7504 * PASS 1
7505 * Count number of NFA states in "nstate". Do not build the NFA.
7506 */
7507 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007508
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007509 // allocate the regprog with space for the compiled regexp
zeertzjq1b438a82023-02-01 13:11:15 +00007510 prog_size = offsetof(nfa_regprog_T, state) + sizeof(nfa_state_T) * nstate;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007511 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007512 if (prog == NULL)
7513 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007514 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007515 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007516
7517 /*
7518 * PASS 2
7519 * Build the NFA
7520 */
7521 prog->start = post2nfa(postfix, post_ptr, FALSE);
7522 if (prog->start == NULL)
7523 goto fail;
7524
7525 prog->regflags = regflags;
7526 prog->engine = &nfa_regengine;
7527 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007528 prog->has_zend = rex.nfa_has_zend;
7529 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007530 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007531
Bram Moolenaara2947e22013-06-11 22:44:09 +02007532 nfa_postprocess(prog);
7533
Bram Moolenaard89616e2013-06-06 18:46:06 +02007534 prog->reganch = nfa_get_reganch(prog->start, 0);
7535 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007536 prog->match_text = nfa_get_match_text(prog->start);
7537
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007538#ifdef ENABLE_LOG
7539 nfa_postfix_dump(expr, OK);
7540 nfa_dump(prog);
7541#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007542#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007543 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007544 prog->reghasz = re_has_z;
7545#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007546 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007547#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007548 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007549#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007550
7551out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007552 VIM_CLEAR(post_start);
7553 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007554 state_ptr = NULL;
7555 return (regprog_T *)prog;
7556
7557fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007558 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007559#ifdef ENABLE_LOG
7560 nfa_postfix_dump(expr, FAIL);
7561#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007562#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007563 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007564#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007565 goto out;
7566}
7567
Bram Moolenaar473de612013-06-08 18:19:48 +02007568/*
7569 * Free a compiled regexp program, returned by nfa_regcomp().
7570 */
7571 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007572nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007573{
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00007574 if (prog == NULL)
7575 return;
7576
7577 vim_free(((nfa_regprog_T *)prog)->match_text);
7578 vim_free(((nfa_regprog_T *)prog)->pattern);
7579 vim_free(prog);
Bram Moolenaar473de612013-06-08 18:19:48 +02007580}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007581
7582/*
7583 * Match a regexp against a string.
7584 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7585 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007586 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007587 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007588 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007589 */
7590 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007591nfa_regexec_nl(
7592 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007593 char_u *line, // string to match against
7594 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007595 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007596{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007597 rex.reg_match = rmp;
7598 rex.reg_mmatch = NULL;
7599 rex.reg_maxline = 0;
7600 rex.reg_line_lbr = line_lbr;
7601 rex.reg_buf = curbuf;
7602 rex.reg_win = NULL;
7603 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007604 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007605 rex.reg_maxcol = 0;
Paul Ollis65745772022-06-05 16:55:54 +01007606 return nfa_regexec_both(line, col, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007607}
7608
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007609
7610/*
7611 * Match a regexp against multiple lines.
7612 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7613 * Uses curbuf for line count and 'iskeyword'.
7614 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007615 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007616 * match otherwise.
7617 *
7618 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7619 *
7620 * ! Also NOTE : match may actually be in another line. e.g.:
7621 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7622 *
7623 * +-------------------------+
7624 * |a |
7625 * |b |
7626 * |c |
7627 * | |
7628 * +-------------------------+
7629 *
7630 * then nfa_regexec_multi() returns 3. while the original
7631 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7632 *
7633 * FIXME if this behavior is not compatible.
7634 */
7635 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007636nfa_regexec_multi(
7637 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007638 win_T *win, // window in which to search or NULL
7639 buf_T *buf, // buffer in which to search
7640 linenr_T lnum, // nr of line to start looking for match
7641 colnr_T col, // column to start looking for match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007642 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007643{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007644 init_regexec_multi(rmp, win, buf, lnum);
Paul Ollis65745772022-06-05 16:55:54 +01007645 return nfa_regexec_both(NULL, col, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007646}
7647
7648#ifdef DEBUG
7649# undef ENABLE_LOG
7650#endif