blob: 6ad682bcf8a353ceb443b36e3b507412afc7ba41 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar0270f382018-07-17 05:43:58 +0200246// Variables only used in nfa_regcomp() and descendants.
247static int nfa_re_flags; // re_flags passed to nfa_regcomp()
248static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200249static int *post_end;
250static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100251
252// Set when the pattern should use the NFA engine.
253// E.g. [[:upper:]] only allows 8bit characters for BT engine,
254// while NFA engine handles multibyte characters correctly.
255static int wants_nfa;
256
Bram Moolenaar0270f382018-07-17 05:43:58 +0200257static int nstate; // Number of states in the NFA.
258static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200261static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200262
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100263// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200264static int nfa_ll_index = 0;
265
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100266static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100267static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200268#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100269static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200270#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int match_follows(nfa_state_T *startstate, int depth);
272static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100274// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200276 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200277 return FAIL; \
278 *post_ptr++ = c; \
279 } while (0)
280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200281/*
282 * Initialize internal variables before NFA compilation.
283 * Return OK on success, FAIL otherwise.
284 */
285 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100286nfa_regcomp_start(
287 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200289{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200290 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200291 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200292
293 nstate = 0;
294 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200296 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298 // Some items blow up in size, such as [A-z]. Add more space for that.
299 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200300 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200301
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200303 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200304
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200305 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200306 if (post_start == NULL)
307 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200308 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100310 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200311 rex.nfa_has_zend = FALSE;
312 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315 regcomp_start(expr, re_flags);
316
317 return OK;
318}
319
320/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200321 * Figure out if the NFA state list starts with an anchor, must match at start
322 * of the line.
323 */
324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100325nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326{
327 nfa_state_T *p = start;
328
329 if (depth > 4)
330 return 0;
331
332 while (p != NULL)
333 {
334 switch (p->c)
335 {
336 case NFA_BOL:
337 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200339
340 case NFA_ZSTART:
341 case NFA_ZEND:
342 case NFA_CURSOR:
343 case NFA_VISUAL:
344
345 case NFA_MOPEN:
346 case NFA_MOPEN1:
347 case NFA_MOPEN2:
348 case NFA_MOPEN3:
349 case NFA_MOPEN4:
350 case NFA_MOPEN5:
351 case NFA_MOPEN6:
352 case NFA_MOPEN7:
353 case NFA_MOPEN8:
354 case NFA_MOPEN9:
355 case NFA_NOPEN:
356#ifdef FEAT_SYN_HL
357 case NFA_ZOPEN:
358 case NFA_ZOPEN1:
359 case NFA_ZOPEN2:
360 case NFA_ZOPEN3:
361 case NFA_ZOPEN4:
362 case NFA_ZOPEN5:
363 case NFA_ZOPEN6:
364 case NFA_ZOPEN7:
365 case NFA_ZOPEN8:
366 case NFA_ZOPEN9:
367#endif
368 p = p->out;
369 break;
370
371 case NFA_SPLIT:
372 return nfa_get_reganch(p->out, depth + 1)
373 && nfa_get_reganch(p->out1, depth + 1);
374
375 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100376 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200377 }
378 }
379 return 0;
380}
381
382/*
383 * Figure out if the NFA state list starts with a character which must match
384 * at start of the match.
385 */
386 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100387nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200388{
389 nfa_state_T *p = start;
390
391 if (depth > 4)
392 return 0;
393
394 while (p != NULL)
395 {
396 switch (p->c)
397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100398 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200399 case NFA_BOL:
400 case NFA_BOF:
401 case NFA_BOW:
402 case NFA_EOW:
403 case NFA_ZSTART:
404 case NFA_ZEND:
405 case NFA_CURSOR:
406 case NFA_VISUAL:
407 case NFA_LNUM:
408 case NFA_LNUM_GT:
409 case NFA_LNUM_LT:
410 case NFA_COL:
411 case NFA_COL_GT:
412 case NFA_COL_LT:
413 case NFA_VCOL:
414 case NFA_VCOL_GT:
415 case NFA_VCOL_LT:
416 case NFA_MARK:
417 case NFA_MARK_GT:
418 case NFA_MARK_LT:
419
420 case NFA_MOPEN:
421 case NFA_MOPEN1:
422 case NFA_MOPEN2:
423 case NFA_MOPEN3:
424 case NFA_MOPEN4:
425 case NFA_MOPEN5:
426 case NFA_MOPEN6:
427 case NFA_MOPEN7:
428 case NFA_MOPEN8:
429 case NFA_MOPEN9:
430 case NFA_NOPEN:
431#ifdef FEAT_SYN_HL
432 case NFA_ZOPEN:
433 case NFA_ZOPEN1:
434 case NFA_ZOPEN2:
435 case NFA_ZOPEN3:
436 case NFA_ZOPEN4:
437 case NFA_ZOPEN5:
438 case NFA_ZOPEN6:
439 case NFA_ZOPEN7:
440 case NFA_ZOPEN8:
441 case NFA_ZOPEN9:
442#endif
443 p = p->out;
444 break;
445
446 case NFA_SPLIT:
447 {
448 int c1 = nfa_get_regstart(p->out, depth + 1);
449 int c2 = nfa_get_regstart(p->out1, depth + 1);
450
451 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100452 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200453 return 0;
454 }
455
456 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200457 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100458 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200459 return 0;
460 }
461 }
462 return 0;
463}
464
465/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200466 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200467 * else. If so return a string in allocated memory with what must match after
468 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200469 */
470 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100471nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200472{
473 nfa_state_T *p = start;
474 int len = 0;
475 char_u *ret;
476 char_u *s;
477
478 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100479 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200480 p = p->out;
481 while (p->c > 0)
482 {
483 len += MB_CHAR2LEN(p->c);
484 p = p->out;
485 }
486 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
487 return NULL;
488
489 ret = alloc(len);
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000490 if (ret == NULL)
491 return NULL;
492
493 p = start->out->out; // skip first char, it goes into regstart
494 s = ret;
495 while (p->c > 0)
Bram Moolenaar473de612013-06-08 18:19:48 +0200496 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000497 if (has_mbyte)
498 s += (*mb_char2bytes)(p->c, s);
499 else
500 *s++ = p->c;
501 p = p->out;
Bram Moolenaar473de612013-06-08 18:19:48 +0200502 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000503 *s = NUL;
Bram Moolenaar473de612013-06-08 18:19:48 +0200504 return ret;
505}
506
507/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200508 * Allocate more space for post_start. Called when
509 * running above the estimated number of states.
510 */
511 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100512realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200514 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100515 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200516 int *new_start;
517 int *old_start;
518
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100519 // For weird patterns the number of states can be very high. Increasing by
520 // 50% seems a reasonable compromise between memory use and speed.
521 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200522 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200523 if (new_start == NULL)
524 return FAIL;
525 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200526 old_start = post_start;
527 post_start = new_start;
528 post_ptr = new_start + (post_ptr - old_start);
529 post_end = post_start + new_max;
530 vim_free(old_start);
531 return OK;
532}
533
534/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200535 * Search between "start" and "end" and try to recognize a
536 * character class in expanded form. For example [0-9].
537 * On success, return the id the character class to be emitted.
538 * On failure, return 0 (=FAIL)
539 * Start points to the first char of the range, while end should point
540 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200541 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
542 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200543 */
544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100545nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200546{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200547# define CLASS_not 0x80
548# define CLASS_af 0x40
549# define CLASS_AF 0x20
550# define CLASS_az 0x10
551# define CLASS_AZ 0x08
552# define CLASS_o7 0x04
553# define CLASS_o9 0x02
554# define CLASS_underscore 0x01
555
556 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200557 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200558 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200559
560 if (extra_newl == TRUE)
561 newl = TRUE;
562
563 if (*end != ']')
564 return FAIL;
565 p = start;
566 if (*p == '^')
567 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200568 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200569 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200570 }
571
572 while (p < end)
573 {
574 if (p + 2 < end && *(p + 1) == '-')
575 {
576 switch (*p)
577 {
578 case '0':
579 if (*(p + 2) == '9')
580 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200581 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200582 break;
583 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200584 if (*(p + 2) == '7')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200589 return FAIL;
590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 case 'a':
592 if (*(p + 2) == 'z')
593 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200594 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200595 break;
596 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200597 if (*(p + 2) == 'f')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200602 return FAIL;
603
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200604 case 'A':
605 if (*(p + 2) == 'Z')
606 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200607 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200608 break;
609 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200610 if (*(p + 2) == 'F')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200615 return FAIL;
616
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200617 default:
618 return FAIL;
619 }
620 p += 3;
621 }
622 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
623 {
624 newl = TRUE;
625 p += 2;
626 }
627 else if (*p == '_')
628 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200629 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200630 p ++;
631 }
632 else if (*p == '\n')
633 {
634 newl = TRUE;
635 p ++;
636 }
637 else
638 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100639 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200640
641 if (p != end)
642 return FAIL;
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200645 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200646
647 switch (config)
648 {
649 case CLASS_o9:
650 return extra_newl + NFA_DIGIT;
651 case CLASS_not | CLASS_o9:
652 return extra_newl + NFA_NDIGIT;
653 case CLASS_af | CLASS_AF | CLASS_o9:
654 return extra_newl + NFA_HEX;
655 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
656 return extra_newl + NFA_NHEX;
657 case CLASS_o7:
658 return extra_newl + NFA_OCTAL;
659 case CLASS_not | CLASS_o7:
660 return extra_newl + NFA_NOCTAL;
661 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
662 return extra_newl + NFA_WORD;
663 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
664 return extra_newl + NFA_NWORD;
665 case CLASS_az | CLASS_AZ | CLASS_underscore:
666 return extra_newl + NFA_HEAD;
667 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
668 return extra_newl + NFA_NHEAD;
669 case CLASS_az | CLASS_AZ:
670 return extra_newl + NFA_ALPHA;
671 case CLASS_not | CLASS_az | CLASS_AZ:
672 return extra_newl + NFA_NALPHA;
673 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200674 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200675 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200676 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200677 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200678 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200679 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200680 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200682 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200683}
684
685/*
686 * Produce the bytes for equivalence class "c".
687 * Currently only handles latin1, latin9 and utf-8.
688 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
689 * equivalent to 'a OR b OR c'
690 *
691 * NOTE! When changing this function, also update reg_equi_class()
692 */
693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200696#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
699 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700 {
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000701#define A_grave 0xc0
702#define A_acute 0xc1
703#define A_circumflex 0xc2
704#define A_virguilla 0xc3
705#define A_diaeresis 0xc4
706#define A_ring 0xc5
707#define C_cedilla 0xc7
708#define E_grave 0xc8
709#define E_acute 0xc9
710#define E_circumflex 0xca
711#define E_diaeresis 0xcb
712#define I_grave 0xcc
713#define I_acute 0xcd
714#define I_circumflex 0xce
715#define I_diaeresis 0xcf
716#define N_virguilla 0xd1
717#define O_grave 0xd2
718#define O_acute 0xd3
719#define O_circumflex 0xd4
720#define O_virguilla 0xd5
721#define O_diaeresis 0xd6
722#define O_slash 0xd8
723#define U_grave 0xd9
724#define U_acute 0xda
725#define U_circumflex 0xdb
726#define U_diaeresis 0xdc
727#define Y_acute 0xdd
728#define a_grave 0xe0
729#define a_acute 0xe1
730#define a_circumflex 0xe2
731#define a_virguilla 0xe3
732#define a_diaeresis 0xe4
733#define a_ring 0xe5
734#define c_cedilla 0xe7
735#define e_grave 0xe8
736#define e_acute 0xe9
737#define e_circumflex 0xea
738#define e_diaeresis 0xeb
739#define i_grave 0xec
740#define i_acute 0xed
741#define i_circumflex 0xee
742#define i_diaeresis 0xef
743#define n_virguilla 0xf1
744#define o_grave 0xf2
745#define o_acute 0xf3
746#define o_circumflex 0xf4
747#define o_virguilla 0xf5
748#define o_diaeresis 0xf6
749#define o_slash 0xf8
750#define u_grave 0xf9
751#define u_acute 0xfa
752#define u_circumflex 0xfb
753#define u_diaeresis 0xfc
754#define y_acute 0xfd
755#define y_diaeresis 0xff
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200756 switch (c)
757 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200758 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200759 case A_virguilla: case A_diaeresis: case A_ring:
760 case 0x100: case 0x102: case 0x104: case 0x1cd:
761 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
762 case 0x202: case 0x226: case 0x23a: case 0x1e00:
763 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
764 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
765 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
766 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
767 EMIT2(A_circumflex) EMIT2(A_virguilla)
768 EMIT2(A_diaeresis) EMIT2(A_ring)
769 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
770 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
771 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
772 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
773 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
774 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
775 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
776 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200777 return OK;
778
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200779 case 'B': case 0x181: case 0x243: case 0x1e02:
780 case 0x1e04: case 0x1e06:
781 EMIT2('B')
782 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
783 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200784 return OK;
785
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200786 case 'C': case C_cedilla: case 0x106: case 0x108:
787 case 0x10a: case 0x10c: case 0x187: case 0x23b:
788 case 0x1e08: case 0xa792:
789 EMIT2('C') EMIT2(C_cedilla)
790 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
791 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
792 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200793 return OK;
794
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200795 case 'D': case 0x10e: case 0x110: case 0x18a:
796 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
797 case 0x1e12:
798 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
799 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
800 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200801 return OK;
802
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200803 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200804 case E_diaeresis: case 0x112: case 0x114: case 0x116:
805 case 0x118: case 0x11a: case 0x204: case 0x206:
806 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
807 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
808 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
809 case 0x1ec2: case 0x1ec4: case 0x1ec6:
810 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
811 EMIT2(E_circumflex) EMIT2(E_diaeresis)
812 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
813 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
814 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
815 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
816 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
817 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
818 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
819 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200820 return OK;
821
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case 'F': case 0x191: case 0x1e1e: case 0xa798:
823 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200824 return OK;
825
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200826 case 'G': case 0x11c: case 0x11e: case 0x120:
827 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
828 case 0x1f4: case 0x1e20: case 0xa7a0:
829 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
830 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
831 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
832 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200833 return OK;
834
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200835 case 'H': case 0x124: case 0x126: case 0x21e:
836 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
837 case 0x1e2a: case 0x2c67:
838 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
839 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
840 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200841 return OK;
842
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200843 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200844 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
845 case 0x12e: case 0x130: case 0x197: case 0x1cf:
846 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
847 case 0x1ec8: case 0x1eca:
848 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
849 EMIT2(I_circumflex) EMIT2(I_diaeresis)
850 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
851 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
852 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
853 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
854 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200855 return OK;
856
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200857 case 'J': case 0x134: case 0x248:
858 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200859 return OK;
860
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
862 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
863 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
864 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
865 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200866 return OK;
867
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200868 case 'L': case 0x139: case 0x13b: case 0x13d:
869 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
870 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
871 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
872 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
873 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
874 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200875 return OK;
876
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200877 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
878 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
879 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200880 return OK;
881
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200882 case 'N': case N_virguilla:
883 case 0x143: case 0x145: case 0x147: case 0x1f8:
884 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
885 case 0xa7a4:
886 EMIT2('N') EMIT2(N_virguilla)
887 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
888 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
889 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200890 return OK;
891
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200892 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200893 case O_virguilla: case O_diaeresis: case O_slash:
894 case 0x14c: case 0x14e: case 0x150: case 0x19f:
895 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
896 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
897 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
898 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
899 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
900 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
901 case 0x1ede: case 0x1ee0: case 0x1ee2:
902 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
903 EMIT2(O_circumflex) EMIT2(O_virguilla)
904 EMIT2(O_diaeresis) EMIT2(O_slash)
905 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
906 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
907 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
908 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
909 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
910 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
911 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
912 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
913 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
914 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
915 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200916 return OK;
917
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200918 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
919 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
920 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200921 return OK;
922
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200923 case 'Q': case 0x24a:
924 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200925 return OK;
926
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200927 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
928 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
929 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
930 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
931 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
932 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
933 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200934 return OK;
935
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200936 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
937 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
938 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
939 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
940 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
941 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
942 EMIT2(0xa7a8)
943 return OK;
944
945 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
946 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
947 case 0x1e6e: case 0x1e70:
948 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
949 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
950 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200951 return OK;
952
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200953 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200954 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
955 case 0x16e: case 0x170: case 0x172: case 0x1af:
956 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
957 case 0x1db: case 0x214: case 0x216: case 0x244:
958 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
959 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
960 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
961 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
962 EMIT2(U_diaeresis) EMIT2(U_circumflex)
963 EMIT2(0x168) EMIT2(0x16a)
964 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
965 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
966 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
967 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
968 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
969 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
970 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
971 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
972 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200973 return OK;
974
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200975 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
976 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200977 return OK;
978
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200979 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
980 case 0x1e86: case 0x1e88:
981 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
982 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200983 return OK;
984
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 case 'X': case 0x1e8a: case 0x1e8c:
986 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200987 return OK;
988
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200989 case 'Y': case Y_acute: case 0x176: case 0x178:
990 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
991 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
992 EMIT2('Y') EMIT2(Y_acute)
993 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
994 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
995 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
996 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'Z': case 0x179: case 0x17b: case 0x17d:
1000 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1001 case 0x2c6b:
1002 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1003 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1004 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001005 return OK;
1006
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001007 case 'a': case a_grave: case a_acute: case a_circumflex:
1008 case a_virguilla: case a_diaeresis: case a_ring:
1009 case 0x101: case 0x103: case 0x105: case 0x1ce:
1010 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1011 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1012 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1013 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1014 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1015 case 0x1eb7: case 0x2c65:
1016 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1017 EMIT2(a_circumflex) EMIT2(a_virguilla)
1018 EMIT2(a_diaeresis) EMIT2(a_ring)
1019 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1020 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1021 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1022 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1023 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1024 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1025 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1026 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1027 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001028 return OK;
1029
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001030 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1031 case 0x1e03: case 0x1e05: case 0x1e07:
1032 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1033 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001034 return OK;
1035
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001036 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1037 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1038 case 0xa794:
1039 EMIT2('c') EMIT2(c_cedilla)
1040 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1041 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1042 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001043 return OK;
1044
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001045 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1046 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1047 case 0x1e11: case 0x1e13:
1048 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1049 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1050 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1051 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001052 return OK;
1053
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001054 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001055 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1056 case 0x119: case 0x11b: case 0x205: case 0x207:
1057 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1058 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1059 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1060 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1061 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1062 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1063 EMIT2(0x113) EMIT2(0x115)
1064 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1065 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1066 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1067 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1068 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1069 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1070 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001071 return OK;
1072
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001073 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1074 case 0x1e1f: case 0xa799:
1075 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1076 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001077 return OK;
1078
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001079 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1080 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1081 case 0x1e21: case 0xa7a1:
1082 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1083 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1084 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1085 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001086 return OK;
1087
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001088 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1089 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1090 case 0x1e96: case 0x2c68: case 0xa795:
1091 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1092 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1093 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1094 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001095 return OK;
1096
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001097 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001098 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1099 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1100 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1101 case 0x1ec9: case 0x1ecb:
1102 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1103 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1104 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1105 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1106 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1107 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1108 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001109 return OK;
1110
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001111 case 'j': case 0x135: case 0x1f0: case 0x249:
1112 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001113 return OK;
1114
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001115 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1116 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1117 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1118 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1119 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001120 return OK;
1121
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001122 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1123 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1124 case 0x1e3d: case 0x2c61:
1125 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1126 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1127 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1128 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001129 return OK;
1130
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001131 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1132 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1133 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1137 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1138 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1139 EMIT2('n') EMIT2(n_virguilla)
1140 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1141 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1142 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1143 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001144 return OK;
1145
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001146 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001147 case o_virguilla: case o_diaeresis: case o_slash:
1148 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1149 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1150 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1151 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1152 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1153 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1154 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1155 case 0x1edf: case 0x1ee1: case 0x1ee3:
1156 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1157 EMIT2(o_circumflex) EMIT2(o_virguilla)
1158 EMIT2(o_diaeresis) EMIT2(o_slash)
1159 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1160 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1161 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1162 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1163 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1164 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1165 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1166 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1167 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1168 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1169 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001170 return OK;
1171
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001172 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1173 case 0x1e55: case 0x1e57:
1174 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1175 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'q': case 0x24b: case 0x2a0:
1179 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001180 return OK;
1181
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001182 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1183 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1184 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1185 case 0xa7a7:
1186 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1187 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1188 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1189 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001190 return OK;
1191
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001192 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1193 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1194 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1195 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1196 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1197 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1198 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1199 return OK;
1200
1201 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1202 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1203 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1204 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1205 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1206 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1207 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001208 return OK;
1209
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001210 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001211 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1212 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1213 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1214 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1215 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1216 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1217 case 0x1eed: case 0x1eef: case 0x1ef1:
1218 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1219 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1220 EMIT2(0x169) EMIT2(0x16b)
1221 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1222 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1223 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1224 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1225 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1226 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1227 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1228 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1229 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001230 return OK;
1231
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001232 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1233 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1234 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001235 return OK;
1236
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001237 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1238 case 0x1e87: case 0x1e89: case 0x1e98:
1239 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1240 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001241 return OK;
1242
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001243 case 'x': case 0x1e8b: case 0x1e8d:
1244 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001245 return OK;
1246
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001247 case 'y': case y_acute: case y_diaeresis: case 0x177:
1248 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1249 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1250 case 0x1ef9:
1251 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1252 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1253 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1254 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001255 return OK;
1256
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001257 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1258 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1259 case 0x1e95: case 0x2c6c:
1260 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1261 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1262 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001263 return OK;
1264
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001265 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001266 }
1267 }
1268
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001269 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001270 return OK;
1271#undef EMIT2
1272}
1273
1274/*
1275 * Code to parse regular expression.
1276 *
1277 * We try to reuse parsing functions in regexp.c to
1278 * minimize surprise and keep the syntax consistent.
1279 */
1280
1281/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001282 * Parse the lowest level.
1283 *
1284 * An atom can be one of a long list of items. Many atoms match one character
1285 * in the text. It is often an ordinary character or a character class.
1286 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1287 * is only for syntax highlighting.
1288 *
1289 * atom ::= ordinary-atom
1290 * or \( pattern \)
1291 * or \%( pattern \)
1292 * or \z( pattern \)
1293 */
1294 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001295nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001296{
1297 int c;
1298 int charclass;
1299 int equiclass;
1300 int collclass;
1301 int got_coll_char;
1302 char_u *p;
1303 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001304 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001305 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001306 int emit_range;
1307 int negated;
1308 int result;
1309 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001310 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001311
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001312 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 switch (c)
1314 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001315 case NUL:
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001316 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar47196582013-05-25 22:04:23 +02001317
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001318 case Magic('^'):
1319 EMIT(NFA_BOL);
1320 break;
1321
1322 case Magic('$'):
1323 EMIT(NFA_EOL);
1324#if defined(FEAT_SYN_HL) || defined(PROTO)
1325 had_eol = TRUE;
1326#endif
1327 break;
1328
1329 case Magic('<'):
1330 EMIT(NFA_BOW);
1331 break;
1332
1333 case Magic('>'):
1334 EMIT(NFA_EOW);
1335 break;
1336
1337 case Magic('_'):
1338 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001339 if (c == NUL)
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001340 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar174a8482013-11-28 14:20:17 +01001341
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001342 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001343 {
1344 EMIT(NFA_BOL);
1345 break;
1346 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001347 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001348 {
1349 EMIT(NFA_EOL);
1350#if defined(FEAT_SYN_HL) || defined(PROTO)
1351 had_eol = TRUE;
1352#endif
1353 break;
1354 }
1355
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001356 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001357
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001358 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001359 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001360 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001361
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001362 // "\_x" is character class plus newline
1363 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001364
1365 /*
1366 * Character classes.
1367 */
1368 case Magic('.'):
1369 case Magic('i'):
1370 case Magic('I'):
1371 case Magic('k'):
1372 case Magic('K'):
1373 case Magic('f'):
1374 case Magic('F'):
1375 case Magic('p'):
1376 case Magic('P'):
1377 case Magic('s'):
1378 case Magic('S'):
1379 case Magic('d'):
1380 case Magic('D'):
1381 case Magic('x'):
1382 case Magic('X'):
1383 case Magic('o'):
1384 case Magic('O'):
1385 case Magic('w'):
1386 case Magic('W'):
1387 case Magic('h'):
1388 case Magic('H'):
1389 case Magic('a'):
1390 case Magic('A'):
1391 case Magic('l'):
1392 case Magic('L'):
1393 case Magic('u'):
1394 case Magic('U'):
1395 p = vim_strchr(classchars, no_Magic(c));
1396 if (p == NULL)
1397 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001398 if (extra == NFA_ADD_NL)
1399 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001400 semsg(_(e_nfa_regexp_invalid_character_class_nr), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001401 rc_did_emsg = TRUE;
1402 return FAIL;
1403 }
Bram Moolenaar097c5372023-05-24 21:02:24 +01001404 siemsg("Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001405 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001406 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001408 // When '.' is followed by a composing char ignore the dot, so that
1409 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001410 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1411 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001412 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001413 c = getchr();
1414 goto nfa_do_multibyte;
1415 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001416 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001417 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001418 {
1419 EMIT(NFA_NEWL);
1420 EMIT(NFA_OR);
1421 regflags |= RF_HASNL;
1422 }
1423 break;
1424
1425 case Magic('n'):
1426 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001427 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001428 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001429 else
1430 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001431 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001432 EMIT(NFA_NEWL);
1433 regflags |= RF_HASNL;
1434 }
1435 break;
1436
1437 case Magic('('):
1438 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001439 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001440 break;
1441
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001442 case Magic('|'):
1443 case Magic('&'):
1444 case Magic(')'):
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001445 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001446 return FAIL;
1447
1448 case Magic('='):
1449 case Magic('?'):
1450 case Magic('+'):
1451 case Magic('@'):
1452 case Magic('*'):
1453 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001454 // these should follow an atom, not form an atom
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001455 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001456 return FAIL;
1457
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001458 case Magic('~'):
1459 {
1460 char_u *lp;
1461
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001462 // Previous substitute pattern.
1463 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001464 if (reg_prev_sub == NULL)
1465 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001466 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001467 return FAIL;
1468 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001469 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001470 {
1471 EMIT(PTR2CHAR(lp));
1472 if (lp != reg_prev_sub)
1473 EMIT(NFA_CONCAT);
1474 }
1475 EMIT(NFA_NOPEN);
1476 break;
1477 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001478
Bram Moolenaar428e9872013-05-30 17:05:39 +02001479 case Magic('1'):
1480 case Magic('2'):
1481 case Magic('3'):
1482 case Magic('4'):
1483 case Magic('5'):
1484 case Magic('6'):
1485 case Magic('7'):
1486 case Magic('8'):
1487 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001488 {
1489 int refnum = no_Magic(c) - '1';
1490
1491 if (!seen_endbrace(refnum + 1))
1492 return FAIL;
1493 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001494 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001495 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001496 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001497
1498 case Magic('z'):
1499 c = no_Magic(getchr());
1500 switch (c)
1501 {
1502 case 's':
1503 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001504 if (re_mult_next("\\zs") == FAIL)
1505 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001506 break;
1507 case 'e':
1508 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001509 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001510 if (re_mult_next("\\ze") == FAIL)
1511 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001512 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001513#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001514 case '1':
1515 case '2':
1516 case '3':
1517 case '4':
1518 case '5':
1519 case '6':
1520 case '7':
1521 case '8':
1522 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001523 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001524 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001525 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001526 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001527 // No need to set rex.nfa_has_backref, the sub-matches don't
1528 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001529 re_has_z = REX_USE;
1530 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001531 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001532 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001533 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001534 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001535 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001536 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001537 re_has_z = REX_SET;
1538 break;
1539#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001540 default:
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001541 semsg(_(e_nfa_regexp_unknown_operator_z_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001542 return FAIL;
1543 }
1544 break;
1545
1546 case Magic('%'):
1547 c = no_Magic(getchr());
1548 switch (c)
1549 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001550 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001551 case '(':
1552 if (nfa_reg(REG_NPAREN) == FAIL)
1553 return FAIL;
1554 EMIT(NFA_NOPEN);
1555 break;
1556
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001557 case 'd': // %d123 decimal
1558 case 'o': // %o123 octal
1559 case 'x': // %xab hex 2
1560 case 'u': // %uabcd hex 4
1561 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001562 {
Christian Brabandtf2b16982025-03-29 09:08:58 +01001563 vimlong_T nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001564
Bram Moolenaar47196582013-05-25 22:04:23 +02001565 switch (c)
1566 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001567 case 'd': nr = getdecchrs(); break;
1568 case 'o': nr = getoctchrs(); break;
1569 case 'x': nr = gethexchrs(2); break;
1570 case 'u': nr = gethexchrs(4); break;
1571 case 'U': nr = gethexchrs(8); break;
1572 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001573 }
1574
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001575 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001576 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1577 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001578 // A NUL is stored in the text as NL
1579 // TODO: what if a composing character follows?
Christian Brabandtf2b16982025-03-29 09:08:58 +01001580 EMIT(nr == 0 ? 0x0a : (long)nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001581 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001582 break;
1583
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001584 // Catch \%^ and \%$ regardless of where they appear in the
1585 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001586 case '^':
1587 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001588 break;
1589
1590 case '$':
1591 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001592 break;
1593
1594 case '#':
Christian Brabandt360da402022-05-18 15:04:02 +01001595 if (regparse[0] == '=' && regparse[1] >= 48
1596 && regparse[1] <= 50)
1597 {
1598 // misplaced \%#=1
1599 semsg(_(e_atom_engine_must_be_at_start_of_pattern),
1600 regparse[1]);
1601 return FAIL;
1602 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001603 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001604 break;
1605
1606 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001607 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001608 break;
1609
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001610 case 'C':
1611 EMIT(NFA_ANY_COMPOSING);
1612 break;
1613
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001614 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001615 {
1616 int n;
1617
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001618 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001619 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001620 {
1621 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001622 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001623 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001624 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001625 if (nfa_regatom() == FAIL)
1626 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001627 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001628 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001629 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001630 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001631 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001632 EMIT(NFA_OPT_CHARS);
1633 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001634
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001635 // Emit as "\%(\%[abc]\)" to be able to handle
1636 // "\%[abc]*" which would cause the empty string to be
1637 // matched an unlimited number of times. NFA_NOPEN is
1638 // added only once at a position, while NFA_SPLIT is
1639 // added multiple times. This is more efficient than
1640 // not allowing NFA_SPLIT multiple times, it is used
1641 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001642 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001643 break;
1644 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001645
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001646 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001647 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001648 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001649 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001650 int cur = FALSE;
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001651 int got_digit = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001652
1653 if (c == '<' || c == '>')
1654 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001655 if (no_Magic(c) == '.')
1656 {
1657 cur = TRUE;
1658 c = getchr();
1659 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001660 while (VIM_ISDIGIT(c))
1661 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001662 long_u tmp;
1663
1664 if (cur)
Bram Moolenaarb10ff5c2022-03-19 11:31:38 +00001665 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001666 semsg(_(e_regexp_number_after_dot_pos_search_chr),
Bram Moolenaarb10ff5c2022-03-19 11:31:38 +00001667 no_Magic(c));
1668 return FAIL;
1669 }
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001670 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001671
1672 if (tmp < n)
1673 {
1674 // overflow.
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001675 emsg(_(e_percent_value_too_large));
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001676 return FAIL;
1677 }
1678 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001679 c = getchr();
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001680 got_digit = TRUE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001681 }
1682 if (c == 'l' || c == 'c' || c == 'v')
1683 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001684 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001685
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001686 if (!cur && !got_digit)
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001687 {
1688 semsg(_(e_nfa_regexp_missing_value_in_chr),
1689 no_Magic(c));
1690 return FAIL;
1691 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001692 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001693 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001694 if (cur)
1695 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001696 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001697 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001698 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001699 if (save_prev_at_start)
1700 at_start = TRUE;
1701 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001702 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001703 {
1704 if (cur)
1705 {
1706 n = curwin->w_cursor.col;
1707 n++;
1708 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001709 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001710 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001711 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001712 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001713 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001714 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001715 if (cur)
1716 {
1717 colnr_T vcol = 0;
1718
1719 getvvcol(curwin, &curwin->w_cursor,
1720 NULL, NULL, &vcol);
1721 n = ++vcol;
1722 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001723 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001724 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001725 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001726 limit = INT_MAX / MB_MAXBYTES;
1727 }
1728 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001729 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001730 emsg(_(e_percent_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001731 return FAIL;
1732 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001733 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001734 break;
1735 }
Julio B46fa3c72024-03-28 10:23:37 +01001736 else if (no_Magic(c) == '\'' && n == 0)
Bram Moolenaar044aa292013-06-04 21:27:38 +02001737 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001738 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001739 EMIT(cmp == '<' ? NFA_MARK_LT :
1740 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001741 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001742 break;
1743 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001744 }
Bram Moolenaarc96311b2022-11-25 21:13:47 +00001745 semsg(_(e_nfa_regexp_unknown_operator_percent_chr),
1746 no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001747 return FAIL;
1748 }
1749 break;
1750
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001751 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001752collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001753 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001754 * [abc] uses NFA_START_COLL - NFA_END_COLL
1755 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1756 * Each character is produced as a regular state, using
1757 * NFA_CONCAT to bind them together.
1758 * Besides normal characters there can be:
1759 * - character classes NFA_CLASS_*
1760 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001761 */
1762
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001763 p = regparse;
1764 endp = skip_anyof(p);
1765 if (*endp == ']')
1766 {
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01001767 int plen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001768 /*
1769 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001770 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001771 * and perform the necessary substitutions in the NFA.
1772 */
1773 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001774 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001775 if (result != FAIL)
1776 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001777 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001778 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001779 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001780 EMIT(NFA_NEWL);
1781 EMIT(NFA_OR);
1782 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001783 else
1784 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001785 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001786 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001787 return OK;
1788 }
1789 /*
1790 * Failed to recognize a character class. Use the simple
1791 * version that turns [abc] into 'a' OR 'b' OR 'c'
1792 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001793 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001794 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001795 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001796 {
1797 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001798 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001799 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001800 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001801 else
1802 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001803 if (*regparse == '-')
1804 {
1805 startc = '-';
1806 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001807 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001808 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001809 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001810 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001811 emit_range = FALSE;
1812 while (regparse < endp)
1813 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001814 int oldstartc = startc;
1815
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001816 startc = -1;
1817 got_coll_char = FALSE;
1818 if (*regparse == '[')
1819 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001820 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001821 equiclass = collclass = 0;
1822 charclass = get_char_class(&regparse);
1823 if (charclass == CLASS_NONE)
1824 {
1825 equiclass = get_equi_class(&regparse);
1826 if (equiclass == 0)
1827 collclass = get_coll_element(&regparse);
1828 }
1829
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001830 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001831 if (charclass != CLASS_NONE)
1832 {
1833 switch (charclass)
1834 {
1835 case CLASS_ALNUM:
1836 EMIT(NFA_CLASS_ALNUM);
1837 break;
1838 case CLASS_ALPHA:
1839 EMIT(NFA_CLASS_ALPHA);
1840 break;
1841 case CLASS_BLANK:
1842 EMIT(NFA_CLASS_BLANK);
1843 break;
1844 case CLASS_CNTRL:
1845 EMIT(NFA_CLASS_CNTRL);
1846 break;
1847 case CLASS_DIGIT:
1848 EMIT(NFA_CLASS_DIGIT);
1849 break;
1850 case CLASS_GRAPH:
1851 EMIT(NFA_CLASS_GRAPH);
1852 break;
1853 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001854 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001855 EMIT(NFA_CLASS_LOWER);
1856 break;
1857 case CLASS_PRINT:
1858 EMIT(NFA_CLASS_PRINT);
1859 break;
1860 case CLASS_PUNCT:
1861 EMIT(NFA_CLASS_PUNCT);
1862 break;
1863 case CLASS_SPACE:
1864 EMIT(NFA_CLASS_SPACE);
1865 break;
1866 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001867 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001868 EMIT(NFA_CLASS_UPPER);
1869 break;
1870 case CLASS_XDIGIT:
1871 EMIT(NFA_CLASS_XDIGIT);
1872 break;
1873 case CLASS_TAB:
1874 EMIT(NFA_CLASS_TAB);
1875 break;
1876 case CLASS_RETURN:
1877 EMIT(NFA_CLASS_RETURN);
1878 break;
1879 case CLASS_BACKSPACE:
1880 EMIT(NFA_CLASS_BACKSPACE);
1881 break;
1882 case CLASS_ESCAPE:
1883 EMIT(NFA_CLASS_ESCAPE);
1884 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001885 case CLASS_IDENT:
1886 EMIT(NFA_CLASS_IDENT);
1887 break;
1888 case CLASS_KEYWORD:
1889 EMIT(NFA_CLASS_KEYWORD);
1890 break;
1891 case CLASS_FNAME:
1892 EMIT(NFA_CLASS_FNAME);
1893 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001894 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001895 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001896 continue;
1897 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001898 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001899 if (equiclass != 0)
1900 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001901 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001902 if (result == FAIL)
1903 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001904 // should never happen
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001905 EMSG_RET_FAIL(_(e_error_building_nfa_with_equivalence_class));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001906 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001907 continue;
1908 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001909 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001910 if (collclass != 0)
1911 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001912 startc = collclass; // allow [.a.]-x as a range
1913 // Will emit the proper atom at the end of the
1914 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001915 }
1916 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001917 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1918 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001919 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001920 {
1921 emit_range = TRUE;
1922 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001923 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001924 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001925 }
1926
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001927 // Now handle simple and escaped characters.
1928 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1929 // accepts "\t", "\e", etc., but only when the 'l' flag in
1930 // 'cpoptions' is not included.
1931 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001932 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001933 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001934 && regparse + 1 <= endp
1935 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001936 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001937 && vim_strchr(REGEXP_ABBR, regparse[1])
1938 != NULL)
1939 )
1940 )
1941 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001942 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001943
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001944 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001945 startc = (reg_string || emit_range
1946 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001947 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001948 || *regparse == 'o'
1949 || *regparse == 'x'
1950 || *regparse == 'u'
1951 || *regparse == 'U'
1952 )
1953 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001954 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001955 startc = coll_get_char();
Christian Brabandtf2b16982025-03-29 09:08:58 +01001956 // max UTF-8 Codepoint is U+10FFFF,
1957 // but allow values until INT_MAX
1958 if (startc == INT_MAX)
1959 EMSG_RET_FAIL(_(e_unicode_val_too_large));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001960 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001961 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001962 }
1963 else
1964 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001965 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001966 startc = backslash_trans(*regparse);
1967 }
1968 }
1969
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001970 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001971 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001972 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001973
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001974 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001975 if (emit_range)
1976 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001977 int endc = startc;
1978
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001979 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001980 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001981 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02001982
1983 if (endc > startc + 2)
1984 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001985 // Emit a range instead of the sequence of
1986 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001987 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001988 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001989 EMIT(1);
1990 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001991 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02001992 EMIT(endc);
1993 EMIT(NFA_RANGE);
1994 EMIT(NFA_CONCAT);
1995 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001996 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001997 || (*mb_char2len)(endc) > 1))
1998 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001999 // Emit the characters in the range.
2000 // "startc" was already emitted, so skip it.
2001 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002002 for (c = startc + 1; c <= endc; c++)
2003 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002004 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002005 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002006 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002007 }
2008 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002009 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002010 // Emit the range. "startc" was already emitted, so
2011 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002012 for (c = startc + 1; c <= endc; c++)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00002013 {
2014 EMIT(c);
2015 EMIT(NFA_CONCAT);
2016 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002017 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002018 emit_range = FALSE;
2019 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002020 }
2021 else
2022 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002023 // This char (startc) is not part of a range. Just
2024 // emit it.
2025 // Normally, simply emit startc. But if we get char
2026 // code=0 from a collating char, then replace it with
2027 // 0x0a.
2028 // This is needed to completely mimic the behaviour of
2029 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002030 if (startc == NFA_NEWL)
2031 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002032 // Line break can't be matched as part of the
2033 // collection, add an OR below. But not for negated
2034 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002035 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002036 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002037 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002038 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002039 {
2040 if (got_coll_char == TRUE && startc == 0)
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002041 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02002042 EMIT(0x0a);
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002043 EMIT(NFA_CONCAT);
2044 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02002045 else
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002046 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02002047 EMIT(startc);
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002048 if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
2049 {
2050 EMIT(NFA_CONCAT);
2051 }
2052 }
Christian Brabandtca22fc32023-08-20 20:34:22 +02002053 }
Christian Brabandtca22fc32023-08-20 20:34:22 +02002054 }
Christian Brabandtbe07caa2023-08-20 22:26:15 +02002055
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002056 if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
2057 {
2058 int i = utf_ptr2len(regparse);
2059
2060 c = utf_ptr2char(regparse + i);
2061
2062 // Add composing characters
2063 for (;;)
2064 {
2065 if (c == 0)
2066 // \x00 is translated to \x0a, start at \x01.
2067 EMIT(1);
2068 else
2069 EMIT(c);
2070 EMIT(NFA_CONCAT);
2071 if ((i += utf_char2len(c)) >= plen)
2072 break;
2073 c = utf_ptr2char(regparse + i);
2074 }
2075 EMIT(NFA_COMPOSING);
2076 EMIT(NFA_CONCAT);
2077 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002078 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002079 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002080
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002081 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002082 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002083 {
2084 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002085 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002086 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002087
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002088 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002089 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002090 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002091
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002092 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002093 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002094 EMIT(NFA_END_NEG_COLL);
2095 else
2096 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002097
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002098 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002099 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002100 {
2101 EMIT(reg_string ? NL : NFA_NEWL);
2102 EMIT(NFA_OR);
2103 }
2104
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002105 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002106 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002107
2108 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002109 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002110 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002111
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002112 default:
2113 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002114 int plen;
2115
2116nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002117 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002118 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002119 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002120 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002121 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002122 int i = 0;
2123
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002124 // A base character plus composing characters, or just one
2125 // or more composing characters.
2126 // This requires creating a separate atom as if enclosing
2127 // the characters in (), where NFA_COMPOSING is the ( and
2128 // NFA_END_COMPOSING is the ). Note that right now we are
2129 // building the postfix form, not the NFA itself;
2130 // a composing char could be: a, b, c, NFA_COMPOSING
2131 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002132 for (;;)
2133 {
2134 EMIT(c);
2135 if (i > 0)
2136 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002137 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002138 break;
2139 c = utf_ptr2char(old_regparse + i);
2140 }
2141 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002142 regparse = old_regparse + plen;
2143 }
2144 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002145 {
2146 c = no_Magic(c);
2147 EMIT(c);
2148 }
2149 return OK;
2150 }
2151 }
2152
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002153 return OK;
2154}
2155
2156/*
2157 * Parse something followed by possible [*+=].
2158 *
2159 * A piece is an atom, possibly followed by a multi, an indication of how many
2160 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2161 * characters: "", "a", "aa", etc.
2162 *
2163 * piece ::= atom
2164 * or atom multi
2165 */
2166 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002167nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002168{
2169 int i;
2170 int op;
2171 int ret;
2172 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002173 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002174 parse_state_T old_state;
2175 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002176 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002177 int old_post_pos;
2178 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002179 int quest;
2180
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002181 // Save the current parse state, so that we can use it if <atom>{m,n} is
2182 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002183 save_parse_state(&old_state);
2184
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002185 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002186 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002187
2188 ret = nfa_regatom();
2189 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002190 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002191
2192 op = peekchr();
2193 if (re_multi_type(op) == NOT_MULTI)
2194 return OK;
2195
2196 skipchr();
2197 switch (op)
2198 {
2199 case Magic('*'):
2200 EMIT(NFA_STAR);
2201 break;
2202
2203 case Magic('+'):
2204 /*
2205 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2206 * first and only submatch would be "aaa". But the backtracking
2207 * engine interprets the plus as "try matching one more time", and
2208 * a* matches a second time at the end of the input, the empty
2209 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002210 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002211 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002212 * In order to be consistent with the old engine, we replace
2213 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002214 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002215 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002216 curchr = -1;
2217 if (nfa_regatom() == FAIL)
2218 return FAIL;
2219 EMIT(NFA_STAR);
2220 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002221 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002222 break;
2223
2224 case Magic('@'):
Christian Brabandtf2b16982025-03-29 09:08:58 +01002225 c2 = (long)getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002226 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002227 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002228 switch(op)
2229 {
2230 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002231 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002232 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002233 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002234 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002235 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002236 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002237 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002238 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002239 op = no_Magic(getchr());
2240 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002241 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002242 i = NFA_PREV_ATOM_JUST_BEFORE;
2243 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002244 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002245 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2246 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002247 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002248 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002249 i = NFA_PREV_ATOM_LIKE_PATTERN;
2250 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002251 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002252 if (i == 0)
2253 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002254 semsg(_(e_nfa_regexp_unknown_operator_at_chr), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002255 return FAIL;
2256 }
2257 EMIT(i);
2258 if (i == NFA_PREV_ATOM_JUST_BEFORE
2259 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2260 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002261 break;
2262
2263 case Magic('?'):
2264 case Magic('='):
2265 EMIT(NFA_QUEST);
2266 break;
2267
2268 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002269 // a{2,5} will expand to 'aaa?a?a?'
2270 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2271 // version of '?'
2272 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2273 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002274
2275 greedy = TRUE;
2276 c2 = peekchr();
2277 if (c2 == '-' || c2 == Magic('-'))
2278 {
2279 skipchr();
2280 greedy = FALSE;
2281 }
2282 if (!read_limits(&minval, &maxval))
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002283 EMSG_RET_FAIL(_(e_nfa_regexp_error_reading_repetition_limits));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002284
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002285 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2286 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002287 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002288 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002289 if (greedy) // { { (match the braces)
2290 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002291 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002292 else // { { (match the braces)
2293 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002294 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002295 break;
2296 }
2297
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002298 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002299 if (maxval == 0)
2300 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002301 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002302 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002303 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002304 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002305 return OK;
2306 }
2307
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002308 // The engine is very inefficient (uses too many states) when the
2309 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002310 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2311 // will emit NFA_STAR.
2312 // Bail out if we can use the other engine, but only, when the
2313 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002314 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002315 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002316 && (maxval > 500 || maxval > minval + 200)
2317 && (maxval != MAX_LIMIT && minval < 200)
2318 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002319 return FAIL;
2320
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002321 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002322 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002323 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002324 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002325
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002326 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2327 for (i = 0; i < maxval; i++)
2328 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002329 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002330 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002331 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002332 if (nfa_regatom() == FAIL)
2333 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002334 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002335 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002336 {
2337 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002338 {
2339 if (greedy)
2340 EMIT(NFA_STAR);
2341 else
2342 EMIT(NFA_STAR_NONGREEDY);
2343 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002344 else
2345 EMIT(quest);
2346 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002347 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002348 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002349 if (i + 1 > minval && maxval == MAX_LIMIT)
2350 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002351 }
2352
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002353 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002354 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002355 curchr = -1;
2356
2357 break;
2358
2359
2360 default:
2361 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002362 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002363
2364 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002365 // Can't have a multi follow a multi.
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002366 EMSG_RET_FAIL(_(e_nfa_regexp_cant_have_multi_follow_multi));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002367
2368 return OK;
2369}
2370
2371/*
2372 * Parse one or more pieces, concatenated. It matches a match for the
2373 * first piece, followed by a match for the second piece, etc. Example:
2374 * "f[0-9]b", first matches "f", then a digit and then "b".
2375 *
2376 * concat ::= piece
2377 * or piece piece
2378 * or piece piece piece
2379 * etc.
2380 */
2381 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002382nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002383{
2384 int cont = TRUE;
2385 int first = TRUE;
2386
2387 while (cont)
2388 {
2389 switch (peekchr())
2390 {
2391 case NUL:
2392 case Magic('|'):
2393 case Magic('&'):
2394 case Magic(')'):
2395 cont = FALSE;
2396 break;
2397
2398 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002399 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002400 skipchr_keepstart();
2401 break;
2402 case Magic('c'):
2403 regflags |= RF_ICASE;
2404 skipchr_keepstart();
2405 break;
2406 case Magic('C'):
2407 regflags |= RF_NOICASE;
2408 skipchr_keepstart();
2409 break;
2410 case Magic('v'):
2411 reg_magic = MAGIC_ALL;
2412 skipchr_keepstart();
2413 curchr = -1;
2414 break;
2415 case Magic('m'):
2416 reg_magic = MAGIC_ON;
2417 skipchr_keepstart();
2418 curchr = -1;
2419 break;
2420 case Magic('M'):
2421 reg_magic = MAGIC_OFF;
2422 skipchr_keepstart();
2423 curchr = -1;
2424 break;
2425 case Magic('V'):
2426 reg_magic = MAGIC_NONE;
2427 skipchr_keepstart();
2428 curchr = -1;
2429 break;
2430
2431 default:
2432 if (nfa_regpiece() == FAIL)
2433 return FAIL;
2434 if (first == FALSE)
2435 EMIT(NFA_CONCAT);
2436 else
2437 first = FALSE;
2438 break;
2439 }
2440 }
2441
2442 return OK;
2443}
2444
2445/*
2446 * Parse a branch, one or more concats, separated by "\&". It matches the
2447 * last concat, but only if all the preceding concats also match at the same
2448 * position. Examples:
2449 * "foobeep\&..." matches "foo" in "foobeep".
2450 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2451 *
2452 * branch ::= concat
2453 * or concat \& concat
2454 * or concat \& concat \& concat
2455 * etc.
2456 */
2457 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002458nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002459{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002460 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002461
Bram Moolenaar16299b52013-05-30 18:45:23 +02002462 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002463
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002464 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002465 if (nfa_regconcat() == FAIL)
2466 return FAIL;
2467
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002468 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002469 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002470 {
2471 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002472 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002473 if (old_post_pos == (int)(post_ptr - post_start))
2474 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002475 EMIT(NFA_NOPEN);
2476 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002477 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002478 if (nfa_regconcat() == FAIL)
2479 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002480 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002481 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002482 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002483 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002484 }
2485
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002486 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002487 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002488 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002489
2490 return OK;
2491}
2492
2493/*
2494 * Parse a pattern, one or more branches, separated by "\|". It matches
2495 * anything that matches one of the branches. Example: "foo\|beep" matches
2496 * "foo" and matches "beep". If more than one branch matches, the first one
2497 * is used.
2498 *
2499 * pattern ::= branch
2500 * or branch \| branch
2501 * or branch \| branch \| branch
2502 * etc.
2503 */
2504 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002505nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002506 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002507{
2508 int parno = 0;
2509
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002510 if (paren == REG_PAREN)
2511 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002512 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002513 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_parens));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002514 parno = regnpar++;
2515 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002516#ifdef FEAT_SYN_HL
2517 else if (paren == REG_ZPAREN)
2518 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002519 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002520 if (regnzpar >= NSUBEXP)
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002521 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_z));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002522 parno = regnzpar++;
2523 }
2524#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002525
2526 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002527 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002528
2529 while (peekchr() == Magic('|'))
2530 {
2531 skipchr();
2532 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002533 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002534 EMIT(NFA_OR);
2535 }
2536
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002537 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002538 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2539 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002540 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002541 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2542 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002543 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002544 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002545 }
2546 else if (paren == REG_NOPAREN && peekchr() != NUL)
2547 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002548 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002549 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002550 else
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002551 EMSG_RET_FAIL(_(e_nfa_regexp_proper_termination_error));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002552 }
2553 /*
2554 * Here we set the flag allowing back references to this set of
2555 * parentheses.
2556 */
2557 if (paren == REG_PAREN)
2558 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002559 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002560 EMIT(NFA_MOPEN + parno);
2561 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002562#ifdef FEAT_SYN_HL
2563 else if (paren == REG_ZPAREN)
2564 EMIT(NFA_ZOPEN + parno);
2565#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002566
2567 return OK;
2568}
2569
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002570#ifdef DEBUG
2571static char_u code[50];
2572
2573 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002574nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002575{
2576 int addnl = FALSE;
2577
2578 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2579 {
2580 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002581 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582 }
2583
2584 STRCPY(code, "");
2585 switch (c)
2586 {
2587 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2588 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2589 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2590 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2591 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2592 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2593
Bram Moolenaar5714b802013-05-28 22:03:20 +02002594 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2595 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2596 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2597 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2598 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2599 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2600 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2601 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2602 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002603#ifdef FEAT_SYN_HL
2604 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2605 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2606 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2607 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2608 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2609 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2610 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2611 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2612 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2613#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002614 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2615
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002616 case NFA_PREV_ATOM_NO_WIDTH:
2617 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002618 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2619 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002620 case NFA_PREV_ATOM_JUST_BEFORE:
2621 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2622 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2623 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002624 case NFA_PREV_ATOM_LIKE_PATTERN:
2625 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2626
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002627 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2628 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002629 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002630 case NFA_START_INVISIBLE_FIRST:
2631 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002632 case NFA_START_INVISIBLE_NEG:
2633 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002634 case NFA_START_INVISIBLE_NEG_FIRST:
2635 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002636 case NFA_START_INVISIBLE_BEFORE:
2637 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002638 case NFA_START_INVISIBLE_BEFORE_FIRST:
2639 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002640 case NFA_START_INVISIBLE_BEFORE_NEG:
2641 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002642 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2643 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002644 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002645 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002646 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002647 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002648
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002649 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2650 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002651 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002652
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002653 case NFA_MOPEN:
2654 case NFA_MOPEN1:
2655 case NFA_MOPEN2:
2656 case NFA_MOPEN3:
2657 case NFA_MOPEN4:
2658 case NFA_MOPEN5:
2659 case NFA_MOPEN6:
2660 case NFA_MOPEN7:
2661 case NFA_MOPEN8:
2662 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002663 STRCPY(code, "NFA_MOPEN(x)");
2664 code[10] = c - NFA_MOPEN + '0';
2665 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002666 case NFA_MCLOSE:
2667 case NFA_MCLOSE1:
2668 case NFA_MCLOSE2:
2669 case NFA_MCLOSE3:
2670 case NFA_MCLOSE4:
2671 case NFA_MCLOSE5:
2672 case NFA_MCLOSE6:
2673 case NFA_MCLOSE7:
2674 case NFA_MCLOSE8:
2675 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002676 STRCPY(code, "NFA_MCLOSE(x)");
2677 code[11] = c - NFA_MCLOSE + '0';
2678 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002679#ifdef FEAT_SYN_HL
2680 case NFA_ZOPEN:
2681 case NFA_ZOPEN1:
2682 case NFA_ZOPEN2:
2683 case NFA_ZOPEN3:
2684 case NFA_ZOPEN4:
2685 case NFA_ZOPEN5:
2686 case NFA_ZOPEN6:
2687 case NFA_ZOPEN7:
2688 case NFA_ZOPEN8:
2689 case NFA_ZOPEN9:
2690 STRCPY(code, "NFA_ZOPEN(x)");
2691 code[10] = c - NFA_ZOPEN + '0';
2692 break;
2693 case NFA_ZCLOSE:
2694 case NFA_ZCLOSE1:
2695 case NFA_ZCLOSE2:
2696 case NFA_ZCLOSE3:
2697 case NFA_ZCLOSE4:
2698 case NFA_ZCLOSE5:
2699 case NFA_ZCLOSE6:
2700 case NFA_ZCLOSE7:
2701 case NFA_ZCLOSE8:
2702 case NFA_ZCLOSE9:
2703 STRCPY(code, "NFA_ZCLOSE(x)");
2704 code[11] = c - NFA_ZCLOSE + '0';
2705 break;
2706#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002707 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2708 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2709 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2710 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002711 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2712 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002713 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2714 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2715 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2716 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2717 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2718 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2719 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2720 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2721 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2722 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2723 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2724 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2725 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2726 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002727 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002728
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002729 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002730 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2731 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2732 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002733 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002734 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002735
2736 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2737 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2738 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2739 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2740 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2741 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2742 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2743
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002744 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2745 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2746 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2747 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2748 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2749 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2750 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2751 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2752 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2753 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2754 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2755 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2756 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2757 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2758 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2759 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002760 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2761 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2762 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002763
2764 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2765 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2766 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2767 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2768 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2769 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2770 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2771 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2772 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2773 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2774 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2775 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2776 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2777 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2778 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2779 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2780 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2781 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2782 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2783 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2784 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2785 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2786 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2787 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2788 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2789 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2790 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002791 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2792 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2793 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2794 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002795
2796 default:
2797 STRCPY(code, "CHAR(x)");
2798 code[5] = c;
2799 }
2800
2801 if (addnl == TRUE)
2802 STRCAT(code, " + NEWLINE ");
2803
2804}
2805
2806#ifdef ENABLE_LOG
2807static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002808static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002809
2810/*
2811 * Print the postfix notation of the current regexp.
2812 */
2813 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002814nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002815{
2816 int *p;
2817 FILE *f;
2818
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002819 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002820 if (f == NULL)
2821 return;
2822
2823 fprintf(f, "\n-------------------------\n");
2824 if (retval == FAIL)
2825 fprintf(f, ">>> NFA engine failed... \n");
2826 else if (retval == OK)
2827 fprintf(f, ">>> NFA engine succeeded !\n");
2828 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
2829 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002830 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002831 nfa_set_code(*p);
2832 fprintf(f, "%s, ", code);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002833 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002834 fprintf(f, "\"\nPostfix notation (int): ");
2835 for (p = post_start; *p && p < post_ptr; p++)
2836 fprintf(f, "%d ", *p);
2837 fprintf(f, "\n\n");
2838 fclose(f);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002839}
2840
2841/*
2842 * Print the NFA starting with a root node "state".
2843 */
2844 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002845nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002846{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002847 garray_T indent;
2848
2849 ga_init2(&indent, 1, 64);
2850 ga_append(&indent, '\0');
2851 nfa_print_state2(debugf, state, &indent);
2852 ga_clear(&indent);
2853}
2854
2855 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002856nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002857{
2858 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002859
2860 if (state == NULL)
2861 return;
2862
2863 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002864
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002865 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002866 p = (char_u *)indent->ga_data;
2867 if (indent->ga_len >= 3)
2868 {
2869 int last = indent->ga_len - 3;
2870 char_u save[2];
2871
2872 STRNCPY(save, &p[last], 2);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00002873 memcpy(&p[last], "+-", 2);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002874 fprintf(debugf, " %s", p);
2875 STRNCPY(&p[last], save, 2);
2876 }
2877 else
2878 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002879
2880 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002881 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002882 code,
2883 state->c,
2884 abs(state->id),
2885 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002886 if (state->id < 0)
2887 return;
2888
2889 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002890
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002891 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002892 indent->ga_len -= 1;
2893 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002894 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002895 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002896 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002897 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002898
2899 nfa_print_state2(debugf, state->out, indent);
2900
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002901 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002902 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002903 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002904 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002905
2906 nfa_print_state2(debugf, state->out1, indent);
2907
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002908 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002909 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002910 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002911}
2912
2913/*
2914 * Print the NFA state machine.
2915 */
2916 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002917nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002918{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002919 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002920
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002921 if (debugf == NULL)
2922 return;
Bram Moolenaard89616e2013-06-06 18:46:06 +02002923
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002924 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002925
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00002926 if (prog->reganch)
2927 fprintf(debugf, "reganch: %d\n", prog->reganch);
2928 if (prog->regstart != NUL)
2929 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2930 prog->regstart, prog->regstart);
2931 if (prog->match_text != NULL)
2932 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
2933
2934 fclose(debugf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002935}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002936#endif // ENABLE_LOG
2937#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002938
2939/*
2940 * Parse r.e. @expr and convert it into postfix form.
2941 * Return the postfix string on success, NULL otherwise.
2942 */
2943 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002944re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002945{
2946 if (nfa_reg(REG_NOPAREN) == FAIL)
2947 return NULL;
2948 EMIT(NFA_MOPEN);
2949 return post_start;
2950}
2951
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002952// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002953
2954/*
2955 * Represents an NFA state plus zero or one or two arrows exiting.
2956 * if c == MATCH, no arrows out; matching state.
2957 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2958 * If c < 256, labeled arrow with character c to out.
2959 */
2960
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002961static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002962
2963/*
2964 * Allocate and initialize nfa_state_T.
2965 */
2966 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002967alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002968{
2969 nfa_state_T *s;
2970
2971 if (istate >= nstate)
2972 return NULL;
2973
2974 s = &state_ptr[istate++];
2975
2976 s->c = c;
2977 s->out = out;
2978 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002979 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002980
2981 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002982 s->lastlist[0] = 0;
2983 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002984
2985 return s;
2986}
2987
2988/*
2989 * A partially built NFA without the matching state filled in.
2990 * Frag_T.start points at the start state.
2991 * Frag_T.out is a list of places that need to be set to the
2992 * next state for this fragment.
2993 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002994
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002995// Since the out pointers in the list are always
2996// uninitialized, we use the pointers themselves
2997// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002998typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002999union Ptrlist
3000{
3001 Ptrlist *next;
3002 nfa_state_T *s;
3003};
3004
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003005struct Frag
3006{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003007 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003008 Ptrlist *out;
3009};
3010typedef struct Frag Frag_T;
3011
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003012/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02003013 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003014 */
3015 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003016frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003017{
Bram Moolenaar053bb602013-05-20 13:55:21 +02003018 Frag_T n;
3019
3020 n.start = start;
3021 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003022 return n;
3023}
3024
3025/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003026 * Create singleton list containing just outp.
3027 */
3028 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003029list1(
3030 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003031{
3032 Ptrlist *l;
3033
3034 l = (Ptrlist *)outp;
3035 l->next = NULL;
3036 return l;
3037}
3038
3039/*
3040 * Patch the list of states at out to point to start.
3041 */
3042 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003043patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003044{
3045 Ptrlist *next;
3046
3047 for (; l; l = next)
3048 {
3049 next = l->next;
3050 l->s = s;
3051 }
3052}
3053
3054
3055/*
3056 * Join the two lists l1 and l2, returning the combination.
3057 */
3058 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003059append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003060{
3061 Ptrlist *oldl1;
3062
3063 oldl1 = l1;
3064 while (l1->next)
3065 l1 = l1->next;
3066 l1->next = l2;
3067 return oldl1;
3068}
3069
3070/*
3071 * Stack used for transforming postfix form into NFA.
3072 */
3073static Frag_T empty;
3074
3075 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003076st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003077{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003078#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003079 FILE *df;
3080 int *p2;
3081
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003082 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003083 if (df)
3084 {
3085 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003086# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003087 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003088# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003089 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003090# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003091 for (p2 = postfix; p2 < end; p2++)
3092 {
3093 nfa_set_code(*p2);
3094 fprintf(df, "%s, ", code);
3095 }
3096 nfa_set_code(*p);
3097 fprintf(df, "\nCurrent position is: ");
3098 for (p2 = postfix; p2 <= p; p2 ++)
3099 {
3100 nfa_set_code(*p2);
3101 fprintf(df, "%s, ", code);
3102 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003103# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003104 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003105 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003106 fprintf(df, "\nCurrent position is: ");
3107 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003108 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003109# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003110 fprintf(df, "\n--------------------------\n");
3111 fclose(df);
3112 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003113#endif
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003114 emsg(_(e_nfa_regexp_could_not_pop_stack));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003115}
3116
3117/*
3118 * Push an item onto the stack.
3119 */
3120 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003121st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003122{
3123 Frag_T *stackp = *p;
3124
3125 if (stackp >= stack_end)
3126 return;
3127 *stackp = s;
3128 *p = *p + 1;
3129}
3130
3131/*
3132 * Pop an item from the stack.
3133 */
3134 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003135st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003136{
3137 Frag_T *stackp;
3138
3139 *p = *p - 1;
3140 stackp = *p;
3141 if (stackp < stack)
3142 return empty;
3143 return **p;
3144}
3145
3146/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003147 * Estimate the maximum byte length of anything matching "state".
3148 * When unknown or unlimited return -1.
3149 */
3150 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003151nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003152{
3153 int l, r;
3154 nfa_state_T *state = startstate;
3155 int len = 0;
3156
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003157 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003158 if (depth > 4)
3159 return -1;
3160
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003161 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003162 {
3163 switch (state->c)
3164 {
3165 case NFA_END_INVISIBLE:
3166 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003167 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003168 return len;
3169
3170 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003171 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003172 l = nfa_max_width(state->out, depth + 1);
3173 r = nfa_max_width(state->out1, depth + 1);
3174 if (l < 0 || r < 0)
3175 return -1;
3176 return len + (l > r ? l : r);
3177
3178 case NFA_ANY:
3179 case NFA_START_COLL:
3180 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003181 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003182 if (enc_utf8)
3183 len += MB_MAXBYTES;
3184 else if (has_mbyte)
3185 len += 2;
3186 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003187 ++len;
3188 if (state->c != NFA_ANY)
3189 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003190 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003191 state = state->out1->out;
3192 continue;
3193 }
3194 break;
3195
3196 case NFA_DIGIT:
3197 case NFA_WHITE:
3198 case NFA_HEX:
3199 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003200 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003201 ++len;
3202 break;
3203
3204 case NFA_IDENT:
3205 case NFA_SIDENT:
3206 case NFA_KWORD:
3207 case NFA_SKWORD:
3208 case NFA_FNAME:
3209 case NFA_SFNAME:
3210 case NFA_PRINT:
3211 case NFA_SPRINT:
3212 case NFA_NWHITE:
3213 case NFA_NDIGIT:
3214 case NFA_NHEX:
3215 case NFA_NOCTAL:
3216 case NFA_WORD:
3217 case NFA_NWORD:
3218 case NFA_HEAD:
3219 case NFA_NHEAD:
3220 case NFA_ALPHA:
3221 case NFA_NALPHA:
3222 case NFA_LOWER:
3223 case NFA_NLOWER:
3224 case NFA_UPPER:
3225 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003226 case NFA_LOWER_IC:
3227 case NFA_NLOWER_IC:
3228 case NFA_UPPER_IC:
3229 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003230 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003231 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003232 if (has_mbyte)
3233 len += 3;
3234 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003235 ++len;
3236 break;
3237
3238 case NFA_START_INVISIBLE:
3239 case NFA_START_INVISIBLE_NEG:
3240 case NFA_START_INVISIBLE_BEFORE:
3241 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003242 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003243 state = state->out1->out;
3244 continue;
3245
3246 case NFA_BACKREF1:
3247 case NFA_BACKREF2:
3248 case NFA_BACKREF3:
3249 case NFA_BACKREF4:
3250 case NFA_BACKREF5:
3251 case NFA_BACKREF6:
3252 case NFA_BACKREF7:
3253 case NFA_BACKREF8:
3254 case NFA_BACKREF9:
3255#ifdef FEAT_SYN_HL
3256 case NFA_ZREF1:
3257 case NFA_ZREF2:
3258 case NFA_ZREF3:
3259 case NFA_ZREF4:
3260 case NFA_ZREF5:
3261 case NFA_ZREF6:
3262 case NFA_ZREF7:
3263 case NFA_ZREF8:
3264 case NFA_ZREF9:
3265#endif
3266 case NFA_NEWL:
3267 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003268 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003269 return -1;
3270
3271 case NFA_BOL:
3272 case NFA_EOL:
3273 case NFA_BOF:
3274 case NFA_EOF:
3275 case NFA_BOW:
3276 case NFA_EOW:
3277 case NFA_MOPEN:
3278 case NFA_MOPEN1:
3279 case NFA_MOPEN2:
3280 case NFA_MOPEN3:
3281 case NFA_MOPEN4:
3282 case NFA_MOPEN5:
3283 case NFA_MOPEN6:
3284 case NFA_MOPEN7:
3285 case NFA_MOPEN8:
3286 case NFA_MOPEN9:
3287#ifdef FEAT_SYN_HL
3288 case NFA_ZOPEN:
3289 case NFA_ZOPEN1:
3290 case NFA_ZOPEN2:
3291 case NFA_ZOPEN3:
3292 case NFA_ZOPEN4:
3293 case NFA_ZOPEN5:
3294 case NFA_ZOPEN6:
3295 case NFA_ZOPEN7:
3296 case NFA_ZOPEN8:
3297 case NFA_ZOPEN9:
3298 case NFA_ZCLOSE:
3299 case NFA_ZCLOSE1:
3300 case NFA_ZCLOSE2:
3301 case NFA_ZCLOSE3:
3302 case NFA_ZCLOSE4:
3303 case NFA_ZCLOSE5:
3304 case NFA_ZCLOSE6:
3305 case NFA_ZCLOSE7:
3306 case NFA_ZCLOSE8:
3307 case NFA_ZCLOSE9:
3308#endif
3309 case NFA_MCLOSE:
3310 case NFA_MCLOSE1:
3311 case NFA_MCLOSE2:
3312 case NFA_MCLOSE3:
3313 case NFA_MCLOSE4:
3314 case NFA_MCLOSE5:
3315 case NFA_MCLOSE6:
3316 case NFA_MCLOSE7:
3317 case NFA_MCLOSE8:
3318 case NFA_MCLOSE9:
3319 case NFA_NOPEN:
3320 case NFA_NCLOSE:
3321
3322 case NFA_LNUM_GT:
3323 case NFA_LNUM_LT:
3324 case NFA_COL_GT:
3325 case NFA_COL_LT:
3326 case NFA_VCOL_GT:
3327 case NFA_VCOL_LT:
3328 case NFA_MARK_GT:
3329 case NFA_MARK_LT:
3330 case NFA_VISUAL:
3331 case NFA_LNUM:
3332 case NFA_CURSOR:
3333 case NFA_COL:
3334 case NFA_VCOL:
3335 case NFA_MARK:
3336
3337 case NFA_ZSTART:
3338 case NFA_ZEND:
3339 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003340 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003341 case NFA_START_PATTERN:
3342 case NFA_END_PATTERN:
3343 case NFA_COMPOSING:
3344 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003345 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003346 break;
3347
3348 default:
3349 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003350 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003351 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003352 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003353 len += MB_CHAR2LEN(state->c);
3354 break;
3355 }
3356
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003357 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003358 state = state->out;
3359 }
3360
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003361 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003362 return -1;
3363}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003364
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003365/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003366 * Convert a postfix form into its equivalent NFA.
3367 * Return the NFA start state on success, NULL otherwise.
3368 */
3369 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003370post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003371{
3372 int *p;
3373 int mopen;
3374 int mclose;
3375 Frag_T *stack = NULL;
3376 Frag_T *stackp = NULL;
3377 Frag_T *stack_end = NULL;
3378 Frag_T e1;
3379 Frag_T e2;
3380 Frag_T e;
3381 nfa_state_T *s;
3382 nfa_state_T *s1;
3383 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003384 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003385
3386 if (postfix == NULL)
3387 return NULL;
3388
Bram Moolenaar053bb602013-05-20 13:55:21 +02003389#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003390#define POP() st_pop(&stackp, stack); \
3391 if (stackp < stack) \
3392 { \
3393 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003394 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003395 return NULL; \
3396 }
3397
3398 if (nfa_calc_size == FALSE)
3399 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003400 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003401 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003402 if (stack == NULL)
3403 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003404 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003405 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003406 }
3407
3408 for (p = postfix; p < end; ++p)
3409 {
3410 switch (*p)
3411 {
3412 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003413 // Concatenation.
3414 // Pay attention: this operator does not exist in the r.e. itself
3415 // (it is implicit, really). It is added when r.e. is translated
3416 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003417 if (nfa_calc_size == TRUE)
3418 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003419 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003420 break;
3421 }
3422 e2 = POP();
3423 e1 = POP();
3424 patch(e1.out, e2.start);
3425 PUSH(frag(e1.start, e2.out));
3426 break;
3427
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003428 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003429 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003430 if (nfa_calc_size == TRUE)
3431 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003432 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003433 break;
3434 }
3435 e2 = POP();
3436 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003437 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003438 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003439 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003440 PUSH(frag(s, append(e1.out, e2.out)));
3441 break;
3442
3443 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003444 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003445 if (nfa_calc_size == TRUE)
3446 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003447 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003448 break;
3449 }
3450 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003451 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003452 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003453 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003454 patch(e.out, s);
3455 PUSH(frag(s, list1(&s->out1)));
3456 break;
3457
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003458 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003459 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003460 if (nfa_calc_size == TRUE)
3461 {
3462 nstate++;
3463 break;
3464 }
3465 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003466 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003467 if (s == NULL)
3468 goto theend;
3469 patch(e.out, s);
3470 PUSH(frag(s, list1(&s->out)));
3471 break;
3472
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003473 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003474 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003475 if (nfa_calc_size == TRUE)
3476 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003477 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003478 break;
3479 }
3480 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003481 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003482 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003483 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003484 PUSH(frag(s, append(e.out, list1(&s->out1))));
3485 break;
3486
3487 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003488 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003489 if (nfa_calc_size == TRUE)
3490 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003491 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003492 break;
3493 }
3494 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003495 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003496 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003497 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003498 PUSH(frag(s, append(e.out, list1(&s->out))));
3499 break;
3500
Bram Moolenaar417bad22013-06-07 14:08:30 +02003501 case NFA_END_COLL:
3502 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003503 // On the stack is the sequence starting with NFA_START_COLL or
3504 // NFA_START_NEG_COLL and all possible characters. Patch it to
3505 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003506 if (nfa_calc_size == TRUE)
3507 {
3508 nstate++;
3509 break;
3510 }
3511 e = POP();
3512 s = alloc_state(NFA_END_COLL, NULL, NULL);
3513 if (s == NULL)
3514 goto theend;
3515 patch(e.out, s);
3516 e.start->out1 = s;
3517 PUSH(frag(e.start, list1(&s->out)));
3518 break;
3519
3520 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003521 // Before this are two characters, the low and high end of a
3522 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003523 if (nfa_calc_size == TRUE)
3524 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003525 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003526 break;
3527 }
3528 e2 = POP();
3529 e1 = POP();
3530 e2.start->val = e2.start->c;
3531 e2.start->c = NFA_RANGE_MAX;
3532 e1.start->val = e1.start->c;
3533 e1.start->c = NFA_RANGE_MIN;
3534 patch(e1.out, e2.start);
3535 PUSH(frag(e1.start, e2.out));
3536 break;
3537
Bram Moolenaar699c1202013-09-25 16:41:54 +02003538 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003539 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003540 if (nfa_calc_size == TRUE)
3541 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003542 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003543 break;
3544 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003545 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003546 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003547 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003548 PUSH(frag(s, list1(&s->out)));
3549 break;
3550
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003551 case NFA_OPT_CHARS:
3552 {
3553 int n;
3554
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003555 // \%[abc] implemented as:
3556 // NFA_SPLIT
3557 // +-CHAR(a)
3558 // | +-NFA_SPLIT
3559 // | +-CHAR(b)
3560 // | | +-NFA_SPLIT
3561 // | | +-CHAR(c)
3562 // | | | +-next
3563 // | | +- next
3564 // | +- next
3565 // +- next
3566 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003567 if (nfa_calc_size == TRUE)
3568 {
3569 nstate += n;
3570 break;
3571 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003572 s = NULL; // avoid compiler warning
3573 e1.out = NULL; // stores list with out1's
3574 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003575 while (n-- > 0)
3576 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003577 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003578 s = alloc_state(NFA_SPLIT, e.start, NULL);
3579 if (s == NULL)
3580 goto theend;
3581 if (e1.out == NULL)
3582 e1 = e;
3583 patch(e.out, s1);
3584 append(e1.out, list1(&s->out1));
3585 s1 = s;
3586 }
3587 PUSH(frag(s, e1.out));
3588 break;
3589 }
3590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003591 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003592 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003593 case NFA_PREV_ATOM_JUST_BEFORE:
3594 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003595 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003596 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003597 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3598 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003599 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003600 int start_state;
3601 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003602 int n = 0;
3603 nfa_state_T *zend;
3604 nfa_state_T *skip;
3605
Bram Moolenaardecd9542013-06-07 16:31:50 +02003606 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003607 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003608 case NFA_PREV_ATOM_NO_WIDTH:
3609 start_state = NFA_START_INVISIBLE;
3610 end_state = NFA_END_INVISIBLE;
3611 break;
3612 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3613 start_state = NFA_START_INVISIBLE_NEG;
3614 end_state = NFA_END_INVISIBLE_NEG;
3615 break;
3616 case NFA_PREV_ATOM_JUST_BEFORE:
3617 start_state = NFA_START_INVISIBLE_BEFORE;
3618 end_state = NFA_END_INVISIBLE;
3619 break;
3620 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3621 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3622 end_state = NFA_END_INVISIBLE_NEG;
3623 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003624 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003625 start_state = NFA_START_PATTERN;
3626 end_state = NFA_END_PATTERN;
3627 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003628 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003629
3630 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003631 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003632
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003633 // The \@= operator: match the preceding atom with zero width.
3634 // The \@! operator: no match for the preceding atom.
3635 // The \@<= operator: match for the preceding atom.
3636 // The \@<! operator: no match for the preceding atom.
3637 // Surrounds the preceding atom with START_INVISIBLE and
3638 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003639
3640 if (nfa_calc_size == TRUE)
3641 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003642 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003643 break;
3644 }
3645 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003646 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003647 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003648 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003649
Bram Moolenaar87953742013-06-05 18:52:40 +02003650 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003651 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003652 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003653 if (pattern)
3654 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003655 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003656 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003657 if (skip == NULL)
3658 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003659 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003660 if (zend == NULL)
3661 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003662 s1->out= skip;
3663 patch(e.out, zend);
3664 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003665 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003666 else
3667 {
3668 patch(e.out, s1);
3669 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003670 if (before)
3671 {
3672 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003673 // See if we can guess the maximum width, it avoids a
3674 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003675 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003676 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003677 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003678 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003679 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003680 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003681
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003682 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003683#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003684 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003685 if (regflags & RF_ICOMBINE)
3686 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003687 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003688 }
3689#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003690 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003691
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003692 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003693 case NFA_MOPEN1:
3694 case NFA_MOPEN2:
3695 case NFA_MOPEN3:
3696 case NFA_MOPEN4:
3697 case NFA_MOPEN5:
3698 case NFA_MOPEN6:
3699 case NFA_MOPEN7:
3700 case NFA_MOPEN8:
3701 case NFA_MOPEN9:
3702#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003703 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003704 case NFA_ZOPEN1:
3705 case NFA_ZOPEN2:
3706 case NFA_ZOPEN3:
3707 case NFA_ZOPEN4:
3708 case NFA_ZOPEN5:
3709 case NFA_ZOPEN6:
3710 case NFA_ZOPEN7:
3711 case NFA_ZOPEN8:
3712 case NFA_ZOPEN9:
3713#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003714 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003715 if (nfa_calc_size == TRUE)
3716 {
3717 nstate += 2;
3718 break;
3719 }
3720
3721 mopen = *p;
3722 switch (*p)
3723 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003724 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3725#ifdef FEAT_SYN_HL
3726 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3727 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3728 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3729 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3730 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3731 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3732 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3733 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3734 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3735 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3736#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003737 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003738 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003739 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003740 mclose = *p + NSUBEXP;
3741 break;
3742 }
3743
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003744 // Allow "NFA_MOPEN" as a valid postfix representation for
3745 // the empty regexp "". In this case, the NFA will be
3746 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3747 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003748 if (stackp == stack)
3749 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003750 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003751 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003752 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003753 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003754 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003755 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003756 patch(list1(&s->out), s1);
3757 PUSH(frag(s, list1(&s1->out)));
3758 break;
3759 }
3760
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003761 // At least one node was emitted before NFA_MOPEN, so
3762 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003763 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003764 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003765 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003766 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003767
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003768 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003769 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003770 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003771 patch(e.out, s1);
3772
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003773 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003774 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003775 patch(list1(&s->out1), s1);
3776
3777 PUSH(frag(s, list1(&s1->out)));
3778 break;
3779
Bram Moolenaar5714b802013-05-28 22:03:20 +02003780 case NFA_BACKREF1:
3781 case NFA_BACKREF2:
3782 case NFA_BACKREF3:
3783 case NFA_BACKREF4:
3784 case NFA_BACKREF5:
3785 case NFA_BACKREF6:
3786 case NFA_BACKREF7:
3787 case NFA_BACKREF8:
3788 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003789#ifdef FEAT_SYN_HL
3790 case NFA_ZREF1:
3791 case NFA_ZREF2:
3792 case NFA_ZREF3:
3793 case NFA_ZREF4:
3794 case NFA_ZREF5:
3795 case NFA_ZREF6:
3796 case NFA_ZREF7:
3797 case NFA_ZREF8:
3798 case NFA_ZREF9:
3799#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003800 if (nfa_calc_size == TRUE)
3801 {
3802 nstate += 2;
3803 break;
3804 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003805 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003806 if (s == NULL)
3807 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003808 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003809 if (s1 == NULL)
3810 goto theend;
3811 patch(list1(&s->out), s1);
3812 PUSH(frag(s, list1(&s1->out)));
3813 break;
3814
Bram Moolenaar423532e2013-05-29 21:14:42 +02003815 case NFA_LNUM:
3816 case NFA_LNUM_GT:
3817 case NFA_LNUM_LT:
3818 case NFA_VCOL:
3819 case NFA_VCOL_GT:
3820 case NFA_VCOL_LT:
3821 case NFA_COL:
3822 case NFA_COL_GT:
3823 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003824 case NFA_MARK:
3825 case NFA_MARK_GT:
3826 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003827 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003828 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003829
Bram Moolenaar423532e2013-05-29 21:14:42 +02003830 if (nfa_calc_size == TRUE)
3831 {
3832 nstate += 1;
3833 break;
3834 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003835 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003836 if (s == NULL)
3837 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003838 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003839 PUSH(frag(s, list1(&s->out)));
3840 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003841 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003842
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003843 case NFA_ZSTART:
3844 case NFA_ZEND:
3845 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003846 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003847 if (nfa_calc_size == TRUE)
3848 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003849 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003850 break;
3851 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003852 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003853 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003854 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003855 PUSH(frag(s, list1(&s->out)));
3856 break;
3857
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003858 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003859
Bram Moolenaarc9471b12023-05-09 15:00:00 +01003860 } // for (p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003861
3862 if (nfa_calc_size == TRUE)
3863 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003864 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003865 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003866 }
3867
3868 e = POP();
3869 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003870 {
3871 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003872 EMSG_RET_NULL(_(e_nfa_regexp_while_converting_from_postfix_to_nfa_too_many_stats_left_on_stack));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003873 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003874
3875 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003876 {
3877 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003878 EMSG_RET_NULL(_(e_nfa_regexp_not_enough_space_to_store_whole_nfa));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003879 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003880
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003881 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003882 matchstate->c = NFA_MATCH;
3883 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003884 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003885
3886 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003887 ret = e.start;
3888
3889theend:
3890 vim_free(stack);
3891 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003892
3893#undef POP1
3894#undef PUSH1
3895#undef POP2
3896#undef PUSH2
3897#undef POP
3898#undef PUSH
3899}
3900
Bram Moolenaara2947e22013-06-11 22:44:09 +02003901/*
3902 * After building the NFA program, inspect it to add optimization hints.
3903 */
3904 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003905nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003906{
3907 int i;
3908 int c;
3909
3910 for (i = 0; i < prog->nstate; ++i)
3911 {
3912 c = prog->state[i].c;
3913 if (c == NFA_START_INVISIBLE
3914 || c == NFA_START_INVISIBLE_NEG
3915 || c == NFA_START_INVISIBLE_BEFORE
3916 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3917 {
3918 int directly;
3919
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003920 // Do it directly when what follows is possibly the end of the
3921 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003922 if (match_follows(prog->state[i].out1->out, 0))
3923 directly = TRUE;
3924 else
3925 {
3926 int ch_invisible = failure_chance(prog->state[i].out, 0);
3927 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3928
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003929 // Postpone when the invisible match is expensive or has a
3930 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003931 if (c == NFA_START_INVISIBLE_BEFORE
3932 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3933 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003934 // "before" matches are very expensive when
3935 // unbounded, always prefer what follows then,
3936 // unless what follows will always match.
3937 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003938 if (prog->state[i].val <= 0 && ch_follows > 0)
3939 directly = FALSE;
3940 else
3941 directly = ch_follows * 10 < ch_invisible;
3942 }
3943 else
3944 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003945 // normal invisible, first do the one with the
3946 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003947 directly = ch_follows < ch_invisible;
3948 }
3949 }
3950 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003951 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003952 ++prog->state[i].c;
3953 }
3954 }
3955}
3956
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003957/////////////////////////////////////////////////////////////////
3958// NFA execution code.
3959/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003960
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003961typedef struct
3962{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003963 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003964
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003965 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003966 union
3967 {
3968 struct multipos
3969 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003970 linenr_T start_lnum;
3971 linenr_T end_lnum;
3972 colnr_T start_col;
3973 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003974 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003975 struct linepos
3976 {
3977 char_u *start;
3978 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003979 } line[NSUBEXP];
3980 } list;
Bram Moolenaar79336e12022-12-11 14:18:31 +00003981 colnr_T orig_start_col; // list.multi[0].start_col without \zs
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003982} regsub_T;
3983
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003984typedef struct
3985{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003986 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003987#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003988 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003989#endif
3990} regsubs_T;
3991
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003992// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02003993typedef struct nfa_pim_S nfa_pim_T;
3994struct nfa_pim_S
3995{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003996 int result; // NFA_PIM_*, see below
3997 nfa_state_T *state; // the invisible match start state
3998 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02003999 union
4000 {
4001 lpos_T pos;
4002 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004003 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02004004};
4005
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004006// Values for done in nfa_pim_T.
4007#define NFA_PIM_UNUSED 0 // pim not used
4008#define NFA_PIM_TODO 1 // pim not done yet
4009#define NFA_PIM_MATCH 2 // pim executed, matches
4010#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02004011
4012
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004013// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02004014typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004015{
4016 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004017 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004018 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
4019 // invisible match
4020 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02004021} nfa_thread_T;
4022
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004023// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004024typedef struct
4025{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004026 nfa_thread_T *t; // allocated array of states
4027 int n; // nr of states currently in "t"
4028 int len; // max nr of states in "t"
4029 int id; // ID of the list
4030 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02004031} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004032
Bram Moolenaar5714b802013-05-28 22:03:20 +02004033#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004034static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004035
4036 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004037log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004038{
4039 log_subexpr(&subs->norm);
4040# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004041 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02004042 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004043# endif
4044}
4045
Bram Moolenaar5714b802013-05-28 22:03:20 +02004046 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004047log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02004048{
4049 int j;
4050
4051 for (j = 0; j < sub->in_use; j++)
4052 if (REG_MULTI)
Bram Moolenaarc96311b2022-11-25 21:13:47 +00004053 fprintf(log_fd,
4054 "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004055 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004056 sub->list.multi[j].start_col,
4057 (int)sub->list.multi[j].start_lnum,
4058 sub->list.multi[j].end_col,
4059 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004060 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004061 {
4062 char *s = (char *)sub->list.line[j].start;
4063 char *e = (char *)sub->list.line[j].end;
4064
Bram Moolenaar87953742013-06-05 18:52:40 +02004065 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004066 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004067 s == NULL ? "NULL" : s,
4068 e == NULL ? "NULL" : e);
4069 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004070}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004071
4072 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004073pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004074{
4075 static char buf[30];
4076
4077 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4078 buf[0] = NUL;
4079 else
4080 {
4081 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004082 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004083 }
4084 return buf;
4085}
4086
Bram Moolenaar5714b802013-05-28 22:03:20 +02004087#endif
4088
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004089// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004090static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004091#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004092static int *nfa_timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004093#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004094
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004095static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004096static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004097
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004098/*
4099 * Copy postponed invisible match info from "from" to "to".
4100 */
4101 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004102copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004103{
4104 to->result = from->result;
4105 to->state = from->state;
4106 copy_sub(&to->subs.norm, &from->subs.norm);
4107#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004108 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004109 copy_sub(&to->subs.synt, &from->subs.synt);
4110#endif
4111 to->end = from->end;
4112}
4113
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004114 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004115clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004116{
4117 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004118 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004119 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004120 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004121 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004122 vim_memset(sub->list.line, 0,
4123 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004124 sub->in_use = 0;
4125}
4126
4127/*
4128 * Copy the submatches from "from" to "to".
4129 */
4130 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004131copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004132{
4133 to->in_use = from->in_use;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004134 if (from->in_use <= 0)
4135 return;
4136
4137 // Copy the match start and end positions.
4138 if (REG_MULTI)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004139 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004140 mch_memmove(&to->list.multi[0],
4141 &from->list.multi[0],
4142 sizeof(struct multipos) * from->in_use);
4143 to->orig_start_col = from->orig_start_col;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004144 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004145 else
4146 mch_memmove(&to->list.line[0],
4147 &from->list.line[0],
4148 sizeof(struct linepos) * from->in_use);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004149}
4150
4151/*
4152 * Like copy_sub() but exclude the main match.
4153 */
4154 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004155copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004156{
4157 if (to->in_use < from->in_use)
4158 to->in_use = from->in_use;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004159 if (from->in_use <= 1)
4160 return;
4161
4162 // Copy the match start and end positions.
4163 if (REG_MULTI)
4164 mch_memmove(&to->list.multi[1],
4165 &from->list.multi[1],
4166 sizeof(struct multipos) * (from->in_use - 1));
4167 else
4168 mch_memmove(&to->list.line[1],
4169 &from->list.line[1],
4170 sizeof(struct linepos) * (from->in_use - 1));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004171}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004172
Bram Moolenaar428e9872013-05-30 17:05:39 +02004173/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004174 * Like copy_sub() but only do the end of the main match if \ze is present.
4175 */
4176 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004177copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004178{
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004179 if (!rex.nfa_has_zend)
4180 return;
4181
4182 if (REG_MULTI)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004183 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004184 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004185 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004186 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4187 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaarf2118842013-09-25 18:16:38 +02004188 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00004189 }
4190 else
4191 {
4192 if (from->list.line[0].end != NULL)
4193 to->list.line[0].end = from->list.line[0].end;
Bram Moolenaarf2118842013-09-25 18:16:38 +02004194 }
4195}
4196
4197/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004198 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004199 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004200 */
4201 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004202sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004203{
4204 int i;
4205 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004206 linenr_T s1;
4207 linenr_T s2;
4208 char_u *sp1;
4209 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004210
4211 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4212 if (REG_MULTI)
4213 {
4214 for (i = 0; i < todo; ++i)
4215 {
4216 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004217 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004218 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004219 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004220 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004221 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004222 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004223 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004224 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004225 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004226 if (s1 != -1 && sub1->list.multi[i].start_col
4227 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004228 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004229
Bram Moolenaar0270f382018-07-17 05:43:58 +02004230 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004231 {
4232 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004233 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004234 else
4235 s1 = -1;
4236 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004237 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004238 else
4239 s2 = -1;
4240 if (s1 != s2)
4241 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004242 if (s1 != -1 && sub1->list.multi[i].end_col
4243 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004244 return FALSE;
4245 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004246 }
4247 }
4248 else
4249 {
4250 for (i = 0; i < todo; ++i)
4251 {
4252 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004253 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004254 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004255 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004256 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004257 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004258 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004259 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004260 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004261 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004262 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004263 {
4264 if (i < sub1->in_use)
4265 sp1 = sub1->list.line[i].end;
4266 else
4267 sp1 = NULL;
4268 if (i < sub2->in_use)
4269 sp2 = sub2->list.line[i].end;
4270 else
4271 sp2 = NULL;
4272 if (sp1 != sp2)
4273 return FALSE;
4274 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004275 }
4276 }
4277
4278 return TRUE;
4279}
4280
Bram Moolenaar616592e2022-06-17 15:17:10 +01004281#ifdef FEAT_RELTIME
4282/*
4283 * Check if we are past the time limit, if there is one.
4284 */
4285 static int
4286nfa_did_time_out(void)
4287{
4288 if (*timeout_flag)
4289 {
4290 if (nfa_timed_out != NULL)
4291 {
Bram Moolenaar4c5678f2022-11-30 18:12:19 +00004292# ifdef FEAT_EVAL
Bram Moolenaar616592e2022-06-17 15:17:10 +01004293 if (!*nfa_timed_out)
4294 ch_log(NULL, "NFA regexp timed out");
Bram Moolenaar509ce032022-06-20 11:23:01 +01004295# endif
Bram Moolenaar616592e2022-06-17 15:17:10 +01004296 *nfa_timed_out = TRUE;
4297 }
4298 return TRUE;
4299 }
4300 return FALSE;
4301}
4302#endif
4303
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004304#ifdef ENABLE_LOG
4305 static void
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004306open_debug_log(int result)
4307{
4308 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4309 if (log_fd == NULL)
4310 {
4311 emsg(_(e_log_open_failed));
4312 log_fd = stderr;
4313 }
4314
4315 fprintf(log_fd, "****************************\n");
4316 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4317 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : result == MAYBE
4318 ? "MAYBE" : "FALSE");
4319 fprintf(log_fd, "****************************\n");
4320}
4321
4322 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004323report_state(char *action,
4324 regsub_T *sub,
4325 nfa_state_T *state,
4326 int lid,
4327 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004328{
4329 int col;
4330
4331 if (sub->in_use <= 0)
4332 col = -1;
4333 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004334 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004335 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004336 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004337 nfa_set_code(state->c);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004338 if (log_fd == NULL)
4339 open_debug_log(MAYBE);
4340
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004341 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4342 action, abs(state->id), lid, state->c, code, col,
4343 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004344}
4345#endif
4346
Bram Moolenaar43e02982013-06-07 17:31:29 +02004347/*
4348 * Return TRUE if the same state is already in list "l" with the same
4349 * positions as "subs".
4350 */
4351 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004352has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004353 nfa_list_T *l, // runtime state list
4354 nfa_state_T *state, // state to update
4355 regsubs_T *subs, // pointers to subexpressions
4356 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004357{
4358 nfa_thread_T *thread;
4359 int i;
4360
4361 for (i = 0; i < l->n; ++i)
4362 {
4363 thread = &l->t[i];
4364 if (thread->state->id == state->id
4365 && sub_equal(&thread->subs.norm, &subs->norm)
4366#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004367 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004368 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004369#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004370 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004371 return TRUE;
4372 }
4373 return FALSE;
4374}
4375
4376/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004377 * Return TRUE if "one" and "two" are equal. That includes when both are not
4378 * set.
4379 */
4380 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004381pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004382{
4383 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4384 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4385
4386 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004387 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004388 return two_unused;
4389 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004390 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004391 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004392 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004393 if (one->state->id != two->state->id)
4394 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004395 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004396 if (REG_MULTI)
4397 return one->end.pos.lnum == two->end.pos.lnum
4398 && one->end.pos.col == two->end.pos.col;
4399 return one->end.ptr == two->end.ptr;
4400}
4401
4402/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004403 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4404 */
4405 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004406match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004407{
4408 nfa_state_T *state = startstate;
4409
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004410 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004411 if (depth > 10)
4412 return FALSE;
4413
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004414 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004415 {
4416 switch (state->c)
4417 {
4418 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004419 case NFA_MCLOSE:
4420 case NFA_END_INVISIBLE:
4421 case NFA_END_INVISIBLE_NEG:
4422 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004423 return TRUE;
4424
4425 case NFA_SPLIT:
4426 return match_follows(state->out, depth + 1)
4427 || match_follows(state->out1, depth + 1);
4428
4429 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004430 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004431 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004432 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004433 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004434 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004435 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004436 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004437 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004438 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004439 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004440 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004441
4442 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004443 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004444 case NFA_IDENT:
4445 case NFA_SIDENT:
4446 case NFA_KWORD:
4447 case NFA_SKWORD:
4448 case NFA_FNAME:
4449 case NFA_SFNAME:
4450 case NFA_PRINT:
4451 case NFA_SPRINT:
4452 case NFA_WHITE:
4453 case NFA_NWHITE:
4454 case NFA_DIGIT:
4455 case NFA_NDIGIT:
4456 case NFA_HEX:
4457 case NFA_NHEX:
4458 case NFA_OCTAL:
4459 case NFA_NOCTAL:
4460 case NFA_WORD:
4461 case NFA_NWORD:
4462 case NFA_HEAD:
4463 case NFA_NHEAD:
4464 case NFA_ALPHA:
4465 case NFA_NALPHA:
4466 case NFA_LOWER:
4467 case NFA_NLOWER:
4468 case NFA_UPPER:
4469 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004470 case NFA_LOWER_IC:
4471 case NFA_NLOWER_IC:
4472 case NFA_UPPER_IC:
4473 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004474 case NFA_START_COLL:
4475 case NFA_START_NEG_COLL:
4476 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004477 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004478 return FALSE;
4479
4480 default:
4481 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004482 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004483 return FALSE;
4484
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004485 // Others: zero-width or possibly zero-width, might still find
4486 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004487 break;
4488 }
4489 state = state->out;
4490 }
4491 return FALSE;
4492}
4493
4494
4495/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004496 * Return TRUE if "state" is already in list "l".
4497 */
4498 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004499state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004500 nfa_list_T *l, // runtime state list
4501 nfa_state_T *state, // state to update
4502 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004503{
4504 if (state->lastlist[nfa_ll_index] == l->id)
4505 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004506 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004507 return TRUE;
4508 }
4509 return FALSE;
4510}
4511
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004512// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004513#define ADDSTATE_HERE_OFFSET 10
4514
Bram Moolenaard05bf562013-06-30 23:24:08 +02004515/*
4516 * Add "state" and possibly what follows to state list ".".
4517 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar616592e2022-06-17 15:17:10 +01004518 * Returns NULL when recursiveness is too deep or timed out.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004519 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004520 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004521addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004522 nfa_list_T *l, // runtime state list
4523 nfa_state_T *state, // state to update
4524 regsubs_T *subs_arg, // pointers to subexpressions
4525 nfa_pim_T *pim, // postponed look-behind match
4526 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004527{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004528 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004529 int off = off_arg;
4530 int add_here = FALSE;
4531 int listindex = 0;
4532 int k;
4533 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004534 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004535 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004536 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004537 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004538 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004539 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004540 regsubs_T *subs = subs_arg;
4541 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004542#ifdef ENABLE_LOG
4543 int did_print = FALSE;
4544#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004545 static int depth = 0;
4546
Bram Moolenaar616592e2022-06-17 15:17:10 +01004547#ifdef FEAT_RELTIME
4548 if (nfa_did_time_out())
4549 return NULL;
4550#endif
4551
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004552 // This function is called recursively. When the depth is too much we run
4553 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004554 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004555 {
4556 --depth;
4557 return NULL;
4558 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004559
Bram Moolenaar16b35782016-09-09 20:29:50 +02004560 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4561 {
4562 add_here = TRUE;
4563 off = 0;
4564 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4565 }
4566
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004567 switch (state->c)
4568 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004569 case NFA_NCLOSE:
4570 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004571 case NFA_MCLOSE1:
4572 case NFA_MCLOSE2:
4573 case NFA_MCLOSE3:
4574 case NFA_MCLOSE4:
4575 case NFA_MCLOSE5:
4576 case NFA_MCLOSE6:
4577 case NFA_MCLOSE7:
4578 case NFA_MCLOSE8:
4579 case NFA_MCLOSE9:
4580#ifdef FEAT_SYN_HL
4581 case NFA_ZCLOSE:
4582 case NFA_ZCLOSE1:
4583 case NFA_ZCLOSE2:
4584 case NFA_ZCLOSE3:
4585 case NFA_ZCLOSE4:
4586 case NFA_ZCLOSE5:
4587 case NFA_ZCLOSE6:
4588 case NFA_ZCLOSE7:
4589 case NFA_ZCLOSE8:
4590 case NFA_ZCLOSE9:
4591#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004592 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004593 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004594 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004595 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004596 // These nodes are not added themselves but their "out" and/or
4597 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004598 break;
4599
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004600 case NFA_BOL:
4601 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004602 // "^" won't match past end-of-line, don't bother trying.
4603 // Except when at the end of the line, or when we are going to the
4604 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004605 if (rex.input > rex.line
4606 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004607 && (nfa_endp == NULL
4608 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004609 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004610 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004611 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004612
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004613 case NFA_MOPEN1:
4614 case NFA_MOPEN2:
4615 case NFA_MOPEN3:
4616 case NFA_MOPEN4:
4617 case NFA_MOPEN5:
4618 case NFA_MOPEN6:
4619 case NFA_MOPEN7:
4620 case NFA_MOPEN8:
4621 case NFA_MOPEN9:
4622#ifdef FEAT_SYN_HL
4623 case NFA_ZOPEN:
4624 case NFA_ZOPEN1:
4625 case NFA_ZOPEN2:
4626 case NFA_ZOPEN3:
4627 case NFA_ZOPEN4:
4628 case NFA_ZOPEN5:
4629 case NFA_ZOPEN6:
4630 case NFA_ZOPEN7:
4631 case NFA_ZOPEN8:
4632 case NFA_ZOPEN9:
4633#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004634 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004635 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004636 // These nodes need to be added so that we can bail out when it
4637 // was added to this list before at the same position to avoid an
4638 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004639
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004640 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004641 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004642 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004643 // This state is already in the list, don't add it again,
4644 // unless it is an MOPEN that is used for a backreference or
4645 // when there is a PIM. For NFA_MATCH check the position,
4646 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004647 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004648 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004649 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004650 // When called from addstate_here() do insert before
4651 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004652 if (add_here)
4653 {
4654 for (k = 0; k < l->n && k < listindex; ++k)
4655 if (l->t[k].state->id == state->id)
4656 {
4657 found = TRUE;
4658 break;
4659 }
4660 }
4661 if (!add_here || found)
4662 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004663skip_add:
4664#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004665 nfa_set_code(state->c);
4666 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4667 abs(state->id), l->id, state->c, code,
4668 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004669#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004670 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004671 return subs;
4672 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004673 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004674
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004675 // Do not add the state again when it exists with the same
4676 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004677 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004678 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004679 }
4680
Bram Moolenaar688b3982019-02-13 21:47:36 +01004681 // When there are backreferences or PIMs the number of states may
4682 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004683 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004684 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004685 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004686 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004687 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004688
Bram Moolenaar688b3982019-02-13 21:47:36 +01004689 if ((long)(newsize >> 10) >= p_mmp)
4690 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004691 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004692 --depth;
4693 return NULL;
4694 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004695 if (subs != &temp_subs)
4696 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004697 // "subs" may point into the current array, need to make a
4698 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004699 copy_sub(&temp_subs.norm, &subs->norm);
4700#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004701 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004702 copy_sub(&temp_subs.synt, &subs->synt);
4703#endif
4704 subs = &temp_subs;
4705 }
4706
Bram Moolenaar688b3982019-02-13 21:47:36 +01004707 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004708 if (newt == NULL)
4709 {
4710 // out of memory
4711 --depth;
4712 return NULL;
4713 }
4714 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004715 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004716 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004717
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004718 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004719 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004720 thread = &l->t[l->n++];
4721 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004722 if (pim == NULL)
4723 thread->pim.result = NFA_PIM_UNUSED;
4724 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004725 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004726 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004727 l->has_pim = TRUE;
4728 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004729 copy_sub(&thread->subs.norm, &subs->norm);
4730#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004731 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004732 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004733#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004734#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004735 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004736 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004737#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004738 }
4739
4740#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004741 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004742 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004743#endif
4744 switch (state->c)
4745 {
4746 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004747 break;
4748
4749 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004750 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004751 subs = addstate(l, state->out, subs, pim, off_arg);
4752 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004753 break;
4754
Bram Moolenaar699c1202013-09-25 16:41:54 +02004755 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004756 case NFA_NOPEN:
4757 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004758 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004759 break;
4760
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004761 case NFA_MOPEN:
4762 case NFA_MOPEN1:
4763 case NFA_MOPEN2:
4764 case NFA_MOPEN3:
4765 case NFA_MOPEN4:
4766 case NFA_MOPEN5:
4767 case NFA_MOPEN6:
4768 case NFA_MOPEN7:
4769 case NFA_MOPEN8:
4770 case NFA_MOPEN9:
4771#ifdef FEAT_SYN_HL
4772 case NFA_ZOPEN:
4773 case NFA_ZOPEN1:
4774 case NFA_ZOPEN2:
4775 case NFA_ZOPEN3:
4776 case NFA_ZOPEN4:
4777 case NFA_ZOPEN5:
4778 case NFA_ZOPEN6:
4779 case NFA_ZOPEN7:
4780 case NFA_ZOPEN8:
4781 case NFA_ZOPEN9:
4782#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004783 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004784 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004785 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004786 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004787 sub = &subs->norm;
4788 }
4789#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004790 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004791 {
4792 subidx = state->c - NFA_ZOPEN;
4793 sub = &subs->synt;
4794 }
4795#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004796 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004797 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004798 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004799 sub = &subs->norm;
4800 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004801
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004802 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004803 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004804 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004805
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004806 // Set the position (with "off" added) in the subexpression. Save
4807 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004808 if (REG_MULTI)
4809 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004810 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004811 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004812 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004813 save_in_use = -1;
4814 }
4815 else
4816 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004817 save_in_use = sub->in_use;
4818 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004819 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004820 sub->list.multi[i].start_lnum = -1;
4821 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004822 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004823 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004824 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004825 if (off == -1)
4826 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004827 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004828 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004829 }
4830 else
4831 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004832 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004833 sub->list.multi[subidx].start_col =
Bram Moolenaarc96311b2022-11-25 21:13:47 +00004834 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004835 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004836 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004837 }
4838 else
4839 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004840 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004841 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004842 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004843 save_in_use = -1;
4844 }
4845 else
4846 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004847 save_in_use = sub->in_use;
4848 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004849 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004850 sub->list.line[i].start = NULL;
4851 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004852 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004853 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004854 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004855 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004856 }
4857
Bram Moolenaar16b35782016-09-09 20:29:50 +02004858 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004859 if (subs == NULL)
4860 break;
4861 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004862#ifdef FEAT_SYN_HL
4863 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4864 sub = &subs->synt;
4865 else
4866#endif
4867 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004868
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004869 if (save_in_use == -1)
4870 {
4871 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004872 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004873 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004874 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004875 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004876 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004877 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004878 break;
4879
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004880 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004881 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004882 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004883 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004884 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004885 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004886 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004887 break;
4888 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004889 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004890 case NFA_MCLOSE1:
4891 case NFA_MCLOSE2:
4892 case NFA_MCLOSE3:
4893 case NFA_MCLOSE4:
4894 case NFA_MCLOSE5:
4895 case NFA_MCLOSE6:
4896 case NFA_MCLOSE7:
4897 case NFA_MCLOSE8:
4898 case NFA_MCLOSE9:
4899#ifdef FEAT_SYN_HL
4900 case NFA_ZCLOSE:
4901 case NFA_ZCLOSE1:
4902 case NFA_ZCLOSE2:
4903 case NFA_ZCLOSE3:
4904 case NFA_ZCLOSE4:
4905 case NFA_ZCLOSE5:
4906 case NFA_ZCLOSE6:
4907 case NFA_ZCLOSE7:
4908 case NFA_ZCLOSE8:
4909 case NFA_ZCLOSE9:
4910#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004911 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004912 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004913 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004914 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004915 sub = &subs->norm;
4916 }
4917#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004918 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004919 {
4920 subidx = state->c - NFA_ZCLOSE;
4921 sub = &subs->synt;
4922 }
4923#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004924 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004925 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004926 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004927 sub = &subs->norm;
4928 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004929
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004930 // We don't fill in gaps here, there must have been an MOPEN that
4931 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004932 save_in_use = sub->in_use;
4933 if (sub->in_use <= subidx)
4934 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004935 if (REG_MULTI)
4936 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004937 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004938 if (off == -1)
4939 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004940 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004941 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004942 }
4943 else
4944 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004945 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004946 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004947 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004948 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004949 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004950 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004951 }
4952 else
4953 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004954 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004955 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004956 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004957 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004958 }
4959
Bram Moolenaar16b35782016-09-09 20:29:50 +02004960 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004961 if (subs == NULL)
4962 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004963 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004964#ifdef FEAT_SYN_HL
4965 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4966 sub = &subs->synt;
4967 else
4968#endif
4969 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004970
4971 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004972 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004973 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004974 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004975 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004976 break;
4977 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004978 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004979 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004980}
4981
4982/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004983 * Like addstate(), but the new state(s) are put at position "*ip".
4984 * Used for zero-width matches, next state to use is the added one.
4985 * This makes sure the order of states to be tried does not change, which
4986 * matters for alternatives.
4987 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004988 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004989addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004990 nfa_list_T *l, // runtime state list
4991 nfa_state_T *state, // state to update
4992 regsubs_T *subs, // pointers to subexpressions
4993 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004994 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004995{
4996 int tlen = l->n;
4997 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004998 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004999 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005000
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005001 // First add the state(s) at the end, so that we know how many there are.
5002 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00005003 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005004 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
5005 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005006 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005007
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005008 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02005009 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005010 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005011
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005012 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02005013 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02005014 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005015 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02005016 if (count == 1)
5017 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005018 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02005019 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02005020 }
5021 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02005022 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005023 if (l->n + count - 1 >= l->len)
5024 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005025 // not enough space to move the new states, reallocate the list
5026 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005027 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01005028 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005029 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005030
Bram Moolenaar688b3982019-02-13 21:47:36 +01005031 if ((long)(newsize >> 10) >= p_mmp)
5032 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00005033 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01005034 return NULL;
5035 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005036 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005037 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005038 return NULL;
5039 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005040 mch_memmove(&(newl[0]),
5041 &(l->t[0]),
5042 sizeof(nfa_thread_T) * listidx);
5043 mch_memmove(&(newl[listidx]),
5044 &(l->t[l->n - count]),
5045 sizeof(nfa_thread_T) * count);
5046 mch_memmove(&(newl[listidx + count]),
5047 &(l->t[listidx + 1]),
5048 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
5049 vim_free(l->t);
5050 l->t = newl;
5051 }
5052 else
5053 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005054 // make space for new states, then move them from the
5055 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005056 mch_memmove(&(l->t[listidx + count]),
5057 &(l->t[listidx + 1]),
5058 sizeof(nfa_thread_T) * (l->n - listidx - 1));
5059 mch_memmove(&(l->t[listidx]),
5060 &(l->t[l->n - 1]),
5061 sizeof(nfa_thread_T) * count);
5062 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005063 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005064 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005065 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005066
5067 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005068}
5069
5070/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005071 * Check character class "class" against current character c.
5072 */
5073 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005074check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005075{
5076 switch (class)
5077 {
5078 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005079 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005080 return OK;
5081 break;
5082 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005083 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005084 return OK;
5085 break;
5086 case NFA_CLASS_BLANK:
5087 if (c == ' ' || c == '\t')
5088 return OK;
5089 break;
5090 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005091 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005092 return OK;
5093 break;
5094 case NFA_CLASS_DIGIT:
5095 if (VIM_ISDIGIT(c))
5096 return OK;
5097 break;
5098 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005099 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005100 return OK;
5101 break;
5102 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005103 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005104 return OK;
5105 break;
5106 case NFA_CLASS_PRINT:
5107 if (vim_isprintc(c))
5108 return OK;
5109 break;
5110 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005111 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005112 return OK;
5113 break;
5114 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005115 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005116 return OK;
5117 break;
5118 case NFA_CLASS_UPPER:
5119 if (MB_ISUPPER(c))
5120 return OK;
5121 break;
5122 case NFA_CLASS_XDIGIT:
5123 if (vim_isxdigit(c))
5124 return OK;
5125 break;
5126 case NFA_CLASS_TAB:
5127 if (c == '\t')
5128 return OK;
5129 break;
5130 case NFA_CLASS_RETURN:
5131 if (c == '\r')
5132 return OK;
5133 break;
5134 case NFA_CLASS_BACKSPACE:
5135 if (c == '\b')
5136 return OK;
5137 break;
5138 case NFA_CLASS_ESCAPE:
5139 if (c == '\033')
5140 return OK;
5141 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005142 case NFA_CLASS_IDENT:
5143 if (vim_isIDc(c))
5144 return OK;
5145 break;
5146 case NFA_CLASS_KEYWORD:
5147 if (reg_iswordc(c))
5148 return OK;
5149 break;
5150 case NFA_CLASS_FNAME:
5151 if (vim_isfilec(c))
5152 return OK;
5153 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005154
5155 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005156 // should not be here :P
Christian Brabandtb64cec22024-03-31 17:52:56 +02005157 siemsg(_(e_nfa_regexp_invalid_character_class_nr), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005158 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005159 }
5160 return FAIL;
5161}
5162
Bram Moolenaar5714b802013-05-28 22:03:20 +02005163/*
5164 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005165 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005166 */
5167 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005168match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005169 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005170 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005171 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005172{
5173 int len;
5174
5175 if (sub->in_use <= subidx)
5176 {
5177retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005178 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005179 *bytelen = 0;
5180 return TRUE;
5181 }
5182
5183 if (REG_MULTI)
5184 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005185 if (sub->list.multi[subidx].start_lnum < 0
5186 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005187 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005188 if (sub->list.multi[subidx].start_lnum == rex.lnum
5189 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005190 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005191 len = sub->list.multi[subidx].end_col
5192 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005193 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5194 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005195 {
5196 *bytelen = len;
5197 return TRUE;
5198 }
5199 }
5200 else
5201 {
5202 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005203 sub->list.multi[subidx].start_lnum,
5204 sub->list.multi[subidx].start_col,
5205 sub->list.multi[subidx].end_lnum,
5206 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005207 bytelen) == RA_MATCH)
5208 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005209 }
5210 }
5211 else
5212 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005213 if (sub->list.line[subidx].start == NULL
5214 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005215 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005216 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005217 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005218 {
5219 *bytelen = len;
5220 return TRUE;
5221 }
5222 }
5223 return FALSE;
5224}
5225
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005226#ifdef FEAT_SYN_HL
5227
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005228/*
5229 * Check for a match with \z subexpression "subidx".
5230 * Return TRUE if it matches.
5231 */
5232 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005233match_zref(
5234 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005235 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005236{
5237 int len;
5238
5239 cleanup_zsubexpr();
5240 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5241 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005242 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005243 *bytelen = 0;
5244 return TRUE;
5245 }
5246
5247 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005248 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005249 {
5250 *bytelen = len;
5251 return TRUE;
5252 }
5253 return FALSE;
5254}
5255#endif
5256
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005257/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005258 * Save list IDs for all NFA states of "prog" into "list".
5259 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005260 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005261 */
5262 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005263nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005264{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005265 int i;
5266 nfa_state_T *p;
5267
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005268 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005269 p = &prog->state[0];
5270 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005271 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005272 list[i] = p->lastlist[1];
5273 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005274 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005275 }
5276}
5277
5278/*
5279 * Restore list IDs from "list" to all NFA states.
5280 */
5281 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005282nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005283{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005284 int i;
5285 nfa_state_T *p;
5286
5287 p = &prog->state[0];
5288 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005289 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005290 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005291 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005292 }
5293}
5294
Bram Moolenaar423532e2013-05-29 21:14:42 +02005295 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005296nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005297{
5298 if (op == 1) return pos > val;
5299 if (op == 2) return pos < val;
5300 return val == pos;
5301}
5302
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005303static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005304
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005305/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005306 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005307 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5308 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005309 */
5310 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005311recursive_regmatch(
5312 nfa_state_T *state,
5313 nfa_pim_T *pim,
5314 nfa_regprog_T *prog,
5315 regsubs_T *submatch,
5316 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005317 int **listids,
5318 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005319{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005320 int save_reginput_col = (int)(rex.input - rex.line);
5321 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005322 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005323 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005324 save_se_T *save_nfa_endp = nfa_endp;
5325 save_se_T endpos;
5326 save_se_T *endposp = NULL;
5327 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005328 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005329
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005330 if (pim != NULL)
5331 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005332 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005333 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005334 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005335 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005336 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005337 }
5338
Bram Moolenaardecd9542013-06-07 16:31:50 +02005339 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005340 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5341 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5342 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005343 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005344 // The recursive match must end at the current position. When "pim" is
5345 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005346 endposp = &endpos;
5347 if (REG_MULTI)
5348 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005349 if (pim == NULL)
5350 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005351 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5352 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005353 }
5354 else
5355 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005356 }
5357 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005358 {
5359 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005360 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005361 else
5362 endpos.se_u.ptr = pim->end.ptr;
5363 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005364
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005365 // Go back the specified number of bytes, or as far as the
5366 // start of the previous line, to try matching "\@<=" or
5367 // not matching "\@<!". This is very inefficient, limit the number of
5368 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005369 if (state->val <= 0)
5370 {
5371 if (REG_MULTI)
5372 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005373 rex.line = reg_getline(--rex.lnum);
5374 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005375 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005376 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005377 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005378 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005379 }
5380 else
5381 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005382 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005383 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005384 // Not enough bytes in this line, go to end of
5385 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005386 rex.line = reg_getline(--rex.lnum);
5387 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005388 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005389 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005390 rex.line = reg_getline(++rex.lnum);
5391 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005392 }
5393 else
John Marriott82792db2024-05-12 00:07:17 +02005394 rex.input = rex.line + reg_getline_len(rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005395 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005396 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005397 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005398 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005399 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005400 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005401 }
5402 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005403 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005404 }
5405 }
5406
Bram Moolenaarf46da702013-06-02 22:37:42 +02005407#ifdef ENABLE_LOG
5408 if (log_fd != stderr)
5409 fclose(log_fd);
5410 log_fd = NULL;
5411#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005412 // Have to clear the lastlist field of the NFA nodes, so that
5413 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005414 if (nfa_ll_index == 1)
5415 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005416 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5417 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005418 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005419 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005420 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005421 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005422 if (*listids == NULL)
5423 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00005424 emsg(_(e_nfa_regexp_could_not_allocate_memory_for_branch_traversal));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005425 return 0;
5426 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005427 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005428 }
5429 nfa_save_listids(prog, *listids);
5430 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005431 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005432 }
5433 else
5434 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005435 // First recursive nfa_regmatch() call, switch to the second lastlist
5436 // entry. Make sure rex.nfa_listid is different from a previous
5437 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005438 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005439 if (rex.nfa_listid <= rex.nfa_alt_listid)
5440 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005441 }
5442
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005443 // Call nfa_regmatch() to check if the current concat matches at this
5444 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005445 nfa_endp = endposp;
5446 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005447
5448 if (need_restore)
5449 nfa_restore_listids(prog, *listids);
5450 else
5451 {
5452 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005453 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005454 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005455
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005456 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005457 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005458 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005459 rex.line = reg_getline(rex.lnum);
5460 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005461 if (result != NFA_TOO_EXPENSIVE)
5462 {
5463 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005464 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005465 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005466 nfa_endp = save_nfa_endp;
5467
5468#ifdef ENABLE_LOG
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005469 open_debug_log(result);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005470#endif
5471
5472 return result;
5473}
5474
Bram Moolenaara2d95102013-06-04 14:23:05 +02005475/*
5476 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005477 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005478 * NFA_ANY: 1
5479 * specific character: 99
5480 */
5481 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005482failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005483{
5484 int c = state->c;
5485 int l, r;
5486
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005487 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005488 if (depth > 4)
5489 return 1;
5490
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005491 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005492 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005493 case NFA_SPLIT:
5494 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005495 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005496 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005497 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005498 l = failure_chance(state->out, depth + 1);
5499 r = failure_chance(state->out1, depth + 1);
5500 return l < r ? l : r;
5501
5502 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005503 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005504 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005505
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005506 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005507 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005508 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005509 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005510 return 0;
5511
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005512 case NFA_START_INVISIBLE:
5513 case NFA_START_INVISIBLE_FIRST:
5514 case NFA_START_INVISIBLE_NEG:
5515 case NFA_START_INVISIBLE_NEG_FIRST:
5516 case NFA_START_INVISIBLE_BEFORE:
5517 case NFA_START_INVISIBLE_BEFORE_FIRST:
5518 case NFA_START_INVISIBLE_BEFORE_NEG:
5519 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5520 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005521 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005522 return 5;
5523
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005524 case NFA_BOL:
5525 case NFA_EOL:
5526 case NFA_BOF:
5527 case NFA_EOF:
5528 case NFA_NEWL:
5529 return 99;
5530
5531 case NFA_BOW:
5532 case NFA_EOW:
5533 return 90;
5534
5535 case NFA_MOPEN:
5536 case NFA_MOPEN1:
5537 case NFA_MOPEN2:
5538 case NFA_MOPEN3:
5539 case NFA_MOPEN4:
5540 case NFA_MOPEN5:
5541 case NFA_MOPEN6:
5542 case NFA_MOPEN7:
5543 case NFA_MOPEN8:
5544 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005545#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005546 case NFA_ZOPEN:
5547 case NFA_ZOPEN1:
5548 case NFA_ZOPEN2:
5549 case NFA_ZOPEN3:
5550 case NFA_ZOPEN4:
5551 case NFA_ZOPEN5:
5552 case NFA_ZOPEN6:
5553 case NFA_ZOPEN7:
5554 case NFA_ZOPEN8:
5555 case NFA_ZOPEN9:
5556 case NFA_ZCLOSE:
5557 case NFA_ZCLOSE1:
5558 case NFA_ZCLOSE2:
5559 case NFA_ZCLOSE3:
5560 case NFA_ZCLOSE4:
5561 case NFA_ZCLOSE5:
5562 case NFA_ZCLOSE6:
5563 case NFA_ZCLOSE7:
5564 case NFA_ZCLOSE8:
5565 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005566#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005567 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005568 case NFA_MCLOSE1:
5569 case NFA_MCLOSE2:
5570 case NFA_MCLOSE3:
5571 case NFA_MCLOSE4:
5572 case NFA_MCLOSE5:
5573 case NFA_MCLOSE6:
5574 case NFA_MCLOSE7:
5575 case NFA_MCLOSE8:
5576 case NFA_MCLOSE9:
5577 case NFA_NCLOSE:
5578 return failure_chance(state->out, depth + 1);
5579
5580 case NFA_BACKREF1:
5581 case NFA_BACKREF2:
5582 case NFA_BACKREF3:
5583 case NFA_BACKREF4:
5584 case NFA_BACKREF5:
5585 case NFA_BACKREF6:
5586 case NFA_BACKREF7:
5587 case NFA_BACKREF8:
5588 case NFA_BACKREF9:
5589#ifdef FEAT_SYN_HL
5590 case NFA_ZREF1:
5591 case NFA_ZREF2:
5592 case NFA_ZREF3:
5593 case NFA_ZREF4:
5594 case NFA_ZREF5:
5595 case NFA_ZREF6:
5596 case NFA_ZREF7:
5597 case NFA_ZREF8:
5598 case NFA_ZREF9:
5599#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005600 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005601 return 94;
5602
5603 case NFA_LNUM_GT:
5604 case NFA_LNUM_LT:
5605 case NFA_COL_GT:
5606 case NFA_COL_LT:
5607 case NFA_VCOL_GT:
5608 case NFA_VCOL_LT:
5609 case NFA_MARK_GT:
5610 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005611 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005612 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005613 return 85;
5614
5615 case NFA_LNUM:
5616 return 90;
5617
5618 case NFA_CURSOR:
5619 case NFA_COL:
5620 case NFA_VCOL:
5621 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005622 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005623 return 98;
5624
5625 case NFA_COMPOSING:
5626 return 95;
5627
5628 default:
5629 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005630 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005631 return 95;
5632 }
5633
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005634 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005635 return 50;
5636}
5637
Bram Moolenaarf46da702013-06-02 22:37:42 +02005638/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005639 * Skip until the char "c" we know a match must start with.
5640 */
5641 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005642skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005643{
5644 char_u *s;
5645
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005646 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005647 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005648 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005649 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005650 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005651 if (s == NULL)
5652 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005653 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005654 return OK;
5655}
5656
5657/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005658 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005659 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005660 * Returns zero for no match, 1 for a match.
5661 */
5662 static long
Bram Moolenaar79336e12022-12-11 14:18:31 +00005663find_match_text(colnr_T *startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005664{
Bram Moolenaar79336e12022-12-11 14:18:31 +00005665 colnr_T col = *startcol;
Bram Moolenaar473de612013-06-08 18:19:48 +02005666 int c1, c2;
5667 int len1, len2;
5668 int match;
5669
5670 for (;;)
5671 {
5672 match = TRUE;
Christian Brabandt22e8e122024-07-30 20:39:18 +02005673 // skip regstart
5674 len2 = MB_CHAR2LEN(regstart);
5675 if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2)
5676 // because of case-folding of the previously matched text, we may need
5677 // to skip fewer bytes than mb_char2len(regstart)
5678 len2 = mb_char2len(utf_fold(regstart));
Bram Moolenaar473de612013-06-08 18:19:48 +02005679 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5680 {
5681 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005682 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005683 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005684 {
5685 match = FALSE;
5686 break;
5687 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005688 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5689 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005690 }
5691 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005692 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005693 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005694 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005695 {
5696 cleanup_subexpr();
5697 if (REG_MULTI)
5698 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005699 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005700 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005701 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005702 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005703 }
5704 else
5705 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005706 rex.reg_startp[0] = rex.line + col;
5707 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005708 }
Bram Moolenaar79336e12022-12-11 14:18:31 +00005709 *startcol = col;
Bram Moolenaar473de612013-06-08 18:19:48 +02005710 return 1L;
5711 }
5712
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005713 // Try finding regstart after the current match.
5714 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005715 if (skip_to_start(regstart, &col) == FAIL)
5716 break;
5717 }
Bram Moolenaar79336e12022-12-11 14:18:31 +00005718
5719 *startcol = col;
Bram Moolenaar473de612013-06-08 18:19:48 +02005720 return 0L;
5721}
5722
5723/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005724 * Main matching routine.
5725 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005726 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005727 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005728 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005729 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005730 * Return TRUE if there is a match, FALSE if there is no match,
5731 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005732 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005733 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005734 * Note: Caller must ensure that: start != NULL.
5735 */
5736 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005737nfa_regmatch(
5738 nfa_regprog_T *prog,
5739 nfa_state_T *start,
5740 regsubs_T *submatch,
5741 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005742{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005743 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005744 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005745 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005746 int go_to_nextline = FALSE;
5747 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005748 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005749 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005750 nfa_list_T *thislist;
5751 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005752 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005753 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005754 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005755 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005756 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005757 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005758 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005759 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005760#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005761 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005762#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005763
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005764 // Some patterns may take a long time to match, especially when using
5765 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005766 fast_breakcheck();
5767 if (got_int)
5768 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005769#ifdef FEAT_RELTIME
Paul Ollis65745772022-06-05 16:55:54 +01005770 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005771 return FALSE;
5772#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005773
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005774#ifdef NFA_REGEXP_DEBUG_LOG
5775 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5776 if (debug == NULL)
5777 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005778 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005779 return FALSE;
5780 }
5781#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005782 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005783
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005784 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005785 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005786
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005787 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005788 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005789 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005790 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005791 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005792 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005793
5794#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005795 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005796 if (log_fd == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005797 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005798 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005799 log_fd = stderr;
5800 }
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005801 fprintf(log_fd, "**********************************\n");
5802 nfa_set_code(start->c);
5803 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5804 abs(start->id), code);
5805 fprintf(log_fd, "**********************************\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005806#endif
5807
5808 thislist = &list[0];
5809 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005810 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005811 nextlist = &list[1];
5812 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005813 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005814#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005815 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005816#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005817 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005818
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005819 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5820 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005821 if (toplevel)
5822 {
5823 if (REG_MULTI)
5824 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005825 m->norm.list.multi[0].start_lnum = rex.lnum;
5826 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar79336e12022-12-11 14:18:31 +00005827 m->norm.orig_start_col = m->norm.list.multi[0].start_col;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005828 }
5829 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005830 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005831 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005832 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005833 }
5834 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005835 r = addstate(thislist, start, m, NULL, 0);
5836 if (r == NULL)
5837 {
5838 nfa_match = NFA_TOO_EXPENSIVE;
5839 goto theend;
5840 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005841
Bram Moolenaarebfec1c2023-01-22 21:14:53 +00005842#define ADD_STATE_IF_MATCH(state) \
5843 if (result) \
5844 { \
5845 add_state = state->out; \
5846 add_off = clen; \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005847 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005848
5849 /*
5850 * Run for each character.
5851 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005852 for (;;)
5853 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005854 int curc;
5855 int clen;
5856
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005857 if (has_mbyte)
5858 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005859 curc = (*mb_ptr2char)(rex.input);
5860 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005861 }
5862 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005863 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005864 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005865 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005866 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005867 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005868 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005869 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005870 go_to_nextline = FALSE;
5871 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005872
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005873 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005874 thislist = &list[flag];
5875 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005876 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005877 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005878 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005879 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005880 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005881# ifdef FEAT_EVAL
5882 || nfa_fail_for_testing
5883# endif
5884 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005885 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005886 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005887 nfa_match = NFA_TOO_EXPENSIVE;
5888 goto theend;
5889 }
5890
Bram Moolenaar0270f382018-07-17 05:43:58 +02005891 thislist->id = rex.nfa_listid;
5892 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005893
5894#ifdef ENABLE_LOG
5895 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005896 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005897 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005898 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005899 {
5900 int i;
5901
5902 for (i = 0; i < thislist->n; i++)
5903 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5904 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005905 fprintf(log_fd, "\n");
5906#endif
5907
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005908#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005909 fprintf(debug, "\n-------------------\n");
5910#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005911 /*
5912 * If the state lists are empty we can stop.
5913 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005914 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005915 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005916
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005917 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005918 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005919 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005920 // If the list gets very long there probably is something wrong.
5921 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005922 fast_breakcheck();
5923 if (got_int)
5924 break;
5925#ifdef FEAT_RELTIME
Paul Ollis65745772022-06-05 16:55:54 +01005926 if (nfa_did_time_out())
Bram Moolenaar305abc62022-05-28 11:08:40 +01005927 break;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005928#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005929 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005930
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005931#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005932 nfa_set_code(t->state->c);
5933 fprintf(debug, "%s, ", code);
5934#endif
5935#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005936 {
5937 int col;
5938
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005939 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005940 col = -1;
5941 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005942 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005943 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005944 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005945 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005946 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005947 abs(t->state->id), (int)t->state->c, code, col,
5948 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005949 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005950#endif
5951
5952 /*
5953 * Handle the possible codes of the current state.
5954 * The most important is NFA_MATCH.
5955 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005956 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005957 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005958 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005959 switch (t->state->c)
5960 {
5961 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005962 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005963 // If the match is not at the start of the line, ends before a
5964 // composing characters and rex.reg_icombine is not set, that
5965 // is not really a match.
5966 if (enc_utf8 && !rex.reg_icombine
5967 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005968 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005969
Bram Moolenaar963fee22013-05-26 21:47:28 +02005970 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005971 copy_sub(&submatch->norm, &t->subs.norm);
5972#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005973 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005974 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005975#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005976#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005977 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005978#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005979 // Found the left-most longest match, do not look at any other
5980 // states at this position. When the list of states is going
5981 // to be empty quit without advancing, so that "rex.input" is
5982 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005983 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005984 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005985 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005986 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005987
5988 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005989 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005990 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005991 /*
5992 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005993 * NFA_START_INVISIBLE_BEFORE node.
5994 * They surround a zero-width group, used with "\@=", "\&",
5995 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005996 * If we got here, it means that the current "invisible" group
5997 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005998 * nfa_regmatch(). For a look-behind match only when it ends
5999 * in the position in "nfa_endp".
6000 * Submatches are stored in *m, and used in the parent call.
6001 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02006002#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02006003 if (nfa_endp != NULL)
6004 {
6005 if (REG_MULTI)
6006 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02006007 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02006008 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006009 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02006010 nfa_endp->se_u.pos.col);
6011 else
6012 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02006013 (int)(rex.input - rex.line),
6014 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006015 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02006016#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006017 // If "nfa_endp" is set it's only a match if it ends at
6018 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02006019 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006020 ? (rex.lnum != nfa_endp->se_u.pos.lnum
6021 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006022 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006023 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02006024 break;
6025
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006026 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02006027 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006028 {
6029 copy_sub(&m->norm, &t->subs.norm);
6030#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006031 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006032 copy_sub(&m->synt, &t->subs.synt);
6033#endif
6034 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006035#ifdef ENABLE_LOG
6036 fprintf(log_fd, "Match found:\n");
6037 log_subsexpr(m);
6038#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02006039 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006040 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02006041 if (nextlist->n == 0)
6042 clen = 0;
6043 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006044
6045 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006046 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006047 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006048 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02006049 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006050 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006051 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006052 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006053 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02006054#ifdef ENABLE_LOG
6055 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6056 failure_chance(t->state->out, 0),
6057 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02006058#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006059 // Do it directly if there already is a PIM or when
6060 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02006061 if (t->pim.result != NFA_PIM_UNUSED
6062 || t->state->c == NFA_START_INVISIBLE_FIRST
6063 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6064 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6065 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006066 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006067 int in_use = m->norm.in_use;
6068
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006069 // Copy submatch info for the recursive call, opposite
6070 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006071 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02006072#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006073 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006074 copy_sub_off(&m->synt, &t->subs.synt);
6075#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006076
Bram Moolenaara2d95102013-06-04 14:23:05 +02006077 /*
6078 * First try matching the invisible match, then what
6079 * follows.
6080 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006081 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006082 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006083 if (result == NFA_TOO_EXPENSIVE)
6084 {
6085 nfa_match = result;
6086 goto theend;
6087 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006088
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006089 // for \@! and \@<! it is a match when the result is
6090 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006091 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006092 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6093 || t->state->c
6094 == NFA_START_INVISIBLE_BEFORE_NEG
6095 || t->state->c
6096 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006097 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006098 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006099 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006100#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006101 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006102 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006103#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006104 // If the pattern has \ze and it matched in the
6105 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006106 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006107
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006108 // t->state->out1 is the corresponding
6109 // END_INVISIBLE node; Add its out to the current
6110 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006111 add_here = TRUE;
6112 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006113 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006114 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006115 }
6116 else
6117 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006118 nfa_pim_T pim;
6119
Bram Moolenaara2d95102013-06-04 14:23:05 +02006120 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006121 * First try matching what follows. Only if a match
6122 * is found verify the invisible match matches. Add a
6123 * nfa_pim_T to the following states, it contains info
6124 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006125 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006126 pim.state = t->state;
6127 pim.result = NFA_PIM_TODO;
6128 pim.subs.norm.in_use = 0;
6129#ifdef FEAT_SYN_HL
6130 pim.subs.synt.in_use = 0;
6131#endif
6132 if (REG_MULTI)
6133 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006134 pim.end.pos.col = (int)(rex.input - rex.line);
6135 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006136 }
6137 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006138 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006139
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006140 // t->state->out1 is the corresponding END_INVISIBLE
6141 // node; Add its out to the current list (zero-width
6142 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006143 if (addstate_here(thislist, t->state->out1->out,
6144 &t->subs, &pim, &listidx) == NULL)
6145 {
6146 nfa_match = NFA_TOO_EXPENSIVE;
6147 goto theend;
6148 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006149 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006150 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006151 break;
6152
Bram Moolenaar87953742013-06-05 18:52:40 +02006153 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006154 {
6155 nfa_state_T *skip = NULL;
6156#ifdef ENABLE_LOG
6157 int skip_lid = 0;
6158#endif
6159
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006160 // There is no point in trying to match the pattern if the
6161 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006162 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6163 {
6164 skip = t->state->out1->out;
6165#ifdef ENABLE_LOG
6166 skip_lid = nextlist->id;
6167#endif
6168 }
6169 else if (state_in_list(nextlist,
6170 t->state->out1->out->out, &t->subs))
6171 {
6172 skip = t->state->out1->out->out;
6173#ifdef ENABLE_LOG
6174 skip_lid = nextlist->id;
6175#endif
6176 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006177 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006178 t->state->out1->out->out, &t->subs))
6179 {
6180 skip = t->state->out1->out->out;
6181#ifdef ENABLE_LOG
6182 skip_lid = thislist->id;
6183#endif
6184 }
6185 if (skip != NULL)
6186 {
6187#ifdef ENABLE_LOG
6188 nfa_set_code(skip->c);
6189 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6190 abs(skip->id), skip_lid, skip->c, code);
6191#endif
6192 break;
6193 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006194 // Copy submatch info to the recursive call, opposite of what
6195 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006196 copy_sub_off(&m->norm, &t->subs.norm);
6197#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006198 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006199 copy_sub_off(&m->synt, &t->subs.synt);
6200#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006201
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006202 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006203 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006204 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006205 if (result == NFA_TOO_EXPENSIVE)
6206 {
6207 nfa_match = result;
6208 goto theend;
6209 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006210 if (result)
6211 {
6212 int bytelen;
6213
6214#ifdef ENABLE_LOG
6215 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6216 log_subsexpr(m);
6217#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006218 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006219 copy_sub_off(&t->subs.norm, &m->norm);
6220#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006221 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006222 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006223#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006224 // Now we need to skip over the matched text and then
6225 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006226 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006227 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006228 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006229 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006230 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006231 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006232
6233#ifdef ENABLE_LOG
6234 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6235#endif
6236 if (bytelen == 0)
6237 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006238 // empty match, output of corresponding
6239 // NFA_END_PATTERN/NFA_SKIP to be used at current
6240 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006241 add_here = TRUE;
6242 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006243 }
6244 else if (bytelen <= clen)
6245 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006246 // match current character, output of corresponding
6247 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006248 add_state = t->state->out1->out->out;
6249 add_off = clen;
6250 }
6251 else
6252 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006253 // skip over the matched characters, set character
6254 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006255 add_state = t->state->out1->out;
6256 add_off = bytelen;
6257 add_count = bytelen - clen;
6258 }
6259 }
6260 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006261 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006262
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006263 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006264 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006265 {
6266 add_here = TRUE;
6267 add_state = t->state->out;
6268 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006269 break;
6270
6271 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006272 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006273 {
6274 add_here = TRUE;
6275 add_state = t->state->out;
6276 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006277 break;
6278
6279 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006280 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006281
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006282 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006283 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006284 else if (has_mbyte)
6285 {
6286 int this_class;
6287
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006288 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006289 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006290 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006291 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006292 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006293 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006294 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006295 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006296 || (rex.input > rex.line
Bram Moolenaarc96311b2022-11-25 21:13:47 +00006297 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006298 result = FALSE;
6299 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006300 {
6301 add_here = TRUE;
6302 add_state = t->state->out;
6303 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006304 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006305
6306 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006307 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006308 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006309 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006310 else if (has_mbyte)
6311 {
6312 int this_class, prev_class;
6313
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006314 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006315 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006316 prev_class = reg_prev_class();
6317 if (this_class == prev_class
6318 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006319 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006320 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006321 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6322 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006323 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006324 result = FALSE;
6325 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006326 {
6327 add_here = TRUE;
6328 add_state = t->state->out;
6329 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006330 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006331
Bram Moolenaar4b780632013-05-31 22:14:52 +02006332 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006333 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006334 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006335 {
6336 add_here = TRUE;
6337 add_state = t->state->out;
6338 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006339 break;
6340
6341 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006342 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006343 {
6344 add_here = TRUE;
6345 add_state = t->state->out;
6346 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006347 break;
6348
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006349 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006350 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006351 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006352 int len = 0;
6353 nfa_state_T *end;
6354 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006355 int cchars[MAX_MCO];
6356 int ccount = 0;
6357 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006358
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006359 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006360 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006361 if (utf_iscomposing(sta->c))
6362 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006363 // Only match composing character(s), ignore base
6364 // character. Used for ".{composing}" and "{composing}"
6365 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006366 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006367 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006368 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006369 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006370 // If \Z was present, then ignore composing characters.
6371 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006372 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006373 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006374 else
6375 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006376 while (sta->c != NFA_END_COMPOSING)
6377 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006378 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006379
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006380 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006381 else if (len > 0 || mc == sta->c)
6382 {
6383 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006384 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006385 len += mb_char2len(mc);
6386 sta = sta->out;
6387 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006388
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006389 // We don't care about the order of composing characters.
6390 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006391 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006392 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006393 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006394 cchars[ccount++] = mc;
6395 len += mb_char2len(mc);
6396 if (ccount == MAX_MCO)
6397 break;
6398 }
6399
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006400 // Check that each composing char in the pattern matches a
6401 // composing char in the text. We do not check if all
6402 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006403 result = OK;
6404 while (sta->c != NFA_END_COMPOSING)
6405 {
6406 for (j = 0; j < ccount; ++j)
6407 if (cchars[j] == sta->c)
6408 break;
6409 if (j == ccount)
6410 {
6411 result = FAIL;
6412 break;
6413 }
6414 sta = sta->out;
6415 }
6416 }
6417 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006418 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006419
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006420 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006421 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006422 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006423 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006424
6425 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006426 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaarc96311b2022-11-25 21:13:47 +00006427 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006428 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006429 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006430 // Pass -1 for the offset, which means taking the position
6431 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006432 add_state = t->state->out;
6433 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006434 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006435 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006436 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006437 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006438 add_state = t->state->out;
6439 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006440 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006441 break;
6442
Bram Moolenaar417bad22013-06-07 14:08:30 +02006443 case NFA_START_COLL:
6444 case NFA_START_NEG_COLL:
6445 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006446 // What follows is a list of characters, until NFA_END_COLL.
6447 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006448 nfa_state_T *state;
6449 int result_if_matched;
6450 int c1, c2;
6451
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006452 // Never match EOL. If it's part of the collection it is added
6453 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006454 if (curc == NUL)
6455 break;
6456
6457 state = t->state->out;
6458 result_if_matched = (t->state->c == NFA_START_COLL);
6459 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006460 {
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01006461 if (state->c == NFA_COMPOSING)
6462 {
6463 int mc = curc;
6464 int len = 0;
6465 nfa_state_T *end;
6466 nfa_state_T *sta;
6467 int cchars[MAX_MCO];
6468 int ccount = 0;
6469 int j;
6470
6471 sta = t->state->out->out;
6472 len = 0;
6473 if (utf_iscomposing(sta->c))
6474 {
6475 // Only match composing character(s), ignore base
6476 // character. Used for ".{composing}" and "{composing}"
6477 // (no preceding character).
6478 len += mb_char2len(mc);
6479 }
6480 if (rex.reg_icombine && len == 0)
6481 {
6482 // If \Z was present, then ignore composing characters.
6483 // When ignoring the base character this always matches.
6484 if (sta->c != curc)
6485 result = FAIL;
6486 else
6487 result = OK;
6488 while (sta->c != NFA_END_COMPOSING)
6489 sta = sta->out;
6490 }
6491 // Check base character matches first, unless ignored.
6492 else if (len > 0 || mc == sta->c)
6493// if (len > 0 || mc == sta->c)
6494 {
6495 if (len == 0)
6496 {
6497 len += mb_char2len(mc);
6498 sta = sta->out;
6499 }
6500
6501 // We don't care about the order of composing characters.
6502 // Get them into cchars[] first.
6503 while (len < clen)
6504 {
6505 mc = mb_ptr2char(rex.input + len);
6506 cchars[ccount++] = mc;
6507 len += mb_char2len(mc);
6508 if (ccount == MAX_MCO)
6509 break;
6510 }
6511
6512 // Check that each composing char in the pattern matches a
6513 // composing char in the text. We do not check if all
6514 // composing chars are matched.
6515 result = OK;
6516 while (sta->c != NFA_END_COMPOSING)
6517 {
6518 for (j = 0; j < ccount; ++j)
6519 if (cchars[j] == sta->c)
6520 break;
6521 if (j == ccount)
6522 {
6523 result = FAIL;
6524 break;
6525 }
6526 sta = sta->out;
6527 }
6528 }
6529 else
6530 result = FAIL;
6531
Christian Brabandtc3a02d72024-08-28 23:17:52 +02006532 if (t->state->out->out1 != NULL
6533 && t->state->out->out1->c == NFA_END_COMPOSING)
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01006534 {
6535 end = t->state->out->out1;
6536 ADD_STATE_IF_MATCH(end);
6537 }
6538 break;
6539 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02006540 if (state->c == NFA_END_COLL)
6541 {
6542 result = !result_if_matched;
6543 break;
6544 }
6545 if (state->c == NFA_RANGE_MIN)
6546 {
6547 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006548 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006549 c2 = state->val;
6550#ifdef ENABLE_LOG
6551 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6552 curc, c1, c2);
6553#endif
6554 if (curc >= c1 && curc <= c2)
6555 {
6556 result = result_if_matched;
6557 break;
6558 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006559 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006560 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006561 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006562 int done = FALSE;
6563
6564 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006565 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006566 {
6567 result = result_if_matched;
6568 done = TRUE;
6569 break;
6570 }
6571 if (done)
6572 break;
6573 }
6574 }
6575 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006576 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006577 || (rex.reg_ic && MB_CASEFOLD(curc)
6578 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006579 {
6580 result = result_if_matched;
6581 break;
6582 }
6583 state = state->out;
6584 }
6585 if (result)
6586 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006587 // next state is in out of the NFA_END_COLL, out1 of
6588 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006589 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006590 add_off = clen;
6591 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006592 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006593 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006594
6595 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006596 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006597 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006598 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006599 add_state = t->state->out;
6600 add_off = clen;
6601 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006602 break;
6603
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006604 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006605 // On a composing character skip over it. Otherwise do
6606 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006607 if (enc_utf8 && utf_iscomposing(curc))
6608 {
6609 add_off = clen;
6610 }
6611 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006612 {
6613 add_here = TRUE;
6614 add_off = 0;
6615 }
6616 add_state = t->state->out;
6617 break;
6618
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006619 /*
6620 * Character classes like \a for alpha, \d for digit etc.
6621 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006622 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006623 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006624 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006625 break;
6626
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006627 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006628 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006629 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006630 break;
6631
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006632 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006633 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006634 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006635 break;
6636
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006637 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006638 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006639 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006640 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006641 break;
6642
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006643 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006644 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006645 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006646 break;
6647
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006648 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006649 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006650 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006651 break;
6652
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006653 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006654 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006655 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006656 break;
6657
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006658 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006659 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006660 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006661 break;
6662
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006663 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006664 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006665 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006666 break;
6667
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006668 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006669 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006670 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006671 break;
6672
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006673 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006674 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006675 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006676 break;
6677
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006678 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006679 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006680 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006681 break;
6682
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006683 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006684 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006685 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006686 break;
6687
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006688 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006689 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006690 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006691 break;
6692
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006693 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006694 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006695 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006696 break;
6697
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006698 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006699 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006700 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006701 break;
6702
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006703 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006704 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006705 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006706 break;
6707
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006708 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006709 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006710 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006711 break;
6712
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006713 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006714 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006715 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006716 break;
6717
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006718 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006719 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006720 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006721 break;
6722
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006723 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006724 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006725 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006726 break;
6727
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006728 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006729 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006730 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006731 break;
6732
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006733 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006734 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006735 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006736 break;
6737
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006738 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006739 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006740 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006741 break;
6742
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006743 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006744 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006745 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006746 break;
6747
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006748 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006749 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006750 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006751 break;
6752
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006753 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006754 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006755 ADD_STATE_IF_MATCH(t->state);
6756 break;
6757
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006758 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006759 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006760 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006761 ADD_STATE_IF_MATCH(t->state);
6762 break;
6763
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006764 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006765 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006766 ADD_STATE_IF_MATCH(t->state);
6767 break;
6768
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006769 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006770 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006771 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006772 ADD_STATE_IF_MATCH(t->state);
6773 break;
6774
Bram Moolenaar5714b802013-05-28 22:03:20 +02006775 case NFA_BACKREF1:
6776 case NFA_BACKREF2:
6777 case NFA_BACKREF3:
6778 case NFA_BACKREF4:
6779 case NFA_BACKREF5:
6780 case NFA_BACKREF6:
6781 case NFA_BACKREF7:
6782 case NFA_BACKREF8:
6783 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006784#ifdef FEAT_SYN_HL
6785 case NFA_ZREF1:
6786 case NFA_ZREF2:
6787 case NFA_ZREF3:
6788 case NFA_ZREF4:
6789 case NFA_ZREF5:
6790 case NFA_ZREF6:
6791 case NFA_ZREF7:
6792 case NFA_ZREF8:
6793 case NFA_ZREF9:
6794#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006795 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006796 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006797 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006798 int bytelen;
6799
Bram Moolenaar1f761382023-03-25 11:31:32 +00006800#ifdef FEAT_SYN_HL
6801 if (t->state->c >= NFA_BACKREF1 && t->state->c <= NFA_BACKREF9)
6802#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006803 {
6804 subidx = t->state->c - NFA_BACKREF1 + 1;
6805 result = match_backref(&t->subs.norm, subidx, &bytelen);
6806 }
6807#ifdef FEAT_SYN_HL
6808 else
6809 {
6810 subidx = t->state->c - NFA_ZREF1 + 1;
6811 result = match_zref(subidx, &bytelen);
6812 }
6813#endif
6814
Bram Moolenaar5714b802013-05-28 22:03:20 +02006815 if (result)
6816 {
6817 if (bytelen == 0)
6818 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006819 // empty match always works, output of NFA_SKIP to be
6820 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006821 add_here = TRUE;
6822 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006823 }
6824 else if (bytelen <= clen)
6825 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006826 // match current character, jump ahead to out of
6827 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006828 add_state = t->state->out->out;
6829 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006830 }
6831 else
6832 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006833 // skip over the matched characters, set character
6834 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006835 add_state = t->state->out;
6836 add_off = bytelen;
6837 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006838 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006839 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006840 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006841 }
6842 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006843 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006844 if (t->count - clen <= 0)
6845 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006846 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006847 add_state = t->state->out;
6848 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006849 }
6850 else
6851 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006852 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006853 add_state = t->state;
6854 add_off = 0;
6855 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006856 }
6857 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006858
Bram Moolenaar423532e2013-05-29 21:14:42 +02006859 case NFA_LNUM:
6860 case NFA_LNUM_GT:
6861 case NFA_LNUM_LT:
6862 result = (REG_MULTI &&
6863 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006864 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006865 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006866 {
6867 add_here = TRUE;
6868 add_state = t->state->out;
6869 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006870 break;
6871
6872 case NFA_COL:
6873 case NFA_COL_GT:
6874 case NFA_COL_LT:
6875 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006876 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006877 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006878 {
6879 add_here = TRUE;
6880 add_state = t->state->out;
6881 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006882 break;
6883
6884 case NFA_VCOL:
6885 case NFA_VCOL_GT:
6886 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006887 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006888 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006889 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006890 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006891
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006892 // Bail out quickly when there can't be a match, avoid the
6893 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006894 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006895 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006896 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006897 result = FALSE;
6898 if (op == 1 && col - 1 > t->state->val && col > 100)
6899 {
6900 int ts = wp->w_buffer->b_p_ts;
6901
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006902 // Guess that a character won't use more columns than
6903 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006904 if (ts < 4)
6905 ts = 4;
6906 result = col > t->state->val * ts;
6907 }
6908 if (!result)
Bram Moolenaar13ed4942022-08-19 13:59:25 +01006909 {
Bram Moolenaar753aead2022-09-08 12:17:06 +01006910 linenr_T lnum = REG_MULTI
6911 ? rex.reg_firstlnum + rex.lnum : 1;
6912 long_u vcol;
Bram Moolenaar13ed4942022-08-19 13:59:25 +01006913
Bram Moolenaar753aead2022-09-08 12:17:06 +01006914 if (REG_MULTI && (lnum <= 0
6915 || lnum > wp->w_buffer->b_ml.ml_line_count))
6916 lnum = 1;
Bram Moolenaar88456cd2022-11-18 22:14:09 +00006917 vcol = (long_u)win_linetabsize(wp, lnum, rex.line, col);
Bram Moolenaar13ed4942022-08-19 13:59:25 +01006918 result = nfa_re_num_cmp(t->state->val, op, vcol + 1);
6919 }
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006920 if (result)
6921 {
6922 add_here = TRUE;
6923 add_state = t->state->out;
6924 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006925 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006926 break;
6927
Bram Moolenaar044aa292013-06-04 21:27:38 +02006928 case NFA_MARK:
6929 case NFA_MARK_GT:
6930 case NFA_MARK_LT:
6931 {
Bram Moolenaarb4ad3b02022-03-30 10:57:45 +01006932 pos_T *pos;
6933 size_t col = REG_MULTI ? rex.input - rex.line : 0;
6934
6935 pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006936
Bram Moolenaar64066b92021-11-17 18:22:56 +00006937 // Line may have been freed, get it again.
6938 if (REG_MULTI)
6939 {
6940 rex.line = reg_getline(rex.lnum);
6941 rex.input = rex.line + col;
6942 }
6943
Bram Moolenaar872bee52021-05-24 22:56:15 +02006944 // Compare the mark position to the match position, if the mark
6945 // exists and mark is set in reg_buf.
6946 if (pos != NULL && pos->lnum > 0)
6947 {
6948 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6949 && pos->col == MAXCOL
John Marriott82792db2024-05-12 00:07:17 +02006950 ? reg_getline_len(pos->lnum - rex.reg_firstlnum)
Bram Moolenaar872bee52021-05-24 22:56:15 +02006951 : pos->col;
6952
6953 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6954 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006955 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006956 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006957 ? t->state->c == NFA_MARK_GT
6958 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006959 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006960 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006961 : t->state->c == NFA_MARK_LT));
6962 if (result)
6963 {
6964 add_here = TRUE;
6965 add_state = t->state->out;
6966 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006967 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006968 break;
6969 }
6970
Bram Moolenaar423532e2013-05-29 21:14:42 +02006971 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006972 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006973 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006974 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006975 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006976 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006977 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006978 {
6979 add_here = TRUE;
6980 add_state = t->state->out;
6981 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006982 break;
6983
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006984 case NFA_VISUAL:
6985 result = reg_match_visual();
6986 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006987 {
6988 add_here = TRUE;
6989 add_state = t->state->out;
6990 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006991 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006992
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006993 case NFA_MOPEN1:
6994 case NFA_MOPEN2:
6995 case NFA_MOPEN3:
6996 case NFA_MOPEN4:
6997 case NFA_MOPEN5:
6998 case NFA_MOPEN6:
6999 case NFA_MOPEN7:
7000 case NFA_MOPEN8:
7001 case NFA_MOPEN9:
7002#ifdef FEAT_SYN_HL
7003 case NFA_ZOPEN:
7004 case NFA_ZOPEN1:
7005 case NFA_ZOPEN2:
7006 case NFA_ZOPEN3:
7007 case NFA_ZOPEN4:
7008 case NFA_ZOPEN5:
7009 case NFA_ZOPEN6:
7010 case NFA_ZOPEN7:
7011 case NFA_ZOPEN8:
7012 case NFA_ZOPEN9:
7013#endif
7014 case NFA_NOPEN:
7015 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007016 // These states are only added to be able to bail out when
7017 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02007018 break;
7019
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007020 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02007021 {
7022 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02007023
Bram Moolenaar398d53d2013-08-01 15:45:52 +02007024#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02007025 if (c < 0)
Bram Moolenaar097c5372023-05-24 21:02:24 +01007026 siemsg("Negative state char: %ld", (long)c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02007027#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02007028 result = (c == curc);
7029
Bram Moolenaar6100d022016-10-02 16:51:57 +02007030 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02007031 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007032 // If rex.reg_icombine is not set only skip over the character
7033 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007034 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007035 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007036 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007037 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02007038 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02007039
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007040 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02007041
7042 if (add_state != NULL)
7043 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007044 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02007045 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007046
7047 if (t->pim.result == NFA_PIM_UNUSED)
7048 pim = NULL;
7049 else
7050 pim = &t->pim;
7051
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007052 // Handle the postponed invisible match if the match might end
7053 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007054 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02007055 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007056 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02007057 {
7058#ifdef ENABLE_LOG
7059 fprintf(log_fd, "\n");
7060 fprintf(log_fd, "==================================\n");
7061 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
7062 fprintf(log_fd, "\n");
7063#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007064 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02007065 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007066 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007067 // for \@! and \@<! it is a match when the result is
7068 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007069 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02007070 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
7071 || pim->state->c
7072 == NFA_START_INVISIBLE_BEFORE_NEG
7073 || pim->state->c
7074 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02007075 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007076 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007077 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007078#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02007079 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007080 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007081#endif
7082 }
7083 }
7084 else
7085 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007086 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007087#ifdef ENABLE_LOG
7088 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007089 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007090 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
7091 fprintf(log_fd, "\n");
7092#endif
7093 }
7094
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007095 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007096 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02007097 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
7098 || pim->state->c
7099 == NFA_START_INVISIBLE_BEFORE_NEG
7100 || pim->state->c
7101 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02007102 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007103 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007104 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007105#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02007106 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007107 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007108#endif
7109 }
7110 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007111 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02007112 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007113
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007114 // Postponed invisible match was handled, don't add it to
7115 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007116 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02007117 }
7118
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007119 // If "pim" points into l->t it will become invalid when
7120 // adding the state causes the list to be reallocated. Make a
7121 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02007122 if (pim == &t->pim)
7123 {
7124 copy_pim(&pim_copy, pim);
7125 pim = &pim_copy;
7126 }
7127
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007128 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007129 r = addstate_here(thislist, add_state, &t->subs,
7130 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007131 else
7132 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007133 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007134 if (add_count > 0)
7135 nextlist->t[nextlist->n - 1].count = add_count;
7136 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007137 if (r == NULL)
7138 {
7139 nfa_match = NFA_TOO_EXPENSIVE;
7140 goto theend;
7141 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007142 }
7143
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007144 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007145
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007146 // Look for the start of a match in the current position by adding the
7147 // start state to the list of states.
7148 // The first found match is the leftmost one, thus the order of states
7149 // matters!
7150 // Do not add the start state in recursive calls of nfa_regmatch(),
7151 // because recursive calls should only start in the first position.
7152 // Unless "nfa_endp" is not NULL, then we match the end position.
7153 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02007154 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007155 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02007156 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02007157 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02007158 && (rex.reg_maxcol == 0
Bram Moolenaarc96311b2022-11-25 21:13:47 +00007159 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02007160 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02007161 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007162 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7163 || (rex.lnum == nfa_endp->se_u.pos.lnum
7164 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02007165 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02007166 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007167 {
7168#ifdef ENABLE_LOG
7169 fprintf(log_fd, "(---) STARTSTATE\n");
7170#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007171 // Inline optimized code for addstate() if we know the state is
7172 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007173 if (toplevel)
7174 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007175 int add = TRUE;
7176 int c;
7177
7178 if (prog->regstart != NUL && clen != 0)
7179 {
7180 if (nextlist->n == 0)
7181 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007182 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007183
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007184 // Nextlist is empty, we can skip ahead to the
7185 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007186 if (skip_to_start(prog->regstart, &col) == FAIL)
7187 break;
7188#ifdef ENABLE_LOG
7189 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007190 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007191#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007192 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007193 }
7194 else
7195 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007196 // Checking if the required start character matches is
7197 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007198 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007199 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007200 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007201 {
7202#ifdef ENABLE_LOG
7203 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7204#endif
7205 add = FALSE;
7206 }
7207 }
7208 }
7209
7210 if (add)
7211 {
7212 if (REG_MULTI)
Bram Moolenaar79336e12022-12-11 14:18:31 +00007213 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007214 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007215 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar79336e12022-12-11 14:18:31 +00007216 m->norm.orig_start_col =
7217 m->norm.list.multi[0].start_col;
7218 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007219 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007220 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007221 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7222 {
7223 nfa_match = NFA_TOO_EXPENSIVE;
7224 goto theend;
7225 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007226 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007227 }
7228 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007229 {
7230 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7231 {
7232 nfa_match = NFA_TOO_EXPENSIVE;
7233 goto theend;
7234 }
7235 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007236 }
7237
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007238#ifdef ENABLE_LOG
7239 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007240 {
7241 int i;
7242
7243 for (i = 0; i < thislist->n; i++)
7244 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7245 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007246 fprintf(log_fd, "\n");
7247#endif
7248
7249nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007250 // Advance to the next character, or advance to the next line, or
7251 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007252 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007253 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007254 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007255 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007256 reg_nextline();
7257 else
7258 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007259
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007260 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007261 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007262 if (got_int)
7263 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007264#ifdef FEAT_RELTIME
Paul Ollis65745772022-06-05 16:55:54 +01007265 if (nfa_did_time_out())
Bram Moolenaar305abc62022-05-28 11:08:40 +01007266 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007267#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007268 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007269
7270#ifdef ENABLE_LOG
7271 if (log_fd != stderr)
7272 fclose(log_fd);
7273 log_fd = NULL;
7274#endif
7275
7276theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007277 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007278 vim_free(list[0].t);
7279 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007280 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007281#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007282#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007283 fclose(debug);
7284#endif
7285
Bram Moolenaar963fee22013-05-26 21:47:28 +02007286 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007287}
7288
7289/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007290 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007291 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007292 */
7293 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007294nfa_regtry(
7295 nfa_regprog_T *prog,
7296 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007297 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007298{
7299 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007300 regsubs_T subs, m;
7301 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007302 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007303#ifdef ENABLE_LOG
7304 FILE *f;
7305#endif
7306
Bram Moolenaar0270f382018-07-17 05:43:58 +02007307 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007308#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007309 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007310#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007311
7312#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007313 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007314 if (f != NULL)
7315 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007316 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaar097c5372023-05-24 21:02:24 +01007317# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007318 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar097c5372023-05-24 21:02:24 +01007319# endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007320 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007321 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007322 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007323 fprintf(f, "\n\n");
7324 fclose(f);
7325 }
7326 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007327 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007328#endif
7329
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007330 clear_sub(&subs.norm);
7331 clear_sub(&m.norm);
7332#ifdef FEAT_SYN_HL
7333 clear_sub(&subs.synt);
7334 clear_sub(&m.synt);
7335#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007336
Bram Moolenaarfda37292014-11-05 14:27:36 +01007337 result = nfa_regmatch(prog, start, &subs, &m);
7338 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007339 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007340 else if (result == NFA_TOO_EXPENSIVE)
7341 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007342
7343 cleanup_subexpr();
7344 if (REG_MULTI)
7345 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007346 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007347 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007348 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7349 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007350
Bram Moolenaar6100d022016-10-02 16:51:57 +02007351 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7352 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007353 }
Bram Moolenaar79336e12022-12-11 14:18:31 +00007354 if (rex.reg_mmatch != NULL)
7355 rex.reg_mmatch->rmm_matchcol = subs.norm.orig_start_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007356
Bram Moolenaar6100d022016-10-02 16:51:57 +02007357 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007358 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007359 rex.reg_startpos[0].lnum = 0;
7360 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007361 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007362 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007363 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007364 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007365 rex.reg_endpos[0].lnum = rex.lnum;
7366 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007367 }
7368 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007369 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007370 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007371 }
7372 else
7373 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007374 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007375 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007376 rex.reg_startp[i] = subs.norm.list.line[i].start;
7377 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007378 }
7379
Bram Moolenaar6100d022016-10-02 16:51:57 +02007380 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007381 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007382 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007383 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007384 }
7385
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007386#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007387 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007388 unref_extmatch(re_extmatch_out);
7389 re_extmatch_out = NULL;
7390
7391 if (prog->reghasz == REX_SET)
7392 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007393 cleanup_zsubexpr();
7394 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007395 if (re_extmatch_out == NULL)
7396 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007397 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007398 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007399 {
7400 if (REG_MULTI)
7401 {
7402 struct multipos *mpos = &subs.synt.list.multi[i];
7403
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007404 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007405 if (mpos->start_lnum >= 0
7406 && mpos->start_lnum == mpos->end_lnum
7407 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007408 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007409 vim_strnsave(reg_getline(mpos->start_lnum)
7410 + mpos->start_col,
7411 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007412 }
7413 else
7414 {
7415 struct linepos *lpos = &subs.synt.list.line[i];
7416
7417 if (lpos->start != NULL && lpos->end != NULL)
7418 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007419 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007420 }
7421 }
7422 }
7423#endif
7424
Bram Moolenaar0270f382018-07-17 05:43:58 +02007425 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007426}
7427
7428/*
7429 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007430 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007431 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007432 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007433 */
7434 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007435nfa_regexec_both(
7436 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007437 colnr_T startcol, // column to start looking for match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007438 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007439{
7440 nfa_regprog_T *prog;
7441 long retval = 0L;
7442 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007443 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007444
7445 if (REG_MULTI)
7446 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007447 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007448 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007449 rex.reg_startpos = rex.reg_mmatch->startpos;
7450 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007451 }
7452 else
7453 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007454 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7455 rex.reg_startp = rex.reg_match->startp;
7456 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007457 }
7458
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007459 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007460 if (prog == NULL || line == NULL)
7461 {
RestorerZ68ebcee2023-05-31 17:12:14 +01007462 iemsg(e_null_argument);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007463 goto theend;
7464 }
7465
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007466 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007467 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007468 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007469 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007470 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007471
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007472 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007473 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007474 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007475
Bram Moolenaar0270f382018-07-17 05:43:58 +02007476 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007477 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007478
Bram Moolenaar0270f382018-07-17 05:43:58 +02007479 rex.nfa_has_zend = prog->has_zend;
7480 rex.nfa_has_backref = prog->has_backref;
7481 rex.nfa_nsubexpr = prog->nsubexp;
7482 rex.nfa_listid = 1;
7483 rex.nfa_alt_listid = 2;
7484#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007485 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007486#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007487
Bram Moolenaard89616e2013-06-06 18:46:06 +02007488 if (prog->reganch && col > 0)
7489 return 0L;
7490
Bram Moolenaar0270f382018-07-17 05:43:58 +02007491 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007492#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007493 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007494 if (prog->reghasz == REX_SET)
7495 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007496 rex.nfa_has_zsubexpr = TRUE;
7497 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007498 }
7499 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007500 {
7501 rex.nfa_has_zsubexpr = FALSE;
7502 rex.need_clear_zsubexpr = FALSE;
7503 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007504#endif
7505
Bram Moolenaard89616e2013-06-06 18:46:06 +02007506 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007507 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007508 // Skip ahead until a character we know the match must start with.
7509 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007510 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007511 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007512
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007513 // If match_text is set it contains the full text that must match.
7514 // Nothing else to try. Doesn't handle combining chars well.
Christian Brabandt22e8e122024-07-30 20:39:18 +02007515 if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine)
Bram Moolenaar01105b32022-11-26 11:47:10 +00007516 {
Bram Moolenaar79336e12022-12-11 14:18:31 +00007517 retval = find_match_text(&col, prog->regstart, prog->match_text);
Bram Moolenaar01105b32022-11-26 11:47:10 +00007518 if (REG_MULTI)
7519 rex.reg_mmatch->rmm_matchcol = col;
7520 else
7521 rex.reg_match->rm_matchcol = col;
7522 return retval;
7523 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007524 }
7525
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007526 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007527 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007528 goto theend;
7529
Bram Moolenaar0270f382018-07-17 05:43:58 +02007530 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7531 // it's accidentally used during execution.
7532 nstate = 0;
7533 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007534 {
7535 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007536 prog->state[i].lastlist[0] = 0;
7537 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007538 }
7539
Paul Ollis65745772022-06-05 16:55:54 +01007540 retval = nfa_regtry(prog, col, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007541
Bram Moolenaar0270f382018-07-17 05:43:58 +02007542#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007543 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007544#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007545
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007546theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007547 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007548 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007549 // Make sure the end is never before the start. Can happen when \zs and
7550 // \ze are used.
7551 if (REG_MULTI)
7552 {
7553 lpos_T *start = &rex.reg_mmatch->startpos[0];
7554 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007555
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007556 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007557 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007558 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7559 }
7560 else
7561 {
7562 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7563 rex.reg_match->endp[0] = rex.reg_match->startp[0];
Bram Moolenaar01105b32022-11-26 11:47:10 +00007564
7565 // startpos[0] may be set by "\zs", also return the column where
7566 // the whole pattern matched.
7567 rex.reg_match->rm_matchcol = col;
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007568 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007569 }
7570
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007571 return retval;
7572}
7573
7574/*
7575 * Compile a regular expression into internal code for the NFA matcher.
7576 * Returns the program in allocated space. Returns NULL for an error.
7577 */
7578 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007579nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007580{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007581 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007582 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007583 int *postfix;
7584
7585 if (expr == NULL)
7586 return NULL;
7587
Bram Moolenaar0270f382018-07-17 05:43:58 +02007588#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007589 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007590#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007591 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007592
7593 init_class_tab();
7594
7595 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7596 return NULL;
7597
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007598 // Build postfix form of the regexp. Needed to build the NFA
7599 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007600 postfix = re2post();
7601 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007602 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007603
7604 /*
7605 * In order to build the NFA, we parse the input regexp twice:
7606 * 1. first pass to count size (so we can allocate space)
7607 * 2. second to emit code
7608 */
7609#ifdef ENABLE_LOG
7610 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007611 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007612
7613 if (f != NULL)
7614 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007615 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007616 fclose(f);
7617 }
7618 }
7619#endif
7620
7621 /*
7622 * PASS 1
7623 * Count number of NFA states in "nstate". Do not build the NFA.
7624 */
7625 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007626
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007627 // allocate the regprog with space for the compiled regexp
zeertzjq1b438a82023-02-01 13:11:15 +00007628 prog_size = offsetof(nfa_regprog_T, state) + sizeof(nfa_state_T) * nstate;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007629 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007630 if (prog == NULL)
7631 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007632 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007633 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007634
7635 /*
7636 * PASS 2
7637 * Build the NFA
7638 */
7639 prog->start = post2nfa(postfix, post_ptr, FALSE);
7640 if (prog->start == NULL)
7641 goto fail;
7642
7643 prog->regflags = regflags;
7644 prog->engine = &nfa_regengine;
7645 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007646 prog->has_zend = rex.nfa_has_zend;
7647 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007648 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007649
Bram Moolenaara2947e22013-06-11 22:44:09 +02007650 nfa_postprocess(prog);
7651
Bram Moolenaard89616e2013-06-06 18:46:06 +02007652 prog->reganch = nfa_get_reganch(prog->start, 0);
7653 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007654 prog->match_text = nfa_get_match_text(prog->start);
7655
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007656#ifdef ENABLE_LOG
7657 nfa_postfix_dump(expr, OK);
7658 nfa_dump(prog);
7659#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007660#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007661 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007662 prog->reghasz = re_has_z;
7663#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007664 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007665#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007666 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007667#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007668
7669out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007670 VIM_CLEAR(post_start);
7671 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007672 state_ptr = NULL;
7673 return (regprog_T *)prog;
7674
7675fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007676 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007677#ifdef ENABLE_LOG
7678 nfa_postfix_dump(expr, FAIL);
7679#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007680#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007681 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007682#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007683 goto out;
7684}
7685
Bram Moolenaar473de612013-06-08 18:19:48 +02007686/*
7687 * Free a compiled regexp program, returned by nfa_regcomp().
7688 */
7689 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007690nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007691{
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00007692 if (prog == NULL)
7693 return;
7694
7695 vim_free(((nfa_regprog_T *)prog)->match_text);
7696 vim_free(((nfa_regprog_T *)prog)->pattern);
7697 vim_free(prog);
Bram Moolenaar473de612013-06-08 18:19:48 +02007698}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007699
7700/*
7701 * Match a regexp against a string.
7702 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7703 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007704 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007705 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007706 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007707 */
7708 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007709nfa_regexec_nl(
7710 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007711 char_u *line, // string to match against
7712 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007713 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007714{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007715 rex.reg_match = rmp;
7716 rex.reg_mmatch = NULL;
7717 rex.reg_maxline = 0;
7718 rex.reg_line_lbr = line_lbr;
7719 rex.reg_buf = curbuf;
7720 rex.reg_win = NULL;
7721 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007722 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007723 rex.reg_maxcol = 0;
Paul Ollis65745772022-06-05 16:55:54 +01007724 return nfa_regexec_both(line, col, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007725}
7726
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007727
7728/*
7729 * Match a regexp against multiple lines.
7730 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7731 * Uses curbuf for line count and 'iskeyword'.
7732 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007733 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007734 * match otherwise.
7735 *
7736 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7737 *
7738 * ! Also NOTE : match may actually be in another line. e.g.:
7739 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7740 *
7741 * +-------------------------+
7742 * |a |
7743 * |b |
7744 * |c |
7745 * | |
7746 * +-------------------------+
7747 *
7748 * then nfa_regexec_multi() returns 3. while the original
7749 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7750 *
7751 * FIXME if this behavior is not compatible.
7752 */
7753 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007754nfa_regexec_multi(
7755 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007756 win_T *win, // window in which to search or NULL
7757 buf_T *buf, // buffer in which to search
7758 linenr_T lnum, // nr of line to start looking for match
7759 colnr_T col, // column to start looking for match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007760 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007761{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007762 init_regexec_multi(rmp, win, buf, lnum);
Paul Ollis65745772022-06-05 16:55:54 +01007763 return nfa_regexec_both(NULL, col, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007764}
7765
7766#ifdef DEBUG
7767# undef ENABLE_LOG
7768#endif