blob: c8ac8d42a98640b9282ed1227633172bcbe700cd [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar0270f382018-07-17 05:43:58 +0200246// Variables only used in nfa_regcomp() and descendants.
247static int nfa_re_flags; // re_flags passed to nfa_regcomp()
248static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200249static int *post_end;
250static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100251
252// Set when the pattern should use the NFA engine.
253// E.g. [[:upper:]] only allows 8bit characters for BT engine,
254// while NFA engine handles multibyte characters correctly.
255static int wants_nfa;
256
Bram Moolenaar0270f382018-07-17 05:43:58 +0200257static int nstate; // Number of states in the NFA.
258static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200261static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200262
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100263// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200264static int nfa_ll_index = 0;
265
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100266static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100267static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200268#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100269static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200270#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int match_follows(nfa_state_T *startstate, int depth);
272static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100274// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200276 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200277 return FAIL; \
278 *post_ptr++ = c; \
279 } while (0)
280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200281/*
282 * Initialize internal variables before NFA compilation.
283 * Return OK on success, FAIL otherwise.
284 */
285 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100286nfa_regcomp_start(
287 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200289{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200290 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200291 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200292
293 nstate = 0;
294 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200296 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298 // Some items blow up in size, such as [A-z]. Add more space for that.
299 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200300 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200301
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200303 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200304
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200305 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200306 if (post_start == NULL)
307 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200308 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100310 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200311 rex.nfa_has_zend = FALSE;
312 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315 regcomp_start(expr, re_flags);
316
317 return OK;
318}
319
320/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200321 * Figure out if the NFA state list starts with an anchor, must match at start
322 * of the line.
323 */
324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100325nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326{
327 nfa_state_T *p = start;
328
329 if (depth > 4)
330 return 0;
331
332 while (p != NULL)
333 {
334 switch (p->c)
335 {
336 case NFA_BOL:
337 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200339
340 case NFA_ZSTART:
341 case NFA_ZEND:
342 case NFA_CURSOR:
343 case NFA_VISUAL:
344
345 case NFA_MOPEN:
346 case NFA_MOPEN1:
347 case NFA_MOPEN2:
348 case NFA_MOPEN3:
349 case NFA_MOPEN4:
350 case NFA_MOPEN5:
351 case NFA_MOPEN6:
352 case NFA_MOPEN7:
353 case NFA_MOPEN8:
354 case NFA_MOPEN9:
355 case NFA_NOPEN:
356#ifdef FEAT_SYN_HL
357 case NFA_ZOPEN:
358 case NFA_ZOPEN1:
359 case NFA_ZOPEN2:
360 case NFA_ZOPEN3:
361 case NFA_ZOPEN4:
362 case NFA_ZOPEN5:
363 case NFA_ZOPEN6:
364 case NFA_ZOPEN7:
365 case NFA_ZOPEN8:
366 case NFA_ZOPEN9:
367#endif
368 p = p->out;
369 break;
370
371 case NFA_SPLIT:
372 return nfa_get_reganch(p->out, depth + 1)
373 && nfa_get_reganch(p->out1, depth + 1);
374
375 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100376 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200377 }
378 }
379 return 0;
380}
381
382/*
383 * Figure out if the NFA state list starts with a character which must match
384 * at start of the match.
385 */
386 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100387nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200388{
389 nfa_state_T *p = start;
390
391 if (depth > 4)
392 return 0;
393
394 while (p != NULL)
395 {
396 switch (p->c)
397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100398 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200399 case NFA_BOL:
400 case NFA_BOF:
401 case NFA_BOW:
402 case NFA_EOW:
403 case NFA_ZSTART:
404 case NFA_ZEND:
405 case NFA_CURSOR:
406 case NFA_VISUAL:
407 case NFA_LNUM:
408 case NFA_LNUM_GT:
409 case NFA_LNUM_LT:
410 case NFA_COL:
411 case NFA_COL_GT:
412 case NFA_COL_LT:
413 case NFA_VCOL:
414 case NFA_VCOL_GT:
415 case NFA_VCOL_LT:
416 case NFA_MARK:
417 case NFA_MARK_GT:
418 case NFA_MARK_LT:
419
420 case NFA_MOPEN:
421 case NFA_MOPEN1:
422 case NFA_MOPEN2:
423 case NFA_MOPEN3:
424 case NFA_MOPEN4:
425 case NFA_MOPEN5:
426 case NFA_MOPEN6:
427 case NFA_MOPEN7:
428 case NFA_MOPEN8:
429 case NFA_MOPEN9:
430 case NFA_NOPEN:
431#ifdef FEAT_SYN_HL
432 case NFA_ZOPEN:
433 case NFA_ZOPEN1:
434 case NFA_ZOPEN2:
435 case NFA_ZOPEN3:
436 case NFA_ZOPEN4:
437 case NFA_ZOPEN5:
438 case NFA_ZOPEN6:
439 case NFA_ZOPEN7:
440 case NFA_ZOPEN8:
441 case NFA_ZOPEN9:
442#endif
443 p = p->out;
444 break;
445
446 case NFA_SPLIT:
447 {
448 int c1 = nfa_get_regstart(p->out, depth + 1);
449 int c2 = nfa_get_regstart(p->out1, depth + 1);
450
451 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100452 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200453 return 0;
454 }
455
456 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200457 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100458 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200459 return 0;
460 }
461 }
462 return 0;
463}
464
465/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200466 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200467 * else. If so return a string in allocated memory with what must match after
468 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200469 */
470 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100471nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200472{
473 nfa_state_T *p = start;
474 int len = 0;
475 char_u *ret;
476 char_u *s;
477
478 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100479 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200480 p = p->out;
481 while (p->c > 0)
482 {
483 len += MB_CHAR2LEN(p->c);
484 p = p->out;
485 }
486 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
487 return NULL;
488
489 ret = alloc(len);
490 if (ret != NULL)
491 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100492 p = start->out->out; // skip first char, it goes into regstart
Bram Moolenaar473de612013-06-08 18:19:48 +0200493 s = ret;
494 while (p->c > 0)
495 {
Bram Moolenaar473de612013-06-08 18:19:48 +0200496 if (has_mbyte)
497 s += (*mb_char2bytes)(p->c, s);
498 else
Bram Moolenaar473de612013-06-08 18:19:48 +0200499 *s++ = p->c;
500 p = p->out;
501 }
502 *s = NUL;
503 }
504 return ret;
505}
506
507/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200508 * Allocate more space for post_start. Called when
509 * running above the estimated number of states.
510 */
511 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100512realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200514 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100515 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200516 int *new_start;
517 int *old_start;
518
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100519 // For weird patterns the number of states can be very high. Increasing by
520 // 50% seems a reasonable compromise between memory use and speed.
521 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200522 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200523 if (new_start == NULL)
524 return FAIL;
525 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200526 old_start = post_start;
527 post_start = new_start;
528 post_ptr = new_start + (post_ptr - old_start);
529 post_end = post_start + new_max;
530 vim_free(old_start);
531 return OK;
532}
533
534/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200535 * Search between "start" and "end" and try to recognize a
536 * character class in expanded form. For example [0-9].
537 * On success, return the id the character class to be emitted.
538 * On failure, return 0 (=FAIL)
539 * Start points to the first char of the range, while end should point
540 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200541 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
542 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200543 */
544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100545nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200546{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200547# define CLASS_not 0x80
548# define CLASS_af 0x40
549# define CLASS_AF 0x20
550# define CLASS_az 0x10
551# define CLASS_AZ 0x08
552# define CLASS_o7 0x04
553# define CLASS_o9 0x02
554# define CLASS_underscore 0x01
555
556 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200557 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200558 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200559
560 if (extra_newl == TRUE)
561 newl = TRUE;
562
563 if (*end != ']')
564 return FAIL;
565 p = start;
566 if (*p == '^')
567 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200568 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200569 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200570 }
571
572 while (p < end)
573 {
574 if (p + 2 < end && *(p + 1) == '-')
575 {
576 switch (*p)
577 {
578 case '0':
579 if (*(p + 2) == '9')
580 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200581 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200582 break;
583 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200584 if (*(p + 2) == '7')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200589 return FAIL;
590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 case 'a':
592 if (*(p + 2) == 'z')
593 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200594 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200595 break;
596 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200597 if (*(p + 2) == 'f')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200602 return FAIL;
603
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200604 case 'A':
605 if (*(p + 2) == 'Z')
606 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200607 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200608 break;
609 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200610 if (*(p + 2) == 'F')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200615 return FAIL;
616
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200617 default:
618 return FAIL;
619 }
620 p += 3;
621 }
622 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
623 {
624 newl = TRUE;
625 p += 2;
626 }
627 else if (*p == '_')
628 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200629 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200630 p ++;
631 }
632 else if (*p == '\n')
633 {
634 newl = TRUE;
635 p ++;
636 }
637 else
638 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100639 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200640
641 if (p != end)
642 return FAIL;
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200645 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200646
647 switch (config)
648 {
649 case CLASS_o9:
650 return extra_newl + NFA_DIGIT;
651 case CLASS_not | CLASS_o9:
652 return extra_newl + NFA_NDIGIT;
653 case CLASS_af | CLASS_AF | CLASS_o9:
654 return extra_newl + NFA_HEX;
655 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
656 return extra_newl + NFA_NHEX;
657 case CLASS_o7:
658 return extra_newl + NFA_OCTAL;
659 case CLASS_not | CLASS_o7:
660 return extra_newl + NFA_NOCTAL;
661 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
662 return extra_newl + NFA_WORD;
663 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
664 return extra_newl + NFA_NWORD;
665 case CLASS_az | CLASS_AZ | CLASS_underscore:
666 return extra_newl + NFA_HEAD;
667 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
668 return extra_newl + NFA_NHEAD;
669 case CLASS_az | CLASS_AZ:
670 return extra_newl + NFA_ALPHA;
671 case CLASS_not | CLASS_az | CLASS_AZ:
672 return extra_newl + NFA_NALPHA;
673 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200674 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200675 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200676 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200677 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200678 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200679 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200680 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200682 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200683}
684
685/*
686 * Produce the bytes for equivalence class "c".
687 * Currently only handles latin1, latin9 and utf-8.
688 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
689 * equivalent to 'a OR b OR c'
690 *
691 * NOTE! When changing this function, also update reg_equi_class()
692 */
693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200696#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
699 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700 {
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000701#define A_grave 0xc0
702#define A_acute 0xc1
703#define A_circumflex 0xc2
704#define A_virguilla 0xc3
705#define A_diaeresis 0xc4
706#define A_ring 0xc5
707#define C_cedilla 0xc7
708#define E_grave 0xc8
709#define E_acute 0xc9
710#define E_circumflex 0xca
711#define E_diaeresis 0xcb
712#define I_grave 0xcc
713#define I_acute 0xcd
714#define I_circumflex 0xce
715#define I_diaeresis 0xcf
716#define N_virguilla 0xd1
717#define O_grave 0xd2
718#define O_acute 0xd3
719#define O_circumflex 0xd4
720#define O_virguilla 0xd5
721#define O_diaeresis 0xd6
722#define O_slash 0xd8
723#define U_grave 0xd9
724#define U_acute 0xda
725#define U_circumflex 0xdb
726#define U_diaeresis 0xdc
727#define Y_acute 0xdd
728#define a_grave 0xe0
729#define a_acute 0xe1
730#define a_circumflex 0xe2
731#define a_virguilla 0xe3
732#define a_diaeresis 0xe4
733#define a_ring 0xe5
734#define c_cedilla 0xe7
735#define e_grave 0xe8
736#define e_acute 0xe9
737#define e_circumflex 0xea
738#define e_diaeresis 0xeb
739#define i_grave 0xec
740#define i_acute 0xed
741#define i_circumflex 0xee
742#define i_diaeresis 0xef
743#define n_virguilla 0xf1
744#define o_grave 0xf2
745#define o_acute 0xf3
746#define o_circumflex 0xf4
747#define o_virguilla 0xf5
748#define o_diaeresis 0xf6
749#define o_slash 0xf8
750#define u_grave 0xf9
751#define u_acute 0xfa
752#define u_circumflex 0xfb
753#define u_diaeresis 0xfc
754#define y_acute 0xfd
755#define y_diaeresis 0xff
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200756 switch (c)
757 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200758 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200759 case A_virguilla: case A_diaeresis: case A_ring:
760 case 0x100: case 0x102: case 0x104: case 0x1cd:
761 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
762 case 0x202: case 0x226: case 0x23a: case 0x1e00:
763 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
764 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
765 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
766 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
767 EMIT2(A_circumflex) EMIT2(A_virguilla)
768 EMIT2(A_diaeresis) EMIT2(A_ring)
769 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
770 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
771 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
772 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
773 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
774 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
775 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
776 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200777 return OK;
778
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200779 case 'B': case 0x181: case 0x243: case 0x1e02:
780 case 0x1e04: case 0x1e06:
781 EMIT2('B')
782 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
783 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200784 return OK;
785
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200786 case 'C': case C_cedilla: case 0x106: case 0x108:
787 case 0x10a: case 0x10c: case 0x187: case 0x23b:
788 case 0x1e08: case 0xa792:
789 EMIT2('C') EMIT2(C_cedilla)
790 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
791 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
792 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200793 return OK;
794
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200795 case 'D': case 0x10e: case 0x110: case 0x18a:
796 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
797 case 0x1e12:
798 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
799 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
800 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200801 return OK;
802
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200803 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200804 case E_diaeresis: case 0x112: case 0x114: case 0x116:
805 case 0x118: case 0x11a: case 0x204: case 0x206:
806 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
807 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
808 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
809 case 0x1ec2: case 0x1ec4: case 0x1ec6:
810 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
811 EMIT2(E_circumflex) EMIT2(E_diaeresis)
812 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
813 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
814 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
815 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
816 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
817 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
818 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
819 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200820 return OK;
821
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case 'F': case 0x191: case 0x1e1e: case 0xa798:
823 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200824 return OK;
825
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200826 case 'G': case 0x11c: case 0x11e: case 0x120:
827 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
828 case 0x1f4: case 0x1e20: case 0xa7a0:
829 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
830 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
831 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
832 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200833 return OK;
834
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200835 case 'H': case 0x124: case 0x126: case 0x21e:
836 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
837 case 0x1e2a: case 0x2c67:
838 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
839 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
840 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200841 return OK;
842
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200843 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200844 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
845 case 0x12e: case 0x130: case 0x197: case 0x1cf:
846 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
847 case 0x1ec8: case 0x1eca:
848 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
849 EMIT2(I_circumflex) EMIT2(I_diaeresis)
850 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
851 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
852 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
853 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
854 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200855 return OK;
856
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200857 case 'J': case 0x134: case 0x248:
858 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200859 return OK;
860
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
862 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
863 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
864 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
865 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200866 return OK;
867
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200868 case 'L': case 0x139: case 0x13b: case 0x13d:
869 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
870 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
871 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
872 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
873 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
874 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200875 return OK;
876
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200877 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
878 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
879 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200880 return OK;
881
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200882 case 'N': case N_virguilla:
883 case 0x143: case 0x145: case 0x147: case 0x1f8:
884 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
885 case 0xa7a4:
886 EMIT2('N') EMIT2(N_virguilla)
887 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
888 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
889 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200890 return OK;
891
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200892 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200893 case O_virguilla: case O_diaeresis: case O_slash:
894 case 0x14c: case 0x14e: case 0x150: case 0x19f:
895 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
896 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
897 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
898 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
899 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
900 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
901 case 0x1ede: case 0x1ee0: case 0x1ee2:
902 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
903 EMIT2(O_circumflex) EMIT2(O_virguilla)
904 EMIT2(O_diaeresis) EMIT2(O_slash)
905 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
906 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
907 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
908 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
909 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
910 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
911 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
912 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
913 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
914 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
915 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200916 return OK;
917
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200918 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
919 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
920 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200921 return OK;
922
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200923 case 'Q': case 0x24a:
924 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200925 return OK;
926
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200927 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
928 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
929 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
930 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
931 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
932 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
933 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200934 return OK;
935
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200936 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
937 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
938 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
939 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
940 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
941 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
942 EMIT2(0xa7a8)
943 return OK;
944
945 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
946 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
947 case 0x1e6e: case 0x1e70:
948 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
949 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
950 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200951 return OK;
952
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200953 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200954 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
955 case 0x16e: case 0x170: case 0x172: case 0x1af:
956 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
957 case 0x1db: case 0x214: case 0x216: case 0x244:
958 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
959 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
960 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
961 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
962 EMIT2(U_diaeresis) EMIT2(U_circumflex)
963 EMIT2(0x168) EMIT2(0x16a)
964 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
965 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
966 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
967 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
968 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
969 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
970 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
971 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
972 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200973 return OK;
974
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200975 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
976 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200977 return OK;
978
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200979 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
980 case 0x1e86: case 0x1e88:
981 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
982 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200983 return OK;
984
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 case 'X': case 0x1e8a: case 0x1e8c:
986 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200987 return OK;
988
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200989 case 'Y': case Y_acute: case 0x176: case 0x178:
990 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
991 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
992 EMIT2('Y') EMIT2(Y_acute)
993 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
994 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
995 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
996 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'Z': case 0x179: case 0x17b: case 0x17d:
1000 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1001 case 0x2c6b:
1002 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1003 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1004 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001005 return OK;
1006
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001007 case 'a': case a_grave: case a_acute: case a_circumflex:
1008 case a_virguilla: case a_diaeresis: case a_ring:
1009 case 0x101: case 0x103: case 0x105: case 0x1ce:
1010 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1011 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1012 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1013 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1014 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1015 case 0x1eb7: case 0x2c65:
1016 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1017 EMIT2(a_circumflex) EMIT2(a_virguilla)
1018 EMIT2(a_diaeresis) EMIT2(a_ring)
1019 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1020 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1021 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1022 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1023 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1024 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1025 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1026 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1027 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001028 return OK;
1029
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001030 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1031 case 0x1e03: case 0x1e05: case 0x1e07:
1032 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1033 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001034 return OK;
1035
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001036 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1037 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1038 case 0xa794:
1039 EMIT2('c') EMIT2(c_cedilla)
1040 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1041 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1042 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001043 return OK;
1044
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001045 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1046 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1047 case 0x1e11: case 0x1e13:
1048 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1049 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1050 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1051 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001052 return OK;
1053
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001054 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001055 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1056 case 0x119: case 0x11b: case 0x205: case 0x207:
1057 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1058 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1059 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1060 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1061 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1062 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1063 EMIT2(0x113) EMIT2(0x115)
1064 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1065 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1066 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1067 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1068 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1069 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1070 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001071 return OK;
1072
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001073 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1074 case 0x1e1f: case 0xa799:
1075 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1076 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001077 return OK;
1078
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001079 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1080 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1081 case 0x1e21: case 0xa7a1:
1082 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1083 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1084 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1085 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001086 return OK;
1087
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001088 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1089 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1090 case 0x1e96: case 0x2c68: case 0xa795:
1091 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1092 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1093 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1094 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001095 return OK;
1096
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001097 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001098 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1099 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1100 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1101 case 0x1ec9: case 0x1ecb:
1102 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1103 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1104 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1105 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1106 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1107 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1108 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001109 return OK;
1110
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001111 case 'j': case 0x135: case 0x1f0: case 0x249:
1112 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001113 return OK;
1114
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001115 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1116 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1117 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1118 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1119 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001120 return OK;
1121
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001122 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1123 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1124 case 0x1e3d: case 0x2c61:
1125 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1126 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1127 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1128 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001129 return OK;
1130
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001131 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1132 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1133 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1137 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1138 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1139 EMIT2('n') EMIT2(n_virguilla)
1140 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1141 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1142 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1143 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001144 return OK;
1145
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001146 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001147 case o_virguilla: case o_diaeresis: case o_slash:
1148 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1149 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1150 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1151 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1152 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1153 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1154 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1155 case 0x1edf: case 0x1ee1: case 0x1ee3:
1156 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1157 EMIT2(o_circumflex) EMIT2(o_virguilla)
1158 EMIT2(o_diaeresis) EMIT2(o_slash)
1159 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1160 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1161 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1162 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1163 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1164 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1165 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1166 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1167 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1168 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1169 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001170 return OK;
1171
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001172 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1173 case 0x1e55: case 0x1e57:
1174 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1175 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'q': case 0x24b: case 0x2a0:
1179 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001180 return OK;
1181
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001182 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1183 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1184 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1185 case 0xa7a7:
1186 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1187 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1188 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1189 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001190 return OK;
1191
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001192 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1193 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1194 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1195 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1196 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1197 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1198 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1199 return OK;
1200
1201 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1202 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1203 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1204 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1205 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1206 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1207 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001208 return OK;
1209
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001210 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001211 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1212 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1213 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1214 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1215 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1216 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1217 case 0x1eed: case 0x1eef: case 0x1ef1:
1218 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1219 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1220 EMIT2(0x169) EMIT2(0x16b)
1221 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1222 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1223 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1224 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1225 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1226 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1227 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1228 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1229 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001230 return OK;
1231
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001232 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1233 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1234 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001235 return OK;
1236
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001237 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1238 case 0x1e87: case 0x1e89: case 0x1e98:
1239 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1240 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001241 return OK;
1242
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001243 case 'x': case 0x1e8b: case 0x1e8d:
1244 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001245 return OK;
1246
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001247 case 'y': case y_acute: case y_diaeresis: case 0x177:
1248 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1249 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1250 case 0x1ef9:
1251 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1252 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1253 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1254 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001255 return OK;
1256
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001257 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1258 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1259 case 0x1e95: case 0x2c6c:
1260 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1261 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1262 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001263 return OK;
1264
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001265 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001266 }
1267 }
1268
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001269 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001270 return OK;
1271#undef EMIT2
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001272#undef EMIT2
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001273}
1274
1275/*
1276 * Code to parse regular expression.
1277 *
1278 * We try to reuse parsing functions in regexp.c to
1279 * minimize surprise and keep the syntax consistent.
1280 */
1281
1282/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001283 * Parse the lowest level.
1284 *
1285 * An atom can be one of a long list of items. Many atoms match one character
1286 * in the text. It is often an ordinary character or a character class.
1287 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1288 * is only for syntax highlighting.
1289 *
1290 * atom ::= ordinary-atom
1291 * or \( pattern \)
1292 * or \%( pattern \)
1293 * or \z( pattern \)
1294 */
1295 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001296nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001297{
1298 int c;
1299 int charclass;
1300 int equiclass;
1301 int collclass;
1302 int got_coll_char;
1303 char_u *p;
1304 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001305 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001306 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001307 int emit_range;
1308 int negated;
1309 int result;
1310 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001311 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001312
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001314 switch (c)
1315 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001316 case NUL:
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001317 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar47196582013-05-25 22:04:23 +02001318
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001319 case Magic('^'):
1320 EMIT(NFA_BOL);
1321 break;
1322
1323 case Magic('$'):
1324 EMIT(NFA_EOL);
1325#if defined(FEAT_SYN_HL) || defined(PROTO)
1326 had_eol = TRUE;
1327#endif
1328 break;
1329
1330 case Magic('<'):
1331 EMIT(NFA_BOW);
1332 break;
1333
1334 case Magic('>'):
1335 EMIT(NFA_EOW);
1336 break;
1337
1338 case Magic('_'):
1339 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001340 if (c == NUL)
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001341 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar174a8482013-11-28 14:20:17 +01001342
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001343 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001344 {
1345 EMIT(NFA_BOL);
1346 break;
1347 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001348 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001349 {
1350 EMIT(NFA_EOL);
1351#if defined(FEAT_SYN_HL) || defined(PROTO)
1352 had_eol = TRUE;
1353#endif
1354 break;
1355 }
1356
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001357 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001358
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001359 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001360 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001361 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001362
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001363 // "\_x" is character class plus newline
1364 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001365
1366 /*
1367 * Character classes.
1368 */
1369 case Magic('.'):
1370 case Magic('i'):
1371 case Magic('I'):
1372 case Magic('k'):
1373 case Magic('K'):
1374 case Magic('f'):
1375 case Magic('F'):
1376 case Magic('p'):
1377 case Magic('P'):
1378 case Magic('s'):
1379 case Magic('S'):
1380 case Magic('d'):
1381 case Magic('D'):
1382 case Magic('x'):
1383 case Magic('X'):
1384 case Magic('o'):
1385 case Magic('O'):
1386 case Magic('w'):
1387 case Magic('W'):
1388 case Magic('h'):
1389 case Magic('H'):
1390 case Magic('a'):
1391 case Magic('A'):
1392 case Magic('l'):
1393 case Magic('L'):
1394 case Magic('u'):
1395 case Magic('U'):
1396 p = vim_strchr(classchars, no_Magic(c));
1397 if (p == NULL)
1398 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001399 if (extra == NFA_ADD_NL)
1400 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001401 semsg(_(e_nfa_regexp_invalid_character_class_nr), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001402 rc_did_emsg = TRUE;
1403 return FAIL;
1404 }
Bram Moolenaarb5443cc2019-01-15 20:19:40 +01001405 siemsg("INTERNAL: Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001406 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001407 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001408
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001409 // When '.' is followed by a composing char ignore the dot, so that
1410 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001411 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1412 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001413 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001414 c = getchr();
1415 goto nfa_do_multibyte;
1416 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001417 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001418 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001419 {
1420 EMIT(NFA_NEWL);
1421 EMIT(NFA_OR);
1422 regflags |= RF_HASNL;
1423 }
1424 break;
1425
1426 case Magic('n'):
1427 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001428 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001429 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001430 else
1431 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001432 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001433 EMIT(NFA_NEWL);
1434 regflags |= RF_HASNL;
1435 }
1436 break;
1437
1438 case Magic('('):
1439 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001440 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001441 break;
1442
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001443 case Magic('|'):
1444 case Magic('&'):
1445 case Magic(')'):
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001446 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001447 return FAIL;
1448
1449 case Magic('='):
1450 case Magic('?'):
1451 case Magic('+'):
1452 case Magic('@'):
1453 case Magic('*'):
1454 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001455 // these should follow an atom, not form an atom
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001456 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001457 return FAIL;
1458
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001459 case Magic('~'):
1460 {
1461 char_u *lp;
1462
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001463 // Previous substitute pattern.
1464 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001465 if (reg_prev_sub == NULL)
1466 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001467 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001468 return FAIL;
1469 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001470 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001471 {
1472 EMIT(PTR2CHAR(lp));
1473 if (lp != reg_prev_sub)
1474 EMIT(NFA_CONCAT);
1475 }
1476 EMIT(NFA_NOPEN);
1477 break;
1478 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001479
Bram Moolenaar428e9872013-05-30 17:05:39 +02001480 case Magic('1'):
1481 case Magic('2'):
1482 case Magic('3'):
1483 case Magic('4'):
1484 case Magic('5'):
1485 case Magic('6'):
1486 case Magic('7'):
1487 case Magic('8'):
1488 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001489 {
1490 int refnum = no_Magic(c) - '1';
1491
1492 if (!seen_endbrace(refnum + 1))
1493 return FAIL;
1494 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001495 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001496 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001497 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001498
1499 case Magic('z'):
1500 c = no_Magic(getchr());
1501 switch (c)
1502 {
1503 case 's':
1504 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001505 if (re_mult_next("\\zs") == FAIL)
1506 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001507 break;
1508 case 'e':
1509 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001510 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001511 if (re_mult_next("\\ze") == FAIL)
1512 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001513 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001514#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001515 case '1':
1516 case '2':
1517 case '3':
1518 case '4':
1519 case '5':
1520 case '6':
1521 case '7':
1522 case '8':
1523 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001524 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001525 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001526 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001527 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001528 // No need to set rex.nfa_has_backref, the sub-matches don't
1529 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001530 re_has_z = REX_USE;
1531 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001532 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001533 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001534 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001535 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001536 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001537 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001538 re_has_z = REX_SET;
1539 break;
1540#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001541 default:
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001542 semsg(_(e_nfa_regexp_unknown_operator_z_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001543 return FAIL;
1544 }
1545 break;
1546
1547 case Magic('%'):
1548 c = no_Magic(getchr());
1549 switch (c)
1550 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001551 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001552 case '(':
1553 if (nfa_reg(REG_NPAREN) == FAIL)
1554 return FAIL;
1555 EMIT(NFA_NOPEN);
1556 break;
1557
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001558 case 'd': // %d123 decimal
1559 case 'o': // %o123 octal
1560 case 'x': // %xab hex 2
1561 case 'u': // %uabcd hex 4
1562 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001563 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001564 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001565
Bram Moolenaar47196582013-05-25 22:04:23 +02001566 switch (c)
1567 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001568 case 'd': nr = getdecchrs(); break;
1569 case 'o': nr = getoctchrs(); break;
1570 case 'x': nr = gethexchrs(2); break;
1571 case 'u': nr = gethexchrs(4); break;
1572 case 'U': nr = gethexchrs(8); break;
1573 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001574 }
1575
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001576 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001577 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1578 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001579 // A NUL is stored in the text as NL
1580 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001581 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001582 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001583 break;
1584
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001585 // Catch \%^ and \%$ regardless of where they appear in the
1586 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001587 case '^':
1588 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001589 break;
1590
1591 case '$':
1592 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001593 break;
1594
1595 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +02001596 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001597 break;
1598
1599 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001600 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001601 break;
1602
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001603 case 'C':
1604 EMIT(NFA_ANY_COMPOSING);
1605 break;
1606
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001607 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001608 {
1609 int n;
1610
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001611 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001612 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001613 {
1614 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001615 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001616 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001617 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001618 if (nfa_regatom() == FAIL)
1619 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001620 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001621 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001622 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001623 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001624 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001625 EMIT(NFA_OPT_CHARS);
1626 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001627
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001628 // Emit as "\%(\%[abc]\)" to be able to handle
1629 // "\%[abc]*" which would cause the empty string to be
1630 // matched an unlimited number of times. NFA_NOPEN is
1631 // added only once at a position, while NFA_SPLIT is
1632 // added multiple times. This is more efficient than
1633 // not allowing NFA_SPLIT multiple times, it is used
1634 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001635 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001636 break;
1637 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001638
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001639 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001640 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001641 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001642 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001643 int cur = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001644
1645 if (c == '<' || c == '>')
1646 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001647 if (no_Magic(c) == '.')
1648 {
1649 cur = TRUE;
1650 c = getchr();
1651 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001652 while (VIM_ISDIGIT(c))
1653 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001654 long_u tmp;
1655
1656 if (cur)
1657 semsg(_(e_regexp_number_after_dot_pos_search),
1658 no_Magic(c));
1659 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001660
1661 if (tmp < n)
1662 {
1663 // overflow.
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001664 emsg(_(e_percent_value_too_large));
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001665 return FAIL;
1666 }
1667 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001668 c = getchr();
1669 }
1670 if (c == 'l' || c == 'c' || c == 'v')
1671 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001672 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001673
Bram Moolenaar423532e2013-05-29 21:14:42 +02001674 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001675 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001676 if (cur)
1677 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001678 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001679 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001680 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001681 if (save_prev_at_start)
1682 at_start = TRUE;
1683 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001684 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001685 {
1686 if (cur)
1687 {
1688 n = curwin->w_cursor.col;
1689 n++;
1690 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001691 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001692 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001693 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001694 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001695 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001696 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001697 if (cur)
1698 {
1699 colnr_T vcol = 0;
1700
1701 getvvcol(curwin, &curwin->w_cursor,
1702 NULL, NULL, &vcol);
1703 n = ++vcol;
1704 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001705 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001706 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001707 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001708 limit = INT_MAX / MB_MAXBYTES;
1709 }
1710 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001711 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001712 emsg(_(e_percent_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001713 return FAIL;
1714 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001715 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001716 break;
1717 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001718 else if (c == '\'' && n == 0)
1719 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001720 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001721 EMIT(cmp == '<' ? NFA_MARK_LT :
1722 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001723 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001724 break;
1725 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001726 }
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001727 semsg(_(e_nfa_regexp_unknown_operator_percent_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001728 return FAIL;
1729 }
1730 break;
1731
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001732 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001733collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001734 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001735 * [abc] uses NFA_START_COLL - NFA_END_COLL
1736 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1737 * Each character is produced as a regular state, using
1738 * NFA_CONCAT to bind them together.
1739 * Besides normal characters there can be:
1740 * - character classes NFA_CLASS_*
1741 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001742 */
1743
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001744 p = regparse;
1745 endp = skip_anyof(p);
1746 if (*endp == ']')
1747 {
1748 /*
1749 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001750 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001751 * and perform the necessary substitutions in the NFA.
1752 */
1753 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001754 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001755 if (result != FAIL)
1756 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001757 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001758 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001759 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001760 EMIT(NFA_NEWL);
1761 EMIT(NFA_OR);
1762 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001763 else
1764 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001765 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001766 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001767 return OK;
1768 }
1769 /*
1770 * Failed to recognize a character class. Use the simple
1771 * version that turns [abc] into 'a' OR 'b' OR 'c'
1772 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001773 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001774 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001775 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001776 {
1777 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001778 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001779 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001780 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001781 else
1782 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001783 if (*regparse == '-')
1784 {
1785 startc = '-';
1786 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001787 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001788 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001789 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001790 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001791 emit_range = FALSE;
1792 while (regparse < endp)
1793 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001794 int oldstartc = startc;
1795
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001796 startc = -1;
1797 got_coll_char = FALSE;
1798 if (*regparse == '[')
1799 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001800 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001801 equiclass = collclass = 0;
1802 charclass = get_char_class(&regparse);
1803 if (charclass == CLASS_NONE)
1804 {
1805 equiclass = get_equi_class(&regparse);
1806 if (equiclass == 0)
1807 collclass = get_coll_element(&regparse);
1808 }
1809
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001810 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001811 if (charclass != CLASS_NONE)
1812 {
1813 switch (charclass)
1814 {
1815 case CLASS_ALNUM:
1816 EMIT(NFA_CLASS_ALNUM);
1817 break;
1818 case CLASS_ALPHA:
1819 EMIT(NFA_CLASS_ALPHA);
1820 break;
1821 case CLASS_BLANK:
1822 EMIT(NFA_CLASS_BLANK);
1823 break;
1824 case CLASS_CNTRL:
1825 EMIT(NFA_CLASS_CNTRL);
1826 break;
1827 case CLASS_DIGIT:
1828 EMIT(NFA_CLASS_DIGIT);
1829 break;
1830 case CLASS_GRAPH:
1831 EMIT(NFA_CLASS_GRAPH);
1832 break;
1833 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001834 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001835 EMIT(NFA_CLASS_LOWER);
1836 break;
1837 case CLASS_PRINT:
1838 EMIT(NFA_CLASS_PRINT);
1839 break;
1840 case CLASS_PUNCT:
1841 EMIT(NFA_CLASS_PUNCT);
1842 break;
1843 case CLASS_SPACE:
1844 EMIT(NFA_CLASS_SPACE);
1845 break;
1846 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001847 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001848 EMIT(NFA_CLASS_UPPER);
1849 break;
1850 case CLASS_XDIGIT:
1851 EMIT(NFA_CLASS_XDIGIT);
1852 break;
1853 case CLASS_TAB:
1854 EMIT(NFA_CLASS_TAB);
1855 break;
1856 case CLASS_RETURN:
1857 EMIT(NFA_CLASS_RETURN);
1858 break;
1859 case CLASS_BACKSPACE:
1860 EMIT(NFA_CLASS_BACKSPACE);
1861 break;
1862 case CLASS_ESCAPE:
1863 EMIT(NFA_CLASS_ESCAPE);
1864 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001865 case CLASS_IDENT:
1866 EMIT(NFA_CLASS_IDENT);
1867 break;
1868 case CLASS_KEYWORD:
1869 EMIT(NFA_CLASS_KEYWORD);
1870 break;
1871 case CLASS_FNAME:
1872 EMIT(NFA_CLASS_FNAME);
1873 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001874 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001875 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001876 continue;
1877 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001878 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001879 if (equiclass != 0)
1880 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001881 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001882 if (result == FAIL)
1883 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001884 // should never happen
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001885 EMSG_RET_FAIL(_(e_error_building_nfa_with_equivalence_class));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001886 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001887 continue;
1888 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001889 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001890 if (collclass != 0)
1891 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001892 startc = collclass; // allow [.a.]-x as a range
1893 // Will emit the proper atom at the end of the
1894 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001895 }
1896 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001897 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1898 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001899 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001900 {
1901 emit_range = TRUE;
1902 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001903 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001904 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001905 }
1906
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001907 // Now handle simple and escaped characters.
1908 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1909 // accepts "\t", "\e", etc., but only when the 'l' flag in
1910 // 'cpoptions' is not included.
1911 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001912 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001913 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001914 && regparse + 1 <= endp
1915 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001916 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001917 && vim_strchr(REGEXP_ABBR, regparse[1])
1918 != NULL)
1919 )
1920 )
1921 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001922 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001923
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001924 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001925 startc = (reg_string || emit_range
1926 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001927 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001928 || *regparse == 'o'
1929 || *regparse == 'x'
1930 || *regparse == 'u'
1931 || *regparse == 'U'
1932 )
1933 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001934 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001935 startc = coll_get_char();
1936 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001937 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001938 }
1939 else
1940 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001941 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001942 startc = backslash_trans(*regparse);
1943 }
1944 }
1945
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001946 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001947 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001948 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001949
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001950 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001951 if (emit_range)
1952 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001953 int endc = startc;
1954
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001955 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001956 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001957 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02001958
1959 if (endc > startc + 2)
1960 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001961 // Emit a range instead of the sequence of
1962 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001963 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001964 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001965 EMIT(1);
1966 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001967 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02001968 EMIT(endc);
1969 EMIT(NFA_RANGE);
1970 EMIT(NFA_CONCAT);
1971 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001972 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001973 || (*mb_char2len)(endc) > 1))
1974 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001975 // Emit the characters in the range.
1976 // "startc" was already emitted, so skip it.
1977 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001978 for (c = startc + 1; c <= endc; c++)
1979 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001980 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001981 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001982 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001983 }
1984 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001985 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001986 // Emit the range. "startc" was already emitted, so
1987 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001988 for (c = startc + 1; c <= endc; c++)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001989 {
1990 EMIT(c);
1991 EMIT(NFA_CONCAT);
1992 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001993 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001994 emit_range = FALSE;
1995 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001996 }
1997 else
1998 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001999 // This char (startc) is not part of a range. Just
2000 // emit it.
2001 // Normally, simply emit startc. But if we get char
2002 // code=0 from a collating char, then replace it with
2003 // 0x0a.
2004 // This is needed to completely mimic the behaviour of
2005 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002006 if (startc == NFA_NEWL)
2007 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002008 // Line break can't be matched as part of the
2009 // collection, add an OR below. But not for negated
2010 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002011 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002012 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002013 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002014 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002015 {
2016 if (got_coll_char == TRUE && startc == 0)
2017 EMIT(0x0a);
2018 else
2019 EMIT(startc);
2020 EMIT(NFA_CONCAT);
2021 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002022 }
2023
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002024 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002025 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002026
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002027 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002028 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002029 {
2030 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002031 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002032 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002033
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002034 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002035 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002036 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002037
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002038 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002039 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002040 EMIT(NFA_END_NEG_COLL);
2041 else
2042 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002043
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002044 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002045 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002046 {
2047 EMIT(reg_string ? NL : NFA_NEWL);
2048 EMIT(NFA_OR);
2049 }
2050
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002051 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002052 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002053
2054 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002055 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002056 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002057
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002058 default:
2059 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002060 int plen;
2061
2062nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002063 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002064 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002065 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002066 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002067 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002068 int i = 0;
2069
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002070 // A base character plus composing characters, or just one
2071 // or more composing characters.
2072 // This requires creating a separate atom as if enclosing
2073 // the characters in (), where NFA_COMPOSING is the ( and
2074 // NFA_END_COMPOSING is the ). Note that right now we are
2075 // building the postfix form, not the NFA itself;
2076 // a composing char could be: a, b, c, NFA_COMPOSING
2077 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002078 for (;;)
2079 {
2080 EMIT(c);
2081 if (i > 0)
2082 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002083 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002084 break;
2085 c = utf_ptr2char(old_regparse + i);
2086 }
2087 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002088 regparse = old_regparse + plen;
2089 }
2090 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002091 {
2092 c = no_Magic(c);
2093 EMIT(c);
2094 }
2095 return OK;
2096 }
2097 }
2098
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002099 return OK;
2100}
2101
2102/*
2103 * Parse something followed by possible [*+=].
2104 *
2105 * A piece is an atom, possibly followed by a multi, an indication of how many
2106 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2107 * characters: "", "a", "aa", etc.
2108 *
2109 * piece ::= atom
2110 * or atom multi
2111 */
2112 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002113nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002114{
2115 int i;
2116 int op;
2117 int ret;
2118 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002119 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002120 parse_state_T old_state;
2121 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002122 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002123 int old_post_pos;
2124 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002125 int quest;
2126
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002127 // Save the current parse state, so that we can use it if <atom>{m,n} is
2128 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002129 save_parse_state(&old_state);
2130
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002131 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002132 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002133
2134 ret = nfa_regatom();
2135 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002136 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002137
2138 op = peekchr();
2139 if (re_multi_type(op) == NOT_MULTI)
2140 return OK;
2141
2142 skipchr();
2143 switch (op)
2144 {
2145 case Magic('*'):
2146 EMIT(NFA_STAR);
2147 break;
2148
2149 case Magic('+'):
2150 /*
2151 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2152 * first and only submatch would be "aaa". But the backtracking
2153 * engine interprets the plus as "try matching one more time", and
2154 * a* matches a second time at the end of the input, the empty
2155 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002156 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002157 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002158 * In order to be consistent with the old engine, we replace
2159 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002160 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002161 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002162 curchr = -1;
2163 if (nfa_regatom() == FAIL)
2164 return FAIL;
2165 EMIT(NFA_STAR);
2166 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002167 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002168 break;
2169
2170 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002171 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002172 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002173 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002174 switch(op)
2175 {
2176 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002177 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002178 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002179 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002180 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002181 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002182 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002183 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002184 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002185 op = no_Magic(getchr());
2186 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002187 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002188 i = NFA_PREV_ATOM_JUST_BEFORE;
2189 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002190 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002191 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2192 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002193 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002194 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002195 i = NFA_PREV_ATOM_LIKE_PATTERN;
2196 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002197 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002198 if (i == 0)
2199 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002200 semsg(_(e_nfa_regexp_unknown_operator_at_chr), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002201 return FAIL;
2202 }
2203 EMIT(i);
2204 if (i == NFA_PREV_ATOM_JUST_BEFORE
2205 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2206 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002207 break;
2208
2209 case Magic('?'):
2210 case Magic('='):
2211 EMIT(NFA_QUEST);
2212 break;
2213
2214 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002215 // a{2,5} will expand to 'aaa?a?a?'
2216 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2217 // version of '?'
2218 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2219 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002220
2221 greedy = TRUE;
2222 c2 = peekchr();
2223 if (c2 == '-' || c2 == Magic('-'))
2224 {
2225 skipchr();
2226 greedy = FALSE;
2227 }
2228 if (!read_limits(&minval, &maxval))
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002229 EMSG_RET_FAIL(_(e_nfa_regexp_error_reading_repetition_limits));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002230
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002231 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2232 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002233 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002234 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002235 if (greedy) // { { (match the braces)
2236 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002237 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002238 else // { { (match the braces)
2239 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002240 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002241 break;
2242 }
2243
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002244 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002245 if (maxval == 0)
2246 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002247 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002248 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002249 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002250 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002251 return OK;
2252 }
2253
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002254 // The engine is very inefficient (uses too many states) when the
2255 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002256 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2257 // will emit NFA_STAR.
2258 // Bail out if we can use the other engine, but only, when the
2259 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002260 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002261 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002262 && (maxval > 500 || maxval > minval + 200)
2263 && (maxval != MAX_LIMIT && minval < 200)
2264 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002265 return FAIL;
2266
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002267 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002268 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002269 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002270 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002271
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002272 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2273 for (i = 0; i < maxval; i++)
2274 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002275 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002276 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002277 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002278 if (nfa_regatom() == FAIL)
2279 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002280 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002281 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002282 {
2283 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002284 {
2285 if (greedy)
2286 EMIT(NFA_STAR);
2287 else
2288 EMIT(NFA_STAR_NONGREEDY);
2289 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002290 else
2291 EMIT(quest);
2292 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002293 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002294 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002295 if (i + 1 > minval && maxval == MAX_LIMIT)
2296 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002297 }
2298
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002299 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002300 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002301 curchr = -1;
2302
2303 break;
2304
2305
2306 default:
2307 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002308 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002309
2310 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002311 // Can't have a multi follow a multi.
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002312 EMSG_RET_FAIL(_(e_nfa_regexp_cant_have_multi_follow_multi));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002313
2314 return OK;
2315}
2316
2317/*
2318 * Parse one or more pieces, concatenated. It matches a match for the
2319 * first piece, followed by a match for the second piece, etc. Example:
2320 * "f[0-9]b", first matches "f", then a digit and then "b".
2321 *
2322 * concat ::= piece
2323 * or piece piece
2324 * or piece piece piece
2325 * etc.
2326 */
2327 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002328nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002329{
2330 int cont = TRUE;
2331 int first = TRUE;
2332
2333 while (cont)
2334 {
2335 switch (peekchr())
2336 {
2337 case NUL:
2338 case Magic('|'):
2339 case Magic('&'):
2340 case Magic(')'):
2341 cont = FALSE;
2342 break;
2343
2344 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002345 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002346 skipchr_keepstart();
2347 break;
2348 case Magic('c'):
2349 regflags |= RF_ICASE;
2350 skipchr_keepstart();
2351 break;
2352 case Magic('C'):
2353 regflags |= RF_NOICASE;
2354 skipchr_keepstart();
2355 break;
2356 case Magic('v'):
2357 reg_magic = MAGIC_ALL;
2358 skipchr_keepstart();
2359 curchr = -1;
2360 break;
2361 case Magic('m'):
2362 reg_magic = MAGIC_ON;
2363 skipchr_keepstart();
2364 curchr = -1;
2365 break;
2366 case Magic('M'):
2367 reg_magic = MAGIC_OFF;
2368 skipchr_keepstart();
2369 curchr = -1;
2370 break;
2371 case Magic('V'):
2372 reg_magic = MAGIC_NONE;
2373 skipchr_keepstart();
2374 curchr = -1;
2375 break;
2376
2377 default:
2378 if (nfa_regpiece() == FAIL)
2379 return FAIL;
2380 if (first == FALSE)
2381 EMIT(NFA_CONCAT);
2382 else
2383 first = FALSE;
2384 break;
2385 }
2386 }
2387
2388 return OK;
2389}
2390
2391/*
2392 * Parse a branch, one or more concats, separated by "\&". It matches the
2393 * last concat, but only if all the preceding concats also match at the same
2394 * position. Examples:
2395 * "foobeep\&..." matches "foo" in "foobeep".
2396 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2397 *
2398 * branch ::= concat
2399 * or concat \& concat
2400 * or concat \& concat \& concat
2401 * etc.
2402 */
2403 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002404nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002405{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002406 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002407
Bram Moolenaar16299b52013-05-30 18:45:23 +02002408 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002409
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002410 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002411 if (nfa_regconcat() == FAIL)
2412 return FAIL;
2413
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002414 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002415 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002416 {
2417 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002418 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002419 if (old_post_pos == (int)(post_ptr - post_start))
2420 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002421 EMIT(NFA_NOPEN);
2422 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002423 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002424 if (nfa_regconcat() == FAIL)
2425 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002426 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002427 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002428 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002429 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002430 }
2431
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002432 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002433 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002434 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002435
2436 return OK;
2437}
2438
2439/*
2440 * Parse a pattern, one or more branches, separated by "\|". It matches
2441 * anything that matches one of the branches. Example: "foo\|beep" matches
2442 * "foo" and matches "beep". If more than one branch matches, the first one
2443 * is used.
2444 *
2445 * pattern ::= branch
2446 * or branch \| branch
2447 * or branch \| branch \| branch
2448 * etc.
2449 */
2450 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002451nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002452 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002453{
2454 int parno = 0;
2455
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002456 if (paren == REG_PAREN)
2457 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002458 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002459 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_parens));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002460 parno = regnpar++;
2461 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002462#ifdef FEAT_SYN_HL
2463 else if (paren == REG_ZPAREN)
2464 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002465 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002466 if (regnzpar >= NSUBEXP)
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002467 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_z));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002468 parno = regnzpar++;
2469 }
2470#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002471
2472 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002473 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002474
2475 while (peekchr() == Magic('|'))
2476 {
2477 skipchr();
2478 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002479 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002480 EMIT(NFA_OR);
2481 }
2482
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002483 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002484 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2485 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002486 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002487 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2488 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002489 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002490 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002491 }
2492 else if (paren == REG_NOPAREN && peekchr() != NUL)
2493 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002494 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002495 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002496 else
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002497 EMSG_RET_FAIL(_(e_nfa_regexp_proper_termination_error));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002498 }
2499 /*
2500 * Here we set the flag allowing back references to this set of
2501 * parentheses.
2502 */
2503 if (paren == REG_PAREN)
2504 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002505 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002506 EMIT(NFA_MOPEN + parno);
2507 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002508#ifdef FEAT_SYN_HL
2509 else if (paren == REG_ZPAREN)
2510 EMIT(NFA_ZOPEN + parno);
2511#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002512
2513 return OK;
2514}
2515
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002516#ifdef DEBUG
2517static char_u code[50];
2518
2519 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002520nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002521{
2522 int addnl = FALSE;
2523
2524 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2525 {
2526 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002527 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002528 }
2529
2530 STRCPY(code, "");
2531 switch (c)
2532 {
2533 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2534 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2535 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2536 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2537 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2538 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2539
Bram Moolenaar5714b802013-05-28 22:03:20 +02002540 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2541 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2542 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2543 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2544 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2545 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2546 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2547 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2548 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002549#ifdef FEAT_SYN_HL
2550 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2551 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2552 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2553 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2554 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2555 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2556 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2557 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2558 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2559#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002560 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2561
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002562 case NFA_PREV_ATOM_NO_WIDTH:
2563 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002564 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2565 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002566 case NFA_PREV_ATOM_JUST_BEFORE:
2567 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2568 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2569 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002570 case NFA_PREV_ATOM_LIKE_PATTERN:
2571 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2572
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002573 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2574 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002575 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002576 case NFA_START_INVISIBLE_FIRST:
2577 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002578 case NFA_START_INVISIBLE_NEG:
2579 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002580 case NFA_START_INVISIBLE_NEG_FIRST:
2581 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002582 case NFA_START_INVISIBLE_BEFORE:
2583 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002584 case NFA_START_INVISIBLE_BEFORE_FIRST:
2585 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002586 case NFA_START_INVISIBLE_BEFORE_NEG:
2587 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002588 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2589 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002590 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002591 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002592 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002593 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002595 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2596 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002597 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002598
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002599 case NFA_MOPEN:
2600 case NFA_MOPEN1:
2601 case NFA_MOPEN2:
2602 case NFA_MOPEN3:
2603 case NFA_MOPEN4:
2604 case NFA_MOPEN5:
2605 case NFA_MOPEN6:
2606 case NFA_MOPEN7:
2607 case NFA_MOPEN8:
2608 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002609 STRCPY(code, "NFA_MOPEN(x)");
2610 code[10] = c - NFA_MOPEN + '0';
2611 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002612 case NFA_MCLOSE:
2613 case NFA_MCLOSE1:
2614 case NFA_MCLOSE2:
2615 case NFA_MCLOSE3:
2616 case NFA_MCLOSE4:
2617 case NFA_MCLOSE5:
2618 case NFA_MCLOSE6:
2619 case NFA_MCLOSE7:
2620 case NFA_MCLOSE8:
2621 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002622 STRCPY(code, "NFA_MCLOSE(x)");
2623 code[11] = c - NFA_MCLOSE + '0';
2624 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002625#ifdef FEAT_SYN_HL
2626 case NFA_ZOPEN:
2627 case NFA_ZOPEN1:
2628 case NFA_ZOPEN2:
2629 case NFA_ZOPEN3:
2630 case NFA_ZOPEN4:
2631 case NFA_ZOPEN5:
2632 case NFA_ZOPEN6:
2633 case NFA_ZOPEN7:
2634 case NFA_ZOPEN8:
2635 case NFA_ZOPEN9:
2636 STRCPY(code, "NFA_ZOPEN(x)");
2637 code[10] = c - NFA_ZOPEN + '0';
2638 break;
2639 case NFA_ZCLOSE:
2640 case NFA_ZCLOSE1:
2641 case NFA_ZCLOSE2:
2642 case NFA_ZCLOSE3:
2643 case NFA_ZCLOSE4:
2644 case NFA_ZCLOSE5:
2645 case NFA_ZCLOSE6:
2646 case NFA_ZCLOSE7:
2647 case NFA_ZCLOSE8:
2648 case NFA_ZCLOSE9:
2649 STRCPY(code, "NFA_ZCLOSE(x)");
2650 code[11] = c - NFA_ZCLOSE + '0';
2651 break;
2652#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002653 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2654 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2655 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2656 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002657 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2658 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002659 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2660 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2661 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2662 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2663 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2664 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2665 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2666 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2667 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2668 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2669 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2670 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2671 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2672 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002673 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002674
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002675 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002676 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2677 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2678 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002679 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002680 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002681
2682 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2683 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2684 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2685 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2686 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2687 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2688 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2689
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002690 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2691 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2692 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2693 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2694 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2695 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2696 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2697 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2698 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2699 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2700 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2701 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2702 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2703 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2704 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2705 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002706 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2707 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2708 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002709
2710 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2711 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2712 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2713 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2714 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2715 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2716 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2717 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2718 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2719 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2720 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2721 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2722 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2723 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2724 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2725 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2726 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2727 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2728 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2729 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2730 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2731 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2732 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2733 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2734 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2735 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2736 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002737 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2738 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2739 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2740 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002741
2742 default:
2743 STRCPY(code, "CHAR(x)");
2744 code[5] = c;
2745 }
2746
2747 if (addnl == TRUE)
2748 STRCAT(code, " + NEWLINE ");
2749
2750}
2751
2752#ifdef ENABLE_LOG
2753static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002754static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002755
2756/*
2757 * Print the postfix notation of the current regexp.
2758 */
2759 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002760nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002761{
2762 int *p;
2763 FILE *f;
2764
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002765 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002766 if (f != NULL)
2767 {
2768 fprintf(f, "\n-------------------------\n");
2769 if (retval == FAIL)
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002770 fprintf(f, ">>> NFA engine failed... \n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002771 else if (retval == OK)
2772 fprintf(f, ">>> NFA engine succeeded !\n");
2773 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002774 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002775 {
2776 nfa_set_code(*p);
2777 fprintf(f, "%s, ", code);
2778 }
2779 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002780 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002781 fprintf(f, "%d ", *p);
2782 fprintf(f, "\n\n");
2783 fclose(f);
2784 }
2785}
2786
2787/*
2788 * Print the NFA starting with a root node "state".
2789 */
2790 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002791nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002792{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002793 garray_T indent;
2794
2795 ga_init2(&indent, 1, 64);
2796 ga_append(&indent, '\0');
2797 nfa_print_state2(debugf, state, &indent);
2798 ga_clear(&indent);
2799}
2800
2801 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002802nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002803{
2804 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002805
2806 if (state == NULL)
2807 return;
2808
2809 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002810
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002811 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002812 p = (char_u *)indent->ga_data;
2813 if (indent->ga_len >= 3)
2814 {
2815 int last = indent->ga_len - 3;
2816 char_u save[2];
2817
2818 STRNCPY(save, &p[last], 2);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00002819 memcpy(&p[last], "+-", 2);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002820 fprintf(debugf, " %s", p);
2821 STRNCPY(&p[last], save, 2);
2822 }
2823 else
2824 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002825
2826 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002827 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002828 code,
2829 state->c,
2830 abs(state->id),
2831 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002832 if (state->id < 0)
2833 return;
2834
2835 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002836
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002837 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002838 indent->ga_len -= 1;
2839 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002840 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002841 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002842 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002843 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002844
2845 nfa_print_state2(debugf, state->out, indent);
2846
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002847 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002848 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002849 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002850 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002851
2852 nfa_print_state2(debugf, state->out1, indent);
2853
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002854 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002855 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002856 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002857}
2858
2859/*
2860 * Print the NFA state machine.
2861 */
2862 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002863nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002864{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002865 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002866
2867 if (debugf != NULL)
2868 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02002869 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002870
Bram Moolenaar473de612013-06-08 18:19:48 +02002871 if (prog->reganch)
2872 fprintf(debugf, "reganch: %d\n", prog->reganch);
2873 if (prog->regstart != NUL)
2874 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2875 prog->regstart, prog->regstart);
2876 if (prog->match_text != NULL)
2877 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002878
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002879 fclose(debugf);
2880 }
2881}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002882#endif // ENABLE_LOG
2883#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002884
2885/*
2886 * Parse r.e. @expr and convert it into postfix form.
2887 * Return the postfix string on success, NULL otherwise.
2888 */
2889 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002890re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002891{
2892 if (nfa_reg(REG_NOPAREN) == FAIL)
2893 return NULL;
2894 EMIT(NFA_MOPEN);
2895 return post_start;
2896}
2897
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002898// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002899
2900/*
2901 * Represents an NFA state plus zero or one or two arrows exiting.
2902 * if c == MATCH, no arrows out; matching state.
2903 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2904 * If c < 256, labeled arrow with character c to out.
2905 */
2906
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002907static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002908
2909/*
2910 * Allocate and initialize nfa_state_T.
2911 */
2912 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002913alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002914{
2915 nfa_state_T *s;
2916
2917 if (istate >= nstate)
2918 return NULL;
2919
2920 s = &state_ptr[istate++];
2921
2922 s->c = c;
2923 s->out = out;
2924 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002925 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002926
2927 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002928 s->lastlist[0] = 0;
2929 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002930
2931 return s;
2932}
2933
2934/*
2935 * A partially built NFA without the matching state filled in.
2936 * Frag_T.start points at the start state.
2937 * Frag_T.out is a list of places that need to be set to the
2938 * next state for this fragment.
2939 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002940
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002941// Since the out pointers in the list are always
2942// uninitialized, we use the pointers themselves
2943// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002944typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002945union Ptrlist
2946{
2947 Ptrlist *next;
2948 nfa_state_T *s;
2949};
2950
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002951struct Frag
2952{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002953 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002954 Ptrlist *out;
2955};
2956typedef struct Frag Frag_T;
2957
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002958/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02002959 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002960 */
2961 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01002962frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002963{
Bram Moolenaar053bb602013-05-20 13:55:21 +02002964 Frag_T n;
2965
2966 n.start = start;
2967 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002968 return n;
2969}
2970
2971/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002972 * Create singleton list containing just outp.
2973 */
2974 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01002975list1(
2976 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002977{
2978 Ptrlist *l;
2979
2980 l = (Ptrlist *)outp;
2981 l->next = NULL;
2982 return l;
2983}
2984
2985/*
2986 * Patch the list of states at out to point to start.
2987 */
2988 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002989patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002990{
2991 Ptrlist *next;
2992
2993 for (; l; l = next)
2994 {
2995 next = l->next;
2996 l->s = s;
2997 }
2998}
2999
3000
3001/*
3002 * Join the two lists l1 and l2, returning the combination.
3003 */
3004 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003005append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003006{
3007 Ptrlist *oldl1;
3008
3009 oldl1 = l1;
3010 while (l1->next)
3011 l1 = l1->next;
3012 l1->next = l2;
3013 return oldl1;
3014}
3015
3016/*
3017 * Stack used for transforming postfix form into NFA.
3018 */
3019static Frag_T empty;
3020
3021 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003022st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003023{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003024#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003025 FILE *df;
3026 int *p2;
3027
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003028 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003029 if (df)
3030 {
3031 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003032# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003033 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003034# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003035 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003036# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003037 for (p2 = postfix; p2 < end; p2++)
3038 {
3039 nfa_set_code(*p2);
3040 fprintf(df, "%s, ", code);
3041 }
3042 nfa_set_code(*p);
3043 fprintf(df, "\nCurrent position is: ");
3044 for (p2 = postfix; p2 <= p; p2 ++)
3045 {
3046 nfa_set_code(*p2);
3047 fprintf(df, "%s, ", code);
3048 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003049# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003050 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003051 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003052 fprintf(df, "\nCurrent position is: ");
3053 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003054 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003055# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003056 fprintf(df, "\n--------------------------\n");
3057 fclose(df);
3058 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003059#endif
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003060 emsg(_(e_nfa_regexp_could_not_pop_stack));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003061}
3062
3063/*
3064 * Push an item onto the stack.
3065 */
3066 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003067st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003068{
3069 Frag_T *stackp = *p;
3070
3071 if (stackp >= stack_end)
3072 return;
3073 *stackp = s;
3074 *p = *p + 1;
3075}
3076
3077/*
3078 * Pop an item from the stack.
3079 */
3080 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003081st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003082{
3083 Frag_T *stackp;
3084
3085 *p = *p - 1;
3086 stackp = *p;
3087 if (stackp < stack)
3088 return empty;
3089 return **p;
3090}
3091
3092/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003093 * Estimate the maximum byte length of anything matching "state".
3094 * When unknown or unlimited return -1.
3095 */
3096 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003097nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003098{
3099 int l, r;
3100 nfa_state_T *state = startstate;
3101 int len = 0;
3102
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003103 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003104 if (depth > 4)
3105 return -1;
3106
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003107 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003108 {
3109 switch (state->c)
3110 {
3111 case NFA_END_INVISIBLE:
3112 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003113 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003114 return len;
3115
3116 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003117 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003118 l = nfa_max_width(state->out, depth + 1);
3119 r = nfa_max_width(state->out1, depth + 1);
3120 if (l < 0 || r < 0)
3121 return -1;
3122 return len + (l > r ? l : r);
3123
3124 case NFA_ANY:
3125 case NFA_START_COLL:
3126 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003127 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003128 if (enc_utf8)
3129 len += MB_MAXBYTES;
3130 else if (has_mbyte)
3131 len += 2;
3132 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003133 ++len;
3134 if (state->c != NFA_ANY)
3135 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003136 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003137 state = state->out1->out;
3138 continue;
3139 }
3140 break;
3141
3142 case NFA_DIGIT:
3143 case NFA_WHITE:
3144 case NFA_HEX:
3145 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003146 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003147 ++len;
3148 break;
3149
3150 case NFA_IDENT:
3151 case NFA_SIDENT:
3152 case NFA_KWORD:
3153 case NFA_SKWORD:
3154 case NFA_FNAME:
3155 case NFA_SFNAME:
3156 case NFA_PRINT:
3157 case NFA_SPRINT:
3158 case NFA_NWHITE:
3159 case NFA_NDIGIT:
3160 case NFA_NHEX:
3161 case NFA_NOCTAL:
3162 case NFA_WORD:
3163 case NFA_NWORD:
3164 case NFA_HEAD:
3165 case NFA_NHEAD:
3166 case NFA_ALPHA:
3167 case NFA_NALPHA:
3168 case NFA_LOWER:
3169 case NFA_NLOWER:
3170 case NFA_UPPER:
3171 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003172 case NFA_LOWER_IC:
3173 case NFA_NLOWER_IC:
3174 case NFA_UPPER_IC:
3175 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003176 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003177 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003178 if (has_mbyte)
3179 len += 3;
3180 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003181 ++len;
3182 break;
3183
3184 case NFA_START_INVISIBLE:
3185 case NFA_START_INVISIBLE_NEG:
3186 case NFA_START_INVISIBLE_BEFORE:
3187 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003188 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003189 state = state->out1->out;
3190 continue;
3191
3192 case NFA_BACKREF1:
3193 case NFA_BACKREF2:
3194 case NFA_BACKREF3:
3195 case NFA_BACKREF4:
3196 case NFA_BACKREF5:
3197 case NFA_BACKREF6:
3198 case NFA_BACKREF7:
3199 case NFA_BACKREF8:
3200 case NFA_BACKREF9:
3201#ifdef FEAT_SYN_HL
3202 case NFA_ZREF1:
3203 case NFA_ZREF2:
3204 case NFA_ZREF3:
3205 case NFA_ZREF4:
3206 case NFA_ZREF5:
3207 case NFA_ZREF6:
3208 case NFA_ZREF7:
3209 case NFA_ZREF8:
3210 case NFA_ZREF9:
3211#endif
3212 case NFA_NEWL:
3213 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003214 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003215 return -1;
3216
3217 case NFA_BOL:
3218 case NFA_EOL:
3219 case NFA_BOF:
3220 case NFA_EOF:
3221 case NFA_BOW:
3222 case NFA_EOW:
3223 case NFA_MOPEN:
3224 case NFA_MOPEN1:
3225 case NFA_MOPEN2:
3226 case NFA_MOPEN3:
3227 case NFA_MOPEN4:
3228 case NFA_MOPEN5:
3229 case NFA_MOPEN6:
3230 case NFA_MOPEN7:
3231 case NFA_MOPEN8:
3232 case NFA_MOPEN9:
3233#ifdef FEAT_SYN_HL
3234 case NFA_ZOPEN:
3235 case NFA_ZOPEN1:
3236 case NFA_ZOPEN2:
3237 case NFA_ZOPEN3:
3238 case NFA_ZOPEN4:
3239 case NFA_ZOPEN5:
3240 case NFA_ZOPEN6:
3241 case NFA_ZOPEN7:
3242 case NFA_ZOPEN8:
3243 case NFA_ZOPEN9:
3244 case NFA_ZCLOSE:
3245 case NFA_ZCLOSE1:
3246 case NFA_ZCLOSE2:
3247 case NFA_ZCLOSE3:
3248 case NFA_ZCLOSE4:
3249 case NFA_ZCLOSE5:
3250 case NFA_ZCLOSE6:
3251 case NFA_ZCLOSE7:
3252 case NFA_ZCLOSE8:
3253 case NFA_ZCLOSE9:
3254#endif
3255 case NFA_MCLOSE:
3256 case NFA_MCLOSE1:
3257 case NFA_MCLOSE2:
3258 case NFA_MCLOSE3:
3259 case NFA_MCLOSE4:
3260 case NFA_MCLOSE5:
3261 case NFA_MCLOSE6:
3262 case NFA_MCLOSE7:
3263 case NFA_MCLOSE8:
3264 case NFA_MCLOSE9:
3265 case NFA_NOPEN:
3266 case NFA_NCLOSE:
3267
3268 case NFA_LNUM_GT:
3269 case NFA_LNUM_LT:
3270 case NFA_COL_GT:
3271 case NFA_COL_LT:
3272 case NFA_VCOL_GT:
3273 case NFA_VCOL_LT:
3274 case NFA_MARK_GT:
3275 case NFA_MARK_LT:
3276 case NFA_VISUAL:
3277 case NFA_LNUM:
3278 case NFA_CURSOR:
3279 case NFA_COL:
3280 case NFA_VCOL:
3281 case NFA_MARK:
3282
3283 case NFA_ZSTART:
3284 case NFA_ZEND:
3285 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003286 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003287 case NFA_START_PATTERN:
3288 case NFA_END_PATTERN:
3289 case NFA_COMPOSING:
3290 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003291 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003292 break;
3293
3294 default:
3295 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003296 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003297 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003298 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003299 len += MB_CHAR2LEN(state->c);
3300 break;
3301 }
3302
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003303 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003304 state = state->out;
3305 }
3306
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003307 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003308 return -1;
3309}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003310
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003311/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003312 * Convert a postfix form into its equivalent NFA.
3313 * Return the NFA start state on success, NULL otherwise.
3314 */
3315 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003316post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003317{
3318 int *p;
3319 int mopen;
3320 int mclose;
3321 Frag_T *stack = NULL;
3322 Frag_T *stackp = NULL;
3323 Frag_T *stack_end = NULL;
3324 Frag_T e1;
3325 Frag_T e2;
3326 Frag_T e;
3327 nfa_state_T *s;
3328 nfa_state_T *s1;
3329 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003330 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003331
3332 if (postfix == NULL)
3333 return NULL;
3334
Bram Moolenaar053bb602013-05-20 13:55:21 +02003335#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003336#define POP() st_pop(&stackp, stack); \
3337 if (stackp < stack) \
3338 { \
3339 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003340 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003341 return NULL; \
3342 }
3343
3344 if (nfa_calc_size == FALSE)
3345 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003346 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003347 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003348 if (stack == NULL)
3349 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003350 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003351 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003352 }
3353
3354 for (p = postfix; p < end; ++p)
3355 {
3356 switch (*p)
3357 {
3358 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003359 // Concatenation.
3360 // Pay attention: this operator does not exist in the r.e. itself
3361 // (it is implicit, really). It is added when r.e. is translated
3362 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003363 if (nfa_calc_size == TRUE)
3364 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003365 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003366 break;
3367 }
3368 e2 = POP();
3369 e1 = POP();
3370 patch(e1.out, e2.start);
3371 PUSH(frag(e1.start, e2.out));
3372 break;
3373
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003374 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003375 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003376 if (nfa_calc_size == TRUE)
3377 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003378 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003379 break;
3380 }
3381 e2 = POP();
3382 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003383 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003384 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003385 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003386 PUSH(frag(s, append(e1.out, e2.out)));
3387 break;
3388
3389 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003390 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003391 if (nfa_calc_size == TRUE)
3392 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003393 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003394 break;
3395 }
3396 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003397 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003398 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003399 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003400 patch(e.out, s);
3401 PUSH(frag(s, list1(&s->out1)));
3402 break;
3403
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003404 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003405 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003406 if (nfa_calc_size == TRUE)
3407 {
3408 nstate++;
3409 break;
3410 }
3411 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003412 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003413 if (s == NULL)
3414 goto theend;
3415 patch(e.out, s);
3416 PUSH(frag(s, list1(&s->out)));
3417 break;
3418
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003419 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003420 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003421 if (nfa_calc_size == TRUE)
3422 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003423 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003424 break;
3425 }
3426 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003427 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003428 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003429 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003430 PUSH(frag(s, append(e.out, list1(&s->out1))));
3431 break;
3432
3433 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003434 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003435 if (nfa_calc_size == TRUE)
3436 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003437 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003438 break;
3439 }
3440 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003441 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003442 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003443 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003444 PUSH(frag(s, append(e.out, list1(&s->out))));
3445 break;
3446
Bram Moolenaar417bad22013-06-07 14:08:30 +02003447 case NFA_END_COLL:
3448 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003449 // On the stack is the sequence starting with NFA_START_COLL or
3450 // NFA_START_NEG_COLL and all possible characters. Patch it to
3451 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003452 if (nfa_calc_size == TRUE)
3453 {
3454 nstate++;
3455 break;
3456 }
3457 e = POP();
3458 s = alloc_state(NFA_END_COLL, NULL, NULL);
3459 if (s == NULL)
3460 goto theend;
3461 patch(e.out, s);
3462 e.start->out1 = s;
3463 PUSH(frag(e.start, list1(&s->out)));
3464 break;
3465
3466 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003467 // Before this are two characters, the low and high end of a
3468 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003469 if (nfa_calc_size == TRUE)
3470 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003471 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003472 break;
3473 }
3474 e2 = POP();
3475 e1 = POP();
3476 e2.start->val = e2.start->c;
3477 e2.start->c = NFA_RANGE_MAX;
3478 e1.start->val = e1.start->c;
3479 e1.start->c = NFA_RANGE_MIN;
3480 patch(e1.out, e2.start);
3481 PUSH(frag(e1.start, e2.out));
3482 break;
3483
Bram Moolenaar699c1202013-09-25 16:41:54 +02003484 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003485 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003486 if (nfa_calc_size == TRUE)
3487 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003488 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003489 break;
3490 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003491 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003492 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003493 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003494 PUSH(frag(s, list1(&s->out)));
3495 break;
3496
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003497 case NFA_OPT_CHARS:
3498 {
3499 int n;
3500
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003501 // \%[abc] implemented as:
3502 // NFA_SPLIT
3503 // +-CHAR(a)
3504 // | +-NFA_SPLIT
3505 // | +-CHAR(b)
3506 // | | +-NFA_SPLIT
3507 // | | +-CHAR(c)
3508 // | | | +-next
3509 // | | +- next
3510 // | +- next
3511 // +- next
3512 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003513 if (nfa_calc_size == TRUE)
3514 {
3515 nstate += n;
3516 break;
3517 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003518 s = NULL; // avoid compiler warning
3519 e1.out = NULL; // stores list with out1's
3520 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003521 while (n-- > 0)
3522 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003523 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003524 s = alloc_state(NFA_SPLIT, e.start, NULL);
3525 if (s == NULL)
3526 goto theend;
3527 if (e1.out == NULL)
3528 e1 = e;
3529 patch(e.out, s1);
3530 append(e1.out, list1(&s->out1));
3531 s1 = s;
3532 }
3533 PUSH(frag(s, e1.out));
3534 break;
3535 }
3536
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003537 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003538 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003539 case NFA_PREV_ATOM_JUST_BEFORE:
3540 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003541 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003542 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003543 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3544 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003545 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003546 int start_state;
3547 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003548 int n = 0;
3549 nfa_state_T *zend;
3550 nfa_state_T *skip;
3551
Bram Moolenaardecd9542013-06-07 16:31:50 +02003552 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003553 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003554 case NFA_PREV_ATOM_NO_WIDTH:
3555 start_state = NFA_START_INVISIBLE;
3556 end_state = NFA_END_INVISIBLE;
3557 break;
3558 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3559 start_state = NFA_START_INVISIBLE_NEG;
3560 end_state = NFA_END_INVISIBLE_NEG;
3561 break;
3562 case NFA_PREV_ATOM_JUST_BEFORE:
3563 start_state = NFA_START_INVISIBLE_BEFORE;
3564 end_state = NFA_END_INVISIBLE;
3565 break;
3566 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3567 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3568 end_state = NFA_END_INVISIBLE_NEG;
3569 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003570 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003571 start_state = NFA_START_PATTERN;
3572 end_state = NFA_END_PATTERN;
3573 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003574 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003575
3576 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003577 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003578
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003579 // The \@= operator: match the preceding atom with zero width.
3580 // The \@! operator: no match for the preceding atom.
3581 // The \@<= operator: match for the preceding atom.
3582 // The \@<! operator: no match for the preceding atom.
3583 // Surrounds the preceding atom with START_INVISIBLE and
3584 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003585
3586 if (nfa_calc_size == TRUE)
3587 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003588 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003589 break;
3590 }
3591 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003592 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003593 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003594 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003595
Bram Moolenaar87953742013-06-05 18:52:40 +02003596 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003597 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003598 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003599 if (pattern)
3600 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003601 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003602 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003603 if (skip == NULL)
3604 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003605 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003606 if (zend == NULL)
3607 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003608 s1->out= skip;
3609 patch(e.out, zend);
3610 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003611 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003612 else
3613 {
3614 patch(e.out, s1);
3615 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003616 if (before)
3617 {
3618 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003619 // See if we can guess the maximum width, it avoids a
3620 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003621 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003622 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003623 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003624 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003625 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003626 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003627
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003628 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003629#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003630 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003631 if (regflags & RF_ICOMBINE)
3632 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003633 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003634 }
3635#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003636 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003637
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003638 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003639 case NFA_MOPEN1:
3640 case NFA_MOPEN2:
3641 case NFA_MOPEN3:
3642 case NFA_MOPEN4:
3643 case NFA_MOPEN5:
3644 case NFA_MOPEN6:
3645 case NFA_MOPEN7:
3646 case NFA_MOPEN8:
3647 case NFA_MOPEN9:
3648#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003649 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003650 case NFA_ZOPEN1:
3651 case NFA_ZOPEN2:
3652 case NFA_ZOPEN3:
3653 case NFA_ZOPEN4:
3654 case NFA_ZOPEN5:
3655 case NFA_ZOPEN6:
3656 case NFA_ZOPEN7:
3657 case NFA_ZOPEN8:
3658 case NFA_ZOPEN9:
3659#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003660 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003661 if (nfa_calc_size == TRUE)
3662 {
3663 nstate += 2;
3664 break;
3665 }
3666
3667 mopen = *p;
3668 switch (*p)
3669 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003670 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3671#ifdef FEAT_SYN_HL
3672 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3673 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3674 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3675 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3676 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3677 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3678 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3679 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3680 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3681 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3682#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003683 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003684 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003685 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003686 mclose = *p + NSUBEXP;
3687 break;
3688 }
3689
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003690 // Allow "NFA_MOPEN" as a valid postfix representation for
3691 // the empty regexp "". In this case, the NFA will be
3692 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3693 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003694 if (stackp == stack)
3695 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003696 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003697 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003698 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003699 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003700 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003701 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003702 patch(list1(&s->out), s1);
3703 PUSH(frag(s, list1(&s1->out)));
3704 break;
3705 }
3706
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003707 // At least one node was emitted before NFA_MOPEN, so
3708 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003709 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003710 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003711 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003712 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003713
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003714 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003715 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003716 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003717 patch(e.out, s1);
3718
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003719 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003720 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003721 patch(list1(&s->out1), s1);
3722
3723 PUSH(frag(s, list1(&s1->out)));
3724 break;
3725
Bram Moolenaar5714b802013-05-28 22:03:20 +02003726 case NFA_BACKREF1:
3727 case NFA_BACKREF2:
3728 case NFA_BACKREF3:
3729 case NFA_BACKREF4:
3730 case NFA_BACKREF5:
3731 case NFA_BACKREF6:
3732 case NFA_BACKREF7:
3733 case NFA_BACKREF8:
3734 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003735#ifdef FEAT_SYN_HL
3736 case NFA_ZREF1:
3737 case NFA_ZREF2:
3738 case NFA_ZREF3:
3739 case NFA_ZREF4:
3740 case NFA_ZREF5:
3741 case NFA_ZREF6:
3742 case NFA_ZREF7:
3743 case NFA_ZREF8:
3744 case NFA_ZREF9:
3745#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003746 if (nfa_calc_size == TRUE)
3747 {
3748 nstate += 2;
3749 break;
3750 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003751 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003752 if (s == NULL)
3753 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003754 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003755 if (s1 == NULL)
3756 goto theend;
3757 patch(list1(&s->out), s1);
3758 PUSH(frag(s, list1(&s1->out)));
3759 break;
3760
Bram Moolenaar423532e2013-05-29 21:14:42 +02003761 case NFA_LNUM:
3762 case NFA_LNUM_GT:
3763 case NFA_LNUM_LT:
3764 case NFA_VCOL:
3765 case NFA_VCOL_GT:
3766 case NFA_VCOL_LT:
3767 case NFA_COL:
3768 case NFA_COL_GT:
3769 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003770 case NFA_MARK:
3771 case NFA_MARK_GT:
3772 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003773 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003774 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003775
Bram Moolenaar423532e2013-05-29 21:14:42 +02003776 if (nfa_calc_size == TRUE)
3777 {
3778 nstate += 1;
3779 break;
3780 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003781 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003782 if (s == NULL)
3783 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003784 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003785 PUSH(frag(s, list1(&s->out)));
3786 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003787 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003788
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003789 case NFA_ZSTART:
3790 case NFA_ZEND:
3791 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003792 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003793 if (nfa_calc_size == TRUE)
3794 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003795 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003796 break;
3797 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003798 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003799 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003800 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003801 PUSH(frag(s, list1(&s->out)));
3802 break;
3803
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003804 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003805
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003806 } // for(p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003807
3808 if (nfa_calc_size == TRUE)
3809 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003810 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003811 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003812 }
3813
3814 e = POP();
3815 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003816 {
3817 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003818 EMSG_RET_NULL(_(e_nfa_regexp_while_converting_from_postfix_to_nfa_too_many_stats_left_on_stack));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003819 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003820
3821 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003822 {
3823 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003824 EMSG_RET_NULL(_(e_nfa_regexp_not_enough_space_to_store_whole_nfa));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003825 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003826
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003827 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003828 matchstate->c = NFA_MATCH;
3829 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003830 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003831
3832 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003833 ret = e.start;
3834
3835theend:
3836 vim_free(stack);
3837 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003838
3839#undef POP1
3840#undef PUSH1
3841#undef POP2
3842#undef PUSH2
3843#undef POP
3844#undef PUSH
3845}
3846
Bram Moolenaara2947e22013-06-11 22:44:09 +02003847/*
3848 * After building the NFA program, inspect it to add optimization hints.
3849 */
3850 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003851nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003852{
3853 int i;
3854 int c;
3855
3856 for (i = 0; i < prog->nstate; ++i)
3857 {
3858 c = prog->state[i].c;
3859 if (c == NFA_START_INVISIBLE
3860 || c == NFA_START_INVISIBLE_NEG
3861 || c == NFA_START_INVISIBLE_BEFORE
3862 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3863 {
3864 int directly;
3865
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003866 // Do it directly when what follows is possibly the end of the
3867 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003868 if (match_follows(prog->state[i].out1->out, 0))
3869 directly = TRUE;
3870 else
3871 {
3872 int ch_invisible = failure_chance(prog->state[i].out, 0);
3873 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3874
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003875 // Postpone when the invisible match is expensive or has a
3876 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003877 if (c == NFA_START_INVISIBLE_BEFORE
3878 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3879 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003880 // "before" matches are very expensive when
3881 // unbounded, always prefer what follows then,
3882 // unless what follows will always match.
3883 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003884 if (prog->state[i].val <= 0 && ch_follows > 0)
3885 directly = FALSE;
3886 else
3887 directly = ch_follows * 10 < ch_invisible;
3888 }
3889 else
3890 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003891 // normal invisible, first do the one with the
3892 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003893 directly = ch_follows < ch_invisible;
3894 }
3895 }
3896 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003897 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003898 ++prog->state[i].c;
3899 }
3900 }
3901}
3902
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003903/////////////////////////////////////////////////////////////////
3904// NFA execution code.
3905/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003906
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003907typedef struct
3908{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003909 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003910
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003911 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003912 union
3913 {
3914 struct multipos
3915 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003916 linenr_T start_lnum;
3917 linenr_T end_lnum;
3918 colnr_T start_col;
3919 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003920 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003921 struct linepos
3922 {
3923 char_u *start;
3924 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003925 } line[NSUBEXP];
3926 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003927} regsub_T;
3928
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003929typedef struct
3930{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003931 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003932#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003933 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003934#endif
3935} regsubs_T;
3936
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003937// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02003938typedef struct nfa_pim_S nfa_pim_T;
3939struct nfa_pim_S
3940{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003941 int result; // NFA_PIM_*, see below
3942 nfa_state_T *state; // the invisible match start state
3943 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02003944 union
3945 {
3946 lpos_T pos;
3947 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003948 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02003949};
3950
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003951// Values for done in nfa_pim_T.
3952#define NFA_PIM_UNUSED 0 // pim not used
3953#define NFA_PIM_TODO 1 // pim not done yet
3954#define NFA_PIM_MATCH 2 // pim executed, matches
3955#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02003956
3957
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003958// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02003959typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003960{
3961 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02003962 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003963 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
3964 // invisible match
3965 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02003966} nfa_thread_T;
3967
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003968// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003969typedef struct
3970{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003971 nfa_thread_T *t; // allocated array of states
3972 int n; // nr of states currently in "t"
3973 int len; // max nr of states in "t"
3974 int id; // ID of the list
3975 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02003976} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003977
Bram Moolenaar5714b802013-05-28 22:03:20 +02003978#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01003979static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003980
3981 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003982log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003983{
3984 log_subexpr(&subs->norm);
3985# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02003986 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02003987 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003988# endif
3989}
3990
Bram Moolenaar5714b802013-05-28 22:03:20 +02003991 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003992log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003993{
3994 int j;
3995
3996 for (j = 0; j < sub->in_use; j++)
3997 if (REG_MULTI)
Bram Moolenaar87953742013-06-05 18:52:40 +02003998 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02003999 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004000 sub->list.multi[j].start_col,
4001 (int)sub->list.multi[j].start_lnum,
4002 sub->list.multi[j].end_col,
4003 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004004 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004005 {
4006 char *s = (char *)sub->list.line[j].start;
4007 char *e = (char *)sub->list.line[j].end;
4008
Bram Moolenaar87953742013-06-05 18:52:40 +02004009 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004010 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004011 s == NULL ? "NULL" : s,
4012 e == NULL ? "NULL" : e);
4013 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004014}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004015
4016 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004017pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004018{
4019 static char buf[30];
4020
4021 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4022 buf[0] = NUL;
4023 else
4024 {
4025 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004026 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004027 }
4028 return buf;
4029}
4030
Bram Moolenaar5714b802013-05-28 22:03:20 +02004031#endif
4032
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004033// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004034static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004035#ifdef FEAT_RELTIME
4036static proftime_T *nfa_time_limit;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004037static int *nfa_timed_out;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004038static int nfa_time_count;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004039#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004040
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004041static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004042static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004043
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004044/*
4045 * Copy postponed invisible match info from "from" to "to".
4046 */
4047 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004048copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004049{
4050 to->result = from->result;
4051 to->state = from->state;
4052 copy_sub(&to->subs.norm, &from->subs.norm);
4053#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004054 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004055 copy_sub(&to->subs.synt, &from->subs.synt);
4056#endif
4057 to->end = from->end;
4058}
4059
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004060 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004061clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004062{
4063 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004064 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004065 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004066 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004067 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004068 vim_memset(sub->list.line, 0,
4069 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004070 sub->in_use = 0;
4071}
4072
4073/*
4074 * Copy the submatches from "from" to "to".
4075 */
4076 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004077copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004078{
4079 to->in_use = from->in_use;
4080 if (from->in_use > 0)
4081 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004082 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004083 if (REG_MULTI)
4084 mch_memmove(&to->list.multi[0],
4085 &from->list.multi[0],
4086 sizeof(struct multipos) * from->in_use);
4087 else
4088 mch_memmove(&to->list.line[0],
4089 &from->list.line[0],
4090 sizeof(struct linepos) * from->in_use);
4091 }
4092}
4093
4094/*
4095 * Like copy_sub() but exclude the main match.
4096 */
4097 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004098copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004099{
4100 if (to->in_use < from->in_use)
4101 to->in_use = from->in_use;
4102 if (from->in_use > 1)
4103 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004104 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004105 if (REG_MULTI)
4106 mch_memmove(&to->list.multi[1],
4107 &from->list.multi[1],
4108 sizeof(struct multipos) * (from->in_use - 1));
4109 else
4110 mch_memmove(&to->list.line[1],
4111 &from->list.line[1],
4112 sizeof(struct linepos) * (from->in_use - 1));
4113 }
4114}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004115
Bram Moolenaar428e9872013-05-30 17:05:39 +02004116/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004117 * Like copy_sub() but only do the end of the main match if \ze is present.
4118 */
4119 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004120copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004121{
Bram Moolenaar0270f382018-07-17 05:43:58 +02004122 if (rex.nfa_has_zend)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004123 {
4124 if (REG_MULTI)
4125 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004126 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004127 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004128 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4129 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004130 }
Bram Moolenaarf2118842013-09-25 18:16:38 +02004131 }
4132 else
4133 {
4134 if (from->list.line[0].end != NULL)
4135 to->list.line[0].end = from->list.line[0].end;
4136 }
4137 }
4138}
4139
4140/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004141 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004142 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004143 */
4144 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004145sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004146{
4147 int i;
4148 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004149 linenr_T s1;
4150 linenr_T s2;
4151 char_u *sp1;
4152 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004153
4154 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4155 if (REG_MULTI)
4156 {
4157 for (i = 0; i < todo; ++i)
4158 {
4159 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004160 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004161 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004162 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004163 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004164 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004165 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004166 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004167 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004168 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004169 if (s1 != -1 && sub1->list.multi[i].start_col
4170 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004171 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004172
Bram Moolenaar0270f382018-07-17 05:43:58 +02004173 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004174 {
4175 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004176 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004177 else
4178 s1 = -1;
4179 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004180 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004181 else
4182 s2 = -1;
4183 if (s1 != s2)
4184 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004185 if (s1 != -1 && sub1->list.multi[i].end_col
4186 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004187 return FALSE;
4188 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004189 }
4190 }
4191 else
4192 {
4193 for (i = 0; i < todo; ++i)
4194 {
4195 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004196 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004197 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004198 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004199 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004200 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004201 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004202 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004203 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004204 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004205 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004206 {
4207 if (i < sub1->in_use)
4208 sp1 = sub1->list.line[i].end;
4209 else
4210 sp1 = NULL;
4211 if (i < sub2->in_use)
4212 sp2 = sub2->list.line[i].end;
4213 else
4214 sp2 = NULL;
4215 if (sp1 != sp2)
4216 return FALSE;
4217 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004218 }
4219 }
4220
4221 return TRUE;
4222}
4223
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004224#ifdef ENABLE_LOG
4225 static void
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004226open_debug_log(int result)
4227{
4228 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4229 if (log_fd == NULL)
4230 {
4231 emsg(_(e_log_open_failed));
4232 log_fd = stderr;
4233 }
4234
4235 fprintf(log_fd, "****************************\n");
4236 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4237 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : result == MAYBE
4238 ? "MAYBE" : "FALSE");
4239 fprintf(log_fd, "****************************\n");
4240}
4241
4242 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004243report_state(char *action,
4244 regsub_T *sub,
4245 nfa_state_T *state,
4246 int lid,
4247 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004248{
4249 int col;
4250
4251 if (sub->in_use <= 0)
4252 col = -1;
4253 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004254 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004255 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004256 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004257 nfa_set_code(state->c);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004258 if (log_fd == NULL)
4259 open_debug_log(MAYBE);
4260
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004261 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4262 action, abs(state->id), lid, state->c, code, col,
4263 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004264}
4265#endif
4266
Bram Moolenaar43e02982013-06-07 17:31:29 +02004267/*
4268 * Return TRUE if the same state is already in list "l" with the same
4269 * positions as "subs".
4270 */
4271 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004272has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004273 nfa_list_T *l, // runtime state list
4274 nfa_state_T *state, // state to update
4275 regsubs_T *subs, // pointers to subexpressions
4276 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004277{
4278 nfa_thread_T *thread;
4279 int i;
4280
4281 for (i = 0; i < l->n; ++i)
4282 {
4283 thread = &l->t[i];
4284 if (thread->state->id == state->id
4285 && sub_equal(&thread->subs.norm, &subs->norm)
4286#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004287 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004288 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004289#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004290 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004291 return TRUE;
4292 }
4293 return FALSE;
4294}
4295
4296/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004297 * Return TRUE if "one" and "two" are equal. That includes when both are not
4298 * set.
4299 */
4300 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004301pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004302{
4303 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4304 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4305
4306 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004307 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004308 return two_unused;
4309 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004310 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004311 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004312 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004313 if (one->state->id != two->state->id)
4314 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004315 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004316 if (REG_MULTI)
4317 return one->end.pos.lnum == two->end.pos.lnum
4318 && one->end.pos.col == two->end.pos.col;
4319 return one->end.ptr == two->end.ptr;
4320}
4321
4322/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004323 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4324 */
4325 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004326match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004327{
4328 nfa_state_T *state = startstate;
4329
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004330 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004331 if (depth > 10)
4332 return FALSE;
4333
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004334 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004335 {
4336 switch (state->c)
4337 {
4338 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004339 case NFA_MCLOSE:
4340 case NFA_END_INVISIBLE:
4341 case NFA_END_INVISIBLE_NEG:
4342 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004343 return TRUE;
4344
4345 case NFA_SPLIT:
4346 return match_follows(state->out, depth + 1)
4347 || match_follows(state->out1, depth + 1);
4348
4349 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004350 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004351 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004352 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004353 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004354 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004355 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004356 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004357 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004358 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004359 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004360 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004361
4362 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004363 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004364 case NFA_IDENT:
4365 case NFA_SIDENT:
4366 case NFA_KWORD:
4367 case NFA_SKWORD:
4368 case NFA_FNAME:
4369 case NFA_SFNAME:
4370 case NFA_PRINT:
4371 case NFA_SPRINT:
4372 case NFA_WHITE:
4373 case NFA_NWHITE:
4374 case NFA_DIGIT:
4375 case NFA_NDIGIT:
4376 case NFA_HEX:
4377 case NFA_NHEX:
4378 case NFA_OCTAL:
4379 case NFA_NOCTAL:
4380 case NFA_WORD:
4381 case NFA_NWORD:
4382 case NFA_HEAD:
4383 case NFA_NHEAD:
4384 case NFA_ALPHA:
4385 case NFA_NALPHA:
4386 case NFA_LOWER:
4387 case NFA_NLOWER:
4388 case NFA_UPPER:
4389 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004390 case NFA_LOWER_IC:
4391 case NFA_NLOWER_IC:
4392 case NFA_UPPER_IC:
4393 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004394 case NFA_START_COLL:
4395 case NFA_START_NEG_COLL:
4396 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004397 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004398 return FALSE;
4399
4400 default:
4401 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004402 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004403 return FALSE;
4404
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004405 // Others: zero-width or possibly zero-width, might still find
4406 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004407 break;
4408 }
4409 state = state->out;
4410 }
4411 return FALSE;
4412}
4413
4414
4415/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004416 * Return TRUE if "state" is already in list "l".
4417 */
4418 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004419state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004420 nfa_list_T *l, // runtime state list
4421 nfa_state_T *state, // state to update
4422 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004423{
4424 if (state->lastlist[nfa_ll_index] == l->id)
4425 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004426 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004427 return TRUE;
4428 }
4429 return FALSE;
4430}
4431
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004432// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004433#define ADDSTATE_HERE_OFFSET 10
4434
Bram Moolenaard05bf562013-06-30 23:24:08 +02004435/*
4436 * Add "state" and possibly what follows to state list ".".
4437 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004438 * Returns NULL when recursiveness is too deep.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004439 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004440 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004441addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004442 nfa_list_T *l, // runtime state list
4443 nfa_state_T *state, // state to update
4444 regsubs_T *subs_arg, // pointers to subexpressions
4445 nfa_pim_T *pim, // postponed look-behind match
4446 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004447{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004448 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004449 int off = off_arg;
4450 int add_here = FALSE;
4451 int listindex = 0;
4452 int k;
4453 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004454 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004455 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004456 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004457 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004458 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004459 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004460 regsubs_T *subs = subs_arg;
4461 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004462#ifdef ENABLE_LOG
4463 int did_print = FALSE;
4464#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004465 static int depth = 0;
4466
4467 // This function is called recursively. When the depth is too much we run
4468 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004469 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004470 {
4471 --depth;
4472 return NULL;
4473 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004474
Bram Moolenaar16b35782016-09-09 20:29:50 +02004475 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4476 {
4477 add_here = TRUE;
4478 off = 0;
4479 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4480 }
4481
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004482 switch (state->c)
4483 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004484 case NFA_NCLOSE:
4485 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004486 case NFA_MCLOSE1:
4487 case NFA_MCLOSE2:
4488 case NFA_MCLOSE3:
4489 case NFA_MCLOSE4:
4490 case NFA_MCLOSE5:
4491 case NFA_MCLOSE6:
4492 case NFA_MCLOSE7:
4493 case NFA_MCLOSE8:
4494 case NFA_MCLOSE9:
4495#ifdef FEAT_SYN_HL
4496 case NFA_ZCLOSE:
4497 case NFA_ZCLOSE1:
4498 case NFA_ZCLOSE2:
4499 case NFA_ZCLOSE3:
4500 case NFA_ZCLOSE4:
4501 case NFA_ZCLOSE5:
4502 case NFA_ZCLOSE6:
4503 case NFA_ZCLOSE7:
4504 case NFA_ZCLOSE8:
4505 case NFA_ZCLOSE9:
4506#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004507 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004508 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004509 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004510 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004511 // These nodes are not added themselves but their "out" and/or
4512 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004513 break;
4514
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004515 case NFA_BOL:
4516 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004517 // "^" won't match past end-of-line, don't bother trying.
4518 // Except when at the end of the line, or when we are going to the
4519 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004520 if (rex.input > rex.line
4521 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004522 && (nfa_endp == NULL
4523 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004524 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004525 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004526 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004527
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004528 case NFA_MOPEN1:
4529 case NFA_MOPEN2:
4530 case NFA_MOPEN3:
4531 case NFA_MOPEN4:
4532 case NFA_MOPEN5:
4533 case NFA_MOPEN6:
4534 case NFA_MOPEN7:
4535 case NFA_MOPEN8:
4536 case NFA_MOPEN9:
4537#ifdef FEAT_SYN_HL
4538 case NFA_ZOPEN:
4539 case NFA_ZOPEN1:
4540 case NFA_ZOPEN2:
4541 case NFA_ZOPEN3:
4542 case NFA_ZOPEN4:
4543 case NFA_ZOPEN5:
4544 case NFA_ZOPEN6:
4545 case NFA_ZOPEN7:
4546 case NFA_ZOPEN8:
4547 case NFA_ZOPEN9:
4548#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004549 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004550 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004551 // These nodes need to be added so that we can bail out when it
4552 // was added to this list before at the same position to avoid an
4553 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004554
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004555 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004556 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004557 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004558 // This state is already in the list, don't add it again,
4559 // unless it is an MOPEN that is used for a backreference or
4560 // when there is a PIM. For NFA_MATCH check the position,
4561 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004562 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004563 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004564 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004565 // When called from addstate_here() do insert before
4566 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004567 if (add_here)
4568 {
4569 for (k = 0; k < l->n && k < listindex; ++k)
4570 if (l->t[k].state->id == state->id)
4571 {
4572 found = TRUE;
4573 break;
4574 }
4575 }
4576 if (!add_here || found)
4577 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004578skip_add:
4579#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004580 nfa_set_code(state->c);
4581 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4582 abs(state->id), l->id, state->c, code,
4583 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004584#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004585 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004586 return subs;
4587 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004588 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004589
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004590 // Do not add the state again when it exists with the same
4591 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004592 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004593 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004594 }
4595
Bram Moolenaar688b3982019-02-13 21:47:36 +01004596 // When there are backreferences or PIMs the number of states may
4597 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004598 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004599 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004600 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004601 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004602 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004603
Bram Moolenaar688b3982019-02-13 21:47:36 +01004604 if ((long)(newsize >> 10) >= p_mmp)
4605 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004606 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004607 --depth;
4608 return NULL;
4609 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004610 if (subs != &temp_subs)
4611 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004612 // "subs" may point into the current array, need to make a
4613 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004614 copy_sub(&temp_subs.norm, &subs->norm);
4615#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004616 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004617 copy_sub(&temp_subs.synt, &subs->synt);
4618#endif
4619 subs = &temp_subs;
4620 }
4621
Bram Moolenaar688b3982019-02-13 21:47:36 +01004622 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004623 if (newt == NULL)
4624 {
4625 // out of memory
4626 --depth;
4627 return NULL;
4628 }
4629 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004630 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004631 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004632
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004633 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004634 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004635 thread = &l->t[l->n++];
4636 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004637 if (pim == NULL)
4638 thread->pim.result = NFA_PIM_UNUSED;
4639 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004640 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004641 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004642 l->has_pim = TRUE;
4643 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004644 copy_sub(&thread->subs.norm, &subs->norm);
4645#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004646 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004647 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004648#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004649#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004650 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004651 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004652#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004653 }
4654
4655#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004656 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004657 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004658#endif
4659 switch (state->c)
4660 {
4661 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004662 break;
4663
4664 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004665 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004666 subs = addstate(l, state->out, subs, pim, off_arg);
4667 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004668 break;
4669
Bram Moolenaar699c1202013-09-25 16:41:54 +02004670 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004671 case NFA_NOPEN:
4672 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004673 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004674 break;
4675
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004676 case NFA_MOPEN:
4677 case NFA_MOPEN1:
4678 case NFA_MOPEN2:
4679 case NFA_MOPEN3:
4680 case NFA_MOPEN4:
4681 case NFA_MOPEN5:
4682 case NFA_MOPEN6:
4683 case NFA_MOPEN7:
4684 case NFA_MOPEN8:
4685 case NFA_MOPEN9:
4686#ifdef FEAT_SYN_HL
4687 case NFA_ZOPEN:
4688 case NFA_ZOPEN1:
4689 case NFA_ZOPEN2:
4690 case NFA_ZOPEN3:
4691 case NFA_ZOPEN4:
4692 case NFA_ZOPEN5:
4693 case NFA_ZOPEN6:
4694 case NFA_ZOPEN7:
4695 case NFA_ZOPEN8:
4696 case NFA_ZOPEN9:
4697#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004698 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004699 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004700 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004701 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004702 sub = &subs->norm;
4703 }
4704#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004705 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004706 {
4707 subidx = state->c - NFA_ZOPEN;
4708 sub = &subs->synt;
4709 }
4710#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004711 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004712 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004713 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004714 sub = &subs->norm;
4715 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004716
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004717 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004718 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004719 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004720
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004721 // Set the position (with "off" added) in the subexpression. Save
4722 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004723 if (REG_MULTI)
4724 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004725 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004726 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004727 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004728 save_in_use = -1;
4729 }
4730 else
4731 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004732 save_in_use = sub->in_use;
4733 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004734 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004735 sub->list.multi[i].start_lnum = -1;
4736 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004737 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004738 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004739 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004740 if (off == -1)
4741 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004742 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004743 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004744 }
4745 else
4746 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004747 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004748 sub->list.multi[subidx].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004749 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004750 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004751 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004752 }
4753 else
4754 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004755 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004756 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004757 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004758 save_in_use = -1;
4759 }
4760 else
4761 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004762 save_in_use = sub->in_use;
4763 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004764 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004765 sub->list.line[i].start = NULL;
4766 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004767 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004768 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004769 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004770 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004771 }
4772
Bram Moolenaar16b35782016-09-09 20:29:50 +02004773 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004774 if (subs == NULL)
4775 break;
4776 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004777#ifdef FEAT_SYN_HL
4778 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4779 sub = &subs->synt;
4780 else
4781#endif
4782 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004783
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004784 if (save_in_use == -1)
4785 {
4786 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004787 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004788 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004789 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004790 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004791 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004792 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004793 break;
4794
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004795 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004796 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004797 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004798 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004799 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004800 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004801 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004802 break;
4803 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004804 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004805 case NFA_MCLOSE1:
4806 case NFA_MCLOSE2:
4807 case NFA_MCLOSE3:
4808 case NFA_MCLOSE4:
4809 case NFA_MCLOSE5:
4810 case NFA_MCLOSE6:
4811 case NFA_MCLOSE7:
4812 case NFA_MCLOSE8:
4813 case NFA_MCLOSE9:
4814#ifdef FEAT_SYN_HL
4815 case NFA_ZCLOSE:
4816 case NFA_ZCLOSE1:
4817 case NFA_ZCLOSE2:
4818 case NFA_ZCLOSE3:
4819 case NFA_ZCLOSE4:
4820 case NFA_ZCLOSE5:
4821 case NFA_ZCLOSE6:
4822 case NFA_ZCLOSE7:
4823 case NFA_ZCLOSE8:
4824 case NFA_ZCLOSE9:
4825#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004826 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004827 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004828 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004829 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004830 sub = &subs->norm;
4831 }
4832#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004833 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004834 {
4835 subidx = state->c - NFA_ZCLOSE;
4836 sub = &subs->synt;
4837 }
4838#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004839 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004840 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004841 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004842 sub = &subs->norm;
4843 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004844
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004845 // We don't fill in gaps here, there must have been an MOPEN that
4846 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004847 save_in_use = sub->in_use;
4848 if (sub->in_use <= subidx)
4849 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004850 if (REG_MULTI)
4851 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004852 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004853 if (off == -1)
4854 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004855 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004856 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004857 }
4858 else
4859 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004860 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004861 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004862 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004863 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004864 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004865 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004866 }
4867 else
4868 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004869 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004870 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004871 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004872 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004873 }
4874
Bram Moolenaar16b35782016-09-09 20:29:50 +02004875 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004876 if (subs == NULL)
4877 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004878 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004879#ifdef FEAT_SYN_HL
4880 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4881 sub = &subs->synt;
4882 else
4883#endif
4884 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004885
4886 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004887 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004888 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004889 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004890 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004891 break;
4892 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004893 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004894 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004895}
4896
4897/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004898 * Like addstate(), but the new state(s) are put at position "*ip".
4899 * Used for zero-width matches, next state to use is the added one.
4900 * This makes sure the order of states to be tried does not change, which
4901 * matters for alternatives.
4902 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004903 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004904addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004905 nfa_list_T *l, // runtime state list
4906 nfa_state_T *state, // state to update
4907 regsubs_T *subs, // pointers to subexpressions
4908 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004909 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004910{
4911 int tlen = l->n;
4912 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004913 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004914 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004915
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004916 // First add the state(s) at the end, so that we know how many there are.
4917 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00004918 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004919 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4920 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004921 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004922
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004923 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004924 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004925 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004926
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004927 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004928 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004929 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004930 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02004931 if (count == 1)
4932 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004933 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02004934 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02004935 }
4936 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004937 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004938 if (l->n + count - 1 >= l->len)
4939 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004940 // not enough space to move the new states, reallocate the list
4941 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004942 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004943 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004944 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004945
Bram Moolenaar688b3982019-02-13 21:47:36 +01004946 if ((long)(newsize >> 10) >= p_mmp)
4947 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004948 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004949 return NULL;
4950 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02004951 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004952 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004953 return NULL;
4954 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004955 mch_memmove(&(newl[0]),
4956 &(l->t[0]),
4957 sizeof(nfa_thread_T) * listidx);
4958 mch_memmove(&(newl[listidx]),
4959 &(l->t[l->n - count]),
4960 sizeof(nfa_thread_T) * count);
4961 mch_memmove(&(newl[listidx + count]),
4962 &(l->t[listidx + 1]),
4963 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
4964 vim_free(l->t);
4965 l->t = newl;
4966 }
4967 else
4968 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004969 // make space for new states, then move them from the
4970 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004971 mch_memmove(&(l->t[listidx + count]),
4972 &(l->t[listidx + 1]),
4973 sizeof(nfa_thread_T) * (l->n - listidx - 1));
4974 mch_memmove(&(l->t[listidx]),
4975 &(l->t[l->n - 1]),
4976 sizeof(nfa_thread_T) * count);
4977 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02004978 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02004979 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004980 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004981
4982 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004983}
4984
4985/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004986 * Check character class "class" against current character c.
4987 */
4988 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004989check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004990{
4991 switch (class)
4992 {
4993 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02004994 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004995 return OK;
4996 break;
4997 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02004998 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004999 return OK;
5000 break;
5001 case NFA_CLASS_BLANK:
5002 if (c == ' ' || c == '\t')
5003 return OK;
5004 break;
5005 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005006 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005007 return OK;
5008 break;
5009 case NFA_CLASS_DIGIT:
5010 if (VIM_ISDIGIT(c))
5011 return OK;
5012 break;
5013 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005014 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005015 return OK;
5016 break;
5017 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005018 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005019 return OK;
5020 break;
5021 case NFA_CLASS_PRINT:
5022 if (vim_isprintc(c))
5023 return OK;
5024 break;
5025 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005026 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005027 return OK;
5028 break;
5029 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005030 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005031 return OK;
5032 break;
5033 case NFA_CLASS_UPPER:
5034 if (MB_ISUPPER(c))
5035 return OK;
5036 break;
5037 case NFA_CLASS_XDIGIT:
5038 if (vim_isxdigit(c))
5039 return OK;
5040 break;
5041 case NFA_CLASS_TAB:
5042 if (c == '\t')
5043 return OK;
5044 break;
5045 case NFA_CLASS_RETURN:
5046 if (c == '\r')
5047 return OK;
5048 break;
5049 case NFA_CLASS_BACKSPACE:
5050 if (c == '\b')
5051 return OK;
5052 break;
5053 case NFA_CLASS_ESCAPE:
5054 if (c == '\033')
5055 return OK;
5056 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005057 case NFA_CLASS_IDENT:
5058 if (vim_isIDc(c))
5059 return OK;
5060 break;
5061 case NFA_CLASS_KEYWORD:
5062 if (reg_iswordc(c))
5063 return OK;
5064 break;
5065 case NFA_CLASS_FNAME:
5066 if (vim_isfilec(c))
5067 return OK;
5068 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005069
5070 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005071 // should not be here :P
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00005072 siemsg(_(e_nfa_regexp_invalid_character_class_nr), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005073 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005074 }
5075 return FAIL;
5076}
5077
Bram Moolenaar5714b802013-05-28 22:03:20 +02005078/*
5079 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005080 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005081 */
5082 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005083match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005084 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005085 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005086 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005087{
5088 int len;
5089
5090 if (sub->in_use <= subidx)
5091 {
5092retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005093 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005094 *bytelen = 0;
5095 return TRUE;
5096 }
5097
5098 if (REG_MULTI)
5099 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005100 if (sub->list.multi[subidx].start_lnum < 0
5101 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005102 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005103 if (sub->list.multi[subidx].start_lnum == rex.lnum
5104 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005105 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005106 len = sub->list.multi[subidx].end_col
5107 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005108 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5109 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005110 {
5111 *bytelen = len;
5112 return TRUE;
5113 }
5114 }
5115 else
5116 {
5117 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005118 sub->list.multi[subidx].start_lnum,
5119 sub->list.multi[subidx].start_col,
5120 sub->list.multi[subidx].end_lnum,
5121 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005122 bytelen) == RA_MATCH)
5123 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005124 }
5125 }
5126 else
5127 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005128 if (sub->list.line[subidx].start == NULL
5129 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005130 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005131 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005132 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005133 {
5134 *bytelen = len;
5135 return TRUE;
5136 }
5137 }
5138 return FALSE;
5139}
5140
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005141#ifdef FEAT_SYN_HL
5142
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005143/*
5144 * Check for a match with \z subexpression "subidx".
5145 * Return TRUE if it matches.
5146 */
5147 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005148match_zref(
5149 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005150 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005151{
5152 int len;
5153
5154 cleanup_zsubexpr();
5155 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5156 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005157 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005158 *bytelen = 0;
5159 return TRUE;
5160 }
5161
5162 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005163 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005164 {
5165 *bytelen = len;
5166 return TRUE;
5167 }
5168 return FALSE;
5169}
5170#endif
5171
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005172/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005173 * Save list IDs for all NFA states of "prog" into "list".
5174 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005175 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005176 */
5177 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005178nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005179{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005180 int i;
5181 nfa_state_T *p;
5182
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005183 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005184 p = &prog->state[0];
5185 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005186 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005187 list[i] = p->lastlist[1];
5188 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005189 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005190 }
5191}
5192
5193/*
5194 * Restore list IDs from "list" to all NFA states.
5195 */
5196 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005197nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005198{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005199 int i;
5200 nfa_state_T *p;
5201
5202 p = &prog->state[0];
5203 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005204 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005205 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005206 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005207 }
5208}
5209
Bram Moolenaar423532e2013-05-29 21:14:42 +02005210 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005211nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005212{
5213 if (op == 1) return pos > val;
5214 if (op == 2) return pos < val;
5215 return val == pos;
5216}
5217
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005218static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005219
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005220/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005221 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005222 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5223 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005224 */
5225 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005226recursive_regmatch(
5227 nfa_state_T *state,
5228 nfa_pim_T *pim,
5229 nfa_regprog_T *prog,
5230 regsubs_T *submatch,
5231 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005232 int **listids,
5233 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005234{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005235 int save_reginput_col = (int)(rex.input - rex.line);
5236 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005237 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005238 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005239 save_se_T *save_nfa_endp = nfa_endp;
5240 save_se_T endpos;
5241 save_se_T *endposp = NULL;
5242 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005243 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005244
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005245 if (pim != NULL)
5246 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005247 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005248 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005249 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005250 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005251 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005252 }
5253
Bram Moolenaardecd9542013-06-07 16:31:50 +02005254 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005255 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5256 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5257 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005258 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005259 // The recursive match must end at the current position. When "pim" is
5260 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005261 endposp = &endpos;
5262 if (REG_MULTI)
5263 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005264 if (pim == NULL)
5265 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005266 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5267 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005268 }
5269 else
5270 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005271 }
5272 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005273 {
5274 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005275 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005276 else
5277 endpos.se_u.ptr = pim->end.ptr;
5278 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005279
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005280 // Go back the specified number of bytes, or as far as the
5281 // start of the previous line, to try matching "\@<=" or
5282 // not matching "\@<!". This is very inefficient, limit the number of
5283 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005284 if (state->val <= 0)
5285 {
5286 if (REG_MULTI)
5287 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005288 rex.line = reg_getline(--rex.lnum);
5289 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005290 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005291 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005292 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005293 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005294 }
5295 else
5296 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005297 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005298 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005299 // Not enough bytes in this line, go to end of
5300 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005301 rex.line = reg_getline(--rex.lnum);
5302 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005303 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005304 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005305 rex.line = reg_getline(++rex.lnum);
5306 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005307 }
5308 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005309 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005310 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005311 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005312 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005313 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005314 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005315 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005316 }
5317 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005318 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005319 }
5320 }
5321
Bram Moolenaarf46da702013-06-02 22:37:42 +02005322#ifdef ENABLE_LOG
5323 if (log_fd != stderr)
5324 fclose(log_fd);
5325 log_fd = NULL;
5326#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005327 // Have to clear the lastlist field of the NFA nodes, so that
5328 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005329 if (nfa_ll_index == 1)
5330 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005331 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5332 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005333 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005334 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005335 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005336 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005337 if (*listids == NULL)
5338 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00005339 emsg(_(e_nfa_regexp_could_not_allocate_memory_for_branch_traversal));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005340 return 0;
5341 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005342 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005343 }
5344 nfa_save_listids(prog, *listids);
5345 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005346 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005347 }
5348 else
5349 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005350 // First recursive nfa_regmatch() call, switch to the second lastlist
5351 // entry. Make sure rex.nfa_listid is different from a previous
5352 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005353 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005354 if (rex.nfa_listid <= rex.nfa_alt_listid)
5355 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005356 }
5357
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005358 // Call nfa_regmatch() to check if the current concat matches at this
5359 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005360 nfa_endp = endposp;
5361 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005362
5363 if (need_restore)
5364 nfa_restore_listids(prog, *listids);
5365 else
5366 {
5367 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005368 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005369 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005370
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005371 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005372 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005373 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005374 rex.line = reg_getline(rex.lnum);
5375 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005376 if (result != NFA_TOO_EXPENSIVE)
5377 {
5378 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005379 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005380 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005381 nfa_endp = save_nfa_endp;
5382
5383#ifdef ENABLE_LOG
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005384 open_debug_log(result);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005385#endif
5386
5387 return result;
5388}
5389
Bram Moolenaara2d95102013-06-04 14:23:05 +02005390/*
5391 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005392 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005393 * NFA_ANY: 1
5394 * specific character: 99
5395 */
5396 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005397failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005398{
5399 int c = state->c;
5400 int l, r;
5401
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005402 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005403 if (depth > 4)
5404 return 1;
5405
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005406 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005407 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005408 case NFA_SPLIT:
5409 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005410 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005411 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005412 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005413 l = failure_chance(state->out, depth + 1);
5414 r = failure_chance(state->out1, depth + 1);
5415 return l < r ? l : r;
5416
5417 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005418 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005419 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005420
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005421 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005422 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005423 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005424 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005425 return 0;
5426
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005427 case NFA_START_INVISIBLE:
5428 case NFA_START_INVISIBLE_FIRST:
5429 case NFA_START_INVISIBLE_NEG:
5430 case NFA_START_INVISIBLE_NEG_FIRST:
5431 case NFA_START_INVISIBLE_BEFORE:
5432 case NFA_START_INVISIBLE_BEFORE_FIRST:
5433 case NFA_START_INVISIBLE_BEFORE_NEG:
5434 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5435 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005436 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005437 return 5;
5438
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005439 case NFA_BOL:
5440 case NFA_EOL:
5441 case NFA_BOF:
5442 case NFA_EOF:
5443 case NFA_NEWL:
5444 return 99;
5445
5446 case NFA_BOW:
5447 case NFA_EOW:
5448 return 90;
5449
5450 case NFA_MOPEN:
5451 case NFA_MOPEN1:
5452 case NFA_MOPEN2:
5453 case NFA_MOPEN3:
5454 case NFA_MOPEN4:
5455 case NFA_MOPEN5:
5456 case NFA_MOPEN6:
5457 case NFA_MOPEN7:
5458 case NFA_MOPEN8:
5459 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005460#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005461 case NFA_ZOPEN:
5462 case NFA_ZOPEN1:
5463 case NFA_ZOPEN2:
5464 case NFA_ZOPEN3:
5465 case NFA_ZOPEN4:
5466 case NFA_ZOPEN5:
5467 case NFA_ZOPEN6:
5468 case NFA_ZOPEN7:
5469 case NFA_ZOPEN8:
5470 case NFA_ZOPEN9:
5471 case NFA_ZCLOSE:
5472 case NFA_ZCLOSE1:
5473 case NFA_ZCLOSE2:
5474 case NFA_ZCLOSE3:
5475 case NFA_ZCLOSE4:
5476 case NFA_ZCLOSE5:
5477 case NFA_ZCLOSE6:
5478 case NFA_ZCLOSE7:
5479 case NFA_ZCLOSE8:
5480 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005481#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005482 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005483 case NFA_MCLOSE1:
5484 case NFA_MCLOSE2:
5485 case NFA_MCLOSE3:
5486 case NFA_MCLOSE4:
5487 case NFA_MCLOSE5:
5488 case NFA_MCLOSE6:
5489 case NFA_MCLOSE7:
5490 case NFA_MCLOSE8:
5491 case NFA_MCLOSE9:
5492 case NFA_NCLOSE:
5493 return failure_chance(state->out, depth + 1);
5494
5495 case NFA_BACKREF1:
5496 case NFA_BACKREF2:
5497 case NFA_BACKREF3:
5498 case NFA_BACKREF4:
5499 case NFA_BACKREF5:
5500 case NFA_BACKREF6:
5501 case NFA_BACKREF7:
5502 case NFA_BACKREF8:
5503 case NFA_BACKREF9:
5504#ifdef FEAT_SYN_HL
5505 case NFA_ZREF1:
5506 case NFA_ZREF2:
5507 case NFA_ZREF3:
5508 case NFA_ZREF4:
5509 case NFA_ZREF5:
5510 case NFA_ZREF6:
5511 case NFA_ZREF7:
5512 case NFA_ZREF8:
5513 case NFA_ZREF9:
5514#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005515 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005516 return 94;
5517
5518 case NFA_LNUM_GT:
5519 case NFA_LNUM_LT:
5520 case NFA_COL_GT:
5521 case NFA_COL_LT:
5522 case NFA_VCOL_GT:
5523 case NFA_VCOL_LT:
5524 case NFA_MARK_GT:
5525 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005526 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005527 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005528 return 85;
5529
5530 case NFA_LNUM:
5531 return 90;
5532
5533 case NFA_CURSOR:
5534 case NFA_COL:
5535 case NFA_VCOL:
5536 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005537 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005538 return 98;
5539
5540 case NFA_COMPOSING:
5541 return 95;
5542
5543 default:
5544 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005545 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005546 return 95;
5547 }
5548
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005549 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005550 return 50;
5551}
5552
Bram Moolenaarf46da702013-06-02 22:37:42 +02005553/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005554 * Skip until the char "c" we know a match must start with.
5555 */
5556 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005557skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005558{
5559 char_u *s;
5560
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005561 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005562 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005563 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005564 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005565 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005566 if (s == NULL)
5567 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005568 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005569 return OK;
5570}
5571
5572/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005573 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005574 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005575 * Returns zero for no match, 1 for a match.
5576 */
5577 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01005578find_match_text(colnr_T startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005579{
5580 colnr_T col = startcol;
5581 int c1, c2;
5582 int len1, len2;
5583 int match;
5584
5585 for (;;)
5586 {
5587 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005588 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005589 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5590 {
5591 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005592 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005593 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005594 {
5595 match = FALSE;
5596 break;
5597 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005598 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5599 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005600 }
5601 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005602 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005603 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005604 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005605 {
5606 cleanup_subexpr();
5607 if (REG_MULTI)
5608 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005609 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005610 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005611 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005612 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005613 }
5614 else
5615 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005616 rex.reg_startp[0] = rex.line + col;
5617 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005618 }
5619 return 1L;
5620 }
5621
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005622 // Try finding regstart after the current match.
5623 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005624 if (skip_to_start(regstart, &col) == FAIL)
5625 break;
5626 }
5627 return 0L;
5628}
5629
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005630#ifdef FEAT_RELTIME
5631 static int
5632nfa_did_time_out()
5633{
5634 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5635 {
5636 if (nfa_timed_out != NULL)
5637 *nfa_timed_out = TRUE;
5638 return TRUE;
5639 }
5640 return FALSE;
5641}
5642#endif
5643
Bram Moolenaar473de612013-06-08 18:19:48 +02005644/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005645 * Main matching routine.
5646 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005647 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005648 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005649 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005650 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005651 * Return TRUE if there is a match, FALSE if there is no match,
5652 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005653 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005654 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005655 * Note: Caller must ensure that: start != NULL.
5656 */
5657 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005658nfa_regmatch(
5659 nfa_regprog_T *prog,
5660 nfa_state_T *start,
5661 regsubs_T *submatch,
5662 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005663{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005664 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005665 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005666 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005667 int go_to_nextline = FALSE;
5668 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005669 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005670 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005671 nfa_list_T *thislist;
5672 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005673 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005674 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005675 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005676 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005677 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005678 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005679 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005680 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005681#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005682 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005683#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005684
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005685 // Some patterns may take a long time to match, especially when using
5686 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005687 fast_breakcheck();
5688 if (got_int)
5689 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005690#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005691 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005692 return FALSE;
5693#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005694
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005695#ifdef NFA_REGEXP_DEBUG_LOG
5696 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5697 if (debug == NULL)
5698 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005699 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005700 return FALSE;
5701 }
5702#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005703 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005704
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005705 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005706 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005707
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005708 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005709 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005710 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005711 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005712 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005713 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005714
5715#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005716 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005717 if (log_fd == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005718 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005719 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005720 log_fd = stderr;
5721 }
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005722 fprintf(log_fd, "**********************************\n");
5723 nfa_set_code(start->c);
5724 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5725 abs(start->id), code);
5726 fprintf(log_fd, "**********************************\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005727#endif
5728
5729 thislist = &list[0];
5730 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005731 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005732 nextlist = &list[1];
5733 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005734 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005735#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005736 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005737#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005738 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005739
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005740 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5741 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005742 if (toplevel)
5743 {
5744 if (REG_MULTI)
5745 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005746 m->norm.list.multi[0].start_lnum = rex.lnum;
5747 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005748 }
5749 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005750 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005751 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005752 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005753 }
5754 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005755 r = addstate(thislist, start, m, NULL, 0);
5756 if (r == NULL)
5757 {
5758 nfa_match = NFA_TOO_EXPENSIVE;
5759 goto theend;
5760 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005761
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005762#define ADD_STATE_IF_MATCH(state) \
5763 if (result) { \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005764 add_state = state->out; \
5765 add_off = clen; \
5766 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005767
5768 /*
5769 * Run for each character.
5770 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005771 for (;;)
5772 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005773 int curc;
5774 int clen;
5775
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005776 if (has_mbyte)
5777 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005778 curc = (*mb_ptr2char)(rex.input);
5779 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005780 }
5781 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005782 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005783 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005784 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005785 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005786 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005787 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005788 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005789 go_to_nextline = FALSE;
5790 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005791
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005792 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005793 thislist = &list[flag];
5794 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005795 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005796 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005797 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005798 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005799 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005800# ifdef FEAT_EVAL
5801 || nfa_fail_for_testing
5802# endif
5803 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005804 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005805 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005806 nfa_match = NFA_TOO_EXPENSIVE;
5807 goto theend;
5808 }
5809
Bram Moolenaar0270f382018-07-17 05:43:58 +02005810 thislist->id = rex.nfa_listid;
5811 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005812
5813#ifdef ENABLE_LOG
5814 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005815 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005816 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005817 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005818 {
5819 int i;
5820
5821 for (i = 0; i < thislist->n; i++)
5822 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5823 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005824 fprintf(log_fd, "\n");
5825#endif
5826
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005827#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005828 fprintf(debug, "\n-------------------\n");
5829#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005830 /*
5831 * If the state lists are empty we can stop.
5832 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005833 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005834 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005835
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005836 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005837 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005838 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005839 // If the list gets very long there probably is something wrong.
5840 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005841 fast_breakcheck();
5842 if (got_int)
5843 break;
5844#ifdef FEAT_RELTIME
5845 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5846 {
5847 nfa_time_count = 0;
5848 if (nfa_did_time_out())
5849 break;
5850 }
5851#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005852 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005853
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005854#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005855 nfa_set_code(t->state->c);
5856 fprintf(debug, "%s, ", code);
5857#endif
5858#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005859 {
5860 int col;
5861
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005862 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005863 col = -1;
5864 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005865 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005866 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005867 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005868 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005869 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005870 abs(t->state->id), (int)t->state->c, code, col,
5871 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005872 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005873#endif
5874
5875 /*
5876 * Handle the possible codes of the current state.
5877 * The most important is NFA_MATCH.
5878 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005879 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005880 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005881 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005882 switch (t->state->c)
5883 {
5884 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005885 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005886 // If the match is not at the start of the line, ends before a
5887 // composing characters and rex.reg_icombine is not set, that
5888 // is not really a match.
5889 if (enc_utf8 && !rex.reg_icombine
5890 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005891 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005892
Bram Moolenaar963fee22013-05-26 21:47:28 +02005893 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005894 copy_sub(&submatch->norm, &t->subs.norm);
5895#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005896 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005897 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005898#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005899#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005900 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005901#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005902 // Found the left-most longest match, do not look at any other
5903 // states at this position. When the list of states is going
5904 // to be empty quit without advancing, so that "rex.input" is
5905 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005906 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005907 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005908 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005909 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005910
5911 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005912 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005913 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005914 /*
5915 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005916 * NFA_START_INVISIBLE_BEFORE node.
5917 * They surround a zero-width group, used with "\@=", "\&",
5918 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005919 * If we got here, it means that the current "invisible" group
5920 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005921 * nfa_regmatch(). For a look-behind match only when it ends
5922 * in the position in "nfa_endp".
5923 * Submatches are stored in *m, and used in the parent call.
5924 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005925#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005926 if (nfa_endp != NULL)
5927 {
5928 if (REG_MULTI)
5929 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005930 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02005931 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02005932 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02005933 nfa_endp->se_u.pos.col);
5934 else
5935 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005936 (int)(rex.input - rex.line),
5937 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005938 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005939#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005940 // If "nfa_endp" is set it's only a match if it ends at
5941 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02005942 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02005943 ? (rex.lnum != nfa_endp->se_u.pos.lnum
5944 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005945 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005946 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02005947 break;
5948
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005949 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02005950 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005951 {
5952 copy_sub(&m->norm, &t->subs.norm);
5953#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005954 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005955 copy_sub(&m->synt, &t->subs.synt);
5956#endif
5957 }
Bram Moolenaar87953742013-06-05 18:52:40 +02005958#ifdef ENABLE_LOG
5959 fprintf(log_fd, "Match found:\n");
5960 log_subsexpr(m);
5961#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02005962 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005963 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02005964 if (nextlist->n == 0)
5965 clen = 0;
5966 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005967
5968 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005969 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005970 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005971 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02005972 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005973 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005974 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005975 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005976 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005977#ifdef ENABLE_LOG
5978 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
5979 failure_chance(t->state->out, 0),
5980 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005981#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005982 // Do it directly if there already is a PIM or when
5983 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02005984 if (t->pim.result != NFA_PIM_UNUSED
5985 || t->state->c == NFA_START_INVISIBLE_FIRST
5986 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5987 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5988 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005989 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02005990 int in_use = m->norm.in_use;
5991
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005992 // Copy submatch info for the recursive call, opposite
5993 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02005994 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02005995#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005996 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02005997 copy_sub_off(&m->synt, &t->subs.synt);
5998#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02005999
Bram Moolenaara2d95102013-06-04 14:23:05 +02006000 /*
6001 * First try matching the invisible match, then what
6002 * follows.
6003 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006004 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006005 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006006 if (result == NFA_TOO_EXPENSIVE)
6007 {
6008 nfa_match = result;
6009 goto theend;
6010 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006011
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006012 // for \@! and \@<! it is a match when the result is
6013 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006014 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006015 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6016 || t->state->c
6017 == NFA_START_INVISIBLE_BEFORE_NEG
6018 || t->state->c
6019 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006020 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006021 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006022 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006023#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006024 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006025 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006026#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006027 // If the pattern has \ze and it matched in the
6028 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006029 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006030
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006031 // t->state->out1 is the corresponding
6032 // END_INVISIBLE node; Add its out to the current
6033 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006034 add_here = TRUE;
6035 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006036 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006037 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006038 }
6039 else
6040 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006041 nfa_pim_T pim;
6042
Bram Moolenaara2d95102013-06-04 14:23:05 +02006043 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006044 * First try matching what follows. Only if a match
6045 * is found verify the invisible match matches. Add a
6046 * nfa_pim_T to the following states, it contains info
6047 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006048 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006049 pim.state = t->state;
6050 pim.result = NFA_PIM_TODO;
6051 pim.subs.norm.in_use = 0;
6052#ifdef FEAT_SYN_HL
6053 pim.subs.synt.in_use = 0;
6054#endif
6055 if (REG_MULTI)
6056 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006057 pim.end.pos.col = (int)(rex.input - rex.line);
6058 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006059 }
6060 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006061 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006062
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006063 // t->state->out1 is the corresponding END_INVISIBLE
6064 // node; Add its out to the current list (zero-width
6065 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006066 if (addstate_here(thislist, t->state->out1->out,
6067 &t->subs, &pim, &listidx) == NULL)
6068 {
6069 nfa_match = NFA_TOO_EXPENSIVE;
6070 goto theend;
6071 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006072 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006073 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006074 break;
6075
Bram Moolenaar87953742013-06-05 18:52:40 +02006076 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006077 {
6078 nfa_state_T *skip = NULL;
6079#ifdef ENABLE_LOG
6080 int skip_lid = 0;
6081#endif
6082
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006083 // There is no point in trying to match the pattern if the
6084 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006085 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6086 {
6087 skip = t->state->out1->out;
6088#ifdef ENABLE_LOG
6089 skip_lid = nextlist->id;
6090#endif
6091 }
6092 else if (state_in_list(nextlist,
6093 t->state->out1->out->out, &t->subs))
6094 {
6095 skip = t->state->out1->out->out;
6096#ifdef ENABLE_LOG
6097 skip_lid = nextlist->id;
6098#endif
6099 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006100 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006101 t->state->out1->out->out, &t->subs))
6102 {
6103 skip = t->state->out1->out->out;
6104#ifdef ENABLE_LOG
6105 skip_lid = thislist->id;
6106#endif
6107 }
6108 if (skip != NULL)
6109 {
6110#ifdef ENABLE_LOG
6111 nfa_set_code(skip->c);
6112 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6113 abs(skip->id), skip_lid, skip->c, code);
6114#endif
6115 break;
6116 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006117 // Copy submatch info to the recursive call, opposite of what
6118 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006119 copy_sub_off(&m->norm, &t->subs.norm);
6120#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006121 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006122 copy_sub_off(&m->synt, &t->subs.synt);
6123#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006124
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006125 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006126 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006127 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006128 if (result == NFA_TOO_EXPENSIVE)
6129 {
6130 nfa_match = result;
6131 goto theend;
6132 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006133 if (result)
6134 {
6135 int bytelen;
6136
6137#ifdef ENABLE_LOG
6138 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6139 log_subsexpr(m);
6140#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006141 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006142 copy_sub_off(&t->subs.norm, &m->norm);
6143#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006144 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006145 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006146#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006147 // Now we need to skip over the matched text and then
6148 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006149 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006150 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006151 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006152 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006153 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006154 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006155
6156#ifdef ENABLE_LOG
6157 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6158#endif
6159 if (bytelen == 0)
6160 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006161 // empty match, output of corresponding
6162 // NFA_END_PATTERN/NFA_SKIP to be used at current
6163 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006164 add_here = TRUE;
6165 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006166 }
6167 else if (bytelen <= clen)
6168 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006169 // match current character, output of corresponding
6170 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006171 add_state = t->state->out1->out->out;
6172 add_off = clen;
6173 }
6174 else
6175 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006176 // skip over the matched characters, set character
6177 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006178 add_state = t->state->out1->out;
6179 add_off = bytelen;
6180 add_count = bytelen - clen;
6181 }
6182 }
6183 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006184 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006185
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006186 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006187 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006188 {
6189 add_here = TRUE;
6190 add_state = t->state->out;
6191 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006192 break;
6193
6194 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006195 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006196 {
6197 add_here = TRUE;
6198 add_state = t->state->out;
6199 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006200 break;
6201
6202 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006203 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006204
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006205 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006206 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006207 else if (has_mbyte)
6208 {
6209 int this_class;
6210
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006211 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006212 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006213 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006214 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006215 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006216 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006217 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006218 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006219 || (rex.input > rex.line
6220 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006221 result = FALSE;
6222 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006223 {
6224 add_here = TRUE;
6225 add_state = t->state->out;
6226 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006227 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006228
6229 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006230 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006231 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006232 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006233 else if (has_mbyte)
6234 {
6235 int this_class, prev_class;
6236
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006237 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006238 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006239 prev_class = reg_prev_class();
6240 if (this_class == prev_class
6241 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006242 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006243 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006244 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6245 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006246 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006247 result = FALSE;
6248 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006249 {
6250 add_here = TRUE;
6251 add_state = t->state->out;
6252 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006253 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006254
Bram Moolenaar4b780632013-05-31 22:14:52 +02006255 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006256 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006257 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006258 {
6259 add_here = TRUE;
6260 add_state = t->state->out;
6261 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006262 break;
6263
6264 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006265 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006266 {
6267 add_here = TRUE;
6268 add_state = t->state->out;
6269 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006270 break;
6271
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006272 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006273 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006274 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006275 int len = 0;
6276 nfa_state_T *end;
6277 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006278 int cchars[MAX_MCO];
6279 int ccount = 0;
6280 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006281
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006282 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006283 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006284 if (utf_iscomposing(sta->c))
6285 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006286 // Only match composing character(s), ignore base
6287 // character. Used for ".{composing}" and "{composing}"
6288 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006289 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006290 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006291 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006292 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006293 // If \Z was present, then ignore composing characters.
6294 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006295 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006296 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006297 else
6298 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006299 while (sta->c != NFA_END_COMPOSING)
6300 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006301 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006302
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006303 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006304 else if (len > 0 || mc == sta->c)
6305 {
6306 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006307 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006308 len += mb_char2len(mc);
6309 sta = sta->out;
6310 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006311
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006312 // We don't care about the order of composing characters.
6313 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006314 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006315 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006316 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006317 cchars[ccount++] = mc;
6318 len += mb_char2len(mc);
6319 if (ccount == MAX_MCO)
6320 break;
6321 }
6322
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006323 // Check that each composing char in the pattern matches a
6324 // composing char in the text. We do not check if all
6325 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006326 result = OK;
6327 while (sta->c != NFA_END_COMPOSING)
6328 {
6329 for (j = 0; j < ccount; ++j)
6330 if (cchars[j] == sta->c)
6331 break;
6332 if (j == ccount)
6333 {
6334 result = FAIL;
6335 break;
6336 }
6337 sta = sta->out;
6338 }
6339 }
6340 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006341 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006342
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006343 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006344 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006345 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006346 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006347
6348 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006349 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006350 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006351 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006352 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006353 // Pass -1 for the offset, which means taking the position
6354 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006355 add_state = t->state->out;
6356 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006357 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006358 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006359 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006360 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006361 add_state = t->state->out;
6362 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006363 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006364 break;
6365
Bram Moolenaar417bad22013-06-07 14:08:30 +02006366 case NFA_START_COLL:
6367 case NFA_START_NEG_COLL:
6368 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006369 // What follows is a list of characters, until NFA_END_COLL.
6370 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006371 nfa_state_T *state;
6372 int result_if_matched;
6373 int c1, c2;
6374
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006375 // Never match EOL. If it's part of the collection it is added
6376 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006377 if (curc == NUL)
6378 break;
6379
6380 state = t->state->out;
6381 result_if_matched = (t->state->c == NFA_START_COLL);
6382 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006383 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006384 if (state->c == NFA_END_COLL)
6385 {
6386 result = !result_if_matched;
6387 break;
6388 }
6389 if (state->c == NFA_RANGE_MIN)
6390 {
6391 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006392 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006393 c2 = state->val;
6394#ifdef ENABLE_LOG
6395 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6396 curc, c1, c2);
6397#endif
6398 if (curc >= c1 && curc <= c2)
6399 {
6400 result = result_if_matched;
6401 break;
6402 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006403 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006404 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006405 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006406 int done = FALSE;
6407
6408 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006409 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006410 {
6411 result = result_if_matched;
6412 done = TRUE;
6413 break;
6414 }
6415 if (done)
6416 break;
6417 }
6418 }
6419 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006420 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006421 || (rex.reg_ic && MB_CASEFOLD(curc)
6422 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006423 {
6424 result = result_if_matched;
6425 break;
6426 }
6427 state = state->out;
6428 }
6429 if (result)
6430 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006431 // next state is in out of the NFA_END_COLL, out1 of
6432 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006433 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006434 add_off = clen;
6435 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006436 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006437 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006438
6439 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006440 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006441 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006442 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006443 add_state = t->state->out;
6444 add_off = clen;
6445 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006446 break;
6447
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006448 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006449 // On a composing character skip over it. Otherwise do
6450 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006451 if (enc_utf8 && utf_iscomposing(curc))
6452 {
6453 add_off = clen;
6454 }
6455 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006456 {
6457 add_here = TRUE;
6458 add_off = 0;
6459 }
6460 add_state = t->state->out;
6461 break;
6462
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006463 /*
6464 * Character classes like \a for alpha, \d for digit etc.
6465 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006466 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006467 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006468 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006469 break;
6470
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006471 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006472 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006473 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006474 break;
6475
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006476 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006477 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006478 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006479 break;
6480
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006481 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006482 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006483 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006484 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006485 break;
6486
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006487 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006488 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006489 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006490 break;
6491
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006492 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006493 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006494 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006495 break;
6496
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006497 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006498 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006499 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006500 break;
6501
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006502 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006503 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006504 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006505 break;
6506
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006507 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006508 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006509 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006510 break;
6511
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006512 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006513 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006514 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006515 break;
6516
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006517 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006518 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006519 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006520 break;
6521
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006522 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006523 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006524 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006525 break;
6526
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006527 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006528 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006529 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006530 break;
6531
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006532 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006533 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006534 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006535 break;
6536
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006537 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006538 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006539 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006540 break;
6541
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006542 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006543 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006544 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006545 break;
6546
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006547 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006548 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006549 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006550 break;
6551
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006552 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006553 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006554 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006555 break;
6556
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006557 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006558 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006559 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006560 break;
6561
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006562 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006563 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006564 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006565 break;
6566
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006567 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006568 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006569 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006570 break;
6571
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006572 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006573 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006574 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006575 break;
6576
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006577 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006578 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006579 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006580 break;
6581
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006582 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006583 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006584 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006585 break;
6586
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006587 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006588 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006589 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006590 break;
6591
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006592 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006593 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006594 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006595 break;
6596
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006597 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006598 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006599 ADD_STATE_IF_MATCH(t->state);
6600 break;
6601
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006602 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006603 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006604 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006605 ADD_STATE_IF_MATCH(t->state);
6606 break;
6607
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006608 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006609 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006610 ADD_STATE_IF_MATCH(t->state);
6611 break;
6612
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006613 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006614 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006615 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006616 ADD_STATE_IF_MATCH(t->state);
6617 break;
6618
Bram Moolenaar5714b802013-05-28 22:03:20 +02006619 case NFA_BACKREF1:
6620 case NFA_BACKREF2:
6621 case NFA_BACKREF3:
6622 case NFA_BACKREF4:
6623 case NFA_BACKREF5:
6624 case NFA_BACKREF6:
6625 case NFA_BACKREF7:
6626 case NFA_BACKREF8:
6627 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006628#ifdef FEAT_SYN_HL
6629 case NFA_ZREF1:
6630 case NFA_ZREF2:
6631 case NFA_ZREF3:
6632 case NFA_ZREF4:
6633 case NFA_ZREF5:
6634 case NFA_ZREF6:
6635 case NFA_ZREF7:
6636 case NFA_ZREF8:
6637 case NFA_ZREF9:
6638#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006639 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006640 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006641 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006642 int bytelen;
6643
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006644 if (t->state->c <= NFA_BACKREF9)
6645 {
6646 subidx = t->state->c - NFA_BACKREF1 + 1;
6647 result = match_backref(&t->subs.norm, subidx, &bytelen);
6648 }
6649#ifdef FEAT_SYN_HL
6650 else
6651 {
6652 subidx = t->state->c - NFA_ZREF1 + 1;
6653 result = match_zref(subidx, &bytelen);
6654 }
6655#endif
6656
Bram Moolenaar5714b802013-05-28 22:03:20 +02006657 if (result)
6658 {
6659 if (bytelen == 0)
6660 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006661 // empty match always works, output of NFA_SKIP to be
6662 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006663 add_here = TRUE;
6664 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006665 }
6666 else if (bytelen <= clen)
6667 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006668 // match current character, jump ahead to out of
6669 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006670 add_state = t->state->out->out;
6671 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006672 }
6673 else
6674 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006675 // skip over the matched characters, set character
6676 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006677 add_state = t->state->out;
6678 add_off = bytelen;
6679 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006680 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006681 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006682 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006683 }
6684 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006685 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006686 if (t->count - clen <= 0)
6687 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006688 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006689 add_state = t->state->out;
6690 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006691 }
6692 else
6693 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006694 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006695 add_state = t->state;
6696 add_off = 0;
6697 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006698 }
6699 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006700
Bram Moolenaar423532e2013-05-29 21:14:42 +02006701 case NFA_LNUM:
6702 case NFA_LNUM_GT:
6703 case NFA_LNUM_LT:
6704 result = (REG_MULTI &&
6705 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006706 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006707 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006708 {
6709 add_here = TRUE;
6710 add_state = t->state->out;
6711 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006712 break;
6713
6714 case NFA_COL:
6715 case NFA_COL_GT:
6716 case NFA_COL_LT:
6717 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006718 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006719 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006720 {
6721 add_here = TRUE;
6722 add_state = t->state->out;
6723 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006724 break;
6725
6726 case NFA_VCOL:
6727 case NFA_VCOL_GT:
6728 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006729 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006730 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006731 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006732 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006733
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006734 // Bail out quickly when there can't be a match, avoid the
6735 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006736 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006737 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006738 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006739 result = FALSE;
6740 if (op == 1 && col - 1 > t->state->val && col > 100)
6741 {
6742 int ts = wp->w_buffer->b_p_ts;
6743
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006744 // Guess that a character won't use more columns than
6745 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006746 if (ts < 4)
6747 ts = 4;
6748 result = col > t->state->val * ts;
6749 }
6750 if (!result)
6751 result = nfa_re_num_cmp(t->state->val, op,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006752 (long_u)win_linetabsize(wp, rex.line, col) + 1);
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006753 if (result)
6754 {
6755 add_here = TRUE;
6756 add_state = t->state->out;
6757 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006758 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006759 break;
6760
Bram Moolenaar044aa292013-06-04 21:27:38 +02006761 case NFA_MARK:
6762 case NFA_MARK_GT:
6763 case NFA_MARK_LT:
6764 {
Bram Moolenaar64066b92021-11-17 18:22:56 +00006765 size_t col = rex.input - rex.line;
Bram Moolenaar6100d022016-10-02 16:51:57 +02006766 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006767
Bram Moolenaar64066b92021-11-17 18:22:56 +00006768 // Line may have been freed, get it again.
6769 if (REG_MULTI)
6770 {
6771 rex.line = reg_getline(rex.lnum);
6772 rex.input = rex.line + col;
6773 }
6774
Bram Moolenaar872bee52021-05-24 22:56:15 +02006775 // Compare the mark position to the match position, if the mark
6776 // exists and mark is set in reg_buf.
6777 if (pos != NULL && pos->lnum > 0)
6778 {
6779 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6780 && pos->col == MAXCOL
6781 ? (colnr_T)STRLEN(reg_getline(
6782 pos->lnum - rex.reg_firstlnum))
6783 : pos->col;
6784
6785 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6786 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006787 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006788 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006789 ? t->state->c == NFA_MARK_GT
6790 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006791 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006792 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006793 : t->state->c == NFA_MARK_LT));
6794 if (result)
6795 {
6796 add_here = TRUE;
6797 add_state = t->state->out;
6798 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006799 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006800 break;
6801 }
6802
Bram Moolenaar423532e2013-05-29 21:14:42 +02006803 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006804 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006805 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006806 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006807 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006808 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006809 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006810 {
6811 add_here = TRUE;
6812 add_state = t->state->out;
6813 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006814 break;
6815
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006816 case NFA_VISUAL:
6817 result = reg_match_visual();
6818 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006819 {
6820 add_here = TRUE;
6821 add_state = t->state->out;
6822 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006823 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006824
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006825 case NFA_MOPEN1:
6826 case NFA_MOPEN2:
6827 case NFA_MOPEN3:
6828 case NFA_MOPEN4:
6829 case NFA_MOPEN5:
6830 case NFA_MOPEN6:
6831 case NFA_MOPEN7:
6832 case NFA_MOPEN8:
6833 case NFA_MOPEN9:
6834#ifdef FEAT_SYN_HL
6835 case NFA_ZOPEN:
6836 case NFA_ZOPEN1:
6837 case NFA_ZOPEN2:
6838 case NFA_ZOPEN3:
6839 case NFA_ZOPEN4:
6840 case NFA_ZOPEN5:
6841 case NFA_ZOPEN6:
6842 case NFA_ZOPEN7:
6843 case NFA_ZOPEN8:
6844 case NFA_ZOPEN9:
6845#endif
6846 case NFA_NOPEN:
6847 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006848 // These states are only added to be able to bail out when
6849 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006850 break;
6851
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006852 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006853 {
6854 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006855
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006856#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006857 if (c < 0)
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00006858 siemsg("INTERNAL: Negative state char: %ld", (long)c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006859#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006860 result = (c == curc);
6861
Bram Moolenaar6100d022016-10-02 16:51:57 +02006862 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006863 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006864 // If rex.reg_icombine is not set only skip over the character
6865 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006866 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006867 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006868 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006869 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006870 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006871
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006872 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006873
6874 if (add_state != NULL)
6875 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006876 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006877 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006878
6879 if (t->pim.result == NFA_PIM_UNUSED)
6880 pim = NULL;
6881 else
6882 pim = &t->pim;
6883
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006884 // Handle the postponed invisible match if the match might end
6885 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006886 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006887 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006888 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006889 {
6890#ifdef ENABLE_LOG
6891 fprintf(log_fd, "\n");
6892 fprintf(log_fd, "==================================\n");
6893 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6894 fprintf(log_fd, "\n");
6895#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006896 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006897 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006898 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006899 // for \@! and \@<! it is a match when the result is
6900 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006901 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006902 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6903 || pim->state->c
6904 == NFA_START_INVISIBLE_BEFORE_NEG
6905 || pim->state->c
6906 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006907 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006908 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006909 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006910#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006911 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006912 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006913#endif
6914 }
6915 }
6916 else
6917 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006918 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006919#ifdef ENABLE_LOG
6920 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006921 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006922 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6923 fprintf(log_fd, "\n");
6924#endif
6925 }
6926
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006927 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006928 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006929 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6930 || pim->state->c
6931 == NFA_START_INVISIBLE_BEFORE_NEG
6932 || pim->state->c
6933 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006934 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006935 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006936 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006937#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006938 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006939 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006940#endif
6941 }
6942 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006943 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02006944 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006945
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006946 // Postponed invisible match was handled, don't add it to
6947 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006948 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006949 }
6950
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006951 // If "pim" points into l->t it will become invalid when
6952 // adding the state causes the list to be reallocated. Make a
6953 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02006954 if (pim == &t->pim)
6955 {
6956 copy_pim(&pim_copy, pim);
6957 pim = &pim_copy;
6958 }
6959
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006960 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006961 r = addstate_here(thislist, add_state, &t->subs,
6962 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006963 else
6964 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006965 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006966 if (add_count > 0)
6967 nextlist->t[nextlist->n - 1].count = add_count;
6968 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006969 if (r == NULL)
6970 {
6971 nfa_match = NFA_TOO_EXPENSIVE;
6972 goto theend;
6973 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006974 }
6975
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006976 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006977
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006978 // Look for the start of a match in the current position by adding the
6979 // start state to the list of states.
6980 // The first found match is the leftmost one, thus the order of states
6981 // matters!
6982 // Do not add the start state in recursive calls of nfa_regmatch(),
6983 // because recursive calls should only start in the first position.
6984 // Unless "nfa_endp" is not NULL, then we match the end position.
6985 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02006986 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02006987 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02006988 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02006989 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02006990 && (rex.reg_maxcol == 0
Bram Moolenaar0270f382018-07-17 05:43:58 +02006991 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02006992 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02006993 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006994 ? (rex.lnum < nfa_endp->se_u.pos.lnum
6995 || (rex.lnum == nfa_endp->se_u.pos.lnum
6996 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02006997 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006998 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006999 {
7000#ifdef ENABLE_LOG
7001 fprintf(log_fd, "(---) STARTSTATE\n");
7002#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007003 // Inline optimized code for addstate() if we know the state is
7004 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007005 if (toplevel)
7006 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007007 int add = TRUE;
7008 int c;
7009
7010 if (prog->regstart != NUL && clen != 0)
7011 {
7012 if (nextlist->n == 0)
7013 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007014 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007015
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007016 // Nextlist is empty, we can skip ahead to the
7017 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007018 if (skip_to_start(prog->regstart, &col) == FAIL)
7019 break;
7020#ifdef ENABLE_LOG
7021 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007022 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007023#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007024 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007025 }
7026 else
7027 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007028 // Checking if the required start character matches is
7029 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007030 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007031 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007032 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007033 {
7034#ifdef ENABLE_LOG
7035 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7036#endif
7037 add = FALSE;
7038 }
7039 }
7040 }
7041
7042 if (add)
7043 {
7044 if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007045 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007046 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007047 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007048 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007049 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7050 {
7051 nfa_match = NFA_TOO_EXPENSIVE;
7052 goto theend;
7053 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007054 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007055 }
7056 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007057 {
7058 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7059 {
7060 nfa_match = NFA_TOO_EXPENSIVE;
7061 goto theend;
7062 }
7063 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007064 }
7065
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007066#ifdef ENABLE_LOG
7067 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007068 {
7069 int i;
7070
7071 for (i = 0; i < thislist->n; i++)
7072 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7073 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007074 fprintf(log_fd, "\n");
7075#endif
7076
7077nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007078 // Advance to the next character, or advance to the next line, or
7079 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007080 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007081 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007082 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007083 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007084 reg_nextline();
7085 else
7086 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007087
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007088 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007089 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007090 if (got_int)
7091 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007092#ifdef FEAT_RELTIME
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007093 // Check for timeout once in a twenty times to avoid overhead.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007094 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7095 {
7096 nfa_time_count = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007097 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007098 break;
7099 }
7100#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007101 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007102
7103#ifdef ENABLE_LOG
7104 if (log_fd != stderr)
7105 fclose(log_fd);
7106 log_fd = NULL;
7107#endif
7108
7109theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007110 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007111 vim_free(list[0].t);
7112 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007113 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007114#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007115#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007116 fclose(debug);
7117#endif
7118
Bram Moolenaar963fee22013-05-26 21:47:28 +02007119 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007120}
7121
7122/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007123 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007124 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007125 */
7126 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007127nfa_regtry(
7128 nfa_regprog_T *prog,
7129 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007130 proftime_T *tm UNUSED, // timeout limit or NULL
7131 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007132{
7133 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007134 regsubs_T subs, m;
7135 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007136 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007137#ifdef ENABLE_LOG
7138 FILE *f;
7139#endif
7140
Bram Moolenaar0270f382018-07-17 05:43:58 +02007141 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007142#ifdef FEAT_RELTIME
7143 nfa_time_limit = tm;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007144 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007145 nfa_time_count = 0;
7146#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007147
7148#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007149 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007150 if (f != NULL)
7151 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007152 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007153#ifdef DEBUG
7154 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7155#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007156 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007157 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007158 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007159 fprintf(f, "\n\n");
7160 fclose(f);
7161 }
7162 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007163 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007164#endif
7165
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007166 clear_sub(&subs.norm);
7167 clear_sub(&m.norm);
7168#ifdef FEAT_SYN_HL
7169 clear_sub(&subs.synt);
7170 clear_sub(&m.synt);
7171#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007172
Bram Moolenaarfda37292014-11-05 14:27:36 +01007173 result = nfa_regmatch(prog, start, &subs, &m);
7174 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007175 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007176 else if (result == NFA_TOO_EXPENSIVE)
7177 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007178
7179 cleanup_subexpr();
7180 if (REG_MULTI)
7181 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007182 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007183 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007184 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7185 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007186
Bram Moolenaar6100d022016-10-02 16:51:57 +02007187 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7188 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007189 }
7190
Bram Moolenaar6100d022016-10-02 16:51:57 +02007191 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007192 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007193 rex.reg_startpos[0].lnum = 0;
7194 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007195 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007196 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007197 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007198 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007199 rex.reg_endpos[0].lnum = rex.lnum;
7200 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007201 }
7202 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007203 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007204 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007205 }
7206 else
7207 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007208 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007209 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007210 rex.reg_startp[i] = subs.norm.list.line[i].start;
7211 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007212 }
7213
Bram Moolenaar6100d022016-10-02 16:51:57 +02007214 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007215 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007216 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007217 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007218 }
7219
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007220#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007221 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007222 unref_extmatch(re_extmatch_out);
7223 re_extmatch_out = NULL;
7224
7225 if (prog->reghasz == REX_SET)
7226 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007227 cleanup_zsubexpr();
7228 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007229 if (re_extmatch_out == NULL)
7230 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007231 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007232 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007233 {
7234 if (REG_MULTI)
7235 {
7236 struct multipos *mpos = &subs.synt.list.multi[i];
7237
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007238 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007239 if (mpos->start_lnum >= 0
7240 && mpos->start_lnum == mpos->end_lnum
7241 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007242 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007243 vim_strnsave(reg_getline(mpos->start_lnum)
7244 + mpos->start_col,
7245 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007246 }
7247 else
7248 {
7249 struct linepos *lpos = &subs.synt.list.line[i];
7250
7251 if (lpos->start != NULL && lpos->end != NULL)
7252 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007253 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007254 }
7255 }
7256 }
7257#endif
7258
Bram Moolenaar0270f382018-07-17 05:43:58 +02007259 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007260}
7261
7262/*
7263 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007264 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007265 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007266 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007267 */
7268 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007269nfa_regexec_both(
7270 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007271 colnr_T startcol, // column to start looking for match
7272 proftime_T *tm, // timeout limit or NULL
7273 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007274{
7275 nfa_regprog_T *prog;
7276 long retval = 0L;
7277 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007278 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007279
7280 if (REG_MULTI)
7281 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007282 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007283 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007284 rex.reg_startpos = rex.reg_mmatch->startpos;
7285 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007286 }
7287 else
7288 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007289 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7290 rex.reg_startp = rex.reg_match->startp;
7291 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007292 }
7293
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007294 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007295 if (prog == NULL || line == NULL)
7296 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02007297 iemsg(_(e_null_argument));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007298 goto theend;
7299 }
7300
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007301 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007302 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007303 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007304 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007305 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007306
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007307 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007308 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007309 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007310
Bram Moolenaar0270f382018-07-17 05:43:58 +02007311 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007312 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007313
Bram Moolenaar0270f382018-07-17 05:43:58 +02007314 rex.nfa_has_zend = prog->has_zend;
7315 rex.nfa_has_backref = prog->has_backref;
7316 rex.nfa_nsubexpr = prog->nsubexp;
7317 rex.nfa_listid = 1;
7318 rex.nfa_alt_listid = 2;
7319#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007320 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007321#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007322
Bram Moolenaard89616e2013-06-06 18:46:06 +02007323 if (prog->reganch && col > 0)
7324 return 0L;
7325
Bram Moolenaar0270f382018-07-17 05:43:58 +02007326 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007327#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007328 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007329 if (prog->reghasz == REX_SET)
7330 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007331 rex.nfa_has_zsubexpr = TRUE;
7332 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007333 }
7334 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007335 {
7336 rex.nfa_has_zsubexpr = FALSE;
7337 rex.need_clear_zsubexpr = FALSE;
7338 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007339#endif
7340
Bram Moolenaard89616e2013-06-06 18:46:06 +02007341 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007342 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007343 // Skip ahead until a character we know the match must start with.
7344 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007345 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007346 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007347
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007348 // If match_text is set it contains the full text that must match.
7349 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007350 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar473de612013-06-08 18:19:48 +02007351 return find_match_text(col, prog->regstart, prog->match_text);
7352 }
7353
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007354 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007355 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007356 goto theend;
7357
Bram Moolenaar0270f382018-07-17 05:43:58 +02007358 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7359 // it's accidentally used during execution.
7360 nstate = 0;
7361 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007362 {
7363 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007364 prog->state[i].lastlist[0] = 0;
7365 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007366 }
7367
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007368 retval = nfa_regtry(prog, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007369
Bram Moolenaar0270f382018-07-17 05:43:58 +02007370#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007371 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007372#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007373
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007374theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007375 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007376 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007377 // Make sure the end is never before the start. Can happen when \zs and
7378 // \ze are used.
7379 if (REG_MULTI)
7380 {
7381 lpos_T *start = &rex.reg_mmatch->startpos[0];
7382 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007383
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007384 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007385 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007386 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7387 }
7388 else
7389 {
7390 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7391 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7392 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007393 }
7394
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007395 return retval;
7396}
7397
7398/*
7399 * Compile a regular expression into internal code for the NFA matcher.
7400 * Returns the program in allocated space. Returns NULL for an error.
7401 */
7402 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007403nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007404{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007405 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007406 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007407 int *postfix;
7408
7409 if (expr == NULL)
7410 return NULL;
7411
Bram Moolenaar0270f382018-07-17 05:43:58 +02007412#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007413 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007414#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007415 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007416
7417 init_class_tab();
7418
7419 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7420 return NULL;
7421
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007422 // Build postfix form of the regexp. Needed to build the NFA
7423 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007424 postfix = re2post();
7425 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007426 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007427
7428 /*
7429 * In order to build the NFA, we parse the input regexp twice:
7430 * 1. first pass to count size (so we can allocate space)
7431 * 2. second to emit code
7432 */
7433#ifdef ENABLE_LOG
7434 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007435 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007436
7437 if (f != NULL)
7438 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007439 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007440 fclose(f);
7441 }
7442 }
7443#endif
7444
7445 /*
7446 * PASS 1
7447 * Count number of NFA states in "nstate". Do not build the NFA.
7448 */
7449 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007450
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007451 // allocate the regprog with space for the compiled regexp
Bram Moolenaar16619a22013-06-11 18:42:36 +02007452 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007453 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007454 if (prog == NULL)
7455 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007456 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007457 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007458
7459 /*
7460 * PASS 2
7461 * Build the NFA
7462 */
7463 prog->start = post2nfa(postfix, post_ptr, FALSE);
7464 if (prog->start == NULL)
7465 goto fail;
7466
7467 prog->regflags = regflags;
7468 prog->engine = &nfa_regengine;
7469 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007470 prog->has_zend = rex.nfa_has_zend;
7471 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007472 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007473
Bram Moolenaara2947e22013-06-11 22:44:09 +02007474 nfa_postprocess(prog);
7475
Bram Moolenaard89616e2013-06-06 18:46:06 +02007476 prog->reganch = nfa_get_reganch(prog->start, 0);
7477 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007478 prog->match_text = nfa_get_match_text(prog->start);
7479
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007480#ifdef ENABLE_LOG
7481 nfa_postfix_dump(expr, OK);
7482 nfa_dump(prog);
7483#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007484#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007485 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007486 prog->reghasz = re_has_z;
7487#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007488 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007489#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007490 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007491#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007492
7493out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007494 VIM_CLEAR(post_start);
7495 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007496 state_ptr = NULL;
7497 return (regprog_T *)prog;
7498
7499fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007500 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007501#ifdef ENABLE_LOG
7502 nfa_postfix_dump(expr, FAIL);
7503#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007504#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007505 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007506#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007507 goto out;
7508}
7509
Bram Moolenaar473de612013-06-08 18:19:48 +02007510/*
7511 * Free a compiled regexp program, returned by nfa_regcomp().
7512 */
7513 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007514nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007515{
7516 if (prog != NULL)
7517 {
7518 vim_free(((nfa_regprog_T *)prog)->match_text);
Bram Moolenaar473de612013-06-08 18:19:48 +02007519 vim_free(((nfa_regprog_T *)prog)->pattern);
Bram Moolenaar473de612013-06-08 18:19:48 +02007520 vim_free(prog);
7521 }
7522}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007523
7524/*
7525 * Match a regexp against a string.
7526 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7527 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007528 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007529 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007530 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007531 */
7532 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007533nfa_regexec_nl(
7534 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007535 char_u *line, // string to match against
7536 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007537 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007538{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007539 rex.reg_match = rmp;
7540 rex.reg_mmatch = NULL;
7541 rex.reg_maxline = 0;
7542 rex.reg_line_lbr = line_lbr;
7543 rex.reg_buf = curbuf;
7544 rex.reg_win = NULL;
7545 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007546 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007547 rex.reg_maxcol = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007548 return nfa_regexec_both(line, col, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007549}
7550
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007551
7552/*
7553 * Match a regexp against multiple lines.
7554 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7555 * Uses curbuf for line count and 'iskeyword'.
7556 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007557 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007558 * match otherwise.
7559 *
7560 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7561 *
7562 * ! Also NOTE : match may actually be in another line. e.g.:
7563 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7564 *
7565 * +-------------------------+
7566 * |a |
7567 * |b |
7568 * |c |
7569 * | |
7570 * +-------------------------+
7571 *
7572 * then nfa_regexec_multi() returns 3. while the original
7573 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7574 *
7575 * FIXME if this behavior is not compatible.
7576 */
7577 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007578nfa_regexec_multi(
7579 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007580 win_T *win, // window in which to search or NULL
7581 buf_T *buf, // buffer in which to search
7582 linenr_T lnum, // nr of line to start looking for match
7583 colnr_T col, // column to start looking for match
7584 proftime_T *tm, // timeout limit or NULL
7585 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007586{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007587 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007588 return nfa_regexec_both(NULL, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007589}
7590
7591#ifdef DEBUG
7592# undef ENABLE_LOG
7593#endif