blob: ed8efcfee7061b64c883dbb18be1fc6958fd5039 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar0270f382018-07-17 05:43:58 +0200246// Variables only used in nfa_regcomp() and descendants.
247static int nfa_re_flags; // re_flags passed to nfa_regcomp()
248static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200249static int *post_end;
250static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100251
252// Set when the pattern should use the NFA engine.
253// E.g. [[:upper:]] only allows 8bit characters for BT engine,
254// while NFA engine handles multibyte characters correctly.
255static int wants_nfa;
256
Bram Moolenaar0270f382018-07-17 05:43:58 +0200257static int nstate; // Number of states in the NFA.
258static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200261static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200262
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100263// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200264static int nfa_ll_index = 0;
265
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100266static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100267static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200268#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100269static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200270#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int match_follows(nfa_state_T *startstate, int depth);
272static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100274// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200276 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200277 return FAIL; \
278 *post_ptr++ = c; \
279 } while (0)
280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200281/*
282 * Initialize internal variables before NFA compilation.
283 * Return OK on success, FAIL otherwise.
284 */
285 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100286nfa_regcomp_start(
287 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200289{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200290 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200291 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200292
293 nstate = 0;
294 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200296 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298 // Some items blow up in size, such as [A-z]. Add more space for that.
299 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200300 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200301
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200303 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200304
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200305 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200306 if (post_start == NULL)
307 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200308 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100310 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200311 rex.nfa_has_zend = FALSE;
312 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315 regcomp_start(expr, re_flags);
316
317 return OK;
318}
319
320/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200321 * Figure out if the NFA state list starts with an anchor, must match at start
322 * of the line.
323 */
324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100325nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326{
327 nfa_state_T *p = start;
328
329 if (depth > 4)
330 return 0;
331
332 while (p != NULL)
333 {
334 switch (p->c)
335 {
336 case NFA_BOL:
337 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200339
340 case NFA_ZSTART:
341 case NFA_ZEND:
342 case NFA_CURSOR:
343 case NFA_VISUAL:
344
345 case NFA_MOPEN:
346 case NFA_MOPEN1:
347 case NFA_MOPEN2:
348 case NFA_MOPEN3:
349 case NFA_MOPEN4:
350 case NFA_MOPEN5:
351 case NFA_MOPEN6:
352 case NFA_MOPEN7:
353 case NFA_MOPEN8:
354 case NFA_MOPEN9:
355 case NFA_NOPEN:
356#ifdef FEAT_SYN_HL
357 case NFA_ZOPEN:
358 case NFA_ZOPEN1:
359 case NFA_ZOPEN2:
360 case NFA_ZOPEN3:
361 case NFA_ZOPEN4:
362 case NFA_ZOPEN5:
363 case NFA_ZOPEN6:
364 case NFA_ZOPEN7:
365 case NFA_ZOPEN8:
366 case NFA_ZOPEN9:
367#endif
368 p = p->out;
369 break;
370
371 case NFA_SPLIT:
372 return nfa_get_reganch(p->out, depth + 1)
373 && nfa_get_reganch(p->out1, depth + 1);
374
375 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100376 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200377 }
378 }
379 return 0;
380}
381
382/*
383 * Figure out if the NFA state list starts with a character which must match
384 * at start of the match.
385 */
386 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100387nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200388{
389 nfa_state_T *p = start;
390
391 if (depth > 4)
392 return 0;
393
394 while (p != NULL)
395 {
396 switch (p->c)
397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100398 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200399 case NFA_BOL:
400 case NFA_BOF:
401 case NFA_BOW:
402 case NFA_EOW:
403 case NFA_ZSTART:
404 case NFA_ZEND:
405 case NFA_CURSOR:
406 case NFA_VISUAL:
407 case NFA_LNUM:
408 case NFA_LNUM_GT:
409 case NFA_LNUM_LT:
410 case NFA_COL:
411 case NFA_COL_GT:
412 case NFA_COL_LT:
413 case NFA_VCOL:
414 case NFA_VCOL_GT:
415 case NFA_VCOL_LT:
416 case NFA_MARK:
417 case NFA_MARK_GT:
418 case NFA_MARK_LT:
419
420 case NFA_MOPEN:
421 case NFA_MOPEN1:
422 case NFA_MOPEN2:
423 case NFA_MOPEN3:
424 case NFA_MOPEN4:
425 case NFA_MOPEN5:
426 case NFA_MOPEN6:
427 case NFA_MOPEN7:
428 case NFA_MOPEN8:
429 case NFA_MOPEN9:
430 case NFA_NOPEN:
431#ifdef FEAT_SYN_HL
432 case NFA_ZOPEN:
433 case NFA_ZOPEN1:
434 case NFA_ZOPEN2:
435 case NFA_ZOPEN3:
436 case NFA_ZOPEN4:
437 case NFA_ZOPEN5:
438 case NFA_ZOPEN6:
439 case NFA_ZOPEN7:
440 case NFA_ZOPEN8:
441 case NFA_ZOPEN9:
442#endif
443 p = p->out;
444 break;
445
446 case NFA_SPLIT:
447 {
448 int c1 = nfa_get_regstart(p->out, depth + 1);
449 int c2 = nfa_get_regstart(p->out1, depth + 1);
450
451 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100452 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200453 return 0;
454 }
455
456 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200457 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100458 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200459 return 0;
460 }
461 }
462 return 0;
463}
464
465/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200466 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200467 * else. If so return a string in allocated memory with what must match after
468 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200469 */
470 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100471nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200472{
473 nfa_state_T *p = start;
474 int len = 0;
475 char_u *ret;
476 char_u *s;
477
478 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100479 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200480 p = p->out;
481 while (p->c > 0)
482 {
483 len += MB_CHAR2LEN(p->c);
484 p = p->out;
485 }
486 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
487 return NULL;
488
489 ret = alloc(len);
490 if (ret != NULL)
491 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100492 p = start->out->out; // skip first char, it goes into regstart
Bram Moolenaar473de612013-06-08 18:19:48 +0200493 s = ret;
494 while (p->c > 0)
495 {
Bram Moolenaar473de612013-06-08 18:19:48 +0200496 if (has_mbyte)
497 s += (*mb_char2bytes)(p->c, s);
498 else
Bram Moolenaar473de612013-06-08 18:19:48 +0200499 *s++ = p->c;
500 p = p->out;
501 }
502 *s = NUL;
503 }
504 return ret;
505}
506
507/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200508 * Allocate more space for post_start. Called when
509 * running above the estimated number of states.
510 */
511 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100512realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200514 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100515 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200516 int *new_start;
517 int *old_start;
518
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100519 // For weird patterns the number of states can be very high. Increasing by
520 // 50% seems a reasonable compromise between memory use and speed.
521 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200522 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200523 if (new_start == NULL)
524 return FAIL;
525 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200526 old_start = post_start;
527 post_start = new_start;
528 post_ptr = new_start + (post_ptr - old_start);
529 post_end = post_start + new_max;
530 vim_free(old_start);
531 return OK;
532}
533
534/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200535 * Search between "start" and "end" and try to recognize a
536 * character class in expanded form. For example [0-9].
537 * On success, return the id the character class to be emitted.
538 * On failure, return 0 (=FAIL)
539 * Start points to the first char of the range, while end should point
540 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200541 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
542 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200543 */
544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100545nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200546{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200547# define CLASS_not 0x80
548# define CLASS_af 0x40
549# define CLASS_AF 0x20
550# define CLASS_az 0x10
551# define CLASS_AZ 0x08
552# define CLASS_o7 0x04
553# define CLASS_o9 0x02
554# define CLASS_underscore 0x01
555
556 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200557 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200558 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200559
560 if (extra_newl == TRUE)
561 newl = TRUE;
562
563 if (*end != ']')
564 return FAIL;
565 p = start;
566 if (*p == '^')
567 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200568 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200569 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200570 }
571
572 while (p < end)
573 {
574 if (p + 2 < end && *(p + 1) == '-')
575 {
576 switch (*p)
577 {
578 case '0':
579 if (*(p + 2) == '9')
580 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200581 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200582 break;
583 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200584 if (*(p + 2) == '7')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200589 return FAIL;
590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 case 'a':
592 if (*(p + 2) == 'z')
593 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200594 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200595 break;
596 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200597 if (*(p + 2) == 'f')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200602 return FAIL;
603
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200604 case 'A':
605 if (*(p + 2) == 'Z')
606 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200607 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200608 break;
609 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200610 if (*(p + 2) == 'F')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200615 return FAIL;
616
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200617 default:
618 return FAIL;
619 }
620 p += 3;
621 }
622 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
623 {
624 newl = TRUE;
625 p += 2;
626 }
627 else if (*p == '_')
628 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200629 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200630 p ++;
631 }
632 else if (*p == '\n')
633 {
634 newl = TRUE;
635 p ++;
636 }
637 else
638 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100639 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200640
641 if (p != end)
642 return FAIL;
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200645 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200646
647 switch (config)
648 {
649 case CLASS_o9:
650 return extra_newl + NFA_DIGIT;
651 case CLASS_not | CLASS_o9:
652 return extra_newl + NFA_NDIGIT;
653 case CLASS_af | CLASS_AF | CLASS_o9:
654 return extra_newl + NFA_HEX;
655 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
656 return extra_newl + NFA_NHEX;
657 case CLASS_o7:
658 return extra_newl + NFA_OCTAL;
659 case CLASS_not | CLASS_o7:
660 return extra_newl + NFA_NOCTAL;
661 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
662 return extra_newl + NFA_WORD;
663 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
664 return extra_newl + NFA_NWORD;
665 case CLASS_az | CLASS_AZ | CLASS_underscore:
666 return extra_newl + NFA_HEAD;
667 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
668 return extra_newl + NFA_NHEAD;
669 case CLASS_az | CLASS_AZ:
670 return extra_newl + NFA_ALPHA;
671 case CLASS_not | CLASS_az | CLASS_AZ:
672 return extra_newl + NFA_NALPHA;
673 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200674 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200675 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200676 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200677 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200678 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200679 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200680 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200682 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200683}
684
685/*
686 * Produce the bytes for equivalence class "c".
687 * Currently only handles latin1, latin9 and utf-8.
688 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
689 * equivalent to 'a OR b OR c'
690 *
691 * NOTE! When changing this function, also update reg_equi_class()
692 */
693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200696#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
699 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700 {
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000701#define A_grave 0xc0
702#define A_acute 0xc1
703#define A_circumflex 0xc2
704#define A_virguilla 0xc3
705#define A_diaeresis 0xc4
706#define A_ring 0xc5
707#define C_cedilla 0xc7
708#define E_grave 0xc8
709#define E_acute 0xc9
710#define E_circumflex 0xca
711#define E_diaeresis 0xcb
712#define I_grave 0xcc
713#define I_acute 0xcd
714#define I_circumflex 0xce
715#define I_diaeresis 0xcf
716#define N_virguilla 0xd1
717#define O_grave 0xd2
718#define O_acute 0xd3
719#define O_circumflex 0xd4
720#define O_virguilla 0xd5
721#define O_diaeresis 0xd6
722#define O_slash 0xd8
723#define U_grave 0xd9
724#define U_acute 0xda
725#define U_circumflex 0xdb
726#define U_diaeresis 0xdc
727#define Y_acute 0xdd
728#define a_grave 0xe0
729#define a_acute 0xe1
730#define a_circumflex 0xe2
731#define a_virguilla 0xe3
732#define a_diaeresis 0xe4
733#define a_ring 0xe5
734#define c_cedilla 0xe7
735#define e_grave 0xe8
736#define e_acute 0xe9
737#define e_circumflex 0xea
738#define e_diaeresis 0xeb
739#define i_grave 0xec
740#define i_acute 0xed
741#define i_circumflex 0xee
742#define i_diaeresis 0xef
743#define n_virguilla 0xf1
744#define o_grave 0xf2
745#define o_acute 0xf3
746#define o_circumflex 0xf4
747#define o_virguilla 0xf5
748#define o_diaeresis 0xf6
749#define o_slash 0xf8
750#define u_grave 0xf9
751#define u_acute 0xfa
752#define u_circumflex 0xfb
753#define u_diaeresis 0xfc
754#define y_acute 0xfd
755#define y_diaeresis 0xff
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200756 switch (c)
757 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200758 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200759 case A_virguilla: case A_diaeresis: case A_ring:
760 case 0x100: case 0x102: case 0x104: case 0x1cd:
761 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
762 case 0x202: case 0x226: case 0x23a: case 0x1e00:
763 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
764 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
765 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
766 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
767 EMIT2(A_circumflex) EMIT2(A_virguilla)
768 EMIT2(A_diaeresis) EMIT2(A_ring)
769 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
770 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
771 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
772 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
773 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
774 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
775 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
776 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200777 return OK;
778
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200779 case 'B': case 0x181: case 0x243: case 0x1e02:
780 case 0x1e04: case 0x1e06:
781 EMIT2('B')
782 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
783 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200784 return OK;
785
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200786 case 'C': case C_cedilla: case 0x106: case 0x108:
787 case 0x10a: case 0x10c: case 0x187: case 0x23b:
788 case 0x1e08: case 0xa792:
789 EMIT2('C') EMIT2(C_cedilla)
790 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
791 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
792 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200793 return OK;
794
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200795 case 'D': case 0x10e: case 0x110: case 0x18a:
796 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
797 case 0x1e12:
798 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
799 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
800 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200801 return OK;
802
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200803 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200804 case E_diaeresis: case 0x112: case 0x114: case 0x116:
805 case 0x118: case 0x11a: case 0x204: case 0x206:
806 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
807 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
808 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
809 case 0x1ec2: case 0x1ec4: case 0x1ec6:
810 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
811 EMIT2(E_circumflex) EMIT2(E_diaeresis)
812 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
813 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
814 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
815 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
816 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
817 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
818 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
819 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200820 return OK;
821
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case 'F': case 0x191: case 0x1e1e: case 0xa798:
823 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200824 return OK;
825
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200826 case 'G': case 0x11c: case 0x11e: case 0x120:
827 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
828 case 0x1f4: case 0x1e20: case 0xa7a0:
829 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
830 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
831 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
832 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200833 return OK;
834
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200835 case 'H': case 0x124: case 0x126: case 0x21e:
836 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
837 case 0x1e2a: case 0x2c67:
838 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
839 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
840 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200841 return OK;
842
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200843 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200844 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
845 case 0x12e: case 0x130: case 0x197: case 0x1cf:
846 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
847 case 0x1ec8: case 0x1eca:
848 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
849 EMIT2(I_circumflex) EMIT2(I_diaeresis)
850 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
851 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
852 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
853 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
854 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200855 return OK;
856
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200857 case 'J': case 0x134: case 0x248:
858 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200859 return OK;
860
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
862 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
863 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
864 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
865 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200866 return OK;
867
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200868 case 'L': case 0x139: case 0x13b: case 0x13d:
869 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
870 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
871 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
872 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
873 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
874 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200875 return OK;
876
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200877 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
878 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
879 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200880 return OK;
881
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200882 case 'N': case N_virguilla:
883 case 0x143: case 0x145: case 0x147: case 0x1f8:
884 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
885 case 0xa7a4:
886 EMIT2('N') EMIT2(N_virguilla)
887 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
888 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
889 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200890 return OK;
891
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200892 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200893 case O_virguilla: case O_diaeresis: case O_slash:
894 case 0x14c: case 0x14e: case 0x150: case 0x19f:
895 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
896 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
897 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
898 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
899 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
900 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
901 case 0x1ede: case 0x1ee0: case 0x1ee2:
902 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
903 EMIT2(O_circumflex) EMIT2(O_virguilla)
904 EMIT2(O_diaeresis) EMIT2(O_slash)
905 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
906 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
907 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
908 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
909 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
910 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
911 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
912 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
913 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
914 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
915 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200916 return OK;
917
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200918 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
919 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
920 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200921 return OK;
922
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200923 case 'Q': case 0x24a:
924 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200925 return OK;
926
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200927 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
928 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
929 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
930 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
931 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
932 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
933 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200934 return OK;
935
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200936 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
937 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
938 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
939 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
940 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
941 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
942 EMIT2(0xa7a8)
943 return OK;
944
945 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
946 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
947 case 0x1e6e: case 0x1e70:
948 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
949 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
950 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200951 return OK;
952
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200953 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200954 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
955 case 0x16e: case 0x170: case 0x172: case 0x1af:
956 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
957 case 0x1db: case 0x214: case 0x216: case 0x244:
958 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
959 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
960 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
961 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
962 EMIT2(U_diaeresis) EMIT2(U_circumflex)
963 EMIT2(0x168) EMIT2(0x16a)
964 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
965 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
966 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
967 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
968 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
969 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
970 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
971 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
972 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200973 return OK;
974
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200975 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
976 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200977 return OK;
978
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200979 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
980 case 0x1e86: case 0x1e88:
981 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
982 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200983 return OK;
984
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 case 'X': case 0x1e8a: case 0x1e8c:
986 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200987 return OK;
988
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200989 case 'Y': case Y_acute: case 0x176: case 0x178:
990 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
991 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
992 EMIT2('Y') EMIT2(Y_acute)
993 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
994 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
995 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
996 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'Z': case 0x179: case 0x17b: case 0x17d:
1000 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1001 case 0x2c6b:
1002 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1003 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1004 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001005 return OK;
1006
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001007 case 'a': case a_grave: case a_acute: case a_circumflex:
1008 case a_virguilla: case a_diaeresis: case a_ring:
1009 case 0x101: case 0x103: case 0x105: case 0x1ce:
1010 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1011 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1012 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1013 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1014 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1015 case 0x1eb7: case 0x2c65:
1016 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1017 EMIT2(a_circumflex) EMIT2(a_virguilla)
1018 EMIT2(a_diaeresis) EMIT2(a_ring)
1019 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1020 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1021 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1022 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1023 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1024 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1025 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1026 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1027 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001028 return OK;
1029
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001030 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1031 case 0x1e03: case 0x1e05: case 0x1e07:
1032 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1033 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001034 return OK;
1035
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001036 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1037 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1038 case 0xa794:
1039 EMIT2('c') EMIT2(c_cedilla)
1040 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1041 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1042 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001043 return OK;
1044
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001045 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1046 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1047 case 0x1e11: case 0x1e13:
1048 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1049 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1050 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1051 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001052 return OK;
1053
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001054 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001055 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1056 case 0x119: case 0x11b: case 0x205: case 0x207:
1057 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1058 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1059 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1060 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1061 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1062 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1063 EMIT2(0x113) EMIT2(0x115)
1064 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1065 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1066 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1067 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1068 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1069 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1070 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001071 return OK;
1072
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001073 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1074 case 0x1e1f: case 0xa799:
1075 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1076 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001077 return OK;
1078
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001079 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1080 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1081 case 0x1e21: case 0xa7a1:
1082 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1083 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1084 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1085 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001086 return OK;
1087
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001088 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1089 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1090 case 0x1e96: case 0x2c68: case 0xa795:
1091 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1092 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1093 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1094 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001095 return OK;
1096
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001097 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001098 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1099 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1100 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1101 case 0x1ec9: case 0x1ecb:
1102 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1103 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1104 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1105 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1106 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1107 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1108 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001109 return OK;
1110
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001111 case 'j': case 0x135: case 0x1f0: case 0x249:
1112 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001113 return OK;
1114
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001115 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1116 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1117 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1118 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1119 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001120 return OK;
1121
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001122 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1123 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1124 case 0x1e3d: case 0x2c61:
1125 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1126 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1127 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1128 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001129 return OK;
1130
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001131 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1132 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1133 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1137 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1138 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1139 EMIT2('n') EMIT2(n_virguilla)
1140 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1141 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1142 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1143 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001144 return OK;
1145
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001146 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001147 case o_virguilla: case o_diaeresis: case o_slash:
1148 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1149 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1150 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1151 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1152 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1153 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1154 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1155 case 0x1edf: case 0x1ee1: case 0x1ee3:
1156 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1157 EMIT2(o_circumflex) EMIT2(o_virguilla)
1158 EMIT2(o_diaeresis) EMIT2(o_slash)
1159 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1160 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1161 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1162 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1163 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1164 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1165 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1166 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1167 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1168 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1169 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001170 return OK;
1171
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001172 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1173 case 0x1e55: case 0x1e57:
1174 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1175 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'q': case 0x24b: case 0x2a0:
1179 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001180 return OK;
1181
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001182 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1183 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1184 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1185 case 0xa7a7:
1186 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1187 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1188 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1189 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001190 return OK;
1191
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001192 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1193 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1194 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1195 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1196 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1197 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1198 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1199 return OK;
1200
1201 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1202 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1203 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1204 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1205 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1206 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1207 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001208 return OK;
1209
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001210 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001211 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1212 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1213 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1214 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1215 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1216 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1217 case 0x1eed: case 0x1eef: case 0x1ef1:
1218 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1219 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1220 EMIT2(0x169) EMIT2(0x16b)
1221 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1222 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1223 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1224 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1225 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1226 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1227 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1228 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1229 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001230 return OK;
1231
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001232 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1233 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1234 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001235 return OK;
1236
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001237 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1238 case 0x1e87: case 0x1e89: case 0x1e98:
1239 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1240 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001241 return OK;
1242
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001243 case 'x': case 0x1e8b: case 0x1e8d:
1244 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001245 return OK;
1246
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001247 case 'y': case y_acute: case y_diaeresis: case 0x177:
1248 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1249 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1250 case 0x1ef9:
1251 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1252 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1253 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1254 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001255 return OK;
1256
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001257 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1258 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1259 case 0x1e95: case 0x2c6c:
1260 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1261 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1262 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001263 return OK;
1264
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001265 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001266 }
1267 }
1268
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001269 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001270 return OK;
1271#undef EMIT2
1272}
1273
1274/*
1275 * Code to parse regular expression.
1276 *
1277 * We try to reuse parsing functions in regexp.c to
1278 * minimize surprise and keep the syntax consistent.
1279 */
1280
1281/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001282 * Parse the lowest level.
1283 *
1284 * An atom can be one of a long list of items. Many atoms match one character
1285 * in the text. It is often an ordinary character or a character class.
1286 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1287 * is only for syntax highlighting.
1288 *
1289 * atom ::= ordinary-atom
1290 * or \( pattern \)
1291 * or \%( pattern \)
1292 * or \z( pattern \)
1293 */
1294 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001295nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001296{
1297 int c;
1298 int charclass;
1299 int equiclass;
1300 int collclass;
1301 int got_coll_char;
1302 char_u *p;
1303 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001304 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001305 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001306 int emit_range;
1307 int negated;
1308 int result;
1309 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001310 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001311
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001312 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 switch (c)
1314 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001315 case NUL:
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001316 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar47196582013-05-25 22:04:23 +02001317
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001318 case Magic('^'):
1319 EMIT(NFA_BOL);
1320 break;
1321
1322 case Magic('$'):
1323 EMIT(NFA_EOL);
1324#if defined(FEAT_SYN_HL) || defined(PROTO)
1325 had_eol = TRUE;
1326#endif
1327 break;
1328
1329 case Magic('<'):
1330 EMIT(NFA_BOW);
1331 break;
1332
1333 case Magic('>'):
1334 EMIT(NFA_EOW);
1335 break;
1336
1337 case Magic('_'):
1338 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001339 if (c == NUL)
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001340 EMSG_RET_FAIL(_(e_nfa_regexp_end_encountered_prematurely));
Bram Moolenaar174a8482013-11-28 14:20:17 +01001341
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001342 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001343 {
1344 EMIT(NFA_BOL);
1345 break;
1346 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001347 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001348 {
1349 EMIT(NFA_EOL);
1350#if defined(FEAT_SYN_HL) || defined(PROTO)
1351 had_eol = TRUE;
1352#endif
1353 break;
1354 }
1355
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001356 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001357
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001358 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001359 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001360 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001361
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001362 // "\_x" is character class plus newline
1363 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001364
1365 /*
1366 * Character classes.
1367 */
1368 case Magic('.'):
1369 case Magic('i'):
1370 case Magic('I'):
1371 case Magic('k'):
1372 case Magic('K'):
1373 case Magic('f'):
1374 case Magic('F'):
1375 case Magic('p'):
1376 case Magic('P'):
1377 case Magic('s'):
1378 case Magic('S'):
1379 case Magic('d'):
1380 case Magic('D'):
1381 case Magic('x'):
1382 case Magic('X'):
1383 case Magic('o'):
1384 case Magic('O'):
1385 case Magic('w'):
1386 case Magic('W'):
1387 case Magic('h'):
1388 case Magic('H'):
1389 case Magic('a'):
1390 case Magic('A'):
1391 case Magic('l'):
1392 case Magic('L'):
1393 case Magic('u'):
1394 case Magic('U'):
1395 p = vim_strchr(classchars, no_Magic(c));
1396 if (p == NULL)
1397 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001398 if (extra == NFA_ADD_NL)
1399 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001400 semsg(_(e_nfa_regexp_invalid_character_class_nr), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001401 rc_did_emsg = TRUE;
1402 return FAIL;
1403 }
Bram Moolenaarb5443cc2019-01-15 20:19:40 +01001404 siemsg("INTERNAL: Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001405 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001406 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001408 // When '.' is followed by a composing char ignore the dot, so that
1409 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001410 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1411 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001412 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001413 c = getchr();
1414 goto nfa_do_multibyte;
1415 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001416 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001417 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001418 {
1419 EMIT(NFA_NEWL);
1420 EMIT(NFA_OR);
1421 regflags |= RF_HASNL;
1422 }
1423 break;
1424
1425 case Magic('n'):
1426 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001427 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001428 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001429 else
1430 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001431 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001432 EMIT(NFA_NEWL);
1433 regflags |= RF_HASNL;
1434 }
1435 break;
1436
1437 case Magic('('):
1438 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001439 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001440 break;
1441
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001442 case Magic('|'):
1443 case Magic('&'):
1444 case Magic(')'):
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001445 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001446 return FAIL;
1447
1448 case Magic('='):
1449 case Magic('?'):
1450 case Magic('+'):
1451 case Magic('@'):
1452 case Magic('*'):
1453 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001454 // these should follow an atom, not form an atom
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001455 semsg(_(e_nfa_regexp_misplaced_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001456 return FAIL;
1457
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001458 case Magic('~'):
1459 {
1460 char_u *lp;
1461
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001462 // Previous substitute pattern.
1463 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001464 if (reg_prev_sub == NULL)
1465 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001466 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001467 return FAIL;
1468 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001469 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001470 {
1471 EMIT(PTR2CHAR(lp));
1472 if (lp != reg_prev_sub)
1473 EMIT(NFA_CONCAT);
1474 }
1475 EMIT(NFA_NOPEN);
1476 break;
1477 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001478
Bram Moolenaar428e9872013-05-30 17:05:39 +02001479 case Magic('1'):
1480 case Magic('2'):
1481 case Magic('3'):
1482 case Magic('4'):
1483 case Magic('5'):
1484 case Magic('6'):
1485 case Magic('7'):
1486 case Magic('8'):
1487 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001488 {
1489 int refnum = no_Magic(c) - '1';
1490
1491 if (!seen_endbrace(refnum + 1))
1492 return FAIL;
1493 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001494 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001495 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001496 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001497
1498 case Magic('z'):
1499 c = no_Magic(getchr());
1500 switch (c)
1501 {
1502 case 's':
1503 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001504 if (re_mult_next("\\zs") == FAIL)
1505 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001506 break;
1507 case 'e':
1508 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001509 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001510 if (re_mult_next("\\ze") == FAIL)
1511 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001512 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001513#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001514 case '1':
1515 case '2':
1516 case '3':
1517 case '4':
1518 case '5':
1519 case '6':
1520 case '7':
1521 case '8':
1522 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001523 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001524 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001525 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001526 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001527 // No need to set rex.nfa_has_backref, the sub-matches don't
1528 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001529 re_has_z = REX_USE;
1530 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001531 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001532 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001533 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001534 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001535 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001536 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001537 re_has_z = REX_SET;
1538 break;
1539#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001540 default:
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001541 semsg(_(e_nfa_regexp_unknown_operator_z_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001542 return FAIL;
1543 }
1544 break;
1545
1546 case Magic('%'):
1547 c = no_Magic(getchr());
1548 switch (c)
1549 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001550 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001551 case '(':
1552 if (nfa_reg(REG_NPAREN) == FAIL)
1553 return FAIL;
1554 EMIT(NFA_NOPEN);
1555 break;
1556
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001557 case 'd': // %d123 decimal
1558 case 'o': // %o123 octal
1559 case 'x': // %xab hex 2
1560 case 'u': // %uabcd hex 4
1561 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001562 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001563 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001564
Bram Moolenaar47196582013-05-25 22:04:23 +02001565 switch (c)
1566 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001567 case 'd': nr = getdecchrs(); break;
1568 case 'o': nr = getoctchrs(); break;
1569 case 'x': nr = gethexchrs(2); break;
1570 case 'u': nr = gethexchrs(4); break;
1571 case 'U': nr = gethexchrs(8); break;
1572 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001573 }
1574
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001575 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001576 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1577 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001578 // A NUL is stored in the text as NL
1579 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001580 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001581 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001582 break;
1583
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001584 // Catch \%^ and \%$ regardless of where they appear in the
1585 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001586 case '^':
1587 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001588 break;
1589
1590 case '$':
1591 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001592 break;
1593
1594 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +02001595 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001596 break;
1597
1598 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001599 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001600 break;
1601
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001602 case 'C':
1603 EMIT(NFA_ANY_COMPOSING);
1604 break;
1605
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001606 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001607 {
1608 int n;
1609
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001610 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001611 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001612 {
1613 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001614 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001615 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001616 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001617 if (nfa_regatom() == FAIL)
1618 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001619 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001620 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001621 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001622 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001623 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001624 EMIT(NFA_OPT_CHARS);
1625 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001626
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001627 // Emit as "\%(\%[abc]\)" to be able to handle
1628 // "\%[abc]*" which would cause the empty string to be
1629 // matched an unlimited number of times. NFA_NOPEN is
1630 // added only once at a position, while NFA_SPLIT is
1631 // added multiple times. This is more efficient than
1632 // not allowing NFA_SPLIT multiple times, it is used
1633 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001634 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001635 break;
1636 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001637
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001638 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001639 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001640 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001641 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001642 int cur = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001643
1644 if (c == '<' || c == '>')
1645 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001646 if (no_Magic(c) == '.')
1647 {
1648 cur = TRUE;
1649 c = getchr();
1650 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001651 while (VIM_ISDIGIT(c))
1652 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001653 long_u tmp;
1654
1655 if (cur)
1656 semsg(_(e_regexp_number_after_dot_pos_search),
1657 no_Magic(c));
1658 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001659
1660 if (tmp < n)
1661 {
1662 // overflow.
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001663 emsg(_(e_percent_value_too_large));
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001664 return FAIL;
1665 }
1666 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001667 c = getchr();
1668 }
1669 if (c == 'l' || c == 'c' || c == 'v')
1670 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001671 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001672
Bram Moolenaar423532e2013-05-29 21:14:42 +02001673 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001674 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001675 if (cur)
1676 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001677 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001678 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001679 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001680 if (save_prev_at_start)
1681 at_start = TRUE;
1682 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001683 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001684 {
1685 if (cur)
1686 {
1687 n = curwin->w_cursor.col;
1688 n++;
1689 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001690 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001691 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001692 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001693 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001694 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001695 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001696 if (cur)
1697 {
1698 colnr_T vcol = 0;
1699
1700 getvvcol(curwin, &curwin->w_cursor,
1701 NULL, NULL, &vcol);
1702 n = ++vcol;
1703 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001704 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001705 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001706 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001707 limit = INT_MAX / MB_MAXBYTES;
1708 }
1709 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001710 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001711 emsg(_(e_percent_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001712 return FAIL;
1713 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001714 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001715 break;
1716 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001717 else if (c == '\'' && n == 0)
1718 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001719 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001720 EMIT(cmp == '<' ? NFA_MARK_LT :
1721 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001722 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001723 break;
1724 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001725 }
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001726 semsg(_(e_nfa_regexp_unknown_operator_percent_chr), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001727 return FAIL;
1728 }
1729 break;
1730
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001731 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001732collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001733 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001734 * [abc] uses NFA_START_COLL - NFA_END_COLL
1735 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1736 * Each character is produced as a regular state, using
1737 * NFA_CONCAT to bind them together.
1738 * Besides normal characters there can be:
1739 * - character classes NFA_CLASS_*
1740 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001741 */
1742
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001743 p = regparse;
1744 endp = skip_anyof(p);
1745 if (*endp == ']')
1746 {
1747 /*
1748 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001749 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001750 * and perform the necessary substitutions in the NFA.
1751 */
1752 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001753 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001754 if (result != FAIL)
1755 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001756 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001757 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001758 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001759 EMIT(NFA_NEWL);
1760 EMIT(NFA_OR);
1761 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001762 else
1763 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001764 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001765 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001766 return OK;
1767 }
1768 /*
1769 * Failed to recognize a character class. Use the simple
1770 * version that turns [abc] into 'a' OR 'b' OR 'c'
1771 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001772 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001773 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001774 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001775 {
1776 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001777 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001778 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001779 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001780 else
1781 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001782 if (*regparse == '-')
1783 {
1784 startc = '-';
1785 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001786 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001787 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001788 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001789 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001790 emit_range = FALSE;
1791 while (regparse < endp)
1792 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001793 int oldstartc = startc;
1794
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001795 startc = -1;
1796 got_coll_char = FALSE;
1797 if (*regparse == '[')
1798 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001799 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001800 equiclass = collclass = 0;
1801 charclass = get_char_class(&regparse);
1802 if (charclass == CLASS_NONE)
1803 {
1804 equiclass = get_equi_class(&regparse);
1805 if (equiclass == 0)
1806 collclass = get_coll_element(&regparse);
1807 }
1808
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001809 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001810 if (charclass != CLASS_NONE)
1811 {
1812 switch (charclass)
1813 {
1814 case CLASS_ALNUM:
1815 EMIT(NFA_CLASS_ALNUM);
1816 break;
1817 case CLASS_ALPHA:
1818 EMIT(NFA_CLASS_ALPHA);
1819 break;
1820 case CLASS_BLANK:
1821 EMIT(NFA_CLASS_BLANK);
1822 break;
1823 case CLASS_CNTRL:
1824 EMIT(NFA_CLASS_CNTRL);
1825 break;
1826 case CLASS_DIGIT:
1827 EMIT(NFA_CLASS_DIGIT);
1828 break;
1829 case CLASS_GRAPH:
1830 EMIT(NFA_CLASS_GRAPH);
1831 break;
1832 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001833 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001834 EMIT(NFA_CLASS_LOWER);
1835 break;
1836 case CLASS_PRINT:
1837 EMIT(NFA_CLASS_PRINT);
1838 break;
1839 case CLASS_PUNCT:
1840 EMIT(NFA_CLASS_PUNCT);
1841 break;
1842 case CLASS_SPACE:
1843 EMIT(NFA_CLASS_SPACE);
1844 break;
1845 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001846 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001847 EMIT(NFA_CLASS_UPPER);
1848 break;
1849 case CLASS_XDIGIT:
1850 EMIT(NFA_CLASS_XDIGIT);
1851 break;
1852 case CLASS_TAB:
1853 EMIT(NFA_CLASS_TAB);
1854 break;
1855 case CLASS_RETURN:
1856 EMIT(NFA_CLASS_RETURN);
1857 break;
1858 case CLASS_BACKSPACE:
1859 EMIT(NFA_CLASS_BACKSPACE);
1860 break;
1861 case CLASS_ESCAPE:
1862 EMIT(NFA_CLASS_ESCAPE);
1863 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001864 case CLASS_IDENT:
1865 EMIT(NFA_CLASS_IDENT);
1866 break;
1867 case CLASS_KEYWORD:
1868 EMIT(NFA_CLASS_KEYWORD);
1869 break;
1870 case CLASS_FNAME:
1871 EMIT(NFA_CLASS_FNAME);
1872 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001873 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001874 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001875 continue;
1876 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001877 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001878 if (equiclass != 0)
1879 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001880 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001881 if (result == FAIL)
1882 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001883 // should never happen
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00001884 EMSG_RET_FAIL(_(e_error_building_nfa_with_equivalence_class));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001885 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001886 continue;
1887 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001888 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001889 if (collclass != 0)
1890 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001891 startc = collclass; // allow [.a.]-x as a range
1892 // Will emit the proper atom at the end of the
1893 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001894 }
1895 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001896 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1897 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001898 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001899 {
1900 emit_range = TRUE;
1901 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001902 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001903 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001904 }
1905
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001906 // Now handle simple and escaped characters.
1907 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1908 // accepts "\t", "\e", etc., but only when the 'l' flag in
1909 // 'cpoptions' is not included.
1910 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001911 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001912 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001913 && regparse + 1 <= endp
1914 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001915 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001916 && vim_strchr(REGEXP_ABBR, regparse[1])
1917 != NULL)
1918 )
1919 )
1920 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001921 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001922
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001923 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001924 startc = (reg_string || emit_range
1925 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001926 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001927 || *regparse == 'o'
1928 || *regparse == 'x'
1929 || *regparse == 'u'
1930 || *regparse == 'U'
1931 )
1932 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001933 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001934 startc = coll_get_char();
1935 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001936 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001937 }
1938 else
1939 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001940 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001941 startc = backslash_trans(*regparse);
1942 }
1943 }
1944
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001945 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001946 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001947 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001948
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001949 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001950 if (emit_range)
1951 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001952 int endc = startc;
1953
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001954 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001955 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001956 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02001957
1958 if (endc > startc + 2)
1959 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001960 // Emit a range instead of the sequence of
1961 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001962 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001963 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001964 EMIT(1);
1965 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001966 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02001967 EMIT(endc);
1968 EMIT(NFA_RANGE);
1969 EMIT(NFA_CONCAT);
1970 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001971 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001972 || (*mb_char2len)(endc) > 1))
1973 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001974 // Emit the characters in the range.
1975 // "startc" was already emitted, so skip it.
1976 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001977 for (c = startc + 1; c <= endc; c++)
1978 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001979 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001980 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001981 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001982 }
1983 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001984 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001985 // Emit the range. "startc" was already emitted, so
1986 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001987 for (c = startc + 1; c <= endc; c++)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001988 {
1989 EMIT(c);
1990 EMIT(NFA_CONCAT);
1991 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001992 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001993 emit_range = FALSE;
1994 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001995 }
1996 else
1997 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001998 // This char (startc) is not part of a range. Just
1999 // emit it.
2000 // Normally, simply emit startc. But if we get char
2001 // code=0 from a collating char, then replace it with
2002 // 0x0a.
2003 // This is needed to completely mimic the behaviour of
2004 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002005 if (startc == NFA_NEWL)
2006 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002007 // Line break can't be matched as part of the
2008 // collection, add an OR below. But not for negated
2009 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002010 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002011 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002012 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002013 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002014 {
2015 if (got_coll_char == TRUE && startc == 0)
2016 EMIT(0x0a);
2017 else
2018 EMIT(startc);
2019 EMIT(NFA_CONCAT);
2020 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002021 }
2022
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002023 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002024 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002025
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002026 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002027 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002028 {
2029 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002030 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002031 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002032
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002033 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002034 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002035 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002036
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002037 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002038 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002039 EMIT(NFA_END_NEG_COLL);
2040 else
2041 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002042
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002043 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002044 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002045 {
2046 EMIT(reg_string ? NL : NFA_NEWL);
2047 EMIT(NFA_OR);
2048 }
2049
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002050 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002051 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002052
2053 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002054 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002055 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002056
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002057 default:
2058 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002059 int plen;
2060
2061nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002062 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002063 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002064 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002065 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002066 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002067 int i = 0;
2068
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002069 // A base character plus composing characters, or just one
2070 // or more composing characters.
2071 // This requires creating a separate atom as if enclosing
2072 // the characters in (), where NFA_COMPOSING is the ( and
2073 // NFA_END_COMPOSING is the ). Note that right now we are
2074 // building the postfix form, not the NFA itself;
2075 // a composing char could be: a, b, c, NFA_COMPOSING
2076 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002077 for (;;)
2078 {
2079 EMIT(c);
2080 if (i > 0)
2081 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002082 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002083 break;
2084 c = utf_ptr2char(old_regparse + i);
2085 }
2086 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002087 regparse = old_regparse + plen;
2088 }
2089 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002090 {
2091 c = no_Magic(c);
2092 EMIT(c);
2093 }
2094 return OK;
2095 }
2096 }
2097
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002098 return OK;
2099}
2100
2101/*
2102 * Parse something followed by possible [*+=].
2103 *
2104 * A piece is an atom, possibly followed by a multi, an indication of how many
2105 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2106 * characters: "", "a", "aa", etc.
2107 *
2108 * piece ::= atom
2109 * or atom multi
2110 */
2111 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002112nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002113{
2114 int i;
2115 int op;
2116 int ret;
2117 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002118 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002119 parse_state_T old_state;
2120 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002121 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002122 int old_post_pos;
2123 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002124 int quest;
2125
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002126 // Save the current parse state, so that we can use it if <atom>{m,n} is
2127 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002128 save_parse_state(&old_state);
2129
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002130 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002131 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002132
2133 ret = nfa_regatom();
2134 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002135 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002136
2137 op = peekchr();
2138 if (re_multi_type(op) == NOT_MULTI)
2139 return OK;
2140
2141 skipchr();
2142 switch (op)
2143 {
2144 case Magic('*'):
2145 EMIT(NFA_STAR);
2146 break;
2147
2148 case Magic('+'):
2149 /*
2150 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2151 * first and only submatch would be "aaa". But the backtracking
2152 * engine interprets the plus as "try matching one more time", and
2153 * a* matches a second time at the end of the input, the empty
2154 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002155 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002156 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002157 * In order to be consistent with the old engine, we replace
2158 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002159 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002160 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002161 curchr = -1;
2162 if (nfa_regatom() == FAIL)
2163 return FAIL;
2164 EMIT(NFA_STAR);
2165 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002166 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002167 break;
2168
2169 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002170 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002171 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002172 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002173 switch(op)
2174 {
2175 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002176 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002177 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002178 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002179 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002180 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002181 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002182 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002183 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002184 op = no_Magic(getchr());
2185 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002186 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002187 i = NFA_PREV_ATOM_JUST_BEFORE;
2188 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002189 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002190 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2191 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002192 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002193 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002194 i = NFA_PREV_ATOM_LIKE_PATTERN;
2195 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002196 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002197 if (i == 0)
2198 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002199 semsg(_(e_nfa_regexp_unknown_operator_at_chr), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002200 return FAIL;
2201 }
2202 EMIT(i);
2203 if (i == NFA_PREV_ATOM_JUST_BEFORE
2204 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2205 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002206 break;
2207
2208 case Magic('?'):
2209 case Magic('='):
2210 EMIT(NFA_QUEST);
2211 break;
2212
2213 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002214 // a{2,5} will expand to 'aaa?a?a?'
2215 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2216 // version of '?'
2217 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2218 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002219
2220 greedy = TRUE;
2221 c2 = peekchr();
2222 if (c2 == '-' || c2 == Magic('-'))
2223 {
2224 skipchr();
2225 greedy = FALSE;
2226 }
2227 if (!read_limits(&minval, &maxval))
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002228 EMSG_RET_FAIL(_(e_nfa_regexp_error_reading_repetition_limits));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002229
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002230 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2231 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002232 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002233 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002234 if (greedy) // { { (match the braces)
2235 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002236 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002237 else // { { (match the braces)
2238 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002239 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002240 break;
2241 }
2242
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002243 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002244 if (maxval == 0)
2245 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002246 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002247 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002248 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002249 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002250 return OK;
2251 }
2252
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002253 // The engine is very inefficient (uses too many states) when the
2254 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002255 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2256 // will emit NFA_STAR.
2257 // Bail out if we can use the other engine, but only, when the
2258 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002259 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002260 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002261 && (maxval > 500 || maxval > minval + 200)
2262 && (maxval != MAX_LIMIT && minval < 200)
2263 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002264 return FAIL;
2265
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002266 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002267 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002268 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002269 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002270
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002271 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2272 for (i = 0; i < maxval; i++)
2273 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002274 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002275 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002276 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002277 if (nfa_regatom() == FAIL)
2278 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002279 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002280 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002281 {
2282 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002283 {
2284 if (greedy)
2285 EMIT(NFA_STAR);
2286 else
2287 EMIT(NFA_STAR_NONGREEDY);
2288 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002289 else
2290 EMIT(quest);
2291 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002292 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002293 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002294 if (i + 1 > minval && maxval == MAX_LIMIT)
2295 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002296 }
2297
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002298 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002299 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002300 curchr = -1;
2301
2302 break;
2303
2304
2305 default:
2306 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002307 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002308
2309 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002310 // Can't have a multi follow a multi.
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002311 EMSG_RET_FAIL(_(e_nfa_regexp_cant_have_multi_follow_multi));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002312
2313 return OK;
2314}
2315
2316/*
2317 * Parse one or more pieces, concatenated. It matches a match for the
2318 * first piece, followed by a match for the second piece, etc. Example:
2319 * "f[0-9]b", first matches "f", then a digit and then "b".
2320 *
2321 * concat ::= piece
2322 * or piece piece
2323 * or piece piece piece
2324 * etc.
2325 */
2326 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002327nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002328{
2329 int cont = TRUE;
2330 int first = TRUE;
2331
2332 while (cont)
2333 {
2334 switch (peekchr())
2335 {
2336 case NUL:
2337 case Magic('|'):
2338 case Magic('&'):
2339 case Magic(')'):
2340 cont = FALSE;
2341 break;
2342
2343 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002344 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002345 skipchr_keepstart();
2346 break;
2347 case Magic('c'):
2348 regflags |= RF_ICASE;
2349 skipchr_keepstart();
2350 break;
2351 case Magic('C'):
2352 regflags |= RF_NOICASE;
2353 skipchr_keepstart();
2354 break;
2355 case Magic('v'):
2356 reg_magic = MAGIC_ALL;
2357 skipchr_keepstart();
2358 curchr = -1;
2359 break;
2360 case Magic('m'):
2361 reg_magic = MAGIC_ON;
2362 skipchr_keepstart();
2363 curchr = -1;
2364 break;
2365 case Magic('M'):
2366 reg_magic = MAGIC_OFF;
2367 skipchr_keepstart();
2368 curchr = -1;
2369 break;
2370 case Magic('V'):
2371 reg_magic = MAGIC_NONE;
2372 skipchr_keepstart();
2373 curchr = -1;
2374 break;
2375
2376 default:
2377 if (nfa_regpiece() == FAIL)
2378 return FAIL;
2379 if (first == FALSE)
2380 EMIT(NFA_CONCAT);
2381 else
2382 first = FALSE;
2383 break;
2384 }
2385 }
2386
2387 return OK;
2388}
2389
2390/*
2391 * Parse a branch, one or more concats, separated by "\&". It matches the
2392 * last concat, but only if all the preceding concats also match at the same
2393 * position. Examples:
2394 * "foobeep\&..." matches "foo" in "foobeep".
2395 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2396 *
2397 * branch ::= concat
2398 * or concat \& concat
2399 * or concat \& concat \& concat
2400 * etc.
2401 */
2402 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002403nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002404{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002405 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002406
Bram Moolenaar16299b52013-05-30 18:45:23 +02002407 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002408
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002409 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002410 if (nfa_regconcat() == FAIL)
2411 return FAIL;
2412
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002413 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002414 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002415 {
2416 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002417 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002418 if (old_post_pos == (int)(post_ptr - post_start))
2419 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002420 EMIT(NFA_NOPEN);
2421 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002422 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002423 if (nfa_regconcat() == FAIL)
2424 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002425 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002426 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002427 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002428 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002429 }
2430
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002431 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002432 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002433 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002434
2435 return OK;
2436}
2437
2438/*
2439 * Parse a pattern, one or more branches, separated by "\|". It matches
2440 * anything that matches one of the branches. Example: "foo\|beep" matches
2441 * "foo" and matches "beep". If more than one branch matches, the first one
2442 * is used.
2443 *
2444 * pattern ::= branch
2445 * or branch \| branch
2446 * or branch \| branch \| branch
2447 * etc.
2448 */
2449 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002450nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002451 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002452{
2453 int parno = 0;
2454
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002455 if (paren == REG_PAREN)
2456 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002457 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002458 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_parens));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002459 parno = regnpar++;
2460 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002461#ifdef FEAT_SYN_HL
2462 else if (paren == REG_ZPAREN)
2463 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002464 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002465 if (regnzpar >= NSUBEXP)
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002466 EMSG_RET_FAIL(_(e_nfa_regexp_too_many_z));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002467 parno = regnzpar++;
2468 }
2469#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002470
2471 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002472 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002473
2474 while (peekchr() == Magic('|'))
2475 {
2476 skipchr();
2477 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002478 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002479 EMIT(NFA_OR);
2480 }
2481
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002482 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002483 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2484 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002485 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002486 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2487 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002488 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002489 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002490 }
2491 else if (paren == REG_NOPAREN && peekchr() != NUL)
2492 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002493 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002494 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002495 else
Bram Moolenaard82a47d2022-01-05 20:24:39 +00002496 EMSG_RET_FAIL(_(e_nfa_regexp_proper_termination_error));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002497 }
2498 /*
2499 * Here we set the flag allowing back references to this set of
2500 * parentheses.
2501 */
2502 if (paren == REG_PAREN)
2503 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002504 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002505 EMIT(NFA_MOPEN + parno);
2506 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002507#ifdef FEAT_SYN_HL
2508 else if (paren == REG_ZPAREN)
2509 EMIT(NFA_ZOPEN + parno);
2510#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002511
2512 return OK;
2513}
2514
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002515#ifdef DEBUG
2516static char_u code[50];
2517
2518 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002519nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002520{
2521 int addnl = FALSE;
2522
2523 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2524 {
2525 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002526 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002527 }
2528
2529 STRCPY(code, "");
2530 switch (c)
2531 {
2532 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2533 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2534 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2535 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2536 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2537 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2538
Bram Moolenaar5714b802013-05-28 22:03:20 +02002539 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2540 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2541 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2542 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2543 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2544 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2545 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2546 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2547 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002548#ifdef FEAT_SYN_HL
2549 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2550 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2551 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2552 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2553 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2554 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2555 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2556 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2557 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2558#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002559 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2560
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002561 case NFA_PREV_ATOM_NO_WIDTH:
2562 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002563 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2564 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002565 case NFA_PREV_ATOM_JUST_BEFORE:
2566 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2567 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2568 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002569 case NFA_PREV_ATOM_LIKE_PATTERN:
2570 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2571
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002572 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2573 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002574 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002575 case NFA_START_INVISIBLE_FIRST:
2576 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002577 case NFA_START_INVISIBLE_NEG:
2578 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002579 case NFA_START_INVISIBLE_NEG_FIRST:
2580 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002581 case NFA_START_INVISIBLE_BEFORE:
2582 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002583 case NFA_START_INVISIBLE_BEFORE_FIRST:
2584 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002585 case NFA_START_INVISIBLE_BEFORE_NEG:
2586 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002587 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2588 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002589 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002590 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002591 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002592 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002593
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2595 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002596 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002598 case NFA_MOPEN:
2599 case NFA_MOPEN1:
2600 case NFA_MOPEN2:
2601 case NFA_MOPEN3:
2602 case NFA_MOPEN4:
2603 case NFA_MOPEN5:
2604 case NFA_MOPEN6:
2605 case NFA_MOPEN7:
2606 case NFA_MOPEN8:
2607 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002608 STRCPY(code, "NFA_MOPEN(x)");
2609 code[10] = c - NFA_MOPEN + '0';
2610 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002611 case NFA_MCLOSE:
2612 case NFA_MCLOSE1:
2613 case NFA_MCLOSE2:
2614 case NFA_MCLOSE3:
2615 case NFA_MCLOSE4:
2616 case NFA_MCLOSE5:
2617 case NFA_MCLOSE6:
2618 case NFA_MCLOSE7:
2619 case NFA_MCLOSE8:
2620 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002621 STRCPY(code, "NFA_MCLOSE(x)");
2622 code[11] = c - NFA_MCLOSE + '0';
2623 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002624#ifdef FEAT_SYN_HL
2625 case NFA_ZOPEN:
2626 case NFA_ZOPEN1:
2627 case NFA_ZOPEN2:
2628 case NFA_ZOPEN3:
2629 case NFA_ZOPEN4:
2630 case NFA_ZOPEN5:
2631 case NFA_ZOPEN6:
2632 case NFA_ZOPEN7:
2633 case NFA_ZOPEN8:
2634 case NFA_ZOPEN9:
2635 STRCPY(code, "NFA_ZOPEN(x)");
2636 code[10] = c - NFA_ZOPEN + '0';
2637 break;
2638 case NFA_ZCLOSE:
2639 case NFA_ZCLOSE1:
2640 case NFA_ZCLOSE2:
2641 case NFA_ZCLOSE3:
2642 case NFA_ZCLOSE4:
2643 case NFA_ZCLOSE5:
2644 case NFA_ZCLOSE6:
2645 case NFA_ZCLOSE7:
2646 case NFA_ZCLOSE8:
2647 case NFA_ZCLOSE9:
2648 STRCPY(code, "NFA_ZCLOSE(x)");
2649 code[11] = c - NFA_ZCLOSE + '0';
2650 break;
2651#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002652 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2653 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2654 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2655 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002656 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2657 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002658 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2659 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2660 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2661 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2662 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2663 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2664 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2665 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2666 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2667 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2668 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2669 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2670 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2671 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002672 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002673
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002674 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002675 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2676 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2677 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002678 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002679 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002680
2681 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2682 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2683 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2684 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2685 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2686 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2687 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2688
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002689 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2690 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2691 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2692 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2693 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2694 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2695 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2696 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2697 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2698 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2699 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2700 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2701 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2702 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2703 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2704 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002705 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2706 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2707 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002708
2709 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2710 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2711 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2712 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2713 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2714 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2715 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2716 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2717 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2718 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2719 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2720 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2721 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2722 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2723 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2724 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2725 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2726 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2727 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2728 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2729 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2730 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2731 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2732 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2733 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2734 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2735 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002736 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2737 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2738 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2739 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002740
2741 default:
2742 STRCPY(code, "CHAR(x)");
2743 code[5] = c;
2744 }
2745
2746 if (addnl == TRUE)
2747 STRCAT(code, " + NEWLINE ");
2748
2749}
2750
2751#ifdef ENABLE_LOG
2752static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002753static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002754
2755/*
2756 * Print the postfix notation of the current regexp.
2757 */
2758 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002759nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002760{
2761 int *p;
2762 FILE *f;
2763
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002764 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002765 if (f != NULL)
2766 {
2767 fprintf(f, "\n-------------------------\n");
2768 if (retval == FAIL)
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002769 fprintf(f, ">>> NFA engine failed... \n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002770 else if (retval == OK)
2771 fprintf(f, ">>> NFA engine succeeded !\n");
2772 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002773 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002774 {
2775 nfa_set_code(*p);
2776 fprintf(f, "%s, ", code);
2777 }
2778 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002779 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002780 fprintf(f, "%d ", *p);
2781 fprintf(f, "\n\n");
2782 fclose(f);
2783 }
2784}
2785
2786/*
2787 * Print the NFA starting with a root node "state".
2788 */
2789 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002790nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002791{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002792 garray_T indent;
2793
2794 ga_init2(&indent, 1, 64);
2795 ga_append(&indent, '\0');
2796 nfa_print_state2(debugf, state, &indent);
2797 ga_clear(&indent);
2798}
2799
2800 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002801nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002802{
2803 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002804
2805 if (state == NULL)
2806 return;
2807
2808 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002809
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002810 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002811 p = (char_u *)indent->ga_data;
2812 if (indent->ga_len >= 3)
2813 {
2814 int last = indent->ga_len - 3;
2815 char_u save[2];
2816
2817 STRNCPY(save, &p[last], 2);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00002818 memcpy(&p[last], "+-", 2);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002819 fprintf(debugf, " %s", p);
2820 STRNCPY(&p[last], save, 2);
2821 }
2822 else
2823 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002824
2825 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002826 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002827 code,
2828 state->c,
2829 abs(state->id),
2830 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002831 if (state->id < 0)
2832 return;
2833
2834 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002835
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002836 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002837 indent->ga_len -= 1;
2838 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002839 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002840 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002841 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002842 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002843
2844 nfa_print_state2(debugf, state->out, indent);
2845
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002846 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002847 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002848 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002849 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002850
2851 nfa_print_state2(debugf, state->out1, indent);
2852
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002853 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002854 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002855 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002856}
2857
2858/*
2859 * Print the NFA state machine.
2860 */
2861 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002862nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002863{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002864 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002865
2866 if (debugf != NULL)
2867 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02002868 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002869
Bram Moolenaar473de612013-06-08 18:19:48 +02002870 if (prog->reganch)
2871 fprintf(debugf, "reganch: %d\n", prog->reganch);
2872 if (prog->regstart != NUL)
2873 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2874 prog->regstart, prog->regstart);
2875 if (prog->match_text != NULL)
2876 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002877
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002878 fclose(debugf);
2879 }
2880}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002881#endif // ENABLE_LOG
2882#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002883
2884/*
2885 * Parse r.e. @expr and convert it into postfix form.
2886 * Return the postfix string on success, NULL otherwise.
2887 */
2888 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002889re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002890{
2891 if (nfa_reg(REG_NOPAREN) == FAIL)
2892 return NULL;
2893 EMIT(NFA_MOPEN);
2894 return post_start;
2895}
2896
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002897// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002898
2899/*
2900 * Represents an NFA state plus zero or one or two arrows exiting.
2901 * if c == MATCH, no arrows out; matching state.
2902 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2903 * If c < 256, labeled arrow with character c to out.
2904 */
2905
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002906static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002907
2908/*
2909 * Allocate and initialize nfa_state_T.
2910 */
2911 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002912alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002913{
2914 nfa_state_T *s;
2915
2916 if (istate >= nstate)
2917 return NULL;
2918
2919 s = &state_ptr[istate++];
2920
2921 s->c = c;
2922 s->out = out;
2923 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002924 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002925
2926 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002927 s->lastlist[0] = 0;
2928 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002929
2930 return s;
2931}
2932
2933/*
2934 * A partially built NFA without the matching state filled in.
2935 * Frag_T.start points at the start state.
2936 * Frag_T.out is a list of places that need to be set to the
2937 * next state for this fragment.
2938 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002939
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002940// Since the out pointers in the list are always
2941// uninitialized, we use the pointers themselves
2942// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002943typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002944union Ptrlist
2945{
2946 Ptrlist *next;
2947 nfa_state_T *s;
2948};
2949
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002950struct Frag
2951{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002952 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002953 Ptrlist *out;
2954};
2955typedef struct Frag Frag_T;
2956
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002957/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02002958 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002959 */
2960 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01002961frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002962{
Bram Moolenaar053bb602013-05-20 13:55:21 +02002963 Frag_T n;
2964
2965 n.start = start;
2966 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002967 return n;
2968}
2969
2970/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002971 * Create singleton list containing just outp.
2972 */
2973 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01002974list1(
2975 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002976{
2977 Ptrlist *l;
2978
2979 l = (Ptrlist *)outp;
2980 l->next = NULL;
2981 return l;
2982}
2983
2984/*
2985 * Patch the list of states at out to point to start.
2986 */
2987 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002988patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002989{
2990 Ptrlist *next;
2991
2992 for (; l; l = next)
2993 {
2994 next = l->next;
2995 l->s = s;
2996 }
2997}
2998
2999
3000/*
3001 * Join the two lists l1 and l2, returning the combination.
3002 */
3003 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003004append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003005{
3006 Ptrlist *oldl1;
3007
3008 oldl1 = l1;
3009 while (l1->next)
3010 l1 = l1->next;
3011 l1->next = l2;
3012 return oldl1;
3013}
3014
3015/*
3016 * Stack used for transforming postfix form into NFA.
3017 */
3018static Frag_T empty;
3019
3020 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003021st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003022{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003023#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003024 FILE *df;
3025 int *p2;
3026
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003027 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003028 if (df)
3029 {
3030 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003031# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003032 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003033# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003034 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003035# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003036 for (p2 = postfix; p2 < end; p2++)
3037 {
3038 nfa_set_code(*p2);
3039 fprintf(df, "%s, ", code);
3040 }
3041 nfa_set_code(*p);
3042 fprintf(df, "\nCurrent position is: ");
3043 for (p2 = postfix; p2 <= p; p2 ++)
3044 {
3045 nfa_set_code(*p2);
3046 fprintf(df, "%s, ", code);
3047 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003048# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003049 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003050 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003051 fprintf(df, "\nCurrent position is: ");
3052 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003053 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003054# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003055 fprintf(df, "\n--------------------------\n");
3056 fclose(df);
3057 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003058#endif
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003059 emsg(_(e_nfa_regexp_could_not_pop_stack));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003060}
3061
3062/*
3063 * Push an item onto the stack.
3064 */
3065 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003066st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003067{
3068 Frag_T *stackp = *p;
3069
3070 if (stackp >= stack_end)
3071 return;
3072 *stackp = s;
3073 *p = *p + 1;
3074}
3075
3076/*
3077 * Pop an item from the stack.
3078 */
3079 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003080st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003081{
3082 Frag_T *stackp;
3083
3084 *p = *p - 1;
3085 stackp = *p;
3086 if (stackp < stack)
3087 return empty;
3088 return **p;
3089}
3090
3091/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003092 * Estimate the maximum byte length of anything matching "state".
3093 * When unknown or unlimited return -1.
3094 */
3095 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003096nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003097{
3098 int l, r;
3099 nfa_state_T *state = startstate;
3100 int len = 0;
3101
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003102 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003103 if (depth > 4)
3104 return -1;
3105
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003106 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003107 {
3108 switch (state->c)
3109 {
3110 case NFA_END_INVISIBLE:
3111 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003112 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003113 return len;
3114
3115 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003116 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003117 l = nfa_max_width(state->out, depth + 1);
3118 r = nfa_max_width(state->out1, depth + 1);
3119 if (l < 0 || r < 0)
3120 return -1;
3121 return len + (l > r ? l : r);
3122
3123 case NFA_ANY:
3124 case NFA_START_COLL:
3125 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003126 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003127 if (enc_utf8)
3128 len += MB_MAXBYTES;
3129 else if (has_mbyte)
3130 len += 2;
3131 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003132 ++len;
3133 if (state->c != NFA_ANY)
3134 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003135 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003136 state = state->out1->out;
3137 continue;
3138 }
3139 break;
3140
3141 case NFA_DIGIT:
3142 case NFA_WHITE:
3143 case NFA_HEX:
3144 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003145 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003146 ++len;
3147 break;
3148
3149 case NFA_IDENT:
3150 case NFA_SIDENT:
3151 case NFA_KWORD:
3152 case NFA_SKWORD:
3153 case NFA_FNAME:
3154 case NFA_SFNAME:
3155 case NFA_PRINT:
3156 case NFA_SPRINT:
3157 case NFA_NWHITE:
3158 case NFA_NDIGIT:
3159 case NFA_NHEX:
3160 case NFA_NOCTAL:
3161 case NFA_WORD:
3162 case NFA_NWORD:
3163 case NFA_HEAD:
3164 case NFA_NHEAD:
3165 case NFA_ALPHA:
3166 case NFA_NALPHA:
3167 case NFA_LOWER:
3168 case NFA_NLOWER:
3169 case NFA_UPPER:
3170 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003171 case NFA_LOWER_IC:
3172 case NFA_NLOWER_IC:
3173 case NFA_UPPER_IC:
3174 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003175 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003176 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003177 if (has_mbyte)
3178 len += 3;
3179 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003180 ++len;
3181 break;
3182
3183 case NFA_START_INVISIBLE:
3184 case NFA_START_INVISIBLE_NEG:
3185 case NFA_START_INVISIBLE_BEFORE:
3186 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003187 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003188 state = state->out1->out;
3189 continue;
3190
3191 case NFA_BACKREF1:
3192 case NFA_BACKREF2:
3193 case NFA_BACKREF3:
3194 case NFA_BACKREF4:
3195 case NFA_BACKREF5:
3196 case NFA_BACKREF6:
3197 case NFA_BACKREF7:
3198 case NFA_BACKREF8:
3199 case NFA_BACKREF9:
3200#ifdef FEAT_SYN_HL
3201 case NFA_ZREF1:
3202 case NFA_ZREF2:
3203 case NFA_ZREF3:
3204 case NFA_ZREF4:
3205 case NFA_ZREF5:
3206 case NFA_ZREF6:
3207 case NFA_ZREF7:
3208 case NFA_ZREF8:
3209 case NFA_ZREF9:
3210#endif
3211 case NFA_NEWL:
3212 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003213 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003214 return -1;
3215
3216 case NFA_BOL:
3217 case NFA_EOL:
3218 case NFA_BOF:
3219 case NFA_EOF:
3220 case NFA_BOW:
3221 case NFA_EOW:
3222 case NFA_MOPEN:
3223 case NFA_MOPEN1:
3224 case NFA_MOPEN2:
3225 case NFA_MOPEN3:
3226 case NFA_MOPEN4:
3227 case NFA_MOPEN5:
3228 case NFA_MOPEN6:
3229 case NFA_MOPEN7:
3230 case NFA_MOPEN8:
3231 case NFA_MOPEN9:
3232#ifdef FEAT_SYN_HL
3233 case NFA_ZOPEN:
3234 case NFA_ZOPEN1:
3235 case NFA_ZOPEN2:
3236 case NFA_ZOPEN3:
3237 case NFA_ZOPEN4:
3238 case NFA_ZOPEN5:
3239 case NFA_ZOPEN6:
3240 case NFA_ZOPEN7:
3241 case NFA_ZOPEN8:
3242 case NFA_ZOPEN9:
3243 case NFA_ZCLOSE:
3244 case NFA_ZCLOSE1:
3245 case NFA_ZCLOSE2:
3246 case NFA_ZCLOSE3:
3247 case NFA_ZCLOSE4:
3248 case NFA_ZCLOSE5:
3249 case NFA_ZCLOSE6:
3250 case NFA_ZCLOSE7:
3251 case NFA_ZCLOSE8:
3252 case NFA_ZCLOSE9:
3253#endif
3254 case NFA_MCLOSE:
3255 case NFA_MCLOSE1:
3256 case NFA_MCLOSE2:
3257 case NFA_MCLOSE3:
3258 case NFA_MCLOSE4:
3259 case NFA_MCLOSE5:
3260 case NFA_MCLOSE6:
3261 case NFA_MCLOSE7:
3262 case NFA_MCLOSE8:
3263 case NFA_MCLOSE9:
3264 case NFA_NOPEN:
3265 case NFA_NCLOSE:
3266
3267 case NFA_LNUM_GT:
3268 case NFA_LNUM_LT:
3269 case NFA_COL_GT:
3270 case NFA_COL_LT:
3271 case NFA_VCOL_GT:
3272 case NFA_VCOL_LT:
3273 case NFA_MARK_GT:
3274 case NFA_MARK_LT:
3275 case NFA_VISUAL:
3276 case NFA_LNUM:
3277 case NFA_CURSOR:
3278 case NFA_COL:
3279 case NFA_VCOL:
3280 case NFA_MARK:
3281
3282 case NFA_ZSTART:
3283 case NFA_ZEND:
3284 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003285 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003286 case NFA_START_PATTERN:
3287 case NFA_END_PATTERN:
3288 case NFA_COMPOSING:
3289 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003290 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003291 break;
3292
3293 default:
3294 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003295 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003296 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003297 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003298 len += MB_CHAR2LEN(state->c);
3299 break;
3300 }
3301
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003302 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003303 state = state->out;
3304 }
3305
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003306 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003307 return -1;
3308}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003309
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003310/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003311 * Convert a postfix form into its equivalent NFA.
3312 * Return the NFA start state on success, NULL otherwise.
3313 */
3314 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003315post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003316{
3317 int *p;
3318 int mopen;
3319 int mclose;
3320 Frag_T *stack = NULL;
3321 Frag_T *stackp = NULL;
3322 Frag_T *stack_end = NULL;
3323 Frag_T e1;
3324 Frag_T e2;
3325 Frag_T e;
3326 nfa_state_T *s;
3327 nfa_state_T *s1;
3328 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003329 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003330
3331 if (postfix == NULL)
3332 return NULL;
3333
Bram Moolenaar053bb602013-05-20 13:55:21 +02003334#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003335#define POP() st_pop(&stackp, stack); \
3336 if (stackp < stack) \
3337 { \
3338 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003339 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003340 return NULL; \
3341 }
3342
3343 if (nfa_calc_size == FALSE)
3344 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003345 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003346 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003347 if (stack == NULL)
3348 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003349 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003350 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003351 }
3352
3353 for (p = postfix; p < end; ++p)
3354 {
3355 switch (*p)
3356 {
3357 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003358 // Concatenation.
3359 // Pay attention: this operator does not exist in the r.e. itself
3360 // (it is implicit, really). It is added when r.e. is translated
3361 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003362 if (nfa_calc_size == TRUE)
3363 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003364 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003365 break;
3366 }
3367 e2 = POP();
3368 e1 = POP();
3369 patch(e1.out, e2.start);
3370 PUSH(frag(e1.start, e2.out));
3371 break;
3372
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003373 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003374 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003375 if (nfa_calc_size == TRUE)
3376 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003377 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003378 break;
3379 }
3380 e2 = POP();
3381 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003382 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003383 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003384 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003385 PUSH(frag(s, append(e1.out, e2.out)));
3386 break;
3387
3388 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003389 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003390 if (nfa_calc_size == TRUE)
3391 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003392 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003393 break;
3394 }
3395 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003396 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003397 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003398 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003399 patch(e.out, s);
3400 PUSH(frag(s, list1(&s->out1)));
3401 break;
3402
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003403 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003404 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003405 if (nfa_calc_size == TRUE)
3406 {
3407 nstate++;
3408 break;
3409 }
3410 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003411 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003412 if (s == NULL)
3413 goto theend;
3414 patch(e.out, s);
3415 PUSH(frag(s, list1(&s->out)));
3416 break;
3417
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003418 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003419 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003420 if (nfa_calc_size == TRUE)
3421 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003422 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003423 break;
3424 }
3425 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003426 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003427 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003428 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003429 PUSH(frag(s, append(e.out, list1(&s->out1))));
3430 break;
3431
3432 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003433 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003434 if (nfa_calc_size == TRUE)
3435 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003436 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003437 break;
3438 }
3439 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003440 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003441 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003442 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003443 PUSH(frag(s, append(e.out, list1(&s->out))));
3444 break;
3445
Bram Moolenaar417bad22013-06-07 14:08:30 +02003446 case NFA_END_COLL:
3447 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003448 // On the stack is the sequence starting with NFA_START_COLL or
3449 // NFA_START_NEG_COLL and all possible characters. Patch it to
3450 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003451 if (nfa_calc_size == TRUE)
3452 {
3453 nstate++;
3454 break;
3455 }
3456 e = POP();
3457 s = alloc_state(NFA_END_COLL, NULL, NULL);
3458 if (s == NULL)
3459 goto theend;
3460 patch(e.out, s);
3461 e.start->out1 = s;
3462 PUSH(frag(e.start, list1(&s->out)));
3463 break;
3464
3465 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003466 // Before this are two characters, the low and high end of a
3467 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003468 if (nfa_calc_size == TRUE)
3469 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003470 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003471 break;
3472 }
3473 e2 = POP();
3474 e1 = POP();
3475 e2.start->val = e2.start->c;
3476 e2.start->c = NFA_RANGE_MAX;
3477 e1.start->val = e1.start->c;
3478 e1.start->c = NFA_RANGE_MIN;
3479 patch(e1.out, e2.start);
3480 PUSH(frag(e1.start, e2.out));
3481 break;
3482
Bram Moolenaar699c1202013-09-25 16:41:54 +02003483 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003484 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003485 if (nfa_calc_size == TRUE)
3486 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003487 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003488 break;
3489 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003490 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003491 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003492 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003493 PUSH(frag(s, list1(&s->out)));
3494 break;
3495
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003496 case NFA_OPT_CHARS:
3497 {
3498 int n;
3499
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003500 // \%[abc] implemented as:
3501 // NFA_SPLIT
3502 // +-CHAR(a)
3503 // | +-NFA_SPLIT
3504 // | +-CHAR(b)
3505 // | | +-NFA_SPLIT
3506 // | | +-CHAR(c)
3507 // | | | +-next
3508 // | | +- next
3509 // | +- next
3510 // +- next
3511 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003512 if (nfa_calc_size == TRUE)
3513 {
3514 nstate += n;
3515 break;
3516 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003517 s = NULL; // avoid compiler warning
3518 e1.out = NULL; // stores list with out1's
3519 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003520 while (n-- > 0)
3521 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003522 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003523 s = alloc_state(NFA_SPLIT, e.start, NULL);
3524 if (s == NULL)
3525 goto theend;
3526 if (e1.out == NULL)
3527 e1 = e;
3528 patch(e.out, s1);
3529 append(e1.out, list1(&s->out1));
3530 s1 = s;
3531 }
3532 PUSH(frag(s, e1.out));
3533 break;
3534 }
3535
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003536 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003537 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003538 case NFA_PREV_ATOM_JUST_BEFORE:
3539 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003540 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003541 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003542 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3543 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003544 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003545 int start_state;
3546 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003547 int n = 0;
3548 nfa_state_T *zend;
3549 nfa_state_T *skip;
3550
Bram Moolenaardecd9542013-06-07 16:31:50 +02003551 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003552 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003553 case NFA_PREV_ATOM_NO_WIDTH:
3554 start_state = NFA_START_INVISIBLE;
3555 end_state = NFA_END_INVISIBLE;
3556 break;
3557 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3558 start_state = NFA_START_INVISIBLE_NEG;
3559 end_state = NFA_END_INVISIBLE_NEG;
3560 break;
3561 case NFA_PREV_ATOM_JUST_BEFORE:
3562 start_state = NFA_START_INVISIBLE_BEFORE;
3563 end_state = NFA_END_INVISIBLE;
3564 break;
3565 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3566 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3567 end_state = NFA_END_INVISIBLE_NEG;
3568 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003569 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003570 start_state = NFA_START_PATTERN;
3571 end_state = NFA_END_PATTERN;
3572 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003573 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003574
3575 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003576 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003577
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003578 // The \@= operator: match the preceding atom with zero width.
3579 // The \@! operator: no match for the preceding atom.
3580 // The \@<= operator: match for the preceding atom.
3581 // The \@<! operator: no match for the preceding atom.
3582 // Surrounds the preceding atom with START_INVISIBLE and
3583 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003584
3585 if (nfa_calc_size == TRUE)
3586 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003587 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003588 break;
3589 }
3590 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003591 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003592 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003593 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003594
Bram Moolenaar87953742013-06-05 18:52:40 +02003595 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003596 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003597 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003598 if (pattern)
3599 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003600 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003601 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003602 if (skip == NULL)
3603 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003604 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003605 if (zend == NULL)
3606 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003607 s1->out= skip;
3608 patch(e.out, zend);
3609 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003610 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003611 else
3612 {
3613 patch(e.out, s1);
3614 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003615 if (before)
3616 {
3617 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003618 // See if we can guess the maximum width, it avoids a
3619 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003620 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003621 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003622 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003623 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003624 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003625 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003626
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003627 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003628#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003629 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003630 if (regflags & RF_ICOMBINE)
3631 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003632 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003633 }
3634#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003635 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003636
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003637 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003638 case NFA_MOPEN1:
3639 case NFA_MOPEN2:
3640 case NFA_MOPEN3:
3641 case NFA_MOPEN4:
3642 case NFA_MOPEN5:
3643 case NFA_MOPEN6:
3644 case NFA_MOPEN7:
3645 case NFA_MOPEN8:
3646 case NFA_MOPEN9:
3647#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003648 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003649 case NFA_ZOPEN1:
3650 case NFA_ZOPEN2:
3651 case NFA_ZOPEN3:
3652 case NFA_ZOPEN4:
3653 case NFA_ZOPEN5:
3654 case NFA_ZOPEN6:
3655 case NFA_ZOPEN7:
3656 case NFA_ZOPEN8:
3657 case NFA_ZOPEN9:
3658#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003659 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003660 if (nfa_calc_size == TRUE)
3661 {
3662 nstate += 2;
3663 break;
3664 }
3665
3666 mopen = *p;
3667 switch (*p)
3668 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003669 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3670#ifdef FEAT_SYN_HL
3671 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3672 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3673 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3674 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3675 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3676 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3677 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3678 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3679 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3680 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3681#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003682 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003683 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003684 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003685 mclose = *p + NSUBEXP;
3686 break;
3687 }
3688
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003689 // Allow "NFA_MOPEN" as a valid postfix representation for
3690 // the empty regexp "". In this case, the NFA will be
3691 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3692 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003693 if (stackp == stack)
3694 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003695 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003696 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003697 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003698 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003699 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003700 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003701 patch(list1(&s->out), s1);
3702 PUSH(frag(s, list1(&s1->out)));
3703 break;
3704 }
3705
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003706 // At least one node was emitted before NFA_MOPEN, so
3707 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003708 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003709 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003710 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003711 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003712
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003713 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003714 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003715 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003716 patch(e.out, s1);
3717
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003718 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003719 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003720 patch(list1(&s->out1), s1);
3721
3722 PUSH(frag(s, list1(&s1->out)));
3723 break;
3724
Bram Moolenaar5714b802013-05-28 22:03:20 +02003725 case NFA_BACKREF1:
3726 case NFA_BACKREF2:
3727 case NFA_BACKREF3:
3728 case NFA_BACKREF4:
3729 case NFA_BACKREF5:
3730 case NFA_BACKREF6:
3731 case NFA_BACKREF7:
3732 case NFA_BACKREF8:
3733 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003734#ifdef FEAT_SYN_HL
3735 case NFA_ZREF1:
3736 case NFA_ZREF2:
3737 case NFA_ZREF3:
3738 case NFA_ZREF4:
3739 case NFA_ZREF5:
3740 case NFA_ZREF6:
3741 case NFA_ZREF7:
3742 case NFA_ZREF8:
3743 case NFA_ZREF9:
3744#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003745 if (nfa_calc_size == TRUE)
3746 {
3747 nstate += 2;
3748 break;
3749 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003750 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003751 if (s == NULL)
3752 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003753 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003754 if (s1 == NULL)
3755 goto theend;
3756 patch(list1(&s->out), s1);
3757 PUSH(frag(s, list1(&s1->out)));
3758 break;
3759
Bram Moolenaar423532e2013-05-29 21:14:42 +02003760 case NFA_LNUM:
3761 case NFA_LNUM_GT:
3762 case NFA_LNUM_LT:
3763 case NFA_VCOL:
3764 case NFA_VCOL_GT:
3765 case NFA_VCOL_LT:
3766 case NFA_COL:
3767 case NFA_COL_GT:
3768 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003769 case NFA_MARK:
3770 case NFA_MARK_GT:
3771 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003772 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003773 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003774
Bram Moolenaar423532e2013-05-29 21:14:42 +02003775 if (nfa_calc_size == TRUE)
3776 {
3777 nstate += 1;
3778 break;
3779 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003780 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003781 if (s == NULL)
3782 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003783 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003784 PUSH(frag(s, list1(&s->out)));
3785 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003786 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003787
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003788 case NFA_ZSTART:
3789 case NFA_ZEND:
3790 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003791 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003792 if (nfa_calc_size == TRUE)
3793 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003794 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003795 break;
3796 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003797 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003798 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003799 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003800 PUSH(frag(s, list1(&s->out)));
3801 break;
3802
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003803 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003804
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003805 } // for(p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003806
3807 if (nfa_calc_size == TRUE)
3808 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003809 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003810 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003811 }
3812
3813 e = POP();
3814 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003815 {
3816 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003817 EMSG_RET_NULL(_(e_nfa_regexp_while_converting_from_postfix_to_nfa_too_many_stats_left_on_stack));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003818 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003819
3820 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003821 {
3822 vim_free(stack);
Bram Moolenaard82a47d2022-01-05 20:24:39 +00003823 EMSG_RET_NULL(_(e_nfa_regexp_not_enough_space_to_store_whole_nfa));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003824 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003825
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003826 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003827 matchstate->c = NFA_MATCH;
3828 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003829 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003830
3831 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003832 ret = e.start;
3833
3834theend:
3835 vim_free(stack);
3836 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003837
3838#undef POP1
3839#undef PUSH1
3840#undef POP2
3841#undef PUSH2
3842#undef POP
3843#undef PUSH
3844}
3845
Bram Moolenaara2947e22013-06-11 22:44:09 +02003846/*
3847 * After building the NFA program, inspect it to add optimization hints.
3848 */
3849 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003850nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003851{
3852 int i;
3853 int c;
3854
3855 for (i = 0; i < prog->nstate; ++i)
3856 {
3857 c = prog->state[i].c;
3858 if (c == NFA_START_INVISIBLE
3859 || c == NFA_START_INVISIBLE_NEG
3860 || c == NFA_START_INVISIBLE_BEFORE
3861 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3862 {
3863 int directly;
3864
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003865 // Do it directly when what follows is possibly the end of the
3866 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003867 if (match_follows(prog->state[i].out1->out, 0))
3868 directly = TRUE;
3869 else
3870 {
3871 int ch_invisible = failure_chance(prog->state[i].out, 0);
3872 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3873
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003874 // Postpone when the invisible match is expensive or has a
3875 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003876 if (c == NFA_START_INVISIBLE_BEFORE
3877 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3878 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003879 // "before" matches are very expensive when
3880 // unbounded, always prefer what follows then,
3881 // unless what follows will always match.
3882 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003883 if (prog->state[i].val <= 0 && ch_follows > 0)
3884 directly = FALSE;
3885 else
3886 directly = ch_follows * 10 < ch_invisible;
3887 }
3888 else
3889 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003890 // normal invisible, first do the one with the
3891 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003892 directly = ch_follows < ch_invisible;
3893 }
3894 }
3895 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003896 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003897 ++prog->state[i].c;
3898 }
3899 }
3900}
3901
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003902/////////////////////////////////////////////////////////////////
3903// NFA execution code.
3904/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003905
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003906typedef struct
3907{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003908 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003909
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003910 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003911 union
3912 {
3913 struct multipos
3914 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003915 linenr_T start_lnum;
3916 linenr_T end_lnum;
3917 colnr_T start_col;
3918 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003919 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003920 struct linepos
3921 {
3922 char_u *start;
3923 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003924 } line[NSUBEXP];
3925 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003926} regsub_T;
3927
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003928typedef struct
3929{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003930 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003931#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003932 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003933#endif
3934} regsubs_T;
3935
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003936// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02003937typedef struct nfa_pim_S nfa_pim_T;
3938struct nfa_pim_S
3939{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003940 int result; // NFA_PIM_*, see below
3941 nfa_state_T *state; // the invisible match start state
3942 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02003943 union
3944 {
3945 lpos_T pos;
3946 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003947 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02003948};
3949
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003950// Values for done in nfa_pim_T.
3951#define NFA_PIM_UNUSED 0 // pim not used
3952#define NFA_PIM_TODO 1 // pim not done yet
3953#define NFA_PIM_MATCH 2 // pim executed, matches
3954#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02003955
3956
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003957// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02003958typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003959{
3960 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02003961 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003962 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
3963 // invisible match
3964 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02003965} nfa_thread_T;
3966
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003967// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003968typedef struct
3969{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003970 nfa_thread_T *t; // allocated array of states
3971 int n; // nr of states currently in "t"
3972 int len; // max nr of states in "t"
3973 int id; // ID of the list
3974 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02003975} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003976
Bram Moolenaar5714b802013-05-28 22:03:20 +02003977#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01003978static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003979
3980 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003981log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003982{
3983 log_subexpr(&subs->norm);
3984# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02003985 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02003986 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003987# endif
3988}
3989
Bram Moolenaar5714b802013-05-28 22:03:20 +02003990 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003991log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003992{
3993 int j;
3994
3995 for (j = 0; j < sub->in_use; j++)
3996 if (REG_MULTI)
Bram Moolenaar87953742013-06-05 18:52:40 +02003997 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02003998 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003999 sub->list.multi[j].start_col,
4000 (int)sub->list.multi[j].start_lnum,
4001 sub->list.multi[j].end_col,
4002 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004003 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004004 {
4005 char *s = (char *)sub->list.line[j].start;
4006 char *e = (char *)sub->list.line[j].end;
4007
Bram Moolenaar87953742013-06-05 18:52:40 +02004008 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004009 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004010 s == NULL ? "NULL" : s,
4011 e == NULL ? "NULL" : e);
4012 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004013}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004014
4015 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004016pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004017{
4018 static char buf[30];
4019
4020 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4021 buf[0] = NUL;
4022 else
4023 {
4024 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004025 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004026 }
4027 return buf;
4028}
4029
Bram Moolenaar5714b802013-05-28 22:03:20 +02004030#endif
4031
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004032// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004033static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004034#ifdef FEAT_RELTIME
4035static proftime_T *nfa_time_limit;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004036static int *nfa_timed_out;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004037static int nfa_time_count;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004038#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004039
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004040static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004041static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004042
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004043/*
4044 * Copy postponed invisible match info from "from" to "to".
4045 */
4046 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004047copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004048{
4049 to->result = from->result;
4050 to->state = from->state;
4051 copy_sub(&to->subs.norm, &from->subs.norm);
4052#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004053 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004054 copy_sub(&to->subs.synt, &from->subs.synt);
4055#endif
4056 to->end = from->end;
4057}
4058
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004059 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004060clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004061{
4062 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004063 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004064 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004065 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004066 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004067 vim_memset(sub->list.line, 0,
4068 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004069 sub->in_use = 0;
4070}
4071
4072/*
4073 * Copy the submatches from "from" to "to".
4074 */
4075 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004076copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004077{
4078 to->in_use = from->in_use;
4079 if (from->in_use > 0)
4080 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004081 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004082 if (REG_MULTI)
4083 mch_memmove(&to->list.multi[0],
4084 &from->list.multi[0],
4085 sizeof(struct multipos) * from->in_use);
4086 else
4087 mch_memmove(&to->list.line[0],
4088 &from->list.line[0],
4089 sizeof(struct linepos) * from->in_use);
4090 }
4091}
4092
4093/*
4094 * Like copy_sub() but exclude the main match.
4095 */
4096 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004097copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004098{
4099 if (to->in_use < from->in_use)
4100 to->in_use = from->in_use;
4101 if (from->in_use > 1)
4102 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004103 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004104 if (REG_MULTI)
4105 mch_memmove(&to->list.multi[1],
4106 &from->list.multi[1],
4107 sizeof(struct multipos) * (from->in_use - 1));
4108 else
4109 mch_memmove(&to->list.line[1],
4110 &from->list.line[1],
4111 sizeof(struct linepos) * (from->in_use - 1));
4112 }
4113}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004114
Bram Moolenaar428e9872013-05-30 17:05:39 +02004115/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004116 * Like copy_sub() but only do the end of the main match if \ze is present.
4117 */
4118 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004119copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004120{
Bram Moolenaar0270f382018-07-17 05:43:58 +02004121 if (rex.nfa_has_zend)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004122 {
4123 if (REG_MULTI)
4124 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004125 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004126 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004127 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4128 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004129 }
Bram Moolenaarf2118842013-09-25 18:16:38 +02004130 }
4131 else
4132 {
4133 if (from->list.line[0].end != NULL)
4134 to->list.line[0].end = from->list.line[0].end;
4135 }
4136 }
4137}
4138
4139/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004140 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004141 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004142 */
4143 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004144sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004145{
4146 int i;
4147 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004148 linenr_T s1;
4149 linenr_T s2;
4150 char_u *sp1;
4151 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004152
4153 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4154 if (REG_MULTI)
4155 {
4156 for (i = 0; i < todo; ++i)
4157 {
4158 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004159 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004160 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004161 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004162 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004163 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004164 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004165 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004166 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004167 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004168 if (s1 != -1 && sub1->list.multi[i].start_col
4169 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004170 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004171
Bram Moolenaar0270f382018-07-17 05:43:58 +02004172 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004173 {
4174 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004175 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004176 else
4177 s1 = -1;
4178 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004179 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004180 else
4181 s2 = -1;
4182 if (s1 != s2)
4183 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004184 if (s1 != -1 && sub1->list.multi[i].end_col
4185 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004186 return FALSE;
4187 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004188 }
4189 }
4190 else
4191 {
4192 for (i = 0; i < todo; ++i)
4193 {
4194 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004195 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004196 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004197 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004198 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004199 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004200 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004201 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004202 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004203 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004204 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004205 {
4206 if (i < sub1->in_use)
4207 sp1 = sub1->list.line[i].end;
4208 else
4209 sp1 = NULL;
4210 if (i < sub2->in_use)
4211 sp2 = sub2->list.line[i].end;
4212 else
4213 sp2 = NULL;
4214 if (sp1 != sp2)
4215 return FALSE;
4216 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004217 }
4218 }
4219
4220 return TRUE;
4221}
4222
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004223#ifdef ENABLE_LOG
4224 static void
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004225open_debug_log(int result)
4226{
4227 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4228 if (log_fd == NULL)
4229 {
4230 emsg(_(e_log_open_failed));
4231 log_fd = stderr;
4232 }
4233
4234 fprintf(log_fd, "****************************\n");
4235 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4236 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : result == MAYBE
4237 ? "MAYBE" : "FALSE");
4238 fprintf(log_fd, "****************************\n");
4239}
4240
4241 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004242report_state(char *action,
4243 regsub_T *sub,
4244 nfa_state_T *state,
4245 int lid,
4246 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004247{
4248 int col;
4249
4250 if (sub->in_use <= 0)
4251 col = -1;
4252 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004253 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004254 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004255 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004256 nfa_set_code(state->c);
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00004257 if (log_fd == NULL)
4258 open_debug_log(MAYBE);
4259
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004260 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4261 action, abs(state->id), lid, state->c, code, col,
4262 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004263}
4264#endif
4265
Bram Moolenaar43e02982013-06-07 17:31:29 +02004266/*
4267 * Return TRUE if the same state is already in list "l" with the same
4268 * positions as "subs".
4269 */
4270 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004271has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004272 nfa_list_T *l, // runtime state list
4273 nfa_state_T *state, // state to update
4274 regsubs_T *subs, // pointers to subexpressions
4275 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004276{
4277 nfa_thread_T *thread;
4278 int i;
4279
4280 for (i = 0; i < l->n; ++i)
4281 {
4282 thread = &l->t[i];
4283 if (thread->state->id == state->id
4284 && sub_equal(&thread->subs.norm, &subs->norm)
4285#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004286 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004287 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004288#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004289 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004290 return TRUE;
4291 }
4292 return FALSE;
4293}
4294
4295/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004296 * Return TRUE if "one" and "two" are equal. That includes when both are not
4297 * set.
4298 */
4299 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004300pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004301{
4302 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4303 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4304
4305 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004306 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004307 return two_unused;
4308 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004309 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004310 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004311 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004312 if (one->state->id != two->state->id)
4313 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004314 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004315 if (REG_MULTI)
4316 return one->end.pos.lnum == two->end.pos.lnum
4317 && one->end.pos.col == two->end.pos.col;
4318 return one->end.ptr == two->end.ptr;
4319}
4320
4321/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004322 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4323 */
4324 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004325match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004326{
4327 nfa_state_T *state = startstate;
4328
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004329 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004330 if (depth > 10)
4331 return FALSE;
4332
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004333 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004334 {
4335 switch (state->c)
4336 {
4337 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004338 case NFA_MCLOSE:
4339 case NFA_END_INVISIBLE:
4340 case NFA_END_INVISIBLE_NEG:
4341 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004342 return TRUE;
4343
4344 case NFA_SPLIT:
4345 return match_follows(state->out, depth + 1)
4346 || match_follows(state->out1, depth + 1);
4347
4348 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004349 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004350 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004351 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004352 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004353 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004354 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004355 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004356 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004357 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004358 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004359 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004360
4361 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004362 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004363 case NFA_IDENT:
4364 case NFA_SIDENT:
4365 case NFA_KWORD:
4366 case NFA_SKWORD:
4367 case NFA_FNAME:
4368 case NFA_SFNAME:
4369 case NFA_PRINT:
4370 case NFA_SPRINT:
4371 case NFA_WHITE:
4372 case NFA_NWHITE:
4373 case NFA_DIGIT:
4374 case NFA_NDIGIT:
4375 case NFA_HEX:
4376 case NFA_NHEX:
4377 case NFA_OCTAL:
4378 case NFA_NOCTAL:
4379 case NFA_WORD:
4380 case NFA_NWORD:
4381 case NFA_HEAD:
4382 case NFA_NHEAD:
4383 case NFA_ALPHA:
4384 case NFA_NALPHA:
4385 case NFA_LOWER:
4386 case NFA_NLOWER:
4387 case NFA_UPPER:
4388 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004389 case NFA_LOWER_IC:
4390 case NFA_NLOWER_IC:
4391 case NFA_UPPER_IC:
4392 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004393 case NFA_START_COLL:
4394 case NFA_START_NEG_COLL:
4395 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004396 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004397 return FALSE;
4398
4399 default:
4400 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004401 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004402 return FALSE;
4403
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004404 // Others: zero-width or possibly zero-width, might still find
4405 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004406 break;
4407 }
4408 state = state->out;
4409 }
4410 return FALSE;
4411}
4412
4413
4414/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004415 * Return TRUE if "state" is already in list "l".
4416 */
4417 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004418state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004419 nfa_list_T *l, // runtime state list
4420 nfa_state_T *state, // state to update
4421 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004422{
4423 if (state->lastlist[nfa_ll_index] == l->id)
4424 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004425 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004426 return TRUE;
4427 }
4428 return FALSE;
4429}
4430
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004431// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004432#define ADDSTATE_HERE_OFFSET 10
4433
Bram Moolenaard05bf562013-06-30 23:24:08 +02004434/*
4435 * Add "state" and possibly what follows to state list ".".
4436 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004437 * Returns NULL when recursiveness is too deep.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004438 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004439 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004440addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004441 nfa_list_T *l, // runtime state list
4442 nfa_state_T *state, // state to update
4443 regsubs_T *subs_arg, // pointers to subexpressions
4444 nfa_pim_T *pim, // postponed look-behind match
4445 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004446{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004447 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004448 int off = off_arg;
4449 int add_here = FALSE;
4450 int listindex = 0;
4451 int k;
4452 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004453 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004454 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004455 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004456 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004457 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004458 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004459 regsubs_T *subs = subs_arg;
4460 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004461#ifdef ENABLE_LOG
4462 int did_print = FALSE;
4463#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004464 static int depth = 0;
4465
4466 // This function is called recursively. When the depth is too much we run
4467 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004468 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004469 {
4470 --depth;
4471 return NULL;
4472 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004473
Bram Moolenaar16b35782016-09-09 20:29:50 +02004474 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4475 {
4476 add_here = TRUE;
4477 off = 0;
4478 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4479 }
4480
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004481 switch (state->c)
4482 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004483 case NFA_NCLOSE:
4484 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004485 case NFA_MCLOSE1:
4486 case NFA_MCLOSE2:
4487 case NFA_MCLOSE3:
4488 case NFA_MCLOSE4:
4489 case NFA_MCLOSE5:
4490 case NFA_MCLOSE6:
4491 case NFA_MCLOSE7:
4492 case NFA_MCLOSE8:
4493 case NFA_MCLOSE9:
4494#ifdef FEAT_SYN_HL
4495 case NFA_ZCLOSE:
4496 case NFA_ZCLOSE1:
4497 case NFA_ZCLOSE2:
4498 case NFA_ZCLOSE3:
4499 case NFA_ZCLOSE4:
4500 case NFA_ZCLOSE5:
4501 case NFA_ZCLOSE6:
4502 case NFA_ZCLOSE7:
4503 case NFA_ZCLOSE8:
4504 case NFA_ZCLOSE9:
4505#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004506 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004507 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004508 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004509 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004510 // These nodes are not added themselves but their "out" and/or
4511 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004512 break;
4513
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004514 case NFA_BOL:
4515 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004516 // "^" won't match past end-of-line, don't bother trying.
4517 // Except when at the end of the line, or when we are going to the
4518 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004519 if (rex.input > rex.line
4520 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004521 && (nfa_endp == NULL
4522 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004523 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004524 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004525 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004526
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004527 case NFA_MOPEN1:
4528 case NFA_MOPEN2:
4529 case NFA_MOPEN3:
4530 case NFA_MOPEN4:
4531 case NFA_MOPEN5:
4532 case NFA_MOPEN6:
4533 case NFA_MOPEN7:
4534 case NFA_MOPEN8:
4535 case NFA_MOPEN9:
4536#ifdef FEAT_SYN_HL
4537 case NFA_ZOPEN:
4538 case NFA_ZOPEN1:
4539 case NFA_ZOPEN2:
4540 case NFA_ZOPEN3:
4541 case NFA_ZOPEN4:
4542 case NFA_ZOPEN5:
4543 case NFA_ZOPEN6:
4544 case NFA_ZOPEN7:
4545 case NFA_ZOPEN8:
4546 case NFA_ZOPEN9:
4547#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004548 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004549 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004550 // These nodes need to be added so that we can bail out when it
4551 // was added to this list before at the same position to avoid an
4552 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004553
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004554 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004555 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004556 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004557 // This state is already in the list, don't add it again,
4558 // unless it is an MOPEN that is used for a backreference or
4559 // when there is a PIM. For NFA_MATCH check the position,
4560 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004561 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004562 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004563 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004564 // When called from addstate_here() do insert before
4565 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004566 if (add_here)
4567 {
4568 for (k = 0; k < l->n && k < listindex; ++k)
4569 if (l->t[k].state->id == state->id)
4570 {
4571 found = TRUE;
4572 break;
4573 }
4574 }
4575 if (!add_here || found)
4576 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004577skip_add:
4578#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004579 nfa_set_code(state->c);
4580 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4581 abs(state->id), l->id, state->c, code,
4582 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004583#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004584 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004585 return subs;
4586 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004587 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004588
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004589 // Do not add the state again when it exists with the same
4590 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004591 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004592 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004593 }
4594
Bram Moolenaar688b3982019-02-13 21:47:36 +01004595 // When there are backreferences or PIMs the number of states may
4596 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004597 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004598 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004599 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004600 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004601 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004602
Bram Moolenaar688b3982019-02-13 21:47:36 +01004603 if ((long)(newsize >> 10) >= p_mmp)
4604 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004605 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004606 --depth;
4607 return NULL;
4608 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004609 if (subs != &temp_subs)
4610 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004611 // "subs" may point into the current array, need to make a
4612 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004613 copy_sub(&temp_subs.norm, &subs->norm);
4614#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004615 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004616 copy_sub(&temp_subs.synt, &subs->synt);
4617#endif
4618 subs = &temp_subs;
4619 }
4620
Bram Moolenaar688b3982019-02-13 21:47:36 +01004621 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004622 if (newt == NULL)
4623 {
4624 // out of memory
4625 --depth;
4626 return NULL;
4627 }
4628 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004629 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004630 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004631
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004632 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004633 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004634 thread = &l->t[l->n++];
4635 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004636 if (pim == NULL)
4637 thread->pim.result = NFA_PIM_UNUSED;
4638 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004639 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004640 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004641 l->has_pim = TRUE;
4642 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004643 copy_sub(&thread->subs.norm, &subs->norm);
4644#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004645 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004646 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004647#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004648#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004649 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004650 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004651#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004652 }
4653
4654#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004655 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004656 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004657#endif
4658 switch (state->c)
4659 {
4660 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004661 break;
4662
4663 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004664 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004665 subs = addstate(l, state->out, subs, pim, off_arg);
4666 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004667 break;
4668
Bram Moolenaar699c1202013-09-25 16:41:54 +02004669 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004670 case NFA_NOPEN:
4671 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004672 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004673 break;
4674
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004675 case NFA_MOPEN:
4676 case NFA_MOPEN1:
4677 case NFA_MOPEN2:
4678 case NFA_MOPEN3:
4679 case NFA_MOPEN4:
4680 case NFA_MOPEN5:
4681 case NFA_MOPEN6:
4682 case NFA_MOPEN7:
4683 case NFA_MOPEN8:
4684 case NFA_MOPEN9:
4685#ifdef FEAT_SYN_HL
4686 case NFA_ZOPEN:
4687 case NFA_ZOPEN1:
4688 case NFA_ZOPEN2:
4689 case NFA_ZOPEN3:
4690 case NFA_ZOPEN4:
4691 case NFA_ZOPEN5:
4692 case NFA_ZOPEN6:
4693 case NFA_ZOPEN7:
4694 case NFA_ZOPEN8:
4695 case NFA_ZOPEN9:
4696#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004697 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004698 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004699 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004700 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004701 sub = &subs->norm;
4702 }
4703#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004704 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004705 {
4706 subidx = state->c - NFA_ZOPEN;
4707 sub = &subs->synt;
4708 }
4709#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004710 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004711 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004712 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004713 sub = &subs->norm;
4714 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004715
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004716 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004717 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004718 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004719
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004720 // Set the position (with "off" added) in the subexpression. Save
4721 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004722 if (REG_MULTI)
4723 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004724 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004725 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004726 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004727 save_in_use = -1;
4728 }
4729 else
4730 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004731 save_in_use = sub->in_use;
4732 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004733 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004734 sub->list.multi[i].start_lnum = -1;
4735 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004736 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004737 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004738 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004739 if (off == -1)
4740 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004741 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004742 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004743 }
4744 else
4745 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004746 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004747 sub->list.multi[subidx].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004748 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004749 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004750 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004751 }
4752 else
4753 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004754 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004755 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004756 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004757 save_in_use = -1;
4758 }
4759 else
4760 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004761 save_in_use = sub->in_use;
4762 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004763 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004764 sub->list.line[i].start = NULL;
4765 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004766 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004767 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004768 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004769 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004770 }
4771
Bram Moolenaar16b35782016-09-09 20:29:50 +02004772 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004773 if (subs == NULL)
4774 break;
4775 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004776#ifdef FEAT_SYN_HL
4777 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4778 sub = &subs->synt;
4779 else
4780#endif
4781 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004782
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004783 if (save_in_use == -1)
4784 {
4785 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004786 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004787 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004788 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004789 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004790 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004791 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004792 break;
4793
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004794 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004795 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004796 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004797 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004798 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004799 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004800 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004801 break;
4802 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004803 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004804 case NFA_MCLOSE1:
4805 case NFA_MCLOSE2:
4806 case NFA_MCLOSE3:
4807 case NFA_MCLOSE4:
4808 case NFA_MCLOSE5:
4809 case NFA_MCLOSE6:
4810 case NFA_MCLOSE7:
4811 case NFA_MCLOSE8:
4812 case NFA_MCLOSE9:
4813#ifdef FEAT_SYN_HL
4814 case NFA_ZCLOSE:
4815 case NFA_ZCLOSE1:
4816 case NFA_ZCLOSE2:
4817 case NFA_ZCLOSE3:
4818 case NFA_ZCLOSE4:
4819 case NFA_ZCLOSE5:
4820 case NFA_ZCLOSE6:
4821 case NFA_ZCLOSE7:
4822 case NFA_ZCLOSE8:
4823 case NFA_ZCLOSE9:
4824#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004825 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004826 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004827 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004828 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004829 sub = &subs->norm;
4830 }
4831#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004832 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004833 {
4834 subidx = state->c - NFA_ZCLOSE;
4835 sub = &subs->synt;
4836 }
4837#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004838 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004839 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004840 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004841 sub = &subs->norm;
4842 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004843
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004844 // We don't fill in gaps here, there must have been an MOPEN that
4845 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004846 save_in_use = sub->in_use;
4847 if (sub->in_use <= subidx)
4848 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004849 if (REG_MULTI)
4850 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004851 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004852 if (off == -1)
4853 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004854 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004855 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004856 }
4857 else
4858 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004859 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004860 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004861 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004862 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004863 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004864 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004865 }
4866 else
4867 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004868 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004869 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004870 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004871 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004872 }
4873
Bram Moolenaar16b35782016-09-09 20:29:50 +02004874 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004875 if (subs == NULL)
4876 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004877 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004878#ifdef FEAT_SYN_HL
4879 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4880 sub = &subs->synt;
4881 else
4882#endif
4883 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004884
4885 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004886 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004887 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004888 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004889 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004890 break;
4891 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004892 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004893 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004894}
4895
4896/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004897 * Like addstate(), but the new state(s) are put at position "*ip".
4898 * Used for zero-width matches, next state to use is the added one.
4899 * This makes sure the order of states to be tried does not change, which
4900 * matters for alternatives.
4901 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004902 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004903addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004904 nfa_list_T *l, // runtime state list
4905 nfa_state_T *state, // state to update
4906 regsubs_T *subs, // pointers to subexpressions
4907 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004908 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004909{
4910 int tlen = l->n;
4911 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004912 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004913 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004914
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004915 // First add the state(s) at the end, so that we know how many there are.
4916 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00004917 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004918 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4919 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004920 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004921
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004922 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004923 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004924 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004925
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004926 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004927 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004928 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004929 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02004930 if (count == 1)
4931 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004932 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02004933 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02004934 }
4935 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004936 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004937 if (l->n + count - 1 >= l->len)
4938 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004939 // not enough space to move the new states, reallocate the list
4940 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004941 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004942 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004943 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004944
Bram Moolenaar688b3982019-02-13 21:47:36 +01004945 if ((long)(newsize >> 10) >= p_mmp)
4946 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004947 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004948 return NULL;
4949 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02004950 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004951 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004952 return NULL;
4953 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004954 mch_memmove(&(newl[0]),
4955 &(l->t[0]),
4956 sizeof(nfa_thread_T) * listidx);
4957 mch_memmove(&(newl[listidx]),
4958 &(l->t[l->n - count]),
4959 sizeof(nfa_thread_T) * count);
4960 mch_memmove(&(newl[listidx + count]),
4961 &(l->t[listidx + 1]),
4962 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
4963 vim_free(l->t);
4964 l->t = newl;
4965 }
4966 else
4967 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004968 // make space for new states, then move them from the
4969 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004970 mch_memmove(&(l->t[listidx + count]),
4971 &(l->t[listidx + 1]),
4972 sizeof(nfa_thread_T) * (l->n - listidx - 1));
4973 mch_memmove(&(l->t[listidx]),
4974 &(l->t[l->n - 1]),
4975 sizeof(nfa_thread_T) * count);
4976 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02004977 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02004978 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004979 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004980
4981 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004982}
4983
4984/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004985 * Check character class "class" against current character c.
4986 */
4987 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004988check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004989{
4990 switch (class)
4991 {
4992 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02004993 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004994 return OK;
4995 break;
4996 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02004997 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004998 return OK;
4999 break;
5000 case NFA_CLASS_BLANK:
5001 if (c == ' ' || c == '\t')
5002 return OK;
5003 break;
5004 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005005 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005006 return OK;
5007 break;
5008 case NFA_CLASS_DIGIT:
5009 if (VIM_ISDIGIT(c))
5010 return OK;
5011 break;
5012 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005013 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005014 return OK;
5015 break;
5016 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005017 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005018 return OK;
5019 break;
5020 case NFA_CLASS_PRINT:
5021 if (vim_isprintc(c))
5022 return OK;
5023 break;
5024 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005025 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005026 return OK;
5027 break;
5028 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005029 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005030 return OK;
5031 break;
5032 case NFA_CLASS_UPPER:
5033 if (MB_ISUPPER(c))
5034 return OK;
5035 break;
5036 case NFA_CLASS_XDIGIT:
5037 if (vim_isxdigit(c))
5038 return OK;
5039 break;
5040 case NFA_CLASS_TAB:
5041 if (c == '\t')
5042 return OK;
5043 break;
5044 case NFA_CLASS_RETURN:
5045 if (c == '\r')
5046 return OK;
5047 break;
5048 case NFA_CLASS_BACKSPACE:
5049 if (c == '\b')
5050 return OK;
5051 break;
5052 case NFA_CLASS_ESCAPE:
5053 if (c == '\033')
5054 return OK;
5055 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005056 case NFA_CLASS_IDENT:
5057 if (vim_isIDc(c))
5058 return OK;
5059 break;
5060 case NFA_CLASS_KEYWORD:
5061 if (reg_iswordc(c))
5062 return OK;
5063 break;
5064 case NFA_CLASS_FNAME:
5065 if (vim_isfilec(c))
5066 return OK;
5067 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005068
5069 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005070 // should not be here :P
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00005071 siemsg(_(e_nfa_regexp_invalid_character_class_nr), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005072 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005073 }
5074 return FAIL;
5075}
5076
Bram Moolenaar5714b802013-05-28 22:03:20 +02005077/*
5078 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005079 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005080 */
5081 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005082match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005083 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005084 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005085 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005086{
5087 int len;
5088
5089 if (sub->in_use <= subidx)
5090 {
5091retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005092 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005093 *bytelen = 0;
5094 return TRUE;
5095 }
5096
5097 if (REG_MULTI)
5098 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005099 if (sub->list.multi[subidx].start_lnum < 0
5100 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005101 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005102 if (sub->list.multi[subidx].start_lnum == rex.lnum
5103 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005104 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005105 len = sub->list.multi[subidx].end_col
5106 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005107 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5108 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005109 {
5110 *bytelen = len;
5111 return TRUE;
5112 }
5113 }
5114 else
5115 {
5116 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005117 sub->list.multi[subidx].start_lnum,
5118 sub->list.multi[subidx].start_col,
5119 sub->list.multi[subidx].end_lnum,
5120 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005121 bytelen) == RA_MATCH)
5122 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005123 }
5124 }
5125 else
5126 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005127 if (sub->list.line[subidx].start == NULL
5128 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005129 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005130 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005131 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005132 {
5133 *bytelen = len;
5134 return TRUE;
5135 }
5136 }
5137 return FALSE;
5138}
5139
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005140#ifdef FEAT_SYN_HL
5141
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005142/*
5143 * Check for a match with \z subexpression "subidx".
5144 * Return TRUE if it matches.
5145 */
5146 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005147match_zref(
5148 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005149 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005150{
5151 int len;
5152
5153 cleanup_zsubexpr();
5154 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5155 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005156 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005157 *bytelen = 0;
5158 return TRUE;
5159 }
5160
5161 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005162 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005163 {
5164 *bytelen = len;
5165 return TRUE;
5166 }
5167 return FALSE;
5168}
5169#endif
5170
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005171/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005172 * Save list IDs for all NFA states of "prog" into "list".
5173 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005174 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005175 */
5176 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005177nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005178{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005179 int i;
5180 nfa_state_T *p;
5181
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005182 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005183 p = &prog->state[0];
5184 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005185 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005186 list[i] = p->lastlist[1];
5187 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005188 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005189 }
5190}
5191
5192/*
5193 * Restore list IDs from "list" to all NFA states.
5194 */
5195 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005196nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005197{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005198 int i;
5199 nfa_state_T *p;
5200
5201 p = &prog->state[0];
5202 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005203 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005204 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005205 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005206 }
5207}
5208
Bram Moolenaar423532e2013-05-29 21:14:42 +02005209 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005210nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005211{
5212 if (op == 1) return pos > val;
5213 if (op == 2) return pos < val;
5214 return val == pos;
5215}
5216
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005217static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005218
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005219/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005220 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005221 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5222 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005223 */
5224 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005225recursive_regmatch(
5226 nfa_state_T *state,
5227 nfa_pim_T *pim,
5228 nfa_regprog_T *prog,
5229 regsubs_T *submatch,
5230 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005231 int **listids,
5232 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005233{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005234 int save_reginput_col = (int)(rex.input - rex.line);
5235 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005236 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005237 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005238 save_se_T *save_nfa_endp = nfa_endp;
5239 save_se_T endpos;
5240 save_se_T *endposp = NULL;
5241 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005242 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005243
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005244 if (pim != NULL)
5245 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005246 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005247 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005248 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005249 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005250 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005251 }
5252
Bram Moolenaardecd9542013-06-07 16:31:50 +02005253 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005254 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5255 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5256 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005257 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005258 // The recursive match must end at the current position. When "pim" is
5259 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005260 endposp = &endpos;
5261 if (REG_MULTI)
5262 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005263 if (pim == NULL)
5264 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005265 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5266 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005267 }
5268 else
5269 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005270 }
5271 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005272 {
5273 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005274 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005275 else
5276 endpos.se_u.ptr = pim->end.ptr;
5277 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005278
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005279 // Go back the specified number of bytes, or as far as the
5280 // start of the previous line, to try matching "\@<=" or
5281 // not matching "\@<!". This is very inefficient, limit the number of
5282 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005283 if (state->val <= 0)
5284 {
5285 if (REG_MULTI)
5286 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005287 rex.line = reg_getline(--rex.lnum);
5288 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005289 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005290 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005291 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005292 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005293 }
5294 else
5295 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005296 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005297 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005298 // Not enough bytes in this line, go to end of
5299 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005300 rex.line = reg_getline(--rex.lnum);
5301 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005302 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005303 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005304 rex.line = reg_getline(++rex.lnum);
5305 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005306 }
5307 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005308 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005309 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005310 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005311 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005312 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005313 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005314 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005315 }
5316 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005317 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005318 }
5319 }
5320
Bram Moolenaarf46da702013-06-02 22:37:42 +02005321#ifdef ENABLE_LOG
5322 if (log_fd != stderr)
5323 fclose(log_fd);
5324 log_fd = NULL;
5325#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005326 // Have to clear the lastlist field of the NFA nodes, so that
5327 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005328 if (nfa_ll_index == 1)
5329 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005330 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5331 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005332 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005333 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005334 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005335 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005336 if (*listids == NULL)
5337 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00005338 emsg(_(e_nfa_regexp_could_not_allocate_memory_for_branch_traversal));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005339 return 0;
5340 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005341 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005342 }
5343 nfa_save_listids(prog, *listids);
5344 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005345 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005346 }
5347 else
5348 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005349 // First recursive nfa_regmatch() call, switch to the second lastlist
5350 // entry. Make sure rex.nfa_listid is different from a previous
5351 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005352 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005353 if (rex.nfa_listid <= rex.nfa_alt_listid)
5354 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005355 }
5356
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005357 // Call nfa_regmatch() to check if the current concat matches at this
5358 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005359 nfa_endp = endposp;
5360 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005361
5362 if (need_restore)
5363 nfa_restore_listids(prog, *listids);
5364 else
5365 {
5366 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005367 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005368 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005369
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005370 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005371 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005372 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005373 rex.line = reg_getline(rex.lnum);
5374 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005375 if (result != NFA_TOO_EXPENSIVE)
5376 {
5377 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005378 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005379 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005380 nfa_endp = save_nfa_endp;
5381
5382#ifdef ENABLE_LOG
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005383 open_debug_log(result);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005384#endif
5385
5386 return result;
5387}
5388
Bram Moolenaara2d95102013-06-04 14:23:05 +02005389/*
5390 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005391 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005392 * NFA_ANY: 1
5393 * specific character: 99
5394 */
5395 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005396failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005397{
5398 int c = state->c;
5399 int l, r;
5400
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005401 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005402 if (depth > 4)
5403 return 1;
5404
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005405 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005406 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005407 case NFA_SPLIT:
5408 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005409 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005410 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005411 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005412 l = failure_chance(state->out, depth + 1);
5413 r = failure_chance(state->out1, depth + 1);
5414 return l < r ? l : r;
5415
5416 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005417 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005418 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005419
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005420 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005421 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005422 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005423 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005424 return 0;
5425
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005426 case NFA_START_INVISIBLE:
5427 case NFA_START_INVISIBLE_FIRST:
5428 case NFA_START_INVISIBLE_NEG:
5429 case NFA_START_INVISIBLE_NEG_FIRST:
5430 case NFA_START_INVISIBLE_BEFORE:
5431 case NFA_START_INVISIBLE_BEFORE_FIRST:
5432 case NFA_START_INVISIBLE_BEFORE_NEG:
5433 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5434 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005435 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005436 return 5;
5437
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005438 case NFA_BOL:
5439 case NFA_EOL:
5440 case NFA_BOF:
5441 case NFA_EOF:
5442 case NFA_NEWL:
5443 return 99;
5444
5445 case NFA_BOW:
5446 case NFA_EOW:
5447 return 90;
5448
5449 case NFA_MOPEN:
5450 case NFA_MOPEN1:
5451 case NFA_MOPEN2:
5452 case NFA_MOPEN3:
5453 case NFA_MOPEN4:
5454 case NFA_MOPEN5:
5455 case NFA_MOPEN6:
5456 case NFA_MOPEN7:
5457 case NFA_MOPEN8:
5458 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005459#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005460 case NFA_ZOPEN:
5461 case NFA_ZOPEN1:
5462 case NFA_ZOPEN2:
5463 case NFA_ZOPEN3:
5464 case NFA_ZOPEN4:
5465 case NFA_ZOPEN5:
5466 case NFA_ZOPEN6:
5467 case NFA_ZOPEN7:
5468 case NFA_ZOPEN8:
5469 case NFA_ZOPEN9:
5470 case NFA_ZCLOSE:
5471 case NFA_ZCLOSE1:
5472 case NFA_ZCLOSE2:
5473 case NFA_ZCLOSE3:
5474 case NFA_ZCLOSE4:
5475 case NFA_ZCLOSE5:
5476 case NFA_ZCLOSE6:
5477 case NFA_ZCLOSE7:
5478 case NFA_ZCLOSE8:
5479 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005480#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005481 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005482 case NFA_MCLOSE1:
5483 case NFA_MCLOSE2:
5484 case NFA_MCLOSE3:
5485 case NFA_MCLOSE4:
5486 case NFA_MCLOSE5:
5487 case NFA_MCLOSE6:
5488 case NFA_MCLOSE7:
5489 case NFA_MCLOSE8:
5490 case NFA_MCLOSE9:
5491 case NFA_NCLOSE:
5492 return failure_chance(state->out, depth + 1);
5493
5494 case NFA_BACKREF1:
5495 case NFA_BACKREF2:
5496 case NFA_BACKREF3:
5497 case NFA_BACKREF4:
5498 case NFA_BACKREF5:
5499 case NFA_BACKREF6:
5500 case NFA_BACKREF7:
5501 case NFA_BACKREF8:
5502 case NFA_BACKREF9:
5503#ifdef FEAT_SYN_HL
5504 case NFA_ZREF1:
5505 case NFA_ZREF2:
5506 case NFA_ZREF3:
5507 case NFA_ZREF4:
5508 case NFA_ZREF5:
5509 case NFA_ZREF6:
5510 case NFA_ZREF7:
5511 case NFA_ZREF8:
5512 case NFA_ZREF9:
5513#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005514 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005515 return 94;
5516
5517 case NFA_LNUM_GT:
5518 case NFA_LNUM_LT:
5519 case NFA_COL_GT:
5520 case NFA_COL_LT:
5521 case NFA_VCOL_GT:
5522 case NFA_VCOL_LT:
5523 case NFA_MARK_GT:
5524 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005525 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005526 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005527 return 85;
5528
5529 case NFA_LNUM:
5530 return 90;
5531
5532 case NFA_CURSOR:
5533 case NFA_COL:
5534 case NFA_VCOL:
5535 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005536 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005537 return 98;
5538
5539 case NFA_COMPOSING:
5540 return 95;
5541
5542 default:
5543 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005544 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005545 return 95;
5546 }
5547
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005548 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005549 return 50;
5550}
5551
Bram Moolenaarf46da702013-06-02 22:37:42 +02005552/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005553 * Skip until the char "c" we know a match must start with.
5554 */
5555 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005556skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005557{
5558 char_u *s;
5559
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005560 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005561 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005562 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005563 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005564 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005565 if (s == NULL)
5566 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005567 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005568 return OK;
5569}
5570
5571/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005572 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005573 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005574 * Returns zero for no match, 1 for a match.
5575 */
5576 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01005577find_match_text(colnr_T startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005578{
5579 colnr_T col = startcol;
5580 int c1, c2;
5581 int len1, len2;
5582 int match;
5583
5584 for (;;)
5585 {
5586 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005587 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005588 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5589 {
5590 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005591 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005592 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005593 {
5594 match = FALSE;
5595 break;
5596 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005597 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5598 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005599 }
5600 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005601 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005602 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005603 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005604 {
5605 cleanup_subexpr();
5606 if (REG_MULTI)
5607 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005608 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005609 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005610 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005611 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005612 }
5613 else
5614 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005615 rex.reg_startp[0] = rex.line + col;
5616 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005617 }
5618 return 1L;
5619 }
5620
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005621 // Try finding regstart after the current match.
5622 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005623 if (skip_to_start(regstart, &col) == FAIL)
5624 break;
5625 }
5626 return 0L;
5627}
5628
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005629#ifdef FEAT_RELTIME
5630 static int
5631nfa_did_time_out()
5632{
5633 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5634 {
5635 if (nfa_timed_out != NULL)
5636 *nfa_timed_out = TRUE;
5637 return TRUE;
5638 }
5639 return FALSE;
5640}
5641#endif
5642
Bram Moolenaar473de612013-06-08 18:19:48 +02005643/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005644 * Main matching routine.
5645 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005646 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005647 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005648 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005649 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005650 * Return TRUE if there is a match, FALSE if there is no match,
5651 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005652 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005653 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005654 * Note: Caller must ensure that: start != NULL.
5655 */
5656 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005657nfa_regmatch(
5658 nfa_regprog_T *prog,
5659 nfa_state_T *start,
5660 regsubs_T *submatch,
5661 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005662{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005663 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005664 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005665 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005666 int go_to_nextline = FALSE;
5667 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005668 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005669 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005670 nfa_list_T *thislist;
5671 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005672 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005673 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005674 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005675 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005676 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005677 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005678 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005679 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005680#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005681 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005682#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005683
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005684 // Some patterns may take a long time to match, especially when using
5685 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005686 fast_breakcheck();
5687 if (got_int)
5688 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005689#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005690 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005691 return FALSE;
5692#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005693
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005694#ifdef NFA_REGEXP_DEBUG_LOG
5695 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5696 if (debug == NULL)
5697 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005698 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005699 return FALSE;
5700 }
5701#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005702 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005703
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005704 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005705 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005706
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005707 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005708 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005709 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005710 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005711 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005712 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005713
5714#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005715 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005716 if (log_fd == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005717 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005718 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005719 log_fd = stderr;
5720 }
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00005721 fprintf(log_fd, "**********************************\n");
5722 nfa_set_code(start->c);
5723 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5724 abs(start->id), code);
5725 fprintf(log_fd, "**********************************\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005726#endif
5727
5728 thislist = &list[0];
5729 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005730 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005731 nextlist = &list[1];
5732 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005733 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005734#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005735 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005736#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005737 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005738
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005739 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5740 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005741 if (toplevel)
5742 {
5743 if (REG_MULTI)
5744 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005745 m->norm.list.multi[0].start_lnum = rex.lnum;
5746 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005747 }
5748 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005749 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005750 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005751 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005752 }
5753 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005754 r = addstate(thislist, start, m, NULL, 0);
5755 if (r == NULL)
5756 {
5757 nfa_match = NFA_TOO_EXPENSIVE;
5758 goto theend;
5759 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005760
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005761#define ADD_STATE_IF_MATCH(state) \
5762 if (result) { \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005763 add_state = state->out; \
5764 add_off = clen; \
5765 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005766
5767 /*
5768 * Run for each character.
5769 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005770 for (;;)
5771 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005772 int curc;
5773 int clen;
5774
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005775 if (has_mbyte)
5776 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005777 curc = (*mb_ptr2char)(rex.input);
5778 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005779 }
5780 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005781 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005782 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005783 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005784 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005785 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005786 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005787 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005788 go_to_nextline = FALSE;
5789 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005790
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005791 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005792 thislist = &list[flag];
5793 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005794 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005795 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005796 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005797 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005798 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005799# ifdef FEAT_EVAL
5800 || nfa_fail_for_testing
5801# endif
5802 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005803 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005804 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005805 nfa_match = NFA_TOO_EXPENSIVE;
5806 goto theend;
5807 }
5808
Bram Moolenaar0270f382018-07-17 05:43:58 +02005809 thislist->id = rex.nfa_listid;
5810 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005811
5812#ifdef ENABLE_LOG
5813 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005814 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005815 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005816 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005817 {
5818 int i;
5819
5820 for (i = 0; i < thislist->n; i++)
5821 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5822 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005823 fprintf(log_fd, "\n");
5824#endif
5825
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005826#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005827 fprintf(debug, "\n-------------------\n");
5828#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005829 /*
5830 * If the state lists are empty we can stop.
5831 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005832 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005833 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005834
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005835 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005836 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005837 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005838 // If the list gets very long there probably is something wrong.
5839 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005840 fast_breakcheck();
5841 if (got_int)
5842 break;
5843#ifdef FEAT_RELTIME
5844 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5845 {
5846 nfa_time_count = 0;
5847 if (nfa_did_time_out())
5848 break;
5849 }
5850#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005851 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005852
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005853#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005854 nfa_set_code(t->state->c);
5855 fprintf(debug, "%s, ", code);
5856#endif
5857#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005858 {
5859 int col;
5860
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005861 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005862 col = -1;
5863 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005864 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005865 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005866 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005867 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005868 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005869 abs(t->state->id), (int)t->state->c, code, col,
5870 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005871 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005872#endif
5873
5874 /*
5875 * Handle the possible codes of the current state.
5876 * The most important is NFA_MATCH.
5877 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005878 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005879 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005880 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005881 switch (t->state->c)
5882 {
5883 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005884 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005885 // If the match is not at the start of the line, ends before a
5886 // composing characters and rex.reg_icombine is not set, that
5887 // is not really a match.
5888 if (enc_utf8 && !rex.reg_icombine
5889 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005890 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005891
Bram Moolenaar963fee22013-05-26 21:47:28 +02005892 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005893 copy_sub(&submatch->norm, &t->subs.norm);
5894#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005895 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005896 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005897#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005898#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005899 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005900#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005901 // Found the left-most longest match, do not look at any other
5902 // states at this position. When the list of states is going
5903 // to be empty quit without advancing, so that "rex.input" is
5904 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005905 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005906 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005907 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005908 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005909
5910 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005911 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005912 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005913 /*
5914 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005915 * NFA_START_INVISIBLE_BEFORE node.
5916 * They surround a zero-width group, used with "\@=", "\&",
5917 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005918 * If we got here, it means that the current "invisible" group
5919 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005920 * nfa_regmatch(). For a look-behind match only when it ends
5921 * in the position in "nfa_endp".
5922 * Submatches are stored in *m, and used in the parent call.
5923 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005924#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005925 if (nfa_endp != NULL)
5926 {
5927 if (REG_MULTI)
5928 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005929 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02005930 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02005931 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02005932 nfa_endp->se_u.pos.col);
5933 else
5934 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005935 (int)(rex.input - rex.line),
5936 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005937 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005938#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005939 // If "nfa_endp" is set it's only a match if it ends at
5940 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02005941 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02005942 ? (rex.lnum != nfa_endp->se_u.pos.lnum
5943 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005944 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005945 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02005946 break;
5947
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005948 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02005949 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005950 {
5951 copy_sub(&m->norm, &t->subs.norm);
5952#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005953 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005954 copy_sub(&m->synt, &t->subs.synt);
5955#endif
5956 }
Bram Moolenaar87953742013-06-05 18:52:40 +02005957#ifdef ENABLE_LOG
5958 fprintf(log_fd, "Match found:\n");
5959 log_subsexpr(m);
5960#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02005961 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005962 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02005963 if (nextlist->n == 0)
5964 clen = 0;
5965 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005966
5967 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005968 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005969 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005970 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02005971 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005972 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005973 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02005974 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005975 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005976#ifdef ENABLE_LOG
5977 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
5978 failure_chance(t->state->out, 0),
5979 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005980#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005981 // Do it directly if there already is a PIM or when
5982 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02005983 if (t->pim.result != NFA_PIM_UNUSED
5984 || t->state->c == NFA_START_INVISIBLE_FIRST
5985 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5986 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5987 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005988 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02005989 int in_use = m->norm.in_use;
5990
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005991 // Copy submatch info for the recursive call, opposite
5992 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02005993 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02005994#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005995 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02005996 copy_sub_off(&m->synt, &t->subs.synt);
5997#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02005998
Bram Moolenaara2d95102013-06-04 14:23:05 +02005999 /*
6000 * First try matching the invisible match, then what
6001 * follows.
6002 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006003 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006004 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006005 if (result == NFA_TOO_EXPENSIVE)
6006 {
6007 nfa_match = result;
6008 goto theend;
6009 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006010
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006011 // for \@! and \@<! it is a match when the result is
6012 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006013 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006014 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6015 || t->state->c
6016 == NFA_START_INVISIBLE_BEFORE_NEG
6017 || t->state->c
6018 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006019 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006020 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006021 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006022#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006023 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006024 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006025#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006026 // If the pattern has \ze and it matched in the
6027 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006028 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006029
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006030 // t->state->out1 is the corresponding
6031 // END_INVISIBLE node; Add its out to the current
6032 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006033 add_here = TRUE;
6034 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006035 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006036 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006037 }
6038 else
6039 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006040 nfa_pim_T pim;
6041
Bram Moolenaara2d95102013-06-04 14:23:05 +02006042 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006043 * First try matching what follows. Only if a match
6044 * is found verify the invisible match matches. Add a
6045 * nfa_pim_T to the following states, it contains info
6046 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006047 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006048 pim.state = t->state;
6049 pim.result = NFA_PIM_TODO;
6050 pim.subs.norm.in_use = 0;
6051#ifdef FEAT_SYN_HL
6052 pim.subs.synt.in_use = 0;
6053#endif
6054 if (REG_MULTI)
6055 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006056 pim.end.pos.col = (int)(rex.input - rex.line);
6057 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006058 }
6059 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006060 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006061
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006062 // t->state->out1 is the corresponding END_INVISIBLE
6063 // node; Add its out to the current list (zero-width
6064 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006065 if (addstate_here(thislist, t->state->out1->out,
6066 &t->subs, &pim, &listidx) == NULL)
6067 {
6068 nfa_match = NFA_TOO_EXPENSIVE;
6069 goto theend;
6070 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006071 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006072 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006073 break;
6074
Bram Moolenaar87953742013-06-05 18:52:40 +02006075 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006076 {
6077 nfa_state_T *skip = NULL;
6078#ifdef ENABLE_LOG
6079 int skip_lid = 0;
6080#endif
6081
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006082 // There is no point in trying to match the pattern if the
6083 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006084 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6085 {
6086 skip = t->state->out1->out;
6087#ifdef ENABLE_LOG
6088 skip_lid = nextlist->id;
6089#endif
6090 }
6091 else if (state_in_list(nextlist,
6092 t->state->out1->out->out, &t->subs))
6093 {
6094 skip = t->state->out1->out->out;
6095#ifdef ENABLE_LOG
6096 skip_lid = nextlist->id;
6097#endif
6098 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006099 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006100 t->state->out1->out->out, &t->subs))
6101 {
6102 skip = t->state->out1->out->out;
6103#ifdef ENABLE_LOG
6104 skip_lid = thislist->id;
6105#endif
6106 }
6107 if (skip != NULL)
6108 {
6109#ifdef ENABLE_LOG
6110 nfa_set_code(skip->c);
6111 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6112 abs(skip->id), skip_lid, skip->c, code);
6113#endif
6114 break;
6115 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006116 // Copy submatch info to the recursive call, opposite of what
6117 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006118 copy_sub_off(&m->norm, &t->subs.norm);
6119#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006120 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006121 copy_sub_off(&m->synt, &t->subs.synt);
6122#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006123
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006124 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006125 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006126 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006127 if (result == NFA_TOO_EXPENSIVE)
6128 {
6129 nfa_match = result;
6130 goto theend;
6131 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006132 if (result)
6133 {
6134 int bytelen;
6135
6136#ifdef ENABLE_LOG
6137 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6138 log_subsexpr(m);
6139#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006140 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006141 copy_sub_off(&t->subs.norm, &m->norm);
6142#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006143 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006144 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006145#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006146 // Now we need to skip over the matched text and then
6147 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006148 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006149 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006150 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006151 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006152 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006153 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006154
6155#ifdef ENABLE_LOG
6156 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6157#endif
6158 if (bytelen == 0)
6159 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006160 // empty match, output of corresponding
6161 // NFA_END_PATTERN/NFA_SKIP to be used at current
6162 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006163 add_here = TRUE;
6164 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006165 }
6166 else if (bytelen <= clen)
6167 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006168 // match current character, output of corresponding
6169 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006170 add_state = t->state->out1->out->out;
6171 add_off = clen;
6172 }
6173 else
6174 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006175 // skip over the matched characters, set character
6176 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006177 add_state = t->state->out1->out;
6178 add_off = bytelen;
6179 add_count = bytelen - clen;
6180 }
6181 }
6182 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006183 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006184
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006185 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006186 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006187 {
6188 add_here = TRUE;
6189 add_state = t->state->out;
6190 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006191 break;
6192
6193 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006194 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006195 {
6196 add_here = TRUE;
6197 add_state = t->state->out;
6198 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006199 break;
6200
6201 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006202 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006203
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006204 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006205 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006206 else if (has_mbyte)
6207 {
6208 int this_class;
6209
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006210 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006211 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006212 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006213 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006214 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006215 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006216 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006217 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006218 || (rex.input > rex.line
6219 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006220 result = FALSE;
6221 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006222 {
6223 add_here = TRUE;
6224 add_state = t->state->out;
6225 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006226 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006227
6228 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006229 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006230 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006231 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006232 else if (has_mbyte)
6233 {
6234 int this_class, prev_class;
6235
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006236 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006237 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006238 prev_class = reg_prev_class();
6239 if (this_class == prev_class
6240 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006241 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006242 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006243 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6244 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006245 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006246 result = FALSE;
6247 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006248 {
6249 add_here = TRUE;
6250 add_state = t->state->out;
6251 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006252 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006253
Bram Moolenaar4b780632013-05-31 22:14:52 +02006254 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006255 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006256 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006257 {
6258 add_here = TRUE;
6259 add_state = t->state->out;
6260 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006261 break;
6262
6263 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006264 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006265 {
6266 add_here = TRUE;
6267 add_state = t->state->out;
6268 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006269 break;
6270
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006271 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006272 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006273 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006274 int len = 0;
6275 nfa_state_T *end;
6276 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006277 int cchars[MAX_MCO];
6278 int ccount = 0;
6279 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006280
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006281 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006282 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006283 if (utf_iscomposing(sta->c))
6284 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006285 // Only match composing character(s), ignore base
6286 // character. Used for ".{composing}" and "{composing}"
6287 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006288 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006289 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006290 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006291 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006292 // If \Z was present, then ignore composing characters.
6293 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006294 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006295 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006296 else
6297 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006298 while (sta->c != NFA_END_COMPOSING)
6299 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006300 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006301
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006302 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006303 else if (len > 0 || mc == sta->c)
6304 {
6305 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006306 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006307 len += mb_char2len(mc);
6308 sta = sta->out;
6309 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006310
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006311 // We don't care about the order of composing characters.
6312 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006313 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006314 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006315 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006316 cchars[ccount++] = mc;
6317 len += mb_char2len(mc);
6318 if (ccount == MAX_MCO)
6319 break;
6320 }
6321
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006322 // Check that each composing char in the pattern matches a
6323 // composing char in the text. We do not check if all
6324 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006325 result = OK;
6326 while (sta->c != NFA_END_COMPOSING)
6327 {
6328 for (j = 0; j < ccount; ++j)
6329 if (cchars[j] == sta->c)
6330 break;
6331 if (j == ccount)
6332 {
6333 result = FAIL;
6334 break;
6335 }
6336 sta = sta->out;
6337 }
6338 }
6339 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006340 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006341
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006342 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006343 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006344 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006345 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006346
6347 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006348 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006349 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006350 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006351 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006352 // Pass -1 for the offset, which means taking the position
6353 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006354 add_state = t->state->out;
6355 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006356 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006357 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006358 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006359 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006360 add_state = t->state->out;
6361 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006362 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006363 break;
6364
Bram Moolenaar417bad22013-06-07 14:08:30 +02006365 case NFA_START_COLL:
6366 case NFA_START_NEG_COLL:
6367 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006368 // What follows is a list of characters, until NFA_END_COLL.
6369 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006370 nfa_state_T *state;
6371 int result_if_matched;
6372 int c1, c2;
6373
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006374 // Never match EOL. If it's part of the collection it is added
6375 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006376 if (curc == NUL)
6377 break;
6378
6379 state = t->state->out;
6380 result_if_matched = (t->state->c == NFA_START_COLL);
6381 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006382 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006383 if (state->c == NFA_END_COLL)
6384 {
6385 result = !result_if_matched;
6386 break;
6387 }
6388 if (state->c == NFA_RANGE_MIN)
6389 {
6390 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006391 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006392 c2 = state->val;
6393#ifdef ENABLE_LOG
6394 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6395 curc, c1, c2);
6396#endif
6397 if (curc >= c1 && curc <= c2)
6398 {
6399 result = result_if_matched;
6400 break;
6401 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006402 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006403 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006404 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006405 int done = FALSE;
6406
6407 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006408 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006409 {
6410 result = result_if_matched;
6411 done = TRUE;
6412 break;
6413 }
6414 if (done)
6415 break;
6416 }
6417 }
6418 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006419 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006420 || (rex.reg_ic && MB_CASEFOLD(curc)
6421 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006422 {
6423 result = result_if_matched;
6424 break;
6425 }
6426 state = state->out;
6427 }
6428 if (result)
6429 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006430 // next state is in out of the NFA_END_COLL, out1 of
6431 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006432 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006433 add_off = clen;
6434 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006435 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006436 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006437
6438 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006439 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006440 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006441 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006442 add_state = t->state->out;
6443 add_off = clen;
6444 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006445 break;
6446
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006447 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006448 // On a composing character skip over it. Otherwise do
6449 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006450 if (enc_utf8 && utf_iscomposing(curc))
6451 {
6452 add_off = clen;
6453 }
6454 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006455 {
6456 add_here = TRUE;
6457 add_off = 0;
6458 }
6459 add_state = t->state->out;
6460 break;
6461
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006462 /*
6463 * Character classes like \a for alpha, \d for digit etc.
6464 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006465 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006466 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006467 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006468 break;
6469
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006470 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006471 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006472 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006473 break;
6474
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006475 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006476 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006477 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006478 break;
6479
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006480 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006481 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006482 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006483 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006484 break;
6485
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006486 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006487 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006488 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006489 break;
6490
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006491 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006492 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006493 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006494 break;
6495
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006496 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006497 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006498 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006499 break;
6500
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006501 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006502 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006503 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006504 break;
6505
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006506 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006507 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006508 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006509 break;
6510
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006511 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006512 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006513 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006514 break;
6515
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006516 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006517 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006518 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006519 break;
6520
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006521 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006522 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006523 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006524 break;
6525
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006526 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006527 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006528 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006529 break;
6530
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006531 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006532 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006533 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006534 break;
6535
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006536 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006537 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006538 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006539 break;
6540
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006541 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006542 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006543 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006544 break;
6545
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006546 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006547 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006548 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006549 break;
6550
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006551 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006552 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006553 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006554 break;
6555
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006556 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006557 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006558 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006559 break;
6560
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006561 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006562 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006563 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006564 break;
6565
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006566 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006567 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006568 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006569 break;
6570
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006571 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006572 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006573 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006574 break;
6575
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006576 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006577 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006578 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006579 break;
6580
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006581 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006582 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006583 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006584 break;
6585
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006586 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006587 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006588 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006589 break;
6590
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006591 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006592 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006593 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006594 break;
6595
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006596 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006597 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006598 ADD_STATE_IF_MATCH(t->state);
6599 break;
6600
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006601 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006602 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006603 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006604 ADD_STATE_IF_MATCH(t->state);
6605 break;
6606
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006607 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006608 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006609 ADD_STATE_IF_MATCH(t->state);
6610 break;
6611
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006612 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006613 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006614 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006615 ADD_STATE_IF_MATCH(t->state);
6616 break;
6617
Bram Moolenaar5714b802013-05-28 22:03:20 +02006618 case NFA_BACKREF1:
6619 case NFA_BACKREF2:
6620 case NFA_BACKREF3:
6621 case NFA_BACKREF4:
6622 case NFA_BACKREF5:
6623 case NFA_BACKREF6:
6624 case NFA_BACKREF7:
6625 case NFA_BACKREF8:
6626 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006627#ifdef FEAT_SYN_HL
6628 case NFA_ZREF1:
6629 case NFA_ZREF2:
6630 case NFA_ZREF3:
6631 case NFA_ZREF4:
6632 case NFA_ZREF5:
6633 case NFA_ZREF6:
6634 case NFA_ZREF7:
6635 case NFA_ZREF8:
6636 case NFA_ZREF9:
6637#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006638 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006639 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006640 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006641 int bytelen;
6642
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006643 if (t->state->c <= NFA_BACKREF9)
6644 {
6645 subidx = t->state->c - NFA_BACKREF1 + 1;
6646 result = match_backref(&t->subs.norm, subidx, &bytelen);
6647 }
6648#ifdef FEAT_SYN_HL
6649 else
6650 {
6651 subidx = t->state->c - NFA_ZREF1 + 1;
6652 result = match_zref(subidx, &bytelen);
6653 }
6654#endif
6655
Bram Moolenaar5714b802013-05-28 22:03:20 +02006656 if (result)
6657 {
6658 if (bytelen == 0)
6659 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006660 // empty match always works, output of NFA_SKIP to be
6661 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006662 add_here = TRUE;
6663 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006664 }
6665 else if (bytelen <= clen)
6666 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006667 // match current character, jump ahead to out of
6668 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006669 add_state = t->state->out->out;
6670 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006671 }
6672 else
6673 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006674 // skip over the matched characters, set character
6675 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006676 add_state = t->state->out;
6677 add_off = bytelen;
6678 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006679 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006680 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006681 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006682 }
6683 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006684 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006685 if (t->count - clen <= 0)
6686 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006687 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006688 add_state = t->state->out;
6689 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006690 }
6691 else
6692 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006693 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006694 add_state = t->state;
6695 add_off = 0;
6696 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006697 }
6698 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006699
Bram Moolenaar423532e2013-05-29 21:14:42 +02006700 case NFA_LNUM:
6701 case NFA_LNUM_GT:
6702 case NFA_LNUM_LT:
6703 result = (REG_MULTI &&
6704 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006705 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006706 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006707 {
6708 add_here = TRUE;
6709 add_state = t->state->out;
6710 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006711 break;
6712
6713 case NFA_COL:
6714 case NFA_COL_GT:
6715 case NFA_COL_LT:
6716 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006717 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006718 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006719 {
6720 add_here = TRUE;
6721 add_state = t->state->out;
6722 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006723 break;
6724
6725 case NFA_VCOL:
6726 case NFA_VCOL_GT:
6727 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006728 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006729 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006730 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006731 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006732
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006733 // Bail out quickly when there can't be a match, avoid the
6734 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006735 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006736 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006737 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006738 result = FALSE;
6739 if (op == 1 && col - 1 > t->state->val && col > 100)
6740 {
6741 int ts = wp->w_buffer->b_p_ts;
6742
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006743 // Guess that a character won't use more columns than
6744 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006745 if (ts < 4)
6746 ts = 4;
6747 result = col > t->state->val * ts;
6748 }
6749 if (!result)
6750 result = nfa_re_num_cmp(t->state->val, op,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006751 (long_u)win_linetabsize(wp, rex.line, col) + 1);
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006752 if (result)
6753 {
6754 add_here = TRUE;
6755 add_state = t->state->out;
6756 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006757 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006758 break;
6759
Bram Moolenaar044aa292013-06-04 21:27:38 +02006760 case NFA_MARK:
6761 case NFA_MARK_GT:
6762 case NFA_MARK_LT:
6763 {
Bram Moolenaar64066b92021-11-17 18:22:56 +00006764 size_t col = rex.input - rex.line;
Bram Moolenaar6100d022016-10-02 16:51:57 +02006765 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006766
Bram Moolenaar64066b92021-11-17 18:22:56 +00006767 // Line may have been freed, get it again.
6768 if (REG_MULTI)
6769 {
6770 rex.line = reg_getline(rex.lnum);
6771 rex.input = rex.line + col;
6772 }
6773
Bram Moolenaar872bee52021-05-24 22:56:15 +02006774 // Compare the mark position to the match position, if the mark
6775 // exists and mark is set in reg_buf.
6776 if (pos != NULL && pos->lnum > 0)
6777 {
6778 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6779 && pos->col == MAXCOL
6780 ? (colnr_T)STRLEN(reg_getline(
6781 pos->lnum - rex.reg_firstlnum))
6782 : pos->col;
6783
6784 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6785 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006786 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006787 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006788 ? t->state->c == NFA_MARK_GT
6789 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006790 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006791 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006792 : t->state->c == NFA_MARK_LT));
6793 if (result)
6794 {
6795 add_here = TRUE;
6796 add_state = t->state->out;
6797 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006798 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006799 break;
6800 }
6801
Bram Moolenaar423532e2013-05-29 21:14:42 +02006802 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006803 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006804 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006805 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006806 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006807 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006808 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006809 {
6810 add_here = TRUE;
6811 add_state = t->state->out;
6812 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006813 break;
6814
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006815 case NFA_VISUAL:
6816 result = reg_match_visual();
6817 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006818 {
6819 add_here = TRUE;
6820 add_state = t->state->out;
6821 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006822 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006823
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006824 case NFA_MOPEN1:
6825 case NFA_MOPEN2:
6826 case NFA_MOPEN3:
6827 case NFA_MOPEN4:
6828 case NFA_MOPEN5:
6829 case NFA_MOPEN6:
6830 case NFA_MOPEN7:
6831 case NFA_MOPEN8:
6832 case NFA_MOPEN9:
6833#ifdef FEAT_SYN_HL
6834 case NFA_ZOPEN:
6835 case NFA_ZOPEN1:
6836 case NFA_ZOPEN2:
6837 case NFA_ZOPEN3:
6838 case NFA_ZOPEN4:
6839 case NFA_ZOPEN5:
6840 case NFA_ZOPEN6:
6841 case NFA_ZOPEN7:
6842 case NFA_ZOPEN8:
6843 case NFA_ZOPEN9:
6844#endif
6845 case NFA_NOPEN:
6846 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006847 // These states are only added to be able to bail out when
6848 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006849 break;
6850
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006851 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006852 {
6853 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006854
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006855#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006856 if (c < 0)
Bram Moolenaarb2d85e32022-01-07 16:55:32 +00006857 siemsg("INTERNAL: Negative state char: %ld", (long)c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006858#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006859 result = (c == curc);
6860
Bram Moolenaar6100d022016-10-02 16:51:57 +02006861 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006862 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006863 // If rex.reg_icombine is not set only skip over the character
6864 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006865 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006866 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006867 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006868 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006869 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006870
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006871 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006872
6873 if (add_state != NULL)
6874 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006875 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006876 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006877
6878 if (t->pim.result == NFA_PIM_UNUSED)
6879 pim = NULL;
6880 else
6881 pim = &t->pim;
6882
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006883 // Handle the postponed invisible match if the match might end
6884 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006885 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006886 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006887 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006888 {
6889#ifdef ENABLE_LOG
6890 fprintf(log_fd, "\n");
6891 fprintf(log_fd, "==================================\n");
6892 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6893 fprintf(log_fd, "\n");
6894#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006895 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006896 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006897 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006898 // for \@! and \@<! it is a match when the result is
6899 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006900 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006901 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6902 || pim->state->c
6903 == NFA_START_INVISIBLE_BEFORE_NEG
6904 || pim->state->c
6905 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006906 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006907 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006908 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006909#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006910 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006911 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006912#endif
6913 }
6914 }
6915 else
6916 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006917 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006918#ifdef ENABLE_LOG
6919 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006920 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006921 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6922 fprintf(log_fd, "\n");
6923#endif
6924 }
6925
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006926 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006927 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006928 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6929 || pim->state->c
6930 == NFA_START_INVISIBLE_BEFORE_NEG
6931 || pim->state->c
6932 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006933 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006934 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006935 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006936#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006937 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006938 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006939#endif
6940 }
6941 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006942 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02006943 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006944
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006945 // Postponed invisible match was handled, don't add it to
6946 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006947 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006948 }
6949
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006950 // If "pim" points into l->t it will become invalid when
6951 // adding the state causes the list to be reallocated. Make a
6952 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02006953 if (pim == &t->pim)
6954 {
6955 copy_pim(&pim_copy, pim);
6956 pim = &pim_copy;
6957 }
6958
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006959 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006960 r = addstate_here(thislist, add_state, &t->subs,
6961 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006962 else
6963 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006964 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006965 if (add_count > 0)
6966 nextlist->t[nextlist->n - 1].count = add_count;
6967 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006968 if (r == NULL)
6969 {
6970 nfa_match = NFA_TOO_EXPENSIVE;
6971 goto theend;
6972 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006973 }
6974
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006975 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006976
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006977 // Look for the start of a match in the current position by adding the
6978 // start state to the list of states.
6979 // The first found match is the leftmost one, thus the order of states
6980 // matters!
6981 // Do not add the start state in recursive calls of nfa_regmatch(),
6982 // because recursive calls should only start in the first position.
6983 // Unless "nfa_endp" is not NULL, then we match the end position.
6984 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02006985 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02006986 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02006987 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02006988 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02006989 && (rex.reg_maxcol == 0
Bram Moolenaar0270f382018-07-17 05:43:58 +02006990 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02006991 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02006992 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006993 ? (rex.lnum < nfa_endp->se_u.pos.lnum
6994 || (rex.lnum == nfa_endp->se_u.pos.lnum
6995 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02006996 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006997 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006998 {
6999#ifdef ENABLE_LOG
7000 fprintf(log_fd, "(---) STARTSTATE\n");
7001#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007002 // Inline optimized code for addstate() if we know the state is
7003 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007004 if (toplevel)
7005 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007006 int add = TRUE;
7007 int c;
7008
7009 if (prog->regstart != NUL && clen != 0)
7010 {
7011 if (nextlist->n == 0)
7012 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007013 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007014
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007015 // Nextlist is empty, we can skip ahead to the
7016 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007017 if (skip_to_start(prog->regstart, &col) == FAIL)
7018 break;
7019#ifdef ENABLE_LOG
7020 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007021 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007022#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007023 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007024 }
7025 else
7026 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007027 // Checking if the required start character matches is
7028 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007029 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007030 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007031 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007032 {
7033#ifdef ENABLE_LOG
7034 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7035#endif
7036 add = FALSE;
7037 }
7038 }
7039 }
7040
7041 if (add)
7042 {
7043 if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007044 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007045 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007046 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007047 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007048 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7049 {
7050 nfa_match = NFA_TOO_EXPENSIVE;
7051 goto theend;
7052 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007053 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007054 }
7055 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007056 {
7057 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7058 {
7059 nfa_match = NFA_TOO_EXPENSIVE;
7060 goto theend;
7061 }
7062 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007063 }
7064
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007065#ifdef ENABLE_LOG
7066 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007067 {
7068 int i;
7069
7070 for (i = 0; i < thislist->n; i++)
7071 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7072 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007073 fprintf(log_fd, "\n");
7074#endif
7075
7076nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007077 // Advance to the next character, or advance to the next line, or
7078 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007079 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007080 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007081 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007082 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007083 reg_nextline();
7084 else
7085 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007086
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007087 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007088 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007089 if (got_int)
7090 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007091#ifdef FEAT_RELTIME
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007092 // Check for timeout once in a twenty times to avoid overhead.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007093 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7094 {
7095 nfa_time_count = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007096 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007097 break;
7098 }
7099#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007100 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007101
7102#ifdef ENABLE_LOG
7103 if (log_fd != stderr)
7104 fclose(log_fd);
7105 log_fd = NULL;
7106#endif
7107
7108theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007109 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007110 vim_free(list[0].t);
7111 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007112 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007113#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007114#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007115 fclose(debug);
7116#endif
7117
Bram Moolenaar963fee22013-05-26 21:47:28 +02007118 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007119}
7120
7121/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007122 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007123 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007124 */
7125 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007126nfa_regtry(
7127 nfa_regprog_T *prog,
7128 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007129 proftime_T *tm UNUSED, // timeout limit or NULL
7130 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007131{
7132 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007133 regsubs_T subs, m;
7134 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007135 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007136#ifdef ENABLE_LOG
7137 FILE *f;
7138#endif
7139
Bram Moolenaar0270f382018-07-17 05:43:58 +02007140 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007141#ifdef FEAT_RELTIME
7142 nfa_time_limit = tm;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007143 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007144 nfa_time_count = 0;
7145#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007146
7147#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007148 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007149 if (f != NULL)
7150 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007151 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007152#ifdef DEBUG
7153 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7154#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007155 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007156 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007157 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007158 fprintf(f, "\n\n");
7159 fclose(f);
7160 }
7161 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007162 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007163#endif
7164
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007165 clear_sub(&subs.norm);
7166 clear_sub(&m.norm);
7167#ifdef FEAT_SYN_HL
7168 clear_sub(&subs.synt);
7169 clear_sub(&m.synt);
7170#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007171
Bram Moolenaarfda37292014-11-05 14:27:36 +01007172 result = nfa_regmatch(prog, start, &subs, &m);
7173 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007174 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007175 else if (result == NFA_TOO_EXPENSIVE)
7176 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007177
7178 cleanup_subexpr();
7179 if (REG_MULTI)
7180 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007181 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007182 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007183 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7184 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007185
Bram Moolenaar6100d022016-10-02 16:51:57 +02007186 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7187 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007188 }
7189
Bram Moolenaar6100d022016-10-02 16:51:57 +02007190 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007191 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007192 rex.reg_startpos[0].lnum = 0;
7193 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007194 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007195 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007196 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007197 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007198 rex.reg_endpos[0].lnum = rex.lnum;
7199 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007200 }
7201 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007202 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007203 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007204 }
7205 else
7206 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007207 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007208 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007209 rex.reg_startp[i] = subs.norm.list.line[i].start;
7210 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007211 }
7212
Bram Moolenaar6100d022016-10-02 16:51:57 +02007213 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007214 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007215 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007216 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007217 }
7218
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007219#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007220 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007221 unref_extmatch(re_extmatch_out);
7222 re_extmatch_out = NULL;
7223
7224 if (prog->reghasz == REX_SET)
7225 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007226 cleanup_zsubexpr();
7227 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007228 if (re_extmatch_out == NULL)
7229 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007230 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007231 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007232 {
7233 if (REG_MULTI)
7234 {
7235 struct multipos *mpos = &subs.synt.list.multi[i];
7236
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007237 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007238 if (mpos->start_lnum >= 0
7239 && mpos->start_lnum == mpos->end_lnum
7240 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007241 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007242 vim_strnsave(reg_getline(mpos->start_lnum)
7243 + mpos->start_col,
7244 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007245 }
7246 else
7247 {
7248 struct linepos *lpos = &subs.synt.list.line[i];
7249
7250 if (lpos->start != NULL && lpos->end != NULL)
7251 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007252 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007253 }
7254 }
7255 }
7256#endif
7257
Bram Moolenaar0270f382018-07-17 05:43:58 +02007258 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007259}
7260
7261/*
7262 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007263 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007264 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007265 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007266 */
7267 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007268nfa_regexec_both(
7269 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007270 colnr_T startcol, // column to start looking for match
7271 proftime_T *tm, // timeout limit or NULL
7272 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007273{
7274 nfa_regprog_T *prog;
7275 long retval = 0L;
7276 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007277 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007278
7279 if (REG_MULTI)
7280 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007281 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007282 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007283 rex.reg_startpos = rex.reg_mmatch->startpos;
7284 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007285 }
7286 else
7287 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007288 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7289 rex.reg_startp = rex.reg_match->startp;
7290 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007291 }
7292
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007293 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007294 if (prog == NULL || line == NULL)
7295 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02007296 iemsg(_(e_null_argument));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007297 goto theend;
7298 }
7299
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007300 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007301 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007302 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007303 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007304 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007305
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007306 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007307 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007308 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007309
Bram Moolenaar0270f382018-07-17 05:43:58 +02007310 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007311 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007312
Bram Moolenaar0270f382018-07-17 05:43:58 +02007313 rex.nfa_has_zend = prog->has_zend;
7314 rex.nfa_has_backref = prog->has_backref;
7315 rex.nfa_nsubexpr = prog->nsubexp;
7316 rex.nfa_listid = 1;
7317 rex.nfa_alt_listid = 2;
7318#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007319 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007320#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007321
Bram Moolenaard89616e2013-06-06 18:46:06 +02007322 if (prog->reganch && col > 0)
7323 return 0L;
7324
Bram Moolenaar0270f382018-07-17 05:43:58 +02007325 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007326#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007327 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007328 if (prog->reghasz == REX_SET)
7329 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007330 rex.nfa_has_zsubexpr = TRUE;
7331 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007332 }
7333 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007334 {
7335 rex.nfa_has_zsubexpr = FALSE;
7336 rex.need_clear_zsubexpr = FALSE;
7337 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007338#endif
7339
Bram Moolenaard89616e2013-06-06 18:46:06 +02007340 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007341 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007342 // Skip ahead until a character we know the match must start with.
7343 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007344 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007345 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007346
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007347 // If match_text is set it contains the full text that must match.
7348 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007349 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar473de612013-06-08 18:19:48 +02007350 return find_match_text(col, prog->regstart, prog->match_text);
7351 }
7352
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007353 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007354 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007355 goto theend;
7356
Bram Moolenaar0270f382018-07-17 05:43:58 +02007357 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7358 // it's accidentally used during execution.
7359 nstate = 0;
7360 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007361 {
7362 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007363 prog->state[i].lastlist[0] = 0;
7364 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007365 }
7366
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007367 retval = nfa_regtry(prog, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007368
Bram Moolenaar0270f382018-07-17 05:43:58 +02007369#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007370 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007371#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007372
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007373theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007374 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007375 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007376 // Make sure the end is never before the start. Can happen when \zs and
7377 // \ze are used.
7378 if (REG_MULTI)
7379 {
7380 lpos_T *start = &rex.reg_mmatch->startpos[0];
7381 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007382
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007383 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007384 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007385 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7386 }
7387 else
7388 {
7389 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7390 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7391 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007392 }
7393
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007394 return retval;
7395}
7396
7397/*
7398 * Compile a regular expression into internal code for the NFA matcher.
7399 * Returns the program in allocated space. Returns NULL for an error.
7400 */
7401 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007402nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007403{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007404 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007405 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007406 int *postfix;
7407
7408 if (expr == NULL)
7409 return NULL;
7410
Bram Moolenaar0270f382018-07-17 05:43:58 +02007411#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007412 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007413#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007414 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007415
7416 init_class_tab();
7417
7418 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7419 return NULL;
7420
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007421 // Build postfix form of the regexp. Needed to build the NFA
7422 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007423 postfix = re2post();
7424 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007425 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007426
7427 /*
7428 * In order to build the NFA, we parse the input regexp twice:
7429 * 1. first pass to count size (so we can allocate space)
7430 * 2. second to emit code
7431 */
7432#ifdef ENABLE_LOG
7433 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007434 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007435
7436 if (f != NULL)
7437 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007438 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007439 fclose(f);
7440 }
7441 }
7442#endif
7443
7444 /*
7445 * PASS 1
7446 * Count number of NFA states in "nstate". Do not build the NFA.
7447 */
7448 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007449
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007450 // allocate the regprog with space for the compiled regexp
Bram Moolenaar16619a22013-06-11 18:42:36 +02007451 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007452 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007453 if (prog == NULL)
7454 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007455 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007456 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007457
7458 /*
7459 * PASS 2
7460 * Build the NFA
7461 */
7462 prog->start = post2nfa(postfix, post_ptr, FALSE);
7463 if (prog->start == NULL)
7464 goto fail;
7465
7466 prog->regflags = regflags;
7467 prog->engine = &nfa_regengine;
7468 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007469 prog->has_zend = rex.nfa_has_zend;
7470 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007471 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007472
Bram Moolenaara2947e22013-06-11 22:44:09 +02007473 nfa_postprocess(prog);
7474
Bram Moolenaard89616e2013-06-06 18:46:06 +02007475 prog->reganch = nfa_get_reganch(prog->start, 0);
7476 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007477 prog->match_text = nfa_get_match_text(prog->start);
7478
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007479#ifdef ENABLE_LOG
7480 nfa_postfix_dump(expr, OK);
7481 nfa_dump(prog);
7482#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007483#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007484 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007485 prog->reghasz = re_has_z;
7486#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007487 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007488#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007489 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007490#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007491
7492out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007493 VIM_CLEAR(post_start);
7494 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007495 state_ptr = NULL;
7496 return (regprog_T *)prog;
7497
7498fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007499 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007500#ifdef ENABLE_LOG
7501 nfa_postfix_dump(expr, FAIL);
7502#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007503#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007504 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007505#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007506 goto out;
7507}
7508
Bram Moolenaar473de612013-06-08 18:19:48 +02007509/*
7510 * Free a compiled regexp program, returned by nfa_regcomp().
7511 */
7512 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007513nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007514{
7515 if (prog != NULL)
7516 {
7517 vim_free(((nfa_regprog_T *)prog)->match_text);
Bram Moolenaar473de612013-06-08 18:19:48 +02007518 vim_free(((nfa_regprog_T *)prog)->pattern);
Bram Moolenaar473de612013-06-08 18:19:48 +02007519 vim_free(prog);
7520 }
7521}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007522
7523/*
7524 * Match a regexp against a string.
7525 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7526 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007527 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007528 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007529 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007530 */
7531 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007532nfa_regexec_nl(
7533 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007534 char_u *line, // string to match against
7535 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007536 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007537{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007538 rex.reg_match = rmp;
7539 rex.reg_mmatch = NULL;
7540 rex.reg_maxline = 0;
7541 rex.reg_line_lbr = line_lbr;
7542 rex.reg_buf = curbuf;
7543 rex.reg_win = NULL;
7544 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007545 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007546 rex.reg_maxcol = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007547 return nfa_regexec_both(line, col, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007548}
7549
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007550
7551/*
7552 * Match a regexp against multiple lines.
7553 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7554 * Uses curbuf for line count and 'iskeyword'.
7555 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007556 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007557 * match otherwise.
7558 *
7559 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7560 *
7561 * ! Also NOTE : match may actually be in another line. e.g.:
7562 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7563 *
7564 * +-------------------------+
7565 * |a |
7566 * |b |
7567 * |c |
7568 * | |
7569 * +-------------------------+
7570 *
7571 * then nfa_regexec_multi() returns 3. while the original
7572 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7573 *
7574 * FIXME if this behavior is not compatible.
7575 */
7576 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007577nfa_regexec_multi(
7578 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007579 win_T *win, // window in which to search or NULL
7580 buf_T *buf, // buffer in which to search
7581 linenr_T lnum, // nr of line to start looking for match
7582 colnr_T col, // column to start looking for match
7583 proftime_T *tm, // timeout limit or NULL
7584 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007585{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007586 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007587 return nfa_regexec_both(NULL, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007588}
7589
7590#ifdef DEBUG
7591# undef ENABLE_LOG
7592#endif