blob: e90f98d474b5d76934efa4a86db734965515c20f [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
Bram Moolenaar52797ba2021-12-16 14:45:13 +000021 * To disable all of this when compiling Vim for debugging, undefine DEBUG in
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020022 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar174a8482013-11-28 14:20:17 +0100246static char_u e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200247static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
Bram Moolenaara5483442019-02-17 20:17:02 +0100248static char_u e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %d");
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +0200249static char_u e_value_too_large[] = N_("E951: \\% value too large");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200250
Bram Moolenaar0270f382018-07-17 05:43:58 +0200251// Variables only used in nfa_regcomp() and descendants.
252static int nfa_re_flags; // re_flags passed to nfa_regcomp()
253static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200254static int *post_end;
255static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100256
257// Set when the pattern should use the NFA engine.
258// E.g. [[:upper:]] only allows 8bit characters for BT engine,
259// while NFA engine handles multibyte characters correctly.
260static int wants_nfa;
261
Bram Moolenaar0270f382018-07-17 05:43:58 +0200262static int nstate; // Number of states in the NFA.
263static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200264
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100265// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200266static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200267
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100268// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200269static int nfa_ll_index = 0;
270
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100272static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100274static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100276static int match_follows(nfa_state_T *startstate, int depth);
277static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200278
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100279// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200280#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200281 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200282 return FAIL; \
283 *post_ptr++ = c; \
284 } while (0)
285
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200286/*
287 * Initialize internal variables before NFA compilation.
288 * Return OK on success, FAIL otherwise.
289 */
290 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100291nfa_regcomp_start(
292 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100293 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200294{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200295 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200296 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
298 nstate = 0;
299 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100300 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200301 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200302
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100303 // Some items blow up in size, such as [A-z]. Add more space for that.
304 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200305 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200306
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100307 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200308 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200310 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200311 if (post_start == NULL)
312 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200314 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100315 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200316 rex.nfa_has_zend = FALSE;
317 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200318
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100319 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200320 regcomp_start(expr, re_flags);
321
322 return OK;
323}
324
325/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326 * Figure out if the NFA state list starts with an anchor, must match at start
327 * of the line.
328 */
329 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100330nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200331{
332 nfa_state_T *p = start;
333
334 if (depth > 4)
335 return 0;
336
337 while (p != NULL)
338 {
339 switch (p->c)
340 {
341 case NFA_BOL:
342 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100343 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200344
345 case NFA_ZSTART:
346 case NFA_ZEND:
347 case NFA_CURSOR:
348 case NFA_VISUAL:
349
350 case NFA_MOPEN:
351 case NFA_MOPEN1:
352 case NFA_MOPEN2:
353 case NFA_MOPEN3:
354 case NFA_MOPEN4:
355 case NFA_MOPEN5:
356 case NFA_MOPEN6:
357 case NFA_MOPEN7:
358 case NFA_MOPEN8:
359 case NFA_MOPEN9:
360 case NFA_NOPEN:
361#ifdef FEAT_SYN_HL
362 case NFA_ZOPEN:
363 case NFA_ZOPEN1:
364 case NFA_ZOPEN2:
365 case NFA_ZOPEN3:
366 case NFA_ZOPEN4:
367 case NFA_ZOPEN5:
368 case NFA_ZOPEN6:
369 case NFA_ZOPEN7:
370 case NFA_ZOPEN8:
371 case NFA_ZOPEN9:
372#endif
373 p = p->out;
374 break;
375
376 case NFA_SPLIT:
377 return nfa_get_reganch(p->out, depth + 1)
378 && nfa_get_reganch(p->out1, depth + 1);
379
380 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100381 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200382 }
383 }
384 return 0;
385}
386
387/*
388 * Figure out if the NFA state list starts with a character which must match
389 * at start of the match.
390 */
391 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200393{
394 nfa_state_T *p = start;
395
396 if (depth > 4)
397 return 0;
398
399 while (p != NULL)
400 {
401 switch (p->c)
402 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100403 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200404 case NFA_BOL:
405 case NFA_BOF:
406 case NFA_BOW:
407 case NFA_EOW:
408 case NFA_ZSTART:
409 case NFA_ZEND:
410 case NFA_CURSOR:
411 case NFA_VISUAL:
412 case NFA_LNUM:
413 case NFA_LNUM_GT:
414 case NFA_LNUM_LT:
415 case NFA_COL:
416 case NFA_COL_GT:
417 case NFA_COL_LT:
418 case NFA_VCOL:
419 case NFA_VCOL_GT:
420 case NFA_VCOL_LT:
421 case NFA_MARK:
422 case NFA_MARK_GT:
423 case NFA_MARK_LT:
424
425 case NFA_MOPEN:
426 case NFA_MOPEN1:
427 case NFA_MOPEN2:
428 case NFA_MOPEN3:
429 case NFA_MOPEN4:
430 case NFA_MOPEN5:
431 case NFA_MOPEN6:
432 case NFA_MOPEN7:
433 case NFA_MOPEN8:
434 case NFA_MOPEN9:
435 case NFA_NOPEN:
436#ifdef FEAT_SYN_HL
437 case NFA_ZOPEN:
438 case NFA_ZOPEN1:
439 case NFA_ZOPEN2:
440 case NFA_ZOPEN3:
441 case NFA_ZOPEN4:
442 case NFA_ZOPEN5:
443 case NFA_ZOPEN6:
444 case NFA_ZOPEN7:
445 case NFA_ZOPEN8:
446 case NFA_ZOPEN9:
447#endif
448 p = p->out;
449 break;
450
451 case NFA_SPLIT:
452 {
453 int c1 = nfa_get_regstart(p->out, depth + 1);
454 int c2 = nfa_get_regstart(p->out1, depth + 1);
455
456 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100457 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200458 return 0;
459 }
460
461 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200462 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100463 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200464 return 0;
465 }
466 }
467 return 0;
468}
469
470/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200471 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200472 * else. If so return a string in allocated memory with what must match after
473 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200474 */
475 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100476nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200477{
478 nfa_state_T *p = start;
479 int len = 0;
480 char_u *ret;
481 char_u *s;
482
483 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100484 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200485 p = p->out;
486 while (p->c > 0)
487 {
488 len += MB_CHAR2LEN(p->c);
489 p = p->out;
490 }
491 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
492 return NULL;
493
494 ret = alloc(len);
495 if (ret != NULL)
496 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100497 p = start->out->out; // skip first char, it goes into regstart
Bram Moolenaar473de612013-06-08 18:19:48 +0200498 s = ret;
499 while (p->c > 0)
500 {
Bram Moolenaar473de612013-06-08 18:19:48 +0200501 if (has_mbyte)
502 s += (*mb_char2bytes)(p->c, s);
503 else
Bram Moolenaar473de612013-06-08 18:19:48 +0200504 *s++ = p->c;
505 p = p->out;
506 }
507 *s = NUL;
508 }
509 return ret;
510}
511
512/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513 * Allocate more space for post_start. Called when
514 * running above the estimated number of states.
515 */
516 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100517realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200518{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200519 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100520 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200521 int *new_start;
522 int *old_start;
523
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100524 // For weird patterns the number of states can be very high. Increasing by
525 // 50% seems a reasonable compromise between memory use and speed.
526 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200527 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200528 if (new_start == NULL)
529 return FAIL;
530 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200531 old_start = post_start;
532 post_start = new_start;
533 post_ptr = new_start + (post_ptr - old_start);
534 post_end = post_start + new_max;
535 vim_free(old_start);
536 return OK;
537}
538
539/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200540 * Search between "start" and "end" and try to recognize a
541 * character class in expanded form. For example [0-9].
542 * On success, return the id the character class to be emitted.
543 * On failure, return 0 (=FAIL)
544 * Start points to the first char of the range, while end should point
545 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200546 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
547 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200548 */
549 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100550nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200551{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200552# define CLASS_not 0x80
553# define CLASS_af 0x40
554# define CLASS_AF 0x20
555# define CLASS_az 0x10
556# define CLASS_AZ 0x08
557# define CLASS_o7 0x04
558# define CLASS_o9 0x02
559# define CLASS_underscore 0x01
560
561 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200562 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200563 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200564
565 if (extra_newl == TRUE)
566 newl = TRUE;
567
568 if (*end != ']')
569 return FAIL;
570 p = start;
571 if (*p == '^')
572 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200573 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200574 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200575 }
576
577 while (p < end)
578 {
579 if (p + 2 < end && *(p + 1) == '-')
580 {
581 switch (*p)
582 {
583 case '0':
584 if (*(p + 2) == '9')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200589 if (*(p + 2) == '7')
590 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200591 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200592 break;
593 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200594 return FAIL;
595
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200596 case 'a':
597 if (*(p + 2) == 'z')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200602 if (*(p + 2) == 'f')
603 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200604 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200605 break;
606 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200607 return FAIL;
608
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200609 case 'A':
610 if (*(p + 2) == 'Z')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200615 if (*(p + 2) == 'F')
616 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200617 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200618 break;
619 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200620 return FAIL;
621
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200622 default:
623 return FAIL;
624 }
625 p += 3;
626 }
627 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
628 {
629 newl = TRUE;
630 p += 2;
631 }
632 else if (*p == '_')
633 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200634 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200635 p ++;
636 }
637 else if (*p == '\n')
638 {
639 newl = TRUE;
640 p ++;
641 }
642 else
643 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100644 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200645
646 if (p != end)
647 return FAIL;
648
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200649 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200650 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200651
652 switch (config)
653 {
654 case CLASS_o9:
655 return extra_newl + NFA_DIGIT;
656 case CLASS_not | CLASS_o9:
657 return extra_newl + NFA_NDIGIT;
658 case CLASS_af | CLASS_AF | CLASS_o9:
659 return extra_newl + NFA_HEX;
660 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
661 return extra_newl + NFA_NHEX;
662 case CLASS_o7:
663 return extra_newl + NFA_OCTAL;
664 case CLASS_not | CLASS_o7:
665 return extra_newl + NFA_NOCTAL;
666 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
667 return extra_newl + NFA_WORD;
668 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
669 return extra_newl + NFA_NWORD;
670 case CLASS_az | CLASS_AZ | CLASS_underscore:
671 return extra_newl + NFA_HEAD;
672 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
673 return extra_newl + NFA_NHEAD;
674 case CLASS_az | CLASS_AZ:
675 return extra_newl + NFA_ALPHA;
676 case CLASS_not | CLASS_az | CLASS_AZ:
677 return extra_newl + NFA_NALPHA;
678 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200679 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200680 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200681 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200682 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200683 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200684 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200685 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200686 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200687 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200688}
689
690/*
691 * Produce the bytes for equivalence class "c".
692 * Currently only handles latin1, latin9 and utf-8.
693 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
694 * equivalent to 'a OR b OR c'
695 *
696 * NOTE! When changing this function, also update reg_equi_class()
697 */
698 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100699nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200701#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200702
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200703 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
704 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200705 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200706#ifdef EBCDIC
707# define A_circumflex 0x62
708# define A_diaeresis 0x63
709# define A_grave 0x64
710# define A_acute 0x65
711# define A_virguilla 0x66
712# define A_ring 0x67
713# define C_cedilla 0x68
714# define E_acute 0x71
715# define E_circumflex 0x72
716# define E_diaeresis 0x73
717# define E_grave 0x74
718# define I_acute 0x75
719# define I_circumflex 0x76
720# define I_diaeresis 0x77
721# define I_grave 0x78
722# define N_virguilla 0x69
723# define O_circumflex 0xeb
724# define O_diaeresis 0xec
725# define O_grave 0xed
726# define O_acute 0xee
727# define O_virguilla 0xef
728# define O_slash 0x80
729# define U_circumflex 0xfb
730# define U_diaeresis 0xfc
731# define U_grave 0xfd
732# define U_acute 0xfe
733# define Y_acute 0xba
734# define a_grave 0x42
735# define a_acute 0x43
736# define a_circumflex 0x44
737# define a_virguilla 0x45
738# define a_diaeresis 0x46
739# define a_ring 0x47
740# define c_cedilla 0x48
741# define e_grave 0x51
742# define e_acute 0x52
743# define e_circumflex 0x53
744# define e_diaeresis 0x54
745# define i_grave 0x55
746# define i_acute 0x56
747# define i_circumflex 0x57
748# define i_diaeresis 0x58
749# define n_virguilla 0x49
750# define o_grave 0xcb
751# define o_acute 0xcc
752# define o_circumflex 0xcd
753# define o_virguilla 0xce
754# define o_diaeresis 0xcf
755# define o_slash 0x70
756# define u_grave 0xdb
757# define u_acute 0xdc
758# define u_circumflex 0xdd
759# define u_diaeresis 0xde
760# define y_acute 0x8d
761# define y_diaeresis 0xdf
762#else
763# define A_grave 0xc0
764# define A_acute 0xc1
765# define A_circumflex 0xc2
766# define A_virguilla 0xc3
767# define A_diaeresis 0xc4
768# define A_ring 0xc5
769# define C_cedilla 0xc7
770# define E_grave 0xc8
771# define E_acute 0xc9
772# define E_circumflex 0xca
773# define E_diaeresis 0xcb
774# define I_grave 0xcc
775# define I_acute 0xcd
776# define I_circumflex 0xce
777# define I_diaeresis 0xcf
778# define N_virguilla 0xd1
779# define O_grave 0xd2
780# define O_acute 0xd3
781# define O_circumflex 0xd4
782# define O_virguilla 0xd5
783# define O_diaeresis 0xd6
784# define O_slash 0xd8
785# define U_grave 0xd9
786# define U_acute 0xda
787# define U_circumflex 0xdb
788# define U_diaeresis 0xdc
789# define Y_acute 0xdd
790# define a_grave 0xe0
791# define a_acute 0xe1
792# define a_circumflex 0xe2
793# define a_virguilla 0xe3
794# define a_diaeresis 0xe4
795# define a_ring 0xe5
796# define c_cedilla 0xe7
797# define e_grave 0xe8
798# define e_acute 0xe9
799# define e_circumflex 0xea
800# define e_diaeresis 0xeb
801# define i_grave 0xec
802# define i_acute 0xed
803# define i_circumflex 0xee
804# define i_diaeresis 0xef
805# define n_virguilla 0xf1
806# define o_grave 0xf2
807# define o_acute 0xf3
808# define o_circumflex 0xf4
809# define o_virguilla 0xf5
810# define o_diaeresis 0xf6
811# define o_slash 0xf8
812# define u_grave 0xf9
813# define u_acute 0xfa
814# define u_circumflex 0xfb
815# define u_diaeresis 0xfc
816# define y_acute 0xfd
817# define y_diaeresis 0xff
818#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200819 switch (c)
820 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200821 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case A_virguilla: case A_diaeresis: case A_ring:
823 case 0x100: case 0x102: case 0x104: case 0x1cd:
824 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
825 case 0x202: case 0x226: case 0x23a: case 0x1e00:
826 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
827 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
828 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
829 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
830 EMIT2(A_circumflex) EMIT2(A_virguilla)
831 EMIT2(A_diaeresis) EMIT2(A_ring)
832 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
833 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
834 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
835 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
836 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
837 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
838 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
839 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200840 return OK;
841
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200842 case 'B': case 0x181: case 0x243: case 0x1e02:
843 case 0x1e04: case 0x1e06:
844 EMIT2('B')
845 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
846 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200847 return OK;
848
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200849 case 'C': case C_cedilla: case 0x106: case 0x108:
850 case 0x10a: case 0x10c: case 0x187: case 0x23b:
851 case 0x1e08: case 0xa792:
852 EMIT2('C') EMIT2(C_cedilla)
853 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
854 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
855 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200856 return OK;
857
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200858 case 'D': case 0x10e: case 0x110: case 0x18a:
859 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
860 case 0x1e12:
861 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
862 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
863 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200864 return OK;
865
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200866 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200867 case E_diaeresis: case 0x112: case 0x114: case 0x116:
868 case 0x118: case 0x11a: case 0x204: case 0x206:
869 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
870 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
871 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
872 case 0x1ec2: case 0x1ec4: case 0x1ec6:
873 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
874 EMIT2(E_circumflex) EMIT2(E_diaeresis)
875 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
876 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
877 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
878 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
879 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
880 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
881 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
882 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200883 return OK;
884
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200885 case 'F': case 0x191: case 0x1e1e: case 0xa798:
886 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200887 return OK;
888
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200889 case 'G': case 0x11c: case 0x11e: case 0x120:
890 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
891 case 0x1f4: case 0x1e20: case 0xa7a0:
892 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
893 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
894 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
895 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200896 return OK;
897
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200898 case 'H': case 0x124: case 0x126: case 0x21e:
899 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
900 case 0x1e2a: case 0x2c67:
901 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
902 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
903 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200904 return OK;
905
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200906 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200907 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
908 case 0x12e: case 0x130: case 0x197: case 0x1cf:
909 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
910 case 0x1ec8: case 0x1eca:
911 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
912 EMIT2(I_circumflex) EMIT2(I_diaeresis)
913 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
914 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
915 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
916 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
917 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200918 return OK;
919
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200920 case 'J': case 0x134: case 0x248:
921 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200922 return OK;
923
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200924 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
925 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
926 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
927 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
928 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200929 return OK;
930
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200931 case 'L': case 0x139: case 0x13b: case 0x13d:
932 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
933 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
934 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
935 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
936 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
937 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200938 return OK;
939
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200940 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
941 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
942 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200943 return OK;
944
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200945 case 'N': case N_virguilla:
946 case 0x143: case 0x145: case 0x147: case 0x1f8:
947 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
948 case 0xa7a4:
949 EMIT2('N') EMIT2(N_virguilla)
950 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
951 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
952 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200953 return OK;
954
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200955 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 case O_virguilla: case O_diaeresis: case O_slash:
957 case 0x14c: case 0x14e: case 0x150: case 0x19f:
958 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
959 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
960 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
961 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
962 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
963 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
964 case 0x1ede: case 0x1ee0: case 0x1ee2:
965 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
966 EMIT2(O_circumflex) EMIT2(O_virguilla)
967 EMIT2(O_diaeresis) EMIT2(O_slash)
968 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
969 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
970 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
971 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
972 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
973 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
974 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
975 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
976 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
977 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
978 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200979 return OK;
980
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
982 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
983 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200984 return OK;
985
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200986 case 'Q': case 0x24a:
987 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200988 return OK;
989
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
991 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
992 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
993 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
994 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
995 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
996 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
1000 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
1001 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
1002 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
1003 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
1004 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
1005 EMIT2(0xa7a8)
1006 return OK;
1007
1008 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
1009 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
1010 case 0x1e6e: case 0x1e70:
1011 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
1012 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
1013 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001014 return OK;
1015
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001016 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001017 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
1018 case 0x16e: case 0x170: case 0x172: case 0x1af:
1019 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
1020 case 0x1db: case 0x214: case 0x216: case 0x244:
1021 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
1022 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
1023 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
1024 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
1025 EMIT2(U_diaeresis) EMIT2(U_circumflex)
1026 EMIT2(0x168) EMIT2(0x16a)
1027 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
1028 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
1029 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
1030 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
1031 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
1032 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
1033 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
1034 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
1035 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001036 return OK;
1037
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001038 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
1039 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001040 return OK;
1041
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001042 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
1043 case 0x1e86: case 0x1e88:
1044 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
1045 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001046 return OK;
1047
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001048 case 'X': case 0x1e8a: case 0x1e8c:
1049 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001050 return OK;
1051
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001052 case 'Y': case Y_acute: case 0x176: case 0x178:
1053 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
1054 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
1055 EMIT2('Y') EMIT2(Y_acute)
1056 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
1057 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
1058 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
1059 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001060 return OK;
1061
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001062 case 'Z': case 0x179: case 0x17b: case 0x17d:
1063 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1064 case 0x2c6b:
1065 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1066 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1067 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001068 return OK;
1069
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001070 case 'a': case a_grave: case a_acute: case a_circumflex:
1071 case a_virguilla: case a_diaeresis: case a_ring:
1072 case 0x101: case 0x103: case 0x105: case 0x1ce:
1073 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1074 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1075 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1076 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1077 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1078 case 0x1eb7: case 0x2c65:
1079 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1080 EMIT2(a_circumflex) EMIT2(a_virguilla)
1081 EMIT2(a_diaeresis) EMIT2(a_ring)
1082 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1083 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1084 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1085 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1086 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1087 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1088 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1089 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1090 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001091 return OK;
1092
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001093 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1094 case 0x1e03: case 0x1e05: case 0x1e07:
1095 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1096 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001097 return OK;
1098
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001099 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1100 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1101 case 0xa794:
1102 EMIT2('c') EMIT2(c_cedilla)
1103 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1104 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1105 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001106 return OK;
1107
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001108 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1109 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1110 case 0x1e11: case 0x1e13:
1111 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1112 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1113 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1114 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001115 return OK;
1116
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001117 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001118 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1119 case 0x119: case 0x11b: case 0x205: case 0x207:
1120 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1121 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1122 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1123 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1124 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1125 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1126 EMIT2(0x113) EMIT2(0x115)
1127 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1128 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1129 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1130 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1131 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1132 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1133 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1137 case 0x1e1f: case 0xa799:
1138 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1139 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001140 return OK;
1141
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001142 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1143 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1144 case 0x1e21: case 0xa7a1:
1145 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1146 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1147 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1148 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001149 return OK;
1150
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001151 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1152 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1153 case 0x1e96: case 0x2c68: case 0xa795:
1154 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1155 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1156 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1157 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001158 return OK;
1159
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001160 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001161 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1162 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1163 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1164 case 0x1ec9: case 0x1ecb:
1165 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1166 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1167 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1168 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1169 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1170 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1171 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001172 return OK;
1173
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001174 case 'j': case 0x135: case 0x1f0: case 0x249:
1175 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1179 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1180 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1181 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1182 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001183 return OK;
1184
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001185 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1186 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1187 case 0x1e3d: case 0x2c61:
1188 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1189 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1190 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1191 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001192 return OK;
1193
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001194 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1195 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1196 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001197 return OK;
1198
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001199 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1200 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1201 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1202 EMIT2('n') EMIT2(n_virguilla)
1203 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1204 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1205 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1206 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001207 return OK;
1208
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001209 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001210 case o_virguilla: case o_diaeresis: case o_slash:
1211 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1212 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1213 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1214 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1215 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1216 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1217 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1218 case 0x1edf: case 0x1ee1: case 0x1ee3:
1219 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1220 EMIT2(o_circumflex) EMIT2(o_virguilla)
1221 EMIT2(o_diaeresis) EMIT2(o_slash)
1222 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1223 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1224 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1225 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1226 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1227 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1228 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1229 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1230 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1231 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1232 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001233 return OK;
1234
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001235 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1236 case 0x1e55: case 0x1e57:
1237 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1238 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001239 return OK;
1240
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001241 case 'q': case 0x24b: case 0x2a0:
1242 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001243 return OK;
1244
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001245 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1246 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1247 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1248 case 0xa7a7:
1249 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1250 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1251 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1252 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001253 return OK;
1254
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001255 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1256 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1257 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1258 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1259 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1260 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1261 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1262 return OK;
1263
1264 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1265 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1266 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1267 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1268 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1269 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1270 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001271 return OK;
1272
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001273 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001274 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1275 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1276 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1277 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1278 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1279 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1280 case 0x1eed: case 0x1eef: case 0x1ef1:
1281 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1282 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1283 EMIT2(0x169) EMIT2(0x16b)
1284 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1285 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1286 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1287 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1288 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1289 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1290 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1291 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1292 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001293 return OK;
1294
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001295 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1296 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1297 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001298 return OK;
1299
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001300 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1301 case 0x1e87: case 0x1e89: case 0x1e98:
1302 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1303 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001304 return OK;
1305
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001306 case 'x': case 0x1e8b: case 0x1e8d:
1307 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001308 return OK;
1309
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001310 case 'y': case y_acute: case y_diaeresis: case 0x177:
1311 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1312 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1313 case 0x1ef9:
1314 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1315 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1316 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1317 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001318 return OK;
1319
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001320 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1321 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1322 case 0x1e95: case 0x2c6c:
1323 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1324 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1325 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001326 return OK;
1327
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001328 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001329 }
1330 }
1331
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001332 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001333 return OK;
1334#undef EMIT2
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001335#undef EMIT2
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001336}
1337
1338/*
1339 * Code to parse regular expression.
1340 *
1341 * We try to reuse parsing functions in regexp.c to
1342 * minimize surprise and keep the syntax consistent.
1343 */
1344
1345/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001346 * Parse the lowest level.
1347 *
1348 * An atom can be one of a long list of items. Many atoms match one character
1349 * in the text. It is often an ordinary character or a character class.
1350 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1351 * is only for syntax highlighting.
1352 *
1353 * atom ::= ordinary-atom
1354 * or \( pattern \)
1355 * or \%( pattern \)
1356 * or \z( pattern \)
1357 */
1358 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001359nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001360{
1361 int c;
1362 int charclass;
1363 int equiclass;
1364 int collclass;
1365 int got_coll_char;
1366 char_u *p;
1367 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001368 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001369 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001370 int emit_range;
1371 int negated;
1372 int result;
1373 int startc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001374 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001375
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001376 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001377 switch (c)
1378 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001379 case NUL:
Bram Moolenaar174a8482013-11-28 14:20:17 +01001380 EMSG_RET_FAIL(_(e_nul_found));
Bram Moolenaar47196582013-05-25 22:04:23 +02001381
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001382 case Magic('^'):
1383 EMIT(NFA_BOL);
1384 break;
1385
1386 case Magic('$'):
1387 EMIT(NFA_EOL);
1388#if defined(FEAT_SYN_HL) || defined(PROTO)
1389 had_eol = TRUE;
1390#endif
1391 break;
1392
1393 case Magic('<'):
1394 EMIT(NFA_BOW);
1395 break;
1396
1397 case Magic('>'):
1398 EMIT(NFA_EOW);
1399 break;
1400
1401 case Magic('_'):
1402 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001403 if (c == NUL)
1404 EMSG_RET_FAIL(_(e_nul_found));
1405
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001406 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001407 {
1408 EMIT(NFA_BOL);
1409 break;
1410 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001411 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001412 {
1413 EMIT(NFA_EOL);
1414#if defined(FEAT_SYN_HL) || defined(PROTO)
1415 had_eol = TRUE;
1416#endif
1417 break;
1418 }
1419
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001420 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001421
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001422 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001423 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001424 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001425
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001426 // "\_x" is character class plus newline
1427 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001428
1429 /*
1430 * Character classes.
1431 */
1432 case Magic('.'):
1433 case Magic('i'):
1434 case Magic('I'):
1435 case Magic('k'):
1436 case Magic('K'):
1437 case Magic('f'):
1438 case Magic('F'):
1439 case Magic('p'):
1440 case Magic('P'):
1441 case Magic('s'):
1442 case Magic('S'):
1443 case Magic('d'):
1444 case Magic('D'):
1445 case Magic('x'):
1446 case Magic('X'):
1447 case Magic('o'):
1448 case Magic('O'):
1449 case Magic('w'):
1450 case Magic('W'):
1451 case Magic('h'):
1452 case Magic('H'):
1453 case Magic('a'):
1454 case Magic('A'):
1455 case Magic('l'):
1456 case Magic('L'):
1457 case Magic('u'):
1458 case Magic('U'):
1459 p = vim_strchr(classchars, no_Magic(c));
1460 if (p == NULL)
1461 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001462 if (extra == NFA_ADD_NL)
1463 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001464 semsg(_(e_ill_char_class), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001465 rc_did_emsg = TRUE;
1466 return FAIL;
1467 }
Bram Moolenaarb5443cc2019-01-15 20:19:40 +01001468 siemsg("INTERNAL: Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001469 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001470 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001471
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001472 // When '.' is followed by a composing char ignore the dot, so that
1473 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001474 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1475 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001476 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001477 c = getchr();
1478 goto nfa_do_multibyte;
1479 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001480 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001481 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001482 {
1483 EMIT(NFA_NEWL);
1484 EMIT(NFA_OR);
1485 regflags |= RF_HASNL;
1486 }
1487 break;
1488
1489 case Magic('n'):
1490 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001491 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001492 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001493 else
1494 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001495 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001496 EMIT(NFA_NEWL);
1497 regflags |= RF_HASNL;
1498 }
1499 break;
1500
1501 case Magic('('):
1502 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001503 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001504 break;
1505
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001506 case Magic('|'):
1507 case Magic('&'):
1508 case Magic(')'):
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001509 semsg(_(e_misplaced), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001510 return FAIL;
1511
1512 case Magic('='):
1513 case Magic('?'):
1514 case Magic('+'):
1515 case Magic('@'):
1516 case Magic('*'):
1517 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001518 // these should follow an atom, not form an atom
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001519 semsg(_(e_misplaced), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001520 return FAIL;
1521
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001522 case Magic('~'):
1523 {
1524 char_u *lp;
1525
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001526 // Previous substitute pattern.
1527 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001528 if (reg_prev_sub == NULL)
1529 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001530 emsg(_(e_no_previous_substitute_regular_expression));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001531 return FAIL;
1532 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001533 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001534 {
1535 EMIT(PTR2CHAR(lp));
1536 if (lp != reg_prev_sub)
1537 EMIT(NFA_CONCAT);
1538 }
1539 EMIT(NFA_NOPEN);
1540 break;
1541 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001542
Bram Moolenaar428e9872013-05-30 17:05:39 +02001543 case Magic('1'):
1544 case Magic('2'):
1545 case Magic('3'):
1546 case Magic('4'):
1547 case Magic('5'):
1548 case Magic('6'):
1549 case Magic('7'):
1550 case Magic('8'):
1551 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001552 {
1553 int refnum = no_Magic(c) - '1';
1554
1555 if (!seen_endbrace(refnum + 1))
1556 return FAIL;
1557 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001558 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001559 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001560 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001561
1562 case Magic('z'):
1563 c = no_Magic(getchr());
1564 switch (c)
1565 {
1566 case 's':
1567 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001568 if (re_mult_next("\\zs") == FAIL)
1569 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001570 break;
1571 case 'e':
1572 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001573 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001574 if (re_mult_next("\\ze") == FAIL)
1575 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001576 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001577#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001578 case '1':
1579 case '2':
1580 case '3':
1581 case '4':
1582 case '5':
1583 case '6':
1584 case '7':
1585 case '8':
1586 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001587 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001588 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001589 EMSG_RET_FAIL(_(e_z1_z9_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001590 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001591 // No need to set rex.nfa_has_backref, the sub-matches don't
1592 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001593 re_has_z = REX_USE;
1594 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001595 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001596 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001597 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001598 EMSG_RET_FAIL(_(e_z_not_allowed_here));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001599 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001600 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001601 re_has_z = REX_SET;
1602 break;
1603#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001604 default:
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001605 semsg(_("E867: (NFA) Unknown operator '\\z%c'"),
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001606 no_Magic(c));
1607 return FAIL;
1608 }
1609 break;
1610
1611 case Magic('%'):
1612 c = no_Magic(getchr());
1613 switch (c)
1614 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001615 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001616 case '(':
1617 if (nfa_reg(REG_NPAREN) == FAIL)
1618 return FAIL;
1619 EMIT(NFA_NOPEN);
1620 break;
1621
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001622 case 'd': // %d123 decimal
1623 case 'o': // %o123 octal
1624 case 'x': // %xab hex 2
1625 case 'u': // %uabcd hex 4
1626 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001627 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001628 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001629
Bram Moolenaar47196582013-05-25 22:04:23 +02001630 switch (c)
1631 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001632 case 'd': nr = getdecchrs(); break;
1633 case 'o': nr = getoctchrs(); break;
1634 case 'x': nr = gethexchrs(2); break;
1635 case 'u': nr = gethexchrs(4); break;
1636 case 'U': nr = gethexchrs(8); break;
1637 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001638 }
1639
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001640 if (nr < 0 || nr > INT_MAX)
Bram Moolenaara6f79292022-01-04 21:30:47 +00001641 EMSG2_RET_FAIL(_(e_invalid_character_after_str_2),
1642 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001643 // A NUL is stored in the text as NL
1644 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001645 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001646 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001647 break;
1648
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001649 // Catch \%^ and \%$ regardless of where they appear in the
1650 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001651 case '^':
1652 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001653 break;
1654
1655 case '$':
1656 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001657 break;
1658
1659 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +02001660 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001661 break;
1662
1663 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001664 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001665 break;
1666
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001667 case 'C':
1668 EMIT(NFA_ANY_COMPOSING);
1669 break;
1670
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001671 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001672 {
1673 int n;
1674
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001675 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001676 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001677 {
1678 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001679 EMSG2_RET_FAIL(_(e_missing_sb_after_str),
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001680 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001681 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001682 if (nfa_regatom() == FAIL)
1683 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001684 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001685 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001686 if (n == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001687 EMSG2_RET_FAIL(_(e_empty_str_brackets),
Bram Moolenaar2976c022013-06-05 21:30:37 +02001688 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001689 EMIT(NFA_OPT_CHARS);
1690 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001691
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001692 // Emit as "\%(\%[abc]\)" to be able to handle
1693 // "\%[abc]*" which would cause the empty string to be
1694 // matched an unlimited number of times. NFA_NOPEN is
1695 // added only once at a position, while NFA_SPLIT is
1696 // added multiple times. This is more efficient than
1697 // not allowing NFA_SPLIT multiple times, it is used
1698 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001699 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001700 break;
1701 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001702
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001703 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001704 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001705 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001706 int cmp = c;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001707 int cur = FALSE;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001708
1709 if (c == '<' || c == '>')
1710 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001711 if (no_Magic(c) == '.')
1712 {
1713 cur = TRUE;
1714 c = getchr();
1715 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001716 while (VIM_ISDIGIT(c))
1717 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001718 long_u tmp;
1719
1720 if (cur)
1721 semsg(_(e_regexp_number_after_dot_pos_search),
1722 no_Magic(c));
1723 tmp = n * 10 + (c - '0');
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001724
1725 if (tmp < n)
1726 {
1727 // overflow.
1728 emsg(_(e_value_too_large));
1729 return FAIL;
1730 }
1731 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001732 c = getchr();
1733 }
1734 if (c == 'l' || c == 'c' || c == 'v')
1735 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001736 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001737
Bram Moolenaar423532e2013-05-29 21:14:42 +02001738 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001739 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001740 if (cur)
1741 n = curwin->w_cursor.lnum;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001742 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001743 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001744 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001745 if (save_prev_at_start)
1746 at_start = TRUE;
1747 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001748 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001749 {
1750 if (cur)
1751 {
1752 n = curwin->w_cursor.col;
1753 n++;
1754 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001755 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001756 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001757 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001758 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001759 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001760 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001761 if (cur)
1762 {
1763 colnr_T vcol = 0;
1764
1765 getvvcol(curwin, &curwin->w_cursor,
1766 NULL, NULL, &vcol);
1767 n = ++vcol;
1768 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001769 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001770 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001771 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001772 limit = INT_MAX / MB_MAXBYTES;
1773 }
1774 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001775 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001776 emsg(_(e_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001777 return FAIL;
1778 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001779 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001780 break;
1781 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001782 else if (c == '\'' && n == 0)
1783 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001784 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001785 EMIT(cmp == '<' ? NFA_MARK_LT :
1786 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001787 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001788 break;
1789 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001790 }
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001791 semsg(_("E867: (NFA) Unknown operator '\\%%%c'"),
Bram Moolenaar5714b802013-05-28 22:03:20 +02001792 no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001793 return FAIL;
1794 }
1795 break;
1796
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001797 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001798collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001799 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001800 * [abc] uses NFA_START_COLL - NFA_END_COLL
1801 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1802 * Each character is produced as a regular state, using
1803 * NFA_CONCAT to bind them together.
1804 * Besides normal characters there can be:
1805 * - character classes NFA_CLASS_*
1806 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001807 */
1808
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001809 p = regparse;
1810 endp = skip_anyof(p);
1811 if (*endp == ']')
1812 {
1813 /*
1814 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001815 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001816 * and perform the necessary substitutions in the NFA.
1817 */
1818 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001819 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001820 if (result != FAIL)
1821 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001822 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001823 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001824 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001825 EMIT(NFA_NEWL);
1826 EMIT(NFA_OR);
1827 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001828 else
1829 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001830 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001831 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001832 return OK;
1833 }
1834 /*
1835 * Failed to recognize a character class. Use the simple
1836 * version that turns [abc] into 'a' OR 'b' OR 'c'
1837 */
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001838 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001839 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001840 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001841 {
1842 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001843 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001844 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001845 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001846 else
1847 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001848 if (*regparse == '-')
1849 {
1850 startc = '-';
1851 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001852 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001853 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001854 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001855 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001856 emit_range = FALSE;
1857 while (regparse < endp)
1858 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02001859 int oldstartc = startc;
1860
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001861 startc = -1;
1862 got_coll_char = FALSE;
1863 if (*regparse == '[')
1864 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001865 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001866 equiclass = collclass = 0;
1867 charclass = get_char_class(&regparse);
1868 if (charclass == CLASS_NONE)
1869 {
1870 equiclass = get_equi_class(&regparse);
1871 if (equiclass == 0)
1872 collclass = get_coll_element(&regparse);
1873 }
1874
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001875 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001876 if (charclass != CLASS_NONE)
1877 {
1878 switch (charclass)
1879 {
1880 case CLASS_ALNUM:
1881 EMIT(NFA_CLASS_ALNUM);
1882 break;
1883 case CLASS_ALPHA:
1884 EMIT(NFA_CLASS_ALPHA);
1885 break;
1886 case CLASS_BLANK:
1887 EMIT(NFA_CLASS_BLANK);
1888 break;
1889 case CLASS_CNTRL:
1890 EMIT(NFA_CLASS_CNTRL);
1891 break;
1892 case CLASS_DIGIT:
1893 EMIT(NFA_CLASS_DIGIT);
1894 break;
1895 case CLASS_GRAPH:
1896 EMIT(NFA_CLASS_GRAPH);
1897 break;
1898 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001899 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001900 EMIT(NFA_CLASS_LOWER);
1901 break;
1902 case CLASS_PRINT:
1903 EMIT(NFA_CLASS_PRINT);
1904 break;
1905 case CLASS_PUNCT:
1906 EMIT(NFA_CLASS_PUNCT);
1907 break;
1908 case CLASS_SPACE:
1909 EMIT(NFA_CLASS_SPACE);
1910 break;
1911 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001912 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001913 EMIT(NFA_CLASS_UPPER);
1914 break;
1915 case CLASS_XDIGIT:
1916 EMIT(NFA_CLASS_XDIGIT);
1917 break;
1918 case CLASS_TAB:
1919 EMIT(NFA_CLASS_TAB);
1920 break;
1921 case CLASS_RETURN:
1922 EMIT(NFA_CLASS_RETURN);
1923 break;
1924 case CLASS_BACKSPACE:
1925 EMIT(NFA_CLASS_BACKSPACE);
1926 break;
1927 case CLASS_ESCAPE:
1928 EMIT(NFA_CLASS_ESCAPE);
1929 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001930 case CLASS_IDENT:
1931 EMIT(NFA_CLASS_IDENT);
1932 break;
1933 case CLASS_KEYWORD:
1934 EMIT(NFA_CLASS_KEYWORD);
1935 break;
1936 case CLASS_FNAME:
1937 EMIT(NFA_CLASS_FNAME);
1938 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001939 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001940 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001941 continue;
1942 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001943 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001944 if (equiclass != 0)
1945 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001946 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001947 if (result == FAIL)
1948 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001949 // should never happen
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001950 EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
1951 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001952 continue;
1953 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001954 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001955 if (collclass != 0)
1956 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001957 startc = collclass; // allow [.a.]-x as a range
1958 // Will emit the proper atom at the end of the
1959 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001960 }
1961 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001962 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1963 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001964 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001965 {
1966 emit_range = TRUE;
1967 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001968 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001969 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001970 }
1971
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001972 // Now handle simple and escaped characters.
1973 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1974 // accepts "\t", "\e", etc., but only when the 'l' flag in
1975 // 'cpoptions' is not included.
1976 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001977 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001978 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001979 && regparse + 1 <= endp
1980 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001981 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001982 && vim_strchr(REGEXP_ABBR, regparse[1])
1983 != NULL)
1984 )
1985 )
1986 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001987 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001988
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001989 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001990 startc = (reg_string || emit_range
1991 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001992 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001993 || *regparse == 'o'
1994 || *regparse == 'x'
1995 || *regparse == 'u'
1996 || *regparse == 'U'
1997 )
1998 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001999 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002000 startc = coll_get_char();
2001 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002002 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002003 }
2004 else
2005 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002006 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002007 startc = backslash_trans(*regparse);
2008 }
2009 }
2010
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002011 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002012 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002013 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002014
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002015 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002016 if (emit_range)
2017 {
=?UTF-8?q?Dundar=20G=C3=B6c?=09f688c2021-07-08 18:05:00 +02002018 int endc = startc;
2019
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002020 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002021 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002022 EMSG_RET_FAIL(_(e_reverse_range_in_character_class));
Bram Moolenaar417bad22013-06-07 14:08:30 +02002023
2024 if (endc > startc + 2)
2025 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002026 // Emit a range instead of the sequence of
2027 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002028 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002029 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002030 EMIT(1);
2031 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002032 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02002033 EMIT(endc);
2034 EMIT(NFA_RANGE);
2035 EMIT(NFA_CONCAT);
2036 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002037 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002038 || (*mb_char2len)(endc) > 1))
2039 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002040 // Emit the characters in the range.
2041 // "startc" was already emitted, so skip it.
2042 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002043 for (c = startc + 1; c <= endc; c++)
2044 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002045 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002046 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002047 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002048 }
2049 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002050 {
2051#ifdef EBCDIC
2052 int alpha_only = FALSE;
2053
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002054 // for alphabetical range skip the gaps
2055 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002056 if (isalpha(startc) && isalpha(endc))
2057 alpha_only = TRUE;
2058#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002059 // Emit the range. "startc" was already emitted, so
2060 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002061 for (c = startc + 1; c <= endc; c++)
2062#ifdef EBCDIC
2063 if (!alpha_only || isalpha(startc))
2064#endif
2065 {
2066 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002067 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002068 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002069 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002070 emit_range = FALSE;
2071 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002072 }
2073 else
2074 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002075 // This char (startc) is not part of a range. Just
2076 // emit it.
2077 // Normally, simply emit startc. But if we get char
2078 // code=0 from a collating char, then replace it with
2079 // 0x0a.
2080 // This is needed to completely mimic the behaviour of
2081 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002082 if (startc == NFA_NEWL)
2083 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002084 // Line break can't be matched as part of the
2085 // collection, add an OR below. But not for negated
2086 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002087 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002088 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002089 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002090 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002091 {
2092 if (got_coll_char == TRUE && startc == 0)
2093 EMIT(0x0a);
2094 else
2095 EMIT(startc);
2096 EMIT(NFA_CONCAT);
2097 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002098 }
2099
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002100 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002101 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002102
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002103 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002104 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002105 {
2106 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002107 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002108 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002109
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002110 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002111 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002112 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002113
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002114 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002115 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002116 EMIT(NFA_END_NEG_COLL);
2117 else
2118 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002119
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002120 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002121 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002122 {
2123 EMIT(reg_string ? NL : NFA_NEWL);
2124 EMIT(NFA_OR);
2125 }
2126
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002127 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002128 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002129
2130 if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002131 EMSG_RET_FAIL(_(e_missing_rsb_after_str_lsb));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002132 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002133
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002134 default:
2135 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002136 int plen;
2137
2138nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002139 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002140 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002141 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002142 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002143 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002144 int i = 0;
2145
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002146 // A base character plus composing characters, or just one
2147 // or more composing characters.
2148 // This requires creating a separate atom as if enclosing
2149 // the characters in (), where NFA_COMPOSING is the ( and
2150 // NFA_END_COMPOSING is the ). Note that right now we are
2151 // building the postfix form, not the NFA itself;
2152 // a composing char could be: a, b, c, NFA_COMPOSING
2153 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002154 for (;;)
2155 {
2156 EMIT(c);
2157 if (i > 0)
2158 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002159 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002160 break;
2161 c = utf_ptr2char(old_regparse + i);
2162 }
2163 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002164 regparse = old_regparse + plen;
2165 }
2166 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002167 {
2168 c = no_Magic(c);
2169 EMIT(c);
2170 }
2171 return OK;
2172 }
2173 }
2174
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002175 return OK;
2176}
2177
2178/*
2179 * Parse something followed by possible [*+=].
2180 *
2181 * A piece is an atom, possibly followed by a multi, an indication of how many
2182 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2183 * characters: "", "a", "aa", etc.
2184 *
2185 * piece ::= atom
2186 * or atom multi
2187 */
2188 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002189nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002190{
2191 int i;
2192 int op;
2193 int ret;
2194 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002195 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002196 parse_state_T old_state;
2197 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002198 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002199 int old_post_pos;
2200 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002201 int quest;
2202
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002203 // Save the current parse state, so that we can use it if <atom>{m,n} is
2204 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002205 save_parse_state(&old_state);
2206
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002207 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002208 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002209
2210 ret = nfa_regatom();
2211 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002212 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002213
2214 op = peekchr();
2215 if (re_multi_type(op) == NOT_MULTI)
2216 return OK;
2217
2218 skipchr();
2219 switch (op)
2220 {
2221 case Magic('*'):
2222 EMIT(NFA_STAR);
2223 break;
2224
2225 case Magic('+'):
2226 /*
2227 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2228 * first and only submatch would be "aaa". But the backtracking
2229 * engine interprets the plus as "try matching one more time", and
2230 * a* matches a second time at the end of the input, the empty
2231 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002232 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002233 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002234 * In order to be consistent with the old engine, we replace
2235 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002236 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002237 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002238 curchr = -1;
2239 if (nfa_regatom() == FAIL)
2240 return FAIL;
2241 EMIT(NFA_STAR);
2242 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002243 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002244 break;
2245
2246 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002247 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002248 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002249 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002250 switch(op)
2251 {
2252 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002253 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002254 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002255 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002256 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002257 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002258 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002259 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002260 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002261 op = no_Magic(getchr());
2262 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002263 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002264 i = NFA_PREV_ATOM_JUST_BEFORE;
2265 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002266 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002267 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2268 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002269 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002270 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002271 i = NFA_PREV_ATOM_LIKE_PATTERN;
2272 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002273 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002274 if (i == 0)
2275 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002276 semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002277 return FAIL;
2278 }
2279 EMIT(i);
2280 if (i == NFA_PREV_ATOM_JUST_BEFORE
2281 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2282 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002283 break;
2284
2285 case Magic('?'):
2286 case Magic('='):
2287 EMIT(NFA_QUEST);
2288 break;
2289
2290 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002291 // a{2,5} will expand to 'aaa?a?a?'
2292 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2293 // version of '?'
2294 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2295 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002296
2297 greedy = TRUE;
2298 c2 = peekchr();
2299 if (c2 == '-' || c2 == Magic('-'))
2300 {
2301 skipchr();
2302 greedy = FALSE;
2303 }
2304 if (!read_limits(&minval, &maxval))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002305 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002306
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002307 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2308 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002309 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002310 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002311 if (greedy) // { { (match the braces)
2312 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002313 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002314 else // { { (match the braces)
2315 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002316 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002317 break;
2318 }
2319
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002320 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002321 if (maxval == 0)
2322 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002323 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002324 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002325 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002326 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002327 return OK;
2328 }
2329
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002330 // The engine is very inefficient (uses too many states) when the
2331 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002332 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2333 // will emit NFA_STAR.
2334 // Bail out if we can use the other engine, but only, when the
2335 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00002336 // does not work with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002337 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002338 && (maxval > 500 || maxval > minval + 200)
2339 && (maxval != MAX_LIMIT && minval < 200)
2340 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002341 return FAIL;
2342
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002343 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002344 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002345 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002346 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002347
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002348 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2349 for (i = 0; i < maxval; i++)
2350 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002351 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002352 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002353 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002354 if (nfa_regatom() == FAIL)
2355 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002356 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002357 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002358 {
2359 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002360 {
2361 if (greedy)
2362 EMIT(NFA_STAR);
2363 else
2364 EMIT(NFA_STAR_NONGREEDY);
2365 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002366 else
2367 EMIT(quest);
2368 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002369 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002370 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002371 if (i + 1 > minval && maxval == MAX_LIMIT)
2372 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002373 }
2374
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002375 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002376 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002377 curchr = -1;
2378
2379 break;
2380
2381
2382 default:
2383 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002384 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002385
2386 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002387 // Can't have a multi follow a multi.
Bram Moolenaar3c867da2018-06-23 14:34:28 +02002388 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002389
2390 return OK;
2391}
2392
2393/*
2394 * Parse one or more pieces, concatenated. It matches a match for the
2395 * first piece, followed by a match for the second piece, etc. Example:
2396 * "f[0-9]b", first matches "f", then a digit and then "b".
2397 *
2398 * concat ::= piece
2399 * or piece piece
2400 * or piece piece piece
2401 * etc.
2402 */
2403 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002404nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002405{
2406 int cont = TRUE;
2407 int first = TRUE;
2408
2409 while (cont)
2410 {
2411 switch (peekchr())
2412 {
2413 case NUL:
2414 case Magic('|'):
2415 case Magic('&'):
2416 case Magic(')'):
2417 cont = FALSE;
2418 break;
2419
2420 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002421 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002422 skipchr_keepstart();
2423 break;
2424 case Magic('c'):
2425 regflags |= RF_ICASE;
2426 skipchr_keepstart();
2427 break;
2428 case Magic('C'):
2429 regflags |= RF_NOICASE;
2430 skipchr_keepstart();
2431 break;
2432 case Magic('v'):
2433 reg_magic = MAGIC_ALL;
2434 skipchr_keepstart();
2435 curchr = -1;
2436 break;
2437 case Magic('m'):
2438 reg_magic = MAGIC_ON;
2439 skipchr_keepstart();
2440 curchr = -1;
2441 break;
2442 case Magic('M'):
2443 reg_magic = MAGIC_OFF;
2444 skipchr_keepstart();
2445 curchr = -1;
2446 break;
2447 case Magic('V'):
2448 reg_magic = MAGIC_NONE;
2449 skipchr_keepstart();
2450 curchr = -1;
2451 break;
2452
2453 default:
2454 if (nfa_regpiece() == FAIL)
2455 return FAIL;
2456 if (first == FALSE)
2457 EMIT(NFA_CONCAT);
2458 else
2459 first = FALSE;
2460 break;
2461 }
2462 }
2463
2464 return OK;
2465}
2466
2467/*
2468 * Parse a branch, one or more concats, separated by "\&". It matches the
2469 * last concat, but only if all the preceding concats also match at the same
2470 * position. Examples:
2471 * "foobeep\&..." matches "foo" in "foobeep".
2472 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2473 *
2474 * branch ::= concat
2475 * or concat \& concat
2476 * or concat \& concat \& concat
2477 * etc.
2478 */
2479 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002480nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002481{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002482 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002483
Bram Moolenaar16299b52013-05-30 18:45:23 +02002484 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002485
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002486 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002487 if (nfa_regconcat() == FAIL)
2488 return FAIL;
2489
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002490 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002491 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002492 {
2493 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002494 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002495 if (old_post_pos == (int)(post_ptr - post_start))
2496 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002497 EMIT(NFA_NOPEN);
2498 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002499 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002500 if (nfa_regconcat() == FAIL)
2501 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002502 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002503 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002504 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002505 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002506 }
2507
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002508 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002509 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002510 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002511
2512 return OK;
2513}
2514
2515/*
2516 * Parse a pattern, one or more branches, separated by "\|". It matches
2517 * anything that matches one of the branches. Example: "foo\|beep" matches
2518 * "foo" and matches "beep". If more than one branch matches, the first one
2519 * is used.
2520 *
2521 * pattern ::= branch
2522 * or branch \| branch
2523 * or branch \| branch \| branch
2524 * etc.
2525 */
2526 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002527nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002528 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002529{
2530 int parno = 0;
2531
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002532 if (paren == REG_PAREN)
2533 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002534 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002535 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002536 parno = regnpar++;
2537 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002538#ifdef FEAT_SYN_HL
2539 else if (paren == REG_ZPAREN)
2540 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002541 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002542 if (regnzpar >= NSUBEXP)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002543 EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002544 parno = regnzpar++;
2545 }
2546#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002547
2548 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002549 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002550
2551 while (peekchr() == Magic('|'))
2552 {
2553 skipchr();
2554 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002555 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002556 EMIT(NFA_OR);
2557 }
2558
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002559 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002560 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2561 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002562 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002563 EMSG2_RET_FAIL(_(e_unmatched_str_percent_open),
2564 reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002565 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002566 EMSG2_RET_FAIL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002567 }
2568 else if (paren == REG_NOPAREN && peekchr() != NUL)
2569 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002570 if (peekchr() == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002571 EMSG2_RET_FAIL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002572 else
2573 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
2574 }
2575 /*
2576 * Here we set the flag allowing back references to this set of
2577 * parentheses.
2578 */
2579 if (paren == REG_PAREN)
2580 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002581 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582 EMIT(NFA_MOPEN + parno);
2583 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002584#ifdef FEAT_SYN_HL
2585 else if (paren == REG_ZPAREN)
2586 EMIT(NFA_ZOPEN + parno);
2587#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002588
2589 return OK;
2590}
2591
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002592#ifdef DEBUG
2593static char_u code[50];
2594
2595 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002596nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597{
2598 int addnl = FALSE;
2599
2600 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2601 {
2602 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002603 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002604 }
2605
2606 STRCPY(code, "");
2607 switch (c)
2608 {
2609 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2610 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2611 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2612 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2613 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2614 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2615
Bram Moolenaar5714b802013-05-28 22:03:20 +02002616 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2617 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2618 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2619 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2620 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2621 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2622 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2623 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2624 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002625#ifdef FEAT_SYN_HL
2626 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2627 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2628 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2629 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2630 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2631 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2632 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2633 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2634 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2635#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002636 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2637
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002638 case NFA_PREV_ATOM_NO_WIDTH:
2639 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002640 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2641 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002642 case NFA_PREV_ATOM_JUST_BEFORE:
2643 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2644 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2645 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002646 case NFA_PREV_ATOM_LIKE_PATTERN:
2647 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2648
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002649 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2650 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002651 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002652 case NFA_START_INVISIBLE_FIRST:
2653 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002654 case NFA_START_INVISIBLE_NEG:
2655 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002656 case NFA_START_INVISIBLE_NEG_FIRST:
2657 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002658 case NFA_START_INVISIBLE_BEFORE:
2659 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002660 case NFA_START_INVISIBLE_BEFORE_FIRST:
2661 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002662 case NFA_START_INVISIBLE_BEFORE_NEG:
2663 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002664 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2665 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002666 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002667 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002668 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002669 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002670
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002671 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2672 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002673 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002674
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002675 case NFA_MOPEN:
2676 case NFA_MOPEN1:
2677 case NFA_MOPEN2:
2678 case NFA_MOPEN3:
2679 case NFA_MOPEN4:
2680 case NFA_MOPEN5:
2681 case NFA_MOPEN6:
2682 case NFA_MOPEN7:
2683 case NFA_MOPEN8:
2684 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002685 STRCPY(code, "NFA_MOPEN(x)");
2686 code[10] = c - NFA_MOPEN + '0';
2687 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002688 case NFA_MCLOSE:
2689 case NFA_MCLOSE1:
2690 case NFA_MCLOSE2:
2691 case NFA_MCLOSE3:
2692 case NFA_MCLOSE4:
2693 case NFA_MCLOSE5:
2694 case NFA_MCLOSE6:
2695 case NFA_MCLOSE7:
2696 case NFA_MCLOSE8:
2697 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002698 STRCPY(code, "NFA_MCLOSE(x)");
2699 code[11] = c - NFA_MCLOSE + '0';
2700 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002701#ifdef FEAT_SYN_HL
2702 case NFA_ZOPEN:
2703 case NFA_ZOPEN1:
2704 case NFA_ZOPEN2:
2705 case NFA_ZOPEN3:
2706 case NFA_ZOPEN4:
2707 case NFA_ZOPEN5:
2708 case NFA_ZOPEN6:
2709 case NFA_ZOPEN7:
2710 case NFA_ZOPEN8:
2711 case NFA_ZOPEN9:
2712 STRCPY(code, "NFA_ZOPEN(x)");
2713 code[10] = c - NFA_ZOPEN + '0';
2714 break;
2715 case NFA_ZCLOSE:
2716 case NFA_ZCLOSE1:
2717 case NFA_ZCLOSE2:
2718 case NFA_ZCLOSE3:
2719 case NFA_ZCLOSE4:
2720 case NFA_ZCLOSE5:
2721 case NFA_ZCLOSE6:
2722 case NFA_ZCLOSE7:
2723 case NFA_ZCLOSE8:
2724 case NFA_ZCLOSE9:
2725 STRCPY(code, "NFA_ZCLOSE(x)");
2726 code[11] = c - NFA_ZCLOSE + '0';
2727 break;
2728#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002729 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2730 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2731 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2732 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002733 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2734 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002735 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2736 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2737 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2738 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2739 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2740 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2741 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2742 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2743 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2744 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2745 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2746 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2747 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2748 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002749 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002750
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002751 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002752 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2753 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2754 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002755 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002756 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002757
2758 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2759 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2760 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2761 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2762 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2763 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2764 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2765
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002766 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2767 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2768 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2769 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2770 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2771 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2772 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2773 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2774 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2775 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2776 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2777 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2778 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2779 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2780 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2781 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002782 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2783 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2784 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002785
2786 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2787 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2788 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2789 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2790 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2791 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2792 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2793 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2794 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2795 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2796 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2797 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2798 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2799 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2800 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2801 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2802 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2803 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2804 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2805 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2806 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2807 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2808 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2809 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2810 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2811 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2812 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002813 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2814 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2815 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2816 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002817
2818 default:
2819 STRCPY(code, "CHAR(x)");
2820 code[5] = c;
2821 }
2822
2823 if (addnl == TRUE)
2824 STRCAT(code, " + NEWLINE ");
2825
2826}
2827
2828#ifdef ENABLE_LOG
2829static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002830static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002831
2832/*
2833 * Print the postfix notation of the current regexp.
2834 */
2835 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002836nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002837{
2838 int *p;
2839 FILE *f;
2840
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002841 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002842 if (f != NULL)
2843 {
2844 fprintf(f, "\n-------------------------\n");
2845 if (retval == FAIL)
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002846 fprintf(f, ">>> NFA engine failed... \n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002847 else if (retval == OK)
2848 fprintf(f, ">>> NFA engine succeeded !\n");
2849 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002850 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002851 {
2852 nfa_set_code(*p);
2853 fprintf(f, "%s, ", code);
2854 }
2855 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002856 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002857 fprintf(f, "%d ", *p);
2858 fprintf(f, "\n\n");
2859 fclose(f);
2860 }
2861}
2862
2863/*
2864 * Print the NFA starting with a root node "state".
2865 */
2866 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002867nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002868{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002869 garray_T indent;
2870
2871 ga_init2(&indent, 1, 64);
2872 ga_append(&indent, '\0');
2873 nfa_print_state2(debugf, state, &indent);
2874 ga_clear(&indent);
2875}
2876
2877 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002878nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002879{
2880 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002881
2882 if (state == NULL)
2883 return;
2884
2885 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002886
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002887 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002888 p = (char_u *)indent->ga_data;
2889 if (indent->ga_len >= 3)
2890 {
2891 int last = indent->ga_len - 3;
2892 char_u save[2];
2893
2894 STRNCPY(save, &p[last], 2);
2895 STRNCPY(&p[last], "+-", 2);
2896 fprintf(debugf, " %s", p);
2897 STRNCPY(&p[last], save, 2);
2898 }
2899 else
2900 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002901
2902 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002903 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002904 code,
2905 state->c,
2906 abs(state->id),
2907 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002908 if (state->id < 0)
2909 return;
2910
2911 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002912
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002913 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002914 indent->ga_len -= 1;
2915 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002916 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002917 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002918 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002919 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002920
2921 nfa_print_state2(debugf, state->out, indent);
2922
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002923 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002924 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002925 ga_concat(indent, (char_u *)" ");
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002926 ga_append(indent, NUL);
Bram Moolenaar152e7892013-05-25 12:28:11 +02002927
2928 nfa_print_state2(debugf, state->out1, indent);
2929
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002930 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002931 indent->ga_len -= 3;
Yegappan Lakshmananbc404bf2021-12-19 19:19:31 +00002932 ga_append(indent, NUL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002933}
2934
2935/*
2936 * Print the NFA state machine.
2937 */
2938 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002939nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002940{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002941 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002942
2943 if (debugf != NULL)
2944 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02002945 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002946
Bram Moolenaar473de612013-06-08 18:19:48 +02002947 if (prog->reganch)
2948 fprintf(debugf, "reganch: %d\n", prog->reganch);
2949 if (prog->regstart != NUL)
2950 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2951 prog->regstart, prog->regstart);
2952 if (prog->match_text != NULL)
2953 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002954
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002955 fclose(debugf);
2956 }
2957}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002958#endif // ENABLE_LOG
2959#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002960
2961/*
2962 * Parse r.e. @expr and convert it into postfix form.
2963 * Return the postfix string on success, NULL otherwise.
2964 */
2965 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002966re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002967{
2968 if (nfa_reg(REG_NOPAREN) == FAIL)
2969 return NULL;
2970 EMIT(NFA_MOPEN);
2971 return post_start;
2972}
2973
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002974// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002975
2976/*
2977 * Represents an NFA state plus zero or one or two arrows exiting.
2978 * if c == MATCH, no arrows out; matching state.
2979 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2980 * If c < 256, labeled arrow with character c to out.
2981 */
2982
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002983static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002984
2985/*
2986 * Allocate and initialize nfa_state_T.
2987 */
2988 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002989alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002990{
2991 nfa_state_T *s;
2992
2993 if (istate >= nstate)
2994 return NULL;
2995
2996 s = &state_ptr[istate++];
2997
2998 s->c = c;
2999 s->out = out;
3000 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003001 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003002
3003 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02003004 s->lastlist[0] = 0;
3005 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003006
3007 return s;
3008}
3009
3010/*
3011 * A partially built NFA without the matching state filled in.
3012 * Frag_T.start points at the start state.
3013 * Frag_T.out is a list of places that need to be set to the
3014 * next state for this fragment.
3015 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003016
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003017// Since the out pointers in the list are always
3018// uninitialized, we use the pointers themselves
3019// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003020typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003021union Ptrlist
3022{
3023 Ptrlist *next;
3024 nfa_state_T *s;
3025};
3026
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003027struct Frag
3028{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003029 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003030 Ptrlist *out;
3031};
3032typedef struct Frag Frag_T;
3033
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003034/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02003035 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003036 */
3037 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003038frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003039{
Bram Moolenaar053bb602013-05-20 13:55:21 +02003040 Frag_T n;
3041
3042 n.start = start;
3043 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003044 return n;
3045}
3046
3047/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003048 * Create singleton list containing just outp.
3049 */
3050 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003051list1(
3052 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003053{
3054 Ptrlist *l;
3055
3056 l = (Ptrlist *)outp;
3057 l->next = NULL;
3058 return l;
3059}
3060
3061/*
3062 * Patch the list of states at out to point to start.
3063 */
3064 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003065patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003066{
3067 Ptrlist *next;
3068
3069 for (; l; l = next)
3070 {
3071 next = l->next;
3072 l->s = s;
3073 }
3074}
3075
3076
3077/*
3078 * Join the two lists l1 and l2, returning the combination.
3079 */
3080 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003081append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003082{
3083 Ptrlist *oldl1;
3084
3085 oldl1 = l1;
3086 while (l1->next)
3087 l1 = l1->next;
3088 l1->next = l2;
3089 return oldl1;
3090}
3091
3092/*
3093 * Stack used for transforming postfix form into NFA.
3094 */
3095static Frag_T empty;
3096
3097 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003098st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003099{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003100#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003101 FILE *df;
3102 int *p2;
3103
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003104 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003105 if (df)
3106 {
3107 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003108# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003109 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003110# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003111 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003112# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003113 for (p2 = postfix; p2 < end; p2++)
3114 {
3115 nfa_set_code(*p2);
3116 fprintf(df, "%s, ", code);
3117 }
3118 nfa_set_code(*p);
3119 fprintf(df, "\nCurrent position is: ");
3120 for (p2 = postfix; p2 <= p; p2 ++)
3121 {
3122 nfa_set_code(*p2);
3123 fprintf(df, "%s, ", code);
3124 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003125# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003126 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003127 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003128 fprintf(df, "\nCurrent position is: ");
3129 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003130 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003131# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003132 fprintf(df, "\n--------------------------\n");
3133 fclose(df);
3134 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003135#endif
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01003136 emsg(_("E874: (NFA) Could not pop the stack!"));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003137}
3138
3139/*
3140 * Push an item onto the stack.
3141 */
3142 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003143st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003144{
3145 Frag_T *stackp = *p;
3146
3147 if (stackp >= stack_end)
3148 return;
3149 *stackp = s;
3150 *p = *p + 1;
3151}
3152
3153/*
3154 * Pop an item from the stack.
3155 */
3156 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003157st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003158{
3159 Frag_T *stackp;
3160
3161 *p = *p - 1;
3162 stackp = *p;
3163 if (stackp < stack)
3164 return empty;
3165 return **p;
3166}
3167
3168/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003169 * Estimate the maximum byte length of anything matching "state".
3170 * When unknown or unlimited return -1.
3171 */
3172 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003173nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003174{
3175 int l, r;
3176 nfa_state_T *state = startstate;
3177 int len = 0;
3178
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003179 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003180 if (depth > 4)
3181 return -1;
3182
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003183 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003184 {
3185 switch (state->c)
3186 {
3187 case NFA_END_INVISIBLE:
3188 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003189 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003190 return len;
3191
3192 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003193 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003194 l = nfa_max_width(state->out, depth + 1);
3195 r = nfa_max_width(state->out1, depth + 1);
3196 if (l < 0 || r < 0)
3197 return -1;
3198 return len + (l > r ? l : r);
3199
3200 case NFA_ANY:
3201 case NFA_START_COLL:
3202 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003203 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003204 if (enc_utf8)
3205 len += MB_MAXBYTES;
3206 else if (has_mbyte)
3207 len += 2;
3208 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003209 ++len;
3210 if (state->c != NFA_ANY)
3211 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003212 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003213 state = state->out1->out;
3214 continue;
3215 }
3216 break;
3217
3218 case NFA_DIGIT:
3219 case NFA_WHITE:
3220 case NFA_HEX:
3221 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003222 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003223 ++len;
3224 break;
3225
3226 case NFA_IDENT:
3227 case NFA_SIDENT:
3228 case NFA_KWORD:
3229 case NFA_SKWORD:
3230 case NFA_FNAME:
3231 case NFA_SFNAME:
3232 case NFA_PRINT:
3233 case NFA_SPRINT:
3234 case NFA_NWHITE:
3235 case NFA_NDIGIT:
3236 case NFA_NHEX:
3237 case NFA_NOCTAL:
3238 case NFA_WORD:
3239 case NFA_NWORD:
3240 case NFA_HEAD:
3241 case NFA_NHEAD:
3242 case NFA_ALPHA:
3243 case NFA_NALPHA:
3244 case NFA_LOWER:
3245 case NFA_NLOWER:
3246 case NFA_UPPER:
3247 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003248 case NFA_LOWER_IC:
3249 case NFA_NLOWER_IC:
3250 case NFA_UPPER_IC:
3251 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003252 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003253 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003254 if (has_mbyte)
3255 len += 3;
3256 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003257 ++len;
3258 break;
3259
3260 case NFA_START_INVISIBLE:
3261 case NFA_START_INVISIBLE_NEG:
3262 case NFA_START_INVISIBLE_BEFORE:
3263 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003264 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003265 state = state->out1->out;
3266 continue;
3267
3268 case NFA_BACKREF1:
3269 case NFA_BACKREF2:
3270 case NFA_BACKREF3:
3271 case NFA_BACKREF4:
3272 case NFA_BACKREF5:
3273 case NFA_BACKREF6:
3274 case NFA_BACKREF7:
3275 case NFA_BACKREF8:
3276 case NFA_BACKREF9:
3277#ifdef FEAT_SYN_HL
3278 case NFA_ZREF1:
3279 case NFA_ZREF2:
3280 case NFA_ZREF3:
3281 case NFA_ZREF4:
3282 case NFA_ZREF5:
3283 case NFA_ZREF6:
3284 case NFA_ZREF7:
3285 case NFA_ZREF8:
3286 case NFA_ZREF9:
3287#endif
3288 case NFA_NEWL:
3289 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003290 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003291 return -1;
3292
3293 case NFA_BOL:
3294 case NFA_EOL:
3295 case NFA_BOF:
3296 case NFA_EOF:
3297 case NFA_BOW:
3298 case NFA_EOW:
3299 case NFA_MOPEN:
3300 case NFA_MOPEN1:
3301 case NFA_MOPEN2:
3302 case NFA_MOPEN3:
3303 case NFA_MOPEN4:
3304 case NFA_MOPEN5:
3305 case NFA_MOPEN6:
3306 case NFA_MOPEN7:
3307 case NFA_MOPEN8:
3308 case NFA_MOPEN9:
3309#ifdef FEAT_SYN_HL
3310 case NFA_ZOPEN:
3311 case NFA_ZOPEN1:
3312 case NFA_ZOPEN2:
3313 case NFA_ZOPEN3:
3314 case NFA_ZOPEN4:
3315 case NFA_ZOPEN5:
3316 case NFA_ZOPEN6:
3317 case NFA_ZOPEN7:
3318 case NFA_ZOPEN8:
3319 case NFA_ZOPEN9:
3320 case NFA_ZCLOSE:
3321 case NFA_ZCLOSE1:
3322 case NFA_ZCLOSE2:
3323 case NFA_ZCLOSE3:
3324 case NFA_ZCLOSE4:
3325 case NFA_ZCLOSE5:
3326 case NFA_ZCLOSE6:
3327 case NFA_ZCLOSE7:
3328 case NFA_ZCLOSE8:
3329 case NFA_ZCLOSE9:
3330#endif
3331 case NFA_MCLOSE:
3332 case NFA_MCLOSE1:
3333 case NFA_MCLOSE2:
3334 case NFA_MCLOSE3:
3335 case NFA_MCLOSE4:
3336 case NFA_MCLOSE5:
3337 case NFA_MCLOSE6:
3338 case NFA_MCLOSE7:
3339 case NFA_MCLOSE8:
3340 case NFA_MCLOSE9:
3341 case NFA_NOPEN:
3342 case NFA_NCLOSE:
3343
3344 case NFA_LNUM_GT:
3345 case NFA_LNUM_LT:
3346 case NFA_COL_GT:
3347 case NFA_COL_LT:
3348 case NFA_VCOL_GT:
3349 case NFA_VCOL_LT:
3350 case NFA_MARK_GT:
3351 case NFA_MARK_LT:
3352 case NFA_VISUAL:
3353 case NFA_LNUM:
3354 case NFA_CURSOR:
3355 case NFA_COL:
3356 case NFA_VCOL:
3357 case NFA_MARK:
3358
3359 case NFA_ZSTART:
3360 case NFA_ZEND:
3361 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003362 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003363 case NFA_START_PATTERN:
3364 case NFA_END_PATTERN:
3365 case NFA_COMPOSING:
3366 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003367 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003368 break;
3369
3370 default:
3371 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003372 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003373 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003374 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003375 len += MB_CHAR2LEN(state->c);
3376 break;
3377 }
3378
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003379 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003380 state = state->out;
3381 }
3382
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003383 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003384 return -1;
3385}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003386
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003387/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003388 * Convert a postfix form into its equivalent NFA.
3389 * Return the NFA start state on success, NULL otherwise.
3390 */
3391 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003392post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003393{
3394 int *p;
3395 int mopen;
3396 int mclose;
3397 Frag_T *stack = NULL;
3398 Frag_T *stackp = NULL;
3399 Frag_T *stack_end = NULL;
3400 Frag_T e1;
3401 Frag_T e2;
3402 Frag_T e;
3403 nfa_state_T *s;
3404 nfa_state_T *s1;
3405 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003406 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003407
3408 if (postfix == NULL)
3409 return NULL;
3410
Bram Moolenaar053bb602013-05-20 13:55:21 +02003411#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003412#define POP() st_pop(&stackp, stack); \
3413 if (stackp < stack) \
3414 { \
3415 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003416 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003417 return NULL; \
3418 }
3419
3420 if (nfa_calc_size == FALSE)
3421 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003422 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003423 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003424 if (stack == NULL)
3425 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003426 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003427 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003428 }
3429
3430 for (p = postfix; p < end; ++p)
3431 {
3432 switch (*p)
3433 {
3434 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003435 // Concatenation.
3436 // Pay attention: this operator does not exist in the r.e. itself
3437 // (it is implicit, really). It is added when r.e. is translated
3438 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003439 if (nfa_calc_size == TRUE)
3440 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003441 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003442 break;
3443 }
3444 e2 = POP();
3445 e1 = POP();
3446 patch(e1.out, e2.start);
3447 PUSH(frag(e1.start, e2.out));
3448 break;
3449
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003450 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003451 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003452 if (nfa_calc_size == TRUE)
3453 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003454 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003455 break;
3456 }
3457 e2 = POP();
3458 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003459 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003460 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003461 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003462 PUSH(frag(s, append(e1.out, e2.out)));
3463 break;
3464
3465 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003466 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003467 if (nfa_calc_size == TRUE)
3468 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003469 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003470 break;
3471 }
3472 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003473 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003474 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003475 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003476 patch(e.out, s);
3477 PUSH(frag(s, list1(&s->out1)));
3478 break;
3479
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003480 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003481 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003482 if (nfa_calc_size == TRUE)
3483 {
3484 nstate++;
3485 break;
3486 }
3487 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003488 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003489 if (s == NULL)
3490 goto theend;
3491 patch(e.out, s);
3492 PUSH(frag(s, list1(&s->out)));
3493 break;
3494
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003495 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003496 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003497 if (nfa_calc_size == TRUE)
3498 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003499 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003500 break;
3501 }
3502 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003503 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003504 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003505 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003506 PUSH(frag(s, append(e.out, list1(&s->out1))));
3507 break;
3508
3509 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003510 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003511 if (nfa_calc_size == TRUE)
3512 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003513 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003514 break;
3515 }
3516 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003517 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003518 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003519 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003520 PUSH(frag(s, append(e.out, list1(&s->out))));
3521 break;
3522
Bram Moolenaar417bad22013-06-07 14:08:30 +02003523 case NFA_END_COLL:
3524 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003525 // On the stack is the sequence starting with NFA_START_COLL or
3526 // NFA_START_NEG_COLL and all possible characters. Patch it to
3527 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003528 if (nfa_calc_size == TRUE)
3529 {
3530 nstate++;
3531 break;
3532 }
3533 e = POP();
3534 s = alloc_state(NFA_END_COLL, NULL, NULL);
3535 if (s == NULL)
3536 goto theend;
3537 patch(e.out, s);
3538 e.start->out1 = s;
3539 PUSH(frag(e.start, list1(&s->out)));
3540 break;
3541
3542 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003543 // Before this are two characters, the low and high end of a
3544 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003545 if (nfa_calc_size == TRUE)
3546 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003547 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003548 break;
3549 }
3550 e2 = POP();
3551 e1 = POP();
3552 e2.start->val = e2.start->c;
3553 e2.start->c = NFA_RANGE_MAX;
3554 e1.start->val = e1.start->c;
3555 e1.start->c = NFA_RANGE_MIN;
3556 patch(e1.out, e2.start);
3557 PUSH(frag(e1.start, e2.out));
3558 break;
3559
Bram Moolenaar699c1202013-09-25 16:41:54 +02003560 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003561 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003562 if (nfa_calc_size == TRUE)
3563 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003564 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003565 break;
3566 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003567 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003568 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003569 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003570 PUSH(frag(s, list1(&s->out)));
3571 break;
3572
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003573 case NFA_OPT_CHARS:
3574 {
3575 int n;
3576
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003577 // \%[abc] implemented as:
3578 // NFA_SPLIT
3579 // +-CHAR(a)
3580 // | +-NFA_SPLIT
3581 // | +-CHAR(b)
3582 // | | +-NFA_SPLIT
3583 // | | +-CHAR(c)
3584 // | | | +-next
3585 // | | +- next
3586 // | +- next
3587 // +- next
3588 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003589 if (nfa_calc_size == TRUE)
3590 {
3591 nstate += n;
3592 break;
3593 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003594 s = NULL; // avoid compiler warning
3595 e1.out = NULL; // stores list with out1's
3596 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003597 while (n-- > 0)
3598 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003599 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003600 s = alloc_state(NFA_SPLIT, e.start, NULL);
3601 if (s == NULL)
3602 goto theend;
3603 if (e1.out == NULL)
3604 e1 = e;
3605 patch(e.out, s1);
3606 append(e1.out, list1(&s->out1));
3607 s1 = s;
3608 }
3609 PUSH(frag(s, e1.out));
3610 break;
3611 }
3612
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003613 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003614 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003615 case NFA_PREV_ATOM_JUST_BEFORE:
3616 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003617 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003618 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003619 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3620 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003621 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003622 int start_state;
3623 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003624 int n = 0;
3625 nfa_state_T *zend;
3626 nfa_state_T *skip;
3627
Bram Moolenaardecd9542013-06-07 16:31:50 +02003628 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003629 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003630 case NFA_PREV_ATOM_NO_WIDTH:
3631 start_state = NFA_START_INVISIBLE;
3632 end_state = NFA_END_INVISIBLE;
3633 break;
3634 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3635 start_state = NFA_START_INVISIBLE_NEG;
3636 end_state = NFA_END_INVISIBLE_NEG;
3637 break;
3638 case NFA_PREV_ATOM_JUST_BEFORE:
3639 start_state = NFA_START_INVISIBLE_BEFORE;
3640 end_state = NFA_END_INVISIBLE;
3641 break;
3642 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3643 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3644 end_state = NFA_END_INVISIBLE_NEG;
3645 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003646 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003647 start_state = NFA_START_PATTERN;
3648 end_state = NFA_END_PATTERN;
3649 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003650 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003651
3652 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003653 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003654
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003655 // The \@= operator: match the preceding atom with zero width.
3656 // The \@! operator: no match for the preceding atom.
3657 // The \@<= operator: match for the preceding atom.
3658 // The \@<! operator: no match for the preceding atom.
3659 // Surrounds the preceding atom with START_INVISIBLE and
3660 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003661
3662 if (nfa_calc_size == TRUE)
3663 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003664 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003665 break;
3666 }
3667 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003668 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003669 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003670 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003671
Bram Moolenaar87953742013-06-05 18:52:40 +02003672 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003673 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003674 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003675 if (pattern)
3676 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003677 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003678 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003679 if (skip == NULL)
3680 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003681 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003682 if (zend == NULL)
3683 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003684 s1->out= skip;
3685 patch(e.out, zend);
3686 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003687 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003688 else
3689 {
3690 patch(e.out, s1);
3691 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003692 if (before)
3693 {
3694 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003695 // See if we can guess the maximum width, it avoids a
3696 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003697 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003698 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003699 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003700 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003701 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003702 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003703
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003704 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003705#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003706 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003707 if (regflags & RF_ICOMBINE)
3708 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003709 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003710 }
3711#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003712 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003713
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003714 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003715 case NFA_MOPEN1:
3716 case NFA_MOPEN2:
3717 case NFA_MOPEN3:
3718 case NFA_MOPEN4:
3719 case NFA_MOPEN5:
3720 case NFA_MOPEN6:
3721 case NFA_MOPEN7:
3722 case NFA_MOPEN8:
3723 case NFA_MOPEN9:
3724#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003725 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003726 case NFA_ZOPEN1:
3727 case NFA_ZOPEN2:
3728 case NFA_ZOPEN3:
3729 case NFA_ZOPEN4:
3730 case NFA_ZOPEN5:
3731 case NFA_ZOPEN6:
3732 case NFA_ZOPEN7:
3733 case NFA_ZOPEN8:
3734 case NFA_ZOPEN9:
3735#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003736 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003737 if (nfa_calc_size == TRUE)
3738 {
3739 nstate += 2;
3740 break;
3741 }
3742
3743 mopen = *p;
3744 switch (*p)
3745 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003746 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3747#ifdef FEAT_SYN_HL
3748 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3749 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3750 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3751 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3752 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3753 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3754 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3755 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3756 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3757 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3758#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003759 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003760 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003761 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003762 mclose = *p + NSUBEXP;
3763 break;
3764 }
3765
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003766 // Allow "NFA_MOPEN" as a valid postfix representation for
3767 // the empty regexp "". In this case, the NFA will be
3768 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3769 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003770 if (stackp == stack)
3771 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003772 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003773 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003774 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003775 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003776 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003777 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003778 patch(list1(&s->out), s1);
3779 PUSH(frag(s, list1(&s1->out)));
3780 break;
3781 }
3782
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003783 // At least one node was emitted before NFA_MOPEN, so
3784 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003785 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003786 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003787 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003788 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003789
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003790 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003791 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003792 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003793 patch(e.out, s1);
3794
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003795 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003796 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003797 patch(list1(&s->out1), s1);
3798
3799 PUSH(frag(s, list1(&s1->out)));
3800 break;
3801
Bram Moolenaar5714b802013-05-28 22:03:20 +02003802 case NFA_BACKREF1:
3803 case NFA_BACKREF2:
3804 case NFA_BACKREF3:
3805 case NFA_BACKREF4:
3806 case NFA_BACKREF5:
3807 case NFA_BACKREF6:
3808 case NFA_BACKREF7:
3809 case NFA_BACKREF8:
3810 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003811#ifdef FEAT_SYN_HL
3812 case NFA_ZREF1:
3813 case NFA_ZREF2:
3814 case NFA_ZREF3:
3815 case NFA_ZREF4:
3816 case NFA_ZREF5:
3817 case NFA_ZREF6:
3818 case NFA_ZREF7:
3819 case NFA_ZREF8:
3820 case NFA_ZREF9:
3821#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003822 if (nfa_calc_size == TRUE)
3823 {
3824 nstate += 2;
3825 break;
3826 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003827 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003828 if (s == NULL)
3829 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003830 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003831 if (s1 == NULL)
3832 goto theend;
3833 patch(list1(&s->out), s1);
3834 PUSH(frag(s, list1(&s1->out)));
3835 break;
3836
Bram Moolenaar423532e2013-05-29 21:14:42 +02003837 case NFA_LNUM:
3838 case NFA_LNUM_GT:
3839 case NFA_LNUM_LT:
3840 case NFA_VCOL:
3841 case NFA_VCOL_GT:
3842 case NFA_VCOL_LT:
3843 case NFA_COL:
3844 case NFA_COL_GT:
3845 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003846 case NFA_MARK:
3847 case NFA_MARK_GT:
3848 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003849 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003850 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003851
Bram Moolenaar423532e2013-05-29 21:14:42 +02003852 if (nfa_calc_size == TRUE)
3853 {
3854 nstate += 1;
3855 break;
3856 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003857 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003858 if (s == NULL)
3859 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003860 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003861 PUSH(frag(s, list1(&s->out)));
3862 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003863 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003864
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003865 case NFA_ZSTART:
3866 case NFA_ZEND:
3867 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003868 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003869 if (nfa_calc_size == TRUE)
3870 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003871 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003872 break;
3873 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003874 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003875 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003876 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003877 PUSH(frag(s, list1(&s->out)));
3878 break;
3879
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003880 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003881
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003882 } // for(p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003883
3884 if (nfa_calc_size == TRUE)
3885 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003886 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003887 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003888 }
3889
3890 e = POP();
3891 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003892 {
3893 vim_free(stack);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003894 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003895 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003896
3897 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003898 {
3899 vim_free(stack);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003900 EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003901 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003902
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003903 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003904 matchstate->c = NFA_MATCH;
3905 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003906 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003907
3908 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003909 ret = e.start;
3910
3911theend:
3912 vim_free(stack);
3913 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003914
3915#undef POP1
3916#undef PUSH1
3917#undef POP2
3918#undef PUSH2
3919#undef POP
3920#undef PUSH
3921}
3922
Bram Moolenaara2947e22013-06-11 22:44:09 +02003923/*
3924 * After building the NFA program, inspect it to add optimization hints.
3925 */
3926 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003927nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003928{
3929 int i;
3930 int c;
3931
3932 for (i = 0; i < prog->nstate; ++i)
3933 {
3934 c = prog->state[i].c;
3935 if (c == NFA_START_INVISIBLE
3936 || c == NFA_START_INVISIBLE_NEG
3937 || c == NFA_START_INVISIBLE_BEFORE
3938 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3939 {
3940 int directly;
3941
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003942 // Do it directly when what follows is possibly the end of the
3943 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003944 if (match_follows(prog->state[i].out1->out, 0))
3945 directly = TRUE;
3946 else
3947 {
3948 int ch_invisible = failure_chance(prog->state[i].out, 0);
3949 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3950
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003951 // Postpone when the invisible match is expensive or has a
3952 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003953 if (c == NFA_START_INVISIBLE_BEFORE
3954 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3955 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003956 // "before" matches are very expensive when
3957 // unbounded, always prefer what follows then,
3958 // unless what follows will always match.
3959 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003960 if (prog->state[i].val <= 0 && ch_follows > 0)
3961 directly = FALSE;
3962 else
3963 directly = ch_follows * 10 < ch_invisible;
3964 }
3965 else
3966 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003967 // normal invisible, first do the one with the
3968 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003969 directly = ch_follows < ch_invisible;
3970 }
3971 }
3972 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003973 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003974 ++prog->state[i].c;
3975 }
3976 }
3977}
3978
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003979/////////////////////////////////////////////////////////////////
3980// NFA execution code.
3981/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003982
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003983typedef struct
3984{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003985 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003986
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003987 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003988 union
3989 {
3990 struct multipos
3991 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003992 linenr_T start_lnum;
3993 linenr_T end_lnum;
3994 colnr_T start_col;
3995 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003996 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003997 struct linepos
3998 {
3999 char_u *start;
4000 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004001 } line[NSUBEXP];
4002 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004003} regsub_T;
4004
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004005typedef struct
4006{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004007 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004008#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004009 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004010#endif
4011} regsubs_T;
4012
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004013// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02004014typedef struct nfa_pim_S nfa_pim_T;
4015struct nfa_pim_S
4016{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004017 int result; // NFA_PIM_*, see below
4018 nfa_state_T *state; // the invisible match start state
4019 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004020 union
4021 {
4022 lpos_T pos;
4023 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004024 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02004025};
4026
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004027// Values for done in nfa_pim_T.
4028#define NFA_PIM_UNUSED 0 // pim not used
4029#define NFA_PIM_TODO 1 // pim not done yet
4030#define NFA_PIM_MATCH 2 // pim executed, matches
4031#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02004032
4033
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004034// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02004035typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004036{
4037 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004038 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004039 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
4040 // invisible match
4041 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02004042} nfa_thread_T;
4043
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004044// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004045typedef struct
4046{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004047 nfa_thread_T *t; // allocated array of states
4048 int n; // nr of states currently in "t"
4049 int len; // max nr of states in "t"
4050 int id; // ID of the list
4051 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02004052} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004053
Bram Moolenaar5714b802013-05-28 22:03:20 +02004054#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004055static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004056
4057 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004058log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004059{
4060 log_subexpr(&subs->norm);
4061# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004062 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02004063 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004064# endif
4065}
4066
Bram Moolenaar5714b802013-05-28 22:03:20 +02004067 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004068log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02004069{
4070 int j;
4071
4072 for (j = 0; j < sub->in_use; j++)
4073 if (REG_MULTI)
Bram Moolenaar87953742013-06-05 18:52:40 +02004074 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004075 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004076 sub->list.multi[j].start_col,
4077 (int)sub->list.multi[j].start_lnum,
4078 sub->list.multi[j].end_col,
4079 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004080 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004081 {
4082 char *s = (char *)sub->list.line[j].start;
4083 char *e = (char *)sub->list.line[j].end;
4084
Bram Moolenaar87953742013-06-05 18:52:40 +02004085 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004086 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004087 s == NULL ? "NULL" : s,
4088 e == NULL ? "NULL" : e);
4089 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004090}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004091
4092 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004093pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004094{
4095 static char buf[30];
4096
4097 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4098 buf[0] = NUL;
4099 else
4100 {
4101 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004102 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004103 }
4104 return buf;
4105}
4106
Bram Moolenaar5714b802013-05-28 22:03:20 +02004107#endif
4108
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004109// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004110static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004111#ifdef FEAT_RELTIME
4112static proftime_T *nfa_time_limit;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004113static int *nfa_timed_out;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004114static int nfa_time_count;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004115#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004116
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004117static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004118static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004119
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004120/*
4121 * Copy postponed invisible match info from "from" to "to".
4122 */
4123 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004124copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004125{
4126 to->result = from->result;
4127 to->state = from->state;
4128 copy_sub(&to->subs.norm, &from->subs.norm);
4129#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004130 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004131 copy_sub(&to->subs.synt, &from->subs.synt);
4132#endif
4133 to->end = from->end;
4134}
4135
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004136 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004137clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004138{
4139 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004140 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004141 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004142 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004143 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004144 vim_memset(sub->list.line, 0,
4145 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004146 sub->in_use = 0;
4147}
4148
4149/*
4150 * Copy the submatches from "from" to "to".
4151 */
4152 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004153copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004154{
4155 to->in_use = from->in_use;
4156 if (from->in_use > 0)
4157 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004158 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004159 if (REG_MULTI)
4160 mch_memmove(&to->list.multi[0],
4161 &from->list.multi[0],
4162 sizeof(struct multipos) * from->in_use);
4163 else
4164 mch_memmove(&to->list.line[0],
4165 &from->list.line[0],
4166 sizeof(struct linepos) * from->in_use);
4167 }
4168}
4169
4170/*
4171 * Like copy_sub() but exclude the main match.
4172 */
4173 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004174copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004175{
4176 if (to->in_use < from->in_use)
4177 to->in_use = from->in_use;
4178 if (from->in_use > 1)
4179 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004180 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004181 if (REG_MULTI)
4182 mch_memmove(&to->list.multi[1],
4183 &from->list.multi[1],
4184 sizeof(struct multipos) * (from->in_use - 1));
4185 else
4186 mch_memmove(&to->list.line[1],
4187 &from->list.line[1],
4188 sizeof(struct linepos) * (from->in_use - 1));
4189 }
4190}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004191
Bram Moolenaar428e9872013-05-30 17:05:39 +02004192/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004193 * Like copy_sub() but only do the end of the main match if \ze is present.
4194 */
4195 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004196copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004197{
Bram Moolenaar0270f382018-07-17 05:43:58 +02004198 if (rex.nfa_has_zend)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004199 {
4200 if (REG_MULTI)
4201 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004202 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004203 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004204 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4205 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004206 }
Bram Moolenaarf2118842013-09-25 18:16:38 +02004207 }
4208 else
4209 {
4210 if (from->list.line[0].end != NULL)
4211 to->list.line[0].end = from->list.line[0].end;
4212 }
4213 }
4214}
4215
4216/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004217 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004218 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004219 */
4220 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004221sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004222{
4223 int i;
4224 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004225 linenr_T s1;
4226 linenr_T s2;
4227 char_u *sp1;
4228 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004229
4230 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4231 if (REG_MULTI)
4232 {
4233 for (i = 0; i < todo; ++i)
4234 {
4235 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004236 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004237 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004238 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004239 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004240 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004241 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004242 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004243 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004244 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004245 if (s1 != -1 && sub1->list.multi[i].start_col
4246 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004247 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004248
Bram Moolenaar0270f382018-07-17 05:43:58 +02004249 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004250 {
4251 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004252 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004253 else
4254 s1 = -1;
4255 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004256 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004257 else
4258 s2 = -1;
4259 if (s1 != s2)
4260 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004261 if (s1 != -1 && sub1->list.multi[i].end_col
4262 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004263 return FALSE;
4264 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004265 }
4266 }
4267 else
4268 {
4269 for (i = 0; i < todo; ++i)
4270 {
4271 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004272 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004273 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004274 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004275 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004276 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004277 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004278 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004279 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004280 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004281 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004282 {
4283 if (i < sub1->in_use)
4284 sp1 = sub1->list.line[i].end;
4285 else
4286 sp1 = NULL;
4287 if (i < sub2->in_use)
4288 sp2 = sub2->list.line[i].end;
4289 else
4290 sp2 = NULL;
4291 if (sp1 != sp2)
4292 return FALSE;
4293 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004294 }
4295 }
4296
4297 return TRUE;
4298}
4299
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004300#ifdef ENABLE_LOG
4301 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004302report_state(char *action,
4303 regsub_T *sub,
4304 nfa_state_T *state,
4305 int lid,
4306 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004307{
4308 int col;
4309
4310 if (sub->in_use <= 0)
4311 col = -1;
4312 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004313 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004314 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004315 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004316 nfa_set_code(state->c);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004317 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4318 action, abs(state->id), lid, state->c, code, col,
4319 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004320}
4321#endif
4322
Bram Moolenaar43e02982013-06-07 17:31:29 +02004323/*
4324 * Return TRUE if the same state is already in list "l" with the same
4325 * positions as "subs".
4326 */
4327 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004328has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004329 nfa_list_T *l, // runtime state list
4330 nfa_state_T *state, // state to update
4331 regsubs_T *subs, // pointers to subexpressions
4332 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004333{
4334 nfa_thread_T *thread;
4335 int i;
4336
4337 for (i = 0; i < l->n; ++i)
4338 {
4339 thread = &l->t[i];
4340 if (thread->state->id == state->id
4341 && sub_equal(&thread->subs.norm, &subs->norm)
4342#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004343 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004344 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004345#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004346 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004347 return TRUE;
4348 }
4349 return FALSE;
4350}
4351
4352/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004353 * Return TRUE if "one" and "two" are equal. That includes when both are not
4354 * set.
4355 */
4356 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004357pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004358{
4359 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4360 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4361
4362 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004363 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004364 return two_unused;
4365 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004366 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004367 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004368 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004369 if (one->state->id != two->state->id)
4370 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004371 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004372 if (REG_MULTI)
4373 return one->end.pos.lnum == two->end.pos.lnum
4374 && one->end.pos.col == two->end.pos.col;
4375 return one->end.ptr == two->end.ptr;
4376}
4377
4378/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004379 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4380 */
4381 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004382match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004383{
4384 nfa_state_T *state = startstate;
4385
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004386 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004387 if (depth > 10)
4388 return FALSE;
4389
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004390 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004391 {
4392 switch (state->c)
4393 {
4394 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004395 case NFA_MCLOSE:
4396 case NFA_END_INVISIBLE:
4397 case NFA_END_INVISIBLE_NEG:
4398 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004399 return TRUE;
4400
4401 case NFA_SPLIT:
4402 return match_follows(state->out, depth + 1)
4403 || match_follows(state->out1, depth + 1);
4404
4405 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004406 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004407 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004408 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004409 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004410 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004411 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004412 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004413 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004414 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004415 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004416 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004417
4418 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004419 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004420 case NFA_IDENT:
4421 case NFA_SIDENT:
4422 case NFA_KWORD:
4423 case NFA_SKWORD:
4424 case NFA_FNAME:
4425 case NFA_SFNAME:
4426 case NFA_PRINT:
4427 case NFA_SPRINT:
4428 case NFA_WHITE:
4429 case NFA_NWHITE:
4430 case NFA_DIGIT:
4431 case NFA_NDIGIT:
4432 case NFA_HEX:
4433 case NFA_NHEX:
4434 case NFA_OCTAL:
4435 case NFA_NOCTAL:
4436 case NFA_WORD:
4437 case NFA_NWORD:
4438 case NFA_HEAD:
4439 case NFA_NHEAD:
4440 case NFA_ALPHA:
4441 case NFA_NALPHA:
4442 case NFA_LOWER:
4443 case NFA_NLOWER:
4444 case NFA_UPPER:
4445 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004446 case NFA_LOWER_IC:
4447 case NFA_NLOWER_IC:
4448 case NFA_UPPER_IC:
4449 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004450 case NFA_START_COLL:
4451 case NFA_START_NEG_COLL:
4452 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004453 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004454 return FALSE;
4455
4456 default:
4457 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004458 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004459 return FALSE;
4460
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004461 // Others: zero-width or possibly zero-width, might still find
4462 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004463 break;
4464 }
4465 state = state->out;
4466 }
4467 return FALSE;
4468}
4469
4470
4471/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004472 * Return TRUE if "state" is already in list "l".
4473 */
4474 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004475state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004476 nfa_list_T *l, // runtime state list
4477 nfa_state_T *state, // state to update
4478 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004479{
4480 if (state->lastlist[nfa_ll_index] == l->id)
4481 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004482 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004483 return TRUE;
4484 }
4485 return FALSE;
4486}
4487
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004488// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004489#define ADDSTATE_HERE_OFFSET 10
4490
Bram Moolenaard05bf562013-06-30 23:24:08 +02004491/*
4492 * Add "state" and possibly what follows to state list ".".
4493 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004494 * Returns NULL when recursiveness is too deep.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004495 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004496 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004497addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004498 nfa_list_T *l, // runtime state list
4499 nfa_state_T *state, // state to update
4500 regsubs_T *subs_arg, // pointers to subexpressions
4501 nfa_pim_T *pim, // postponed look-behind match
4502 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004503{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004504 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004505 int off = off_arg;
4506 int add_here = FALSE;
4507 int listindex = 0;
4508 int k;
4509 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004510 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004511 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004512 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004513 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004514 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004515 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004516 regsubs_T *subs = subs_arg;
4517 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004518#ifdef ENABLE_LOG
4519 int did_print = FALSE;
4520#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004521 static int depth = 0;
4522
4523 // This function is called recursively. When the depth is too much we run
4524 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004525 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004526 {
4527 --depth;
4528 return NULL;
4529 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004530
Bram Moolenaar16b35782016-09-09 20:29:50 +02004531 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4532 {
4533 add_here = TRUE;
4534 off = 0;
4535 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4536 }
4537
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004538 switch (state->c)
4539 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004540 case NFA_NCLOSE:
4541 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004542 case NFA_MCLOSE1:
4543 case NFA_MCLOSE2:
4544 case NFA_MCLOSE3:
4545 case NFA_MCLOSE4:
4546 case NFA_MCLOSE5:
4547 case NFA_MCLOSE6:
4548 case NFA_MCLOSE7:
4549 case NFA_MCLOSE8:
4550 case NFA_MCLOSE9:
4551#ifdef FEAT_SYN_HL
4552 case NFA_ZCLOSE:
4553 case NFA_ZCLOSE1:
4554 case NFA_ZCLOSE2:
4555 case NFA_ZCLOSE3:
4556 case NFA_ZCLOSE4:
4557 case NFA_ZCLOSE5:
4558 case NFA_ZCLOSE6:
4559 case NFA_ZCLOSE7:
4560 case NFA_ZCLOSE8:
4561 case NFA_ZCLOSE9:
4562#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004563 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004564 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004565 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004566 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004567 // These nodes are not added themselves but their "out" and/or
4568 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004569 break;
4570
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004571 case NFA_BOL:
4572 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004573 // "^" won't match past end-of-line, don't bother trying.
4574 // Except when at the end of the line, or when we are going to the
4575 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004576 if (rex.input > rex.line
4577 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004578 && (nfa_endp == NULL
4579 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004580 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004581 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004582 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004583
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004584 case NFA_MOPEN1:
4585 case NFA_MOPEN2:
4586 case NFA_MOPEN3:
4587 case NFA_MOPEN4:
4588 case NFA_MOPEN5:
4589 case NFA_MOPEN6:
4590 case NFA_MOPEN7:
4591 case NFA_MOPEN8:
4592 case NFA_MOPEN9:
4593#ifdef FEAT_SYN_HL
4594 case NFA_ZOPEN:
4595 case NFA_ZOPEN1:
4596 case NFA_ZOPEN2:
4597 case NFA_ZOPEN3:
4598 case NFA_ZOPEN4:
4599 case NFA_ZOPEN5:
4600 case NFA_ZOPEN6:
4601 case NFA_ZOPEN7:
4602 case NFA_ZOPEN8:
4603 case NFA_ZOPEN9:
4604#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004605 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004606 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004607 // These nodes need to be added so that we can bail out when it
4608 // was added to this list before at the same position to avoid an
4609 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004610
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004611 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004612 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004613 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004614 // This state is already in the list, don't add it again,
4615 // unless it is an MOPEN that is used for a backreference or
4616 // when there is a PIM. For NFA_MATCH check the position,
4617 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004618 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004619 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004620 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004621 // When called from addstate_here() do insert before
4622 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004623 if (add_here)
4624 {
4625 for (k = 0; k < l->n && k < listindex; ++k)
4626 if (l->t[k].state->id == state->id)
4627 {
4628 found = TRUE;
4629 break;
4630 }
4631 }
4632 if (!add_here || found)
4633 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004634skip_add:
4635#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004636 nfa_set_code(state->c);
4637 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4638 abs(state->id), l->id, state->c, code,
4639 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004640#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004641 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004642 return subs;
4643 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004644 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004645
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004646 // Do not add the state again when it exists with the same
4647 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004648 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004649 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004650 }
4651
Bram Moolenaar688b3982019-02-13 21:47:36 +01004652 // When there are backreferences or PIMs the number of states may
4653 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004654 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004655 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004656 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004657 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004658 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004659
Bram Moolenaar688b3982019-02-13 21:47:36 +01004660 if ((long)(newsize >> 10) >= p_mmp)
4661 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004662 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01004663 --depth;
4664 return NULL;
4665 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004666 if (subs != &temp_subs)
4667 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004668 // "subs" may point into the current array, need to make a
4669 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004670 copy_sub(&temp_subs.norm, &subs->norm);
4671#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004672 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004673 copy_sub(&temp_subs.synt, &subs->synt);
4674#endif
4675 subs = &temp_subs;
4676 }
4677
Bram Moolenaar688b3982019-02-13 21:47:36 +01004678 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004679 if (newt == NULL)
4680 {
4681 // out of memory
4682 --depth;
4683 return NULL;
4684 }
4685 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004686 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004687 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004688
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004689 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004690 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004691 thread = &l->t[l->n++];
4692 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004693 if (pim == NULL)
4694 thread->pim.result = NFA_PIM_UNUSED;
4695 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004696 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004697 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004698 l->has_pim = TRUE;
4699 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004700 copy_sub(&thread->subs.norm, &subs->norm);
4701#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004702 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004703 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004704#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004705#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004706 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004707 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004708#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004709 }
4710
4711#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004712 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004713 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004714#endif
4715 switch (state->c)
4716 {
4717 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004718 break;
4719
4720 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004721 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004722 subs = addstate(l, state->out, subs, pim, off_arg);
4723 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004724 break;
4725
Bram Moolenaar699c1202013-09-25 16:41:54 +02004726 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004727 case NFA_NOPEN:
4728 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004729 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004730 break;
4731
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004732 case NFA_MOPEN:
4733 case NFA_MOPEN1:
4734 case NFA_MOPEN2:
4735 case NFA_MOPEN3:
4736 case NFA_MOPEN4:
4737 case NFA_MOPEN5:
4738 case NFA_MOPEN6:
4739 case NFA_MOPEN7:
4740 case NFA_MOPEN8:
4741 case NFA_MOPEN9:
4742#ifdef FEAT_SYN_HL
4743 case NFA_ZOPEN:
4744 case NFA_ZOPEN1:
4745 case NFA_ZOPEN2:
4746 case NFA_ZOPEN3:
4747 case NFA_ZOPEN4:
4748 case NFA_ZOPEN5:
4749 case NFA_ZOPEN6:
4750 case NFA_ZOPEN7:
4751 case NFA_ZOPEN8:
4752 case NFA_ZOPEN9:
4753#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004754 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004755 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004756 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004757 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004758 sub = &subs->norm;
4759 }
4760#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004761 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004762 {
4763 subidx = state->c - NFA_ZOPEN;
4764 sub = &subs->synt;
4765 }
4766#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004767 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004768 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004769 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004770 sub = &subs->norm;
4771 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004772
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004773 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004774 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004775 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004776
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004777 // Set the position (with "off" added) in the subexpression. Save
4778 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004779 if (REG_MULTI)
4780 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004781 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004782 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004783 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004784 save_in_use = -1;
4785 }
4786 else
4787 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004788 save_in_use = sub->in_use;
4789 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004790 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004791 sub->list.multi[i].start_lnum = -1;
4792 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004793 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004794 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004795 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004796 if (off == -1)
4797 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004798 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004799 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004800 }
4801 else
4802 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004803 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004804 sub->list.multi[subidx].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004805 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004806 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004807 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004808 }
4809 else
4810 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004811 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004812 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004813 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004814 save_in_use = -1;
4815 }
4816 else
4817 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004818 save_in_use = sub->in_use;
4819 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004820 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004821 sub->list.line[i].start = NULL;
4822 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004823 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004824 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004825 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004826 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004827 }
4828
Bram Moolenaar16b35782016-09-09 20:29:50 +02004829 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004830 if (subs == NULL)
4831 break;
4832 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004833#ifdef FEAT_SYN_HL
4834 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4835 sub = &subs->synt;
4836 else
4837#endif
4838 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004839
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004840 if (save_in_use == -1)
4841 {
4842 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004843 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004844 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004845 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004846 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004847 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004848 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004849 break;
4850
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004851 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004852 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004853 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004854 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004855 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004856 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004857 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004858 break;
4859 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004860 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004861 case NFA_MCLOSE1:
4862 case NFA_MCLOSE2:
4863 case NFA_MCLOSE3:
4864 case NFA_MCLOSE4:
4865 case NFA_MCLOSE5:
4866 case NFA_MCLOSE6:
4867 case NFA_MCLOSE7:
4868 case NFA_MCLOSE8:
4869 case NFA_MCLOSE9:
4870#ifdef FEAT_SYN_HL
4871 case NFA_ZCLOSE:
4872 case NFA_ZCLOSE1:
4873 case NFA_ZCLOSE2:
4874 case NFA_ZCLOSE3:
4875 case NFA_ZCLOSE4:
4876 case NFA_ZCLOSE5:
4877 case NFA_ZCLOSE6:
4878 case NFA_ZCLOSE7:
4879 case NFA_ZCLOSE8:
4880 case NFA_ZCLOSE9:
4881#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004882 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004883 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004884 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004885 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004886 sub = &subs->norm;
4887 }
4888#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004889 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004890 {
4891 subidx = state->c - NFA_ZCLOSE;
4892 sub = &subs->synt;
4893 }
4894#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004895 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004896 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004897 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004898 sub = &subs->norm;
4899 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004900
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004901 // We don't fill in gaps here, there must have been an MOPEN that
4902 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004903 save_in_use = sub->in_use;
4904 if (sub->in_use <= subidx)
4905 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004906 if (REG_MULTI)
4907 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004908 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004909 if (off == -1)
4910 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004911 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004912 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004913 }
4914 else
4915 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004916 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004917 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004918 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004919 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004920 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004921 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004922 }
4923 else
4924 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004925 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004926 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004927 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004928 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004929 }
4930
Bram Moolenaar16b35782016-09-09 20:29:50 +02004931 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004932 if (subs == NULL)
4933 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004934 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004935#ifdef FEAT_SYN_HL
4936 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4937 sub = &subs->synt;
4938 else
4939#endif
4940 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004941
4942 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004943 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004944 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004945 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004946 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004947 break;
4948 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004949 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004950 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004951}
4952
4953/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004954 * Like addstate(), but the new state(s) are put at position "*ip".
4955 * Used for zero-width matches, next state to use is the added one.
4956 * This makes sure the order of states to be tried does not change, which
4957 * matters for alternatives.
4958 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004959 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004960addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004961 nfa_list_T *l, // runtime state list
4962 nfa_state_T *state, // state to update
4963 regsubs_T *subs, // pointers to subexpressions
4964 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004965 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004966{
4967 int tlen = l->n;
4968 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004969 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004970 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004971
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004972 // First add the state(s) at the end, so that we know how many there are.
4973 // Pass the listidx as offset (avoids adding another argument to
Dominique Pelleaf4a61a2021-12-27 17:21:41 +00004974 // addstate()).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004975 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4976 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004977 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004978
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004979 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004980 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004981 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004982
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004983 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004984 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004985 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004986 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02004987 if (count == 1)
4988 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004989 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02004990 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02004991 }
4992 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004993 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004994 if (l->n + count - 1 >= l->len)
4995 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004996 // not enough space to move the new states, reallocate the list
4997 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004998 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004999 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005000 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005001
Bram Moolenaar688b3982019-02-13 21:47:36 +01005002 if ((long)(newsize >> 10) >= p_mmp)
5003 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00005004 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar688b3982019-02-13 21:47:36 +01005005 return NULL;
5006 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005007 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005008 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01005009 return NULL;
5010 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005011 mch_memmove(&(newl[0]),
5012 &(l->t[0]),
5013 sizeof(nfa_thread_T) * listidx);
5014 mch_memmove(&(newl[listidx]),
5015 &(l->t[l->n - count]),
5016 sizeof(nfa_thread_T) * count);
5017 mch_memmove(&(newl[listidx + count]),
5018 &(l->t[listidx + 1]),
5019 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
5020 vim_free(l->t);
5021 l->t = newl;
5022 }
5023 else
5024 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005025 // make space for new states, then move them from the
5026 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02005027 mch_memmove(&(l->t[listidx + count]),
5028 &(l->t[listidx + 1]),
5029 sizeof(nfa_thread_T) * (l->n - listidx - 1));
5030 mch_memmove(&(l->t[listidx]),
5031 &(l->t[l->n - 1]),
5032 sizeof(nfa_thread_T) * count);
5033 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005034 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005035 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005036 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005037
5038 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005039}
5040
5041/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005042 * Check character class "class" against current character c.
5043 */
5044 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005045check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005046{
5047 switch (class)
5048 {
5049 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005050 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005051 return OK;
5052 break;
5053 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005054 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005055 return OK;
5056 break;
5057 case NFA_CLASS_BLANK:
5058 if (c == ' ' || c == '\t')
5059 return OK;
5060 break;
5061 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005062 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005063 return OK;
5064 break;
5065 case NFA_CLASS_DIGIT:
5066 if (VIM_ISDIGIT(c))
5067 return OK;
5068 break;
5069 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005070 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005071 return OK;
5072 break;
5073 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005074 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005075 return OK;
5076 break;
5077 case NFA_CLASS_PRINT:
5078 if (vim_isprintc(c))
5079 return OK;
5080 break;
5081 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005082 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005083 return OK;
5084 break;
5085 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005086 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005087 return OK;
5088 break;
5089 case NFA_CLASS_UPPER:
5090 if (MB_ISUPPER(c))
5091 return OK;
5092 break;
5093 case NFA_CLASS_XDIGIT:
5094 if (vim_isxdigit(c))
5095 return OK;
5096 break;
5097 case NFA_CLASS_TAB:
5098 if (c == '\t')
5099 return OK;
5100 break;
5101 case NFA_CLASS_RETURN:
5102 if (c == '\r')
5103 return OK;
5104 break;
5105 case NFA_CLASS_BACKSPACE:
5106 if (c == '\b')
5107 return OK;
5108 break;
5109 case NFA_CLASS_ESCAPE:
5110 if (c == '\033')
5111 return OK;
5112 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005113 case NFA_CLASS_IDENT:
5114 if (vim_isIDc(c))
5115 return OK;
5116 break;
5117 case NFA_CLASS_KEYWORD:
5118 if (reg_iswordc(c))
5119 return OK;
5120 break;
5121 case NFA_CLASS_FNAME:
5122 if (vim_isfilec(c))
5123 return OK;
5124 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005125
5126 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005127 // should not be here :P
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005128 siemsg(_(e_ill_char_class), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005129 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005130 }
5131 return FAIL;
5132}
5133
Bram Moolenaar5714b802013-05-28 22:03:20 +02005134/*
5135 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005136 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005137 */
5138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005139match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005140 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005141 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005142 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005143{
5144 int len;
5145
5146 if (sub->in_use <= subidx)
5147 {
5148retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005149 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005150 *bytelen = 0;
5151 return TRUE;
5152 }
5153
5154 if (REG_MULTI)
5155 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005156 if (sub->list.multi[subidx].start_lnum < 0
5157 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005158 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005159 if (sub->list.multi[subidx].start_lnum == rex.lnum
5160 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005161 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005162 len = sub->list.multi[subidx].end_col
5163 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005164 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5165 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005166 {
5167 *bytelen = len;
5168 return TRUE;
5169 }
5170 }
5171 else
5172 {
5173 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005174 sub->list.multi[subidx].start_lnum,
5175 sub->list.multi[subidx].start_col,
5176 sub->list.multi[subidx].end_lnum,
5177 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005178 bytelen) == RA_MATCH)
5179 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005180 }
5181 }
5182 else
5183 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005184 if (sub->list.line[subidx].start == NULL
5185 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005186 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005187 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005188 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005189 {
5190 *bytelen = len;
5191 return TRUE;
5192 }
5193 }
5194 return FALSE;
5195}
5196
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005197#ifdef FEAT_SYN_HL
5198
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005199/*
5200 * Check for a match with \z subexpression "subidx".
5201 * Return TRUE if it matches.
5202 */
5203 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005204match_zref(
5205 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005206 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005207{
5208 int len;
5209
5210 cleanup_zsubexpr();
5211 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5212 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005213 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005214 *bytelen = 0;
5215 return TRUE;
5216 }
5217
5218 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005219 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005220 {
5221 *bytelen = len;
5222 return TRUE;
5223 }
5224 return FALSE;
5225}
5226#endif
5227
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005228/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005229 * Save list IDs for all NFA states of "prog" into "list".
5230 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005231 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005232 */
5233 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005234nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005235{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005236 int i;
5237 nfa_state_T *p;
5238
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005239 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005240 p = &prog->state[0];
5241 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005242 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005243 list[i] = p->lastlist[1];
5244 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005245 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005246 }
5247}
5248
5249/*
5250 * Restore list IDs from "list" to all NFA states.
5251 */
5252 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005253nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005254{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005255 int i;
5256 nfa_state_T *p;
5257
5258 p = &prog->state[0];
5259 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005260 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005261 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005262 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005263 }
5264}
5265
Bram Moolenaar423532e2013-05-29 21:14:42 +02005266 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005267nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005268{
5269 if (op == 1) return pos > val;
5270 if (op == 2) return pos < val;
5271 return val == pos;
5272}
5273
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005274static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005275
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005276/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005277 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005278 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5279 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005280 */
5281 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005282recursive_regmatch(
5283 nfa_state_T *state,
5284 nfa_pim_T *pim,
5285 nfa_regprog_T *prog,
5286 regsubs_T *submatch,
5287 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005288 int **listids,
5289 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005290{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005291 int save_reginput_col = (int)(rex.input - rex.line);
5292 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005293 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005294 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005295 save_se_T *save_nfa_endp = nfa_endp;
5296 save_se_T endpos;
5297 save_se_T *endposp = NULL;
5298 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005299 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005300
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005301 if (pim != NULL)
5302 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005303 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005304 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005305 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005306 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005307 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005308 }
5309
Bram Moolenaardecd9542013-06-07 16:31:50 +02005310 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005311 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5312 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5313 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005314 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005315 // The recursive match must end at the current position. When "pim" is
5316 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005317 endposp = &endpos;
5318 if (REG_MULTI)
5319 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005320 if (pim == NULL)
5321 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005322 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5323 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005324 }
5325 else
5326 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005327 }
5328 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005329 {
5330 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005331 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005332 else
5333 endpos.se_u.ptr = pim->end.ptr;
5334 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005335
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005336 // Go back the specified number of bytes, or as far as the
5337 // start of the previous line, to try matching "\@<=" or
5338 // not matching "\@<!". This is very inefficient, limit the number of
5339 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005340 if (state->val <= 0)
5341 {
5342 if (REG_MULTI)
5343 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005344 rex.line = reg_getline(--rex.lnum);
5345 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005346 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005347 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005348 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005349 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005350 }
5351 else
5352 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005353 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005354 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005355 // Not enough bytes in this line, go to end of
5356 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005357 rex.line = reg_getline(--rex.lnum);
5358 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005359 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005360 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005361 rex.line = reg_getline(++rex.lnum);
5362 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005363 }
5364 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005365 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005366 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005367 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005368 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005369 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005370 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005371 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005372 }
5373 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005374 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005375 }
5376 }
5377
Bram Moolenaarf46da702013-06-02 22:37:42 +02005378#ifdef ENABLE_LOG
5379 if (log_fd != stderr)
5380 fclose(log_fd);
5381 log_fd = NULL;
5382#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005383 // Have to clear the lastlist field of the NFA nodes, so that
5384 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005385 if (nfa_ll_index == 1)
5386 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005387 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5388 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005389 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005390 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005391 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005392 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005393 if (*listids == NULL)
5394 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005395 emsg(_("E878: (NFA) Could not allocate memory for branch traversal!"));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005396 return 0;
5397 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005398 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005399 }
5400 nfa_save_listids(prog, *listids);
5401 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005402 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005403 }
5404 else
5405 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005406 // First recursive nfa_regmatch() call, switch to the second lastlist
5407 // entry. Make sure rex.nfa_listid is different from a previous
5408 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005409 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005410 if (rex.nfa_listid <= rex.nfa_alt_listid)
5411 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005412 }
5413
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005414 // Call nfa_regmatch() to check if the current concat matches at this
5415 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005416 nfa_endp = endposp;
5417 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005418
5419 if (need_restore)
5420 nfa_restore_listids(prog, *listids);
5421 else
5422 {
5423 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005424 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005425 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005426
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005427 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005428 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005429 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005430 rex.line = reg_getline(rex.lnum);
5431 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005432 if (result != NFA_TOO_EXPENSIVE)
5433 {
5434 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005435 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005436 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005437 nfa_endp = save_nfa_endp;
5438
5439#ifdef ENABLE_LOG
5440 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5441 if (log_fd != NULL)
5442 {
5443 fprintf(log_fd, "****************************\n");
5444 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
5445 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
5446 fprintf(log_fd, "****************************\n");
5447 }
5448 else
5449 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005450 emsg(_(e_log_open_failed));
Bram Moolenaarf46da702013-06-02 22:37:42 +02005451 log_fd = stderr;
5452 }
5453#endif
5454
5455 return result;
5456}
5457
Bram Moolenaara2d95102013-06-04 14:23:05 +02005458/*
5459 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005460 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005461 * NFA_ANY: 1
5462 * specific character: 99
5463 */
5464 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005465failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005466{
5467 int c = state->c;
5468 int l, r;
5469
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005470 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005471 if (depth > 4)
5472 return 1;
5473
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005474 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005475 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005476 case NFA_SPLIT:
5477 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005478 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005479 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005480 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005481 l = failure_chance(state->out, depth + 1);
5482 r = failure_chance(state->out1, depth + 1);
5483 return l < r ? l : r;
5484
5485 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005486 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005487 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005488
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005489 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005490 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005491 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005492 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005493 return 0;
5494
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005495 case NFA_START_INVISIBLE:
5496 case NFA_START_INVISIBLE_FIRST:
5497 case NFA_START_INVISIBLE_NEG:
5498 case NFA_START_INVISIBLE_NEG_FIRST:
5499 case NFA_START_INVISIBLE_BEFORE:
5500 case NFA_START_INVISIBLE_BEFORE_FIRST:
5501 case NFA_START_INVISIBLE_BEFORE_NEG:
5502 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5503 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005504 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005505 return 5;
5506
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005507 case NFA_BOL:
5508 case NFA_EOL:
5509 case NFA_BOF:
5510 case NFA_EOF:
5511 case NFA_NEWL:
5512 return 99;
5513
5514 case NFA_BOW:
5515 case NFA_EOW:
5516 return 90;
5517
5518 case NFA_MOPEN:
5519 case NFA_MOPEN1:
5520 case NFA_MOPEN2:
5521 case NFA_MOPEN3:
5522 case NFA_MOPEN4:
5523 case NFA_MOPEN5:
5524 case NFA_MOPEN6:
5525 case NFA_MOPEN7:
5526 case NFA_MOPEN8:
5527 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005528#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005529 case NFA_ZOPEN:
5530 case NFA_ZOPEN1:
5531 case NFA_ZOPEN2:
5532 case NFA_ZOPEN3:
5533 case NFA_ZOPEN4:
5534 case NFA_ZOPEN5:
5535 case NFA_ZOPEN6:
5536 case NFA_ZOPEN7:
5537 case NFA_ZOPEN8:
5538 case NFA_ZOPEN9:
5539 case NFA_ZCLOSE:
5540 case NFA_ZCLOSE1:
5541 case NFA_ZCLOSE2:
5542 case NFA_ZCLOSE3:
5543 case NFA_ZCLOSE4:
5544 case NFA_ZCLOSE5:
5545 case NFA_ZCLOSE6:
5546 case NFA_ZCLOSE7:
5547 case NFA_ZCLOSE8:
5548 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005549#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005550 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005551 case NFA_MCLOSE1:
5552 case NFA_MCLOSE2:
5553 case NFA_MCLOSE3:
5554 case NFA_MCLOSE4:
5555 case NFA_MCLOSE5:
5556 case NFA_MCLOSE6:
5557 case NFA_MCLOSE7:
5558 case NFA_MCLOSE8:
5559 case NFA_MCLOSE9:
5560 case NFA_NCLOSE:
5561 return failure_chance(state->out, depth + 1);
5562
5563 case NFA_BACKREF1:
5564 case NFA_BACKREF2:
5565 case NFA_BACKREF3:
5566 case NFA_BACKREF4:
5567 case NFA_BACKREF5:
5568 case NFA_BACKREF6:
5569 case NFA_BACKREF7:
5570 case NFA_BACKREF8:
5571 case NFA_BACKREF9:
5572#ifdef FEAT_SYN_HL
5573 case NFA_ZREF1:
5574 case NFA_ZREF2:
5575 case NFA_ZREF3:
5576 case NFA_ZREF4:
5577 case NFA_ZREF5:
5578 case NFA_ZREF6:
5579 case NFA_ZREF7:
5580 case NFA_ZREF8:
5581 case NFA_ZREF9:
5582#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005583 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005584 return 94;
5585
5586 case NFA_LNUM_GT:
5587 case NFA_LNUM_LT:
5588 case NFA_COL_GT:
5589 case NFA_COL_LT:
5590 case NFA_VCOL_GT:
5591 case NFA_VCOL_LT:
5592 case NFA_MARK_GT:
5593 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005594 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005595 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005596 return 85;
5597
5598 case NFA_LNUM:
5599 return 90;
5600
5601 case NFA_CURSOR:
5602 case NFA_COL:
5603 case NFA_VCOL:
5604 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005605 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005606 return 98;
5607
5608 case NFA_COMPOSING:
5609 return 95;
5610
5611 default:
5612 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005613 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005614 return 95;
5615 }
5616
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005617 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005618 return 50;
5619}
5620
Bram Moolenaarf46da702013-06-02 22:37:42 +02005621/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005622 * Skip until the char "c" we know a match must start with.
5623 */
5624 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005625skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005626{
5627 char_u *s;
5628
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005629 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005630 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005631 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005632 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005633 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005634 if (s == NULL)
5635 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005636 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005637 return OK;
5638}
5639
5640/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005641 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005642 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005643 * Returns zero for no match, 1 for a match.
5644 */
5645 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01005646find_match_text(colnr_T startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005647{
5648 colnr_T col = startcol;
5649 int c1, c2;
5650 int len1, len2;
5651 int match;
5652
5653 for (;;)
5654 {
5655 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005656 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005657 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5658 {
5659 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005660 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005661 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005662 {
5663 match = FALSE;
5664 break;
5665 }
Bram Moolenaar65b60562021-09-07 19:26:53 +02005666 len2 += enc_utf8 ? utf_ptr2len(rex.line + col + len2)
5667 : MB_CHAR2LEN(c2);
Bram Moolenaar473de612013-06-08 18:19:48 +02005668 }
5669 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005670 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005671 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005672 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005673 {
5674 cleanup_subexpr();
5675 if (REG_MULTI)
5676 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005677 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005678 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005679 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005680 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005681 }
5682 else
5683 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005684 rex.reg_startp[0] = rex.line + col;
5685 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005686 }
5687 return 1L;
5688 }
5689
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005690 // Try finding regstart after the current match.
5691 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005692 if (skip_to_start(regstart, &col) == FAIL)
5693 break;
5694 }
5695 return 0L;
5696}
5697
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005698#ifdef FEAT_RELTIME
5699 static int
5700nfa_did_time_out()
5701{
5702 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5703 {
5704 if (nfa_timed_out != NULL)
5705 *nfa_timed_out = TRUE;
5706 return TRUE;
5707 }
5708 return FALSE;
5709}
5710#endif
5711
Bram Moolenaar473de612013-06-08 18:19:48 +02005712/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005713 * Main matching routine.
5714 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005715 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005716 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005717 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005718 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005719 * Return TRUE if there is a match, FALSE if there is no match,
5720 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005721 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005722 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005723 * Note: Caller must ensure that: start != NULL.
5724 */
5725 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005726nfa_regmatch(
5727 nfa_regprog_T *prog,
5728 nfa_state_T *start,
5729 regsubs_T *submatch,
5730 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005731{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005732 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005733 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005734 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005735 int go_to_nextline = FALSE;
5736 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005737 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005738 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005739 nfa_list_T *thislist;
5740 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005741 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005742 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005743 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005744 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005745 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005746 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005747 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005748 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005749#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005750 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005751#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005752
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005753 // Some patterns may take a long time to match, especially when using
5754 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005755 fast_breakcheck();
5756 if (got_int)
5757 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005758#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005759 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005760 return FALSE;
5761#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005762
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005763#ifdef NFA_REGEXP_DEBUG_LOG
5764 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5765 if (debug == NULL)
5766 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005767 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005768 return FALSE;
5769 }
5770#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005771 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005772
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005773 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005774 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005775
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005776 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005777 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005778 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005779 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005780 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005781 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005782
5783#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005784 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005785 if (log_fd != NULL)
5786 {
5787 fprintf(log_fd, "**********************************\n");
5788 nfa_set_code(start->c);
5789 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5790 abs(start->id), code);
5791 fprintf(log_fd, "**********************************\n");
5792 }
5793 else
5794 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005795 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005796 log_fd = stderr;
5797 }
5798#endif
5799
5800 thislist = &list[0];
5801 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005802 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005803 nextlist = &list[1];
5804 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005805 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005806#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005807 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005808#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005809 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005810
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005811 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5812 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005813 if (toplevel)
5814 {
5815 if (REG_MULTI)
5816 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005817 m->norm.list.multi[0].start_lnum = rex.lnum;
5818 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005819 }
5820 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005821 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005822 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005823 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005824 }
5825 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005826 r = addstate(thislist, start, m, NULL, 0);
5827 if (r == NULL)
5828 {
5829 nfa_match = NFA_TOO_EXPENSIVE;
5830 goto theend;
5831 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005832
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005833#define ADD_STATE_IF_MATCH(state) \
5834 if (result) { \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005835 add_state = state->out; \
5836 add_off = clen; \
5837 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005838
5839 /*
5840 * Run for each character.
5841 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005842 for (;;)
5843 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005844 int curc;
5845 int clen;
5846
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005847 if (has_mbyte)
5848 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005849 curc = (*mb_ptr2char)(rex.input);
5850 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005851 }
5852 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005853 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005854 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005855 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005856 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005857 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005858 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005859 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005860 go_to_nextline = FALSE;
5861 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005862
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005863 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005864 thislist = &list[flag];
5865 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005866 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005867 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005868 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005869 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005870 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005871# ifdef FEAT_EVAL
5872 || nfa_fail_for_testing
5873# endif
5874 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005875 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005876 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005877 nfa_match = NFA_TOO_EXPENSIVE;
5878 goto theend;
5879 }
5880
Bram Moolenaar0270f382018-07-17 05:43:58 +02005881 thislist->id = rex.nfa_listid;
5882 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005883
5884#ifdef ENABLE_LOG
5885 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005886 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005887 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005888 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005889 {
5890 int i;
5891
5892 for (i = 0; i < thislist->n; i++)
5893 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5894 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005895 fprintf(log_fd, "\n");
5896#endif
5897
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005898#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005899 fprintf(debug, "\n-------------------\n");
5900#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005901 /*
5902 * If the state lists are empty we can stop.
5903 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005904 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005905 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005906
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005907 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005908 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005909 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005910 // If the list gets very long there probably is something wrong.
5911 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005912 fast_breakcheck();
5913 if (got_int)
5914 break;
5915#ifdef FEAT_RELTIME
5916 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5917 {
5918 nfa_time_count = 0;
5919 if (nfa_did_time_out())
5920 break;
5921 }
5922#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005923 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005924
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005925#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005926 nfa_set_code(t->state->c);
5927 fprintf(debug, "%s, ", code);
5928#endif
5929#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005930 {
5931 int col;
5932
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005933 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005934 col = -1;
5935 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005936 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005937 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005938 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005939 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005940 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005941 abs(t->state->id), (int)t->state->c, code, col,
5942 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005943 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005944#endif
5945
5946 /*
5947 * Handle the possible codes of the current state.
5948 * The most important is NFA_MATCH.
5949 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005950 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005951 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005952 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005953 switch (t->state->c)
5954 {
5955 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005956 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005957 // If the match is not at the start of the line, ends before a
5958 // composing characters and rex.reg_icombine is not set, that
5959 // is not really a match.
5960 if (enc_utf8 && !rex.reg_icombine
5961 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005962 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005963
Bram Moolenaar963fee22013-05-26 21:47:28 +02005964 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005965 copy_sub(&submatch->norm, &t->subs.norm);
5966#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005967 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005968 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005969#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005970#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005971 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005972#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005973 // Found the left-most longest match, do not look at any other
5974 // states at this position. When the list of states is going
5975 // to be empty quit without advancing, so that "rex.input" is
5976 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005977 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005978 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005979 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005980 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005981
5982 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005983 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005984 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005985 /*
5986 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005987 * NFA_START_INVISIBLE_BEFORE node.
5988 * They surround a zero-width group, used with "\@=", "\&",
5989 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005990 * If we got here, it means that the current "invisible" group
5991 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005992 * nfa_regmatch(). For a look-behind match only when it ends
5993 * in the position in "nfa_endp".
5994 * Submatches are stored in *m, and used in the parent call.
5995 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005996#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005997 if (nfa_endp != NULL)
5998 {
5999 if (REG_MULTI)
6000 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02006001 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02006002 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006003 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02006004 nfa_endp->se_u.pos.col);
6005 else
6006 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02006007 (int)(rex.input - rex.line),
6008 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006009 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02006010#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006011 // If "nfa_endp" is set it's only a match if it ends at
6012 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02006013 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006014 ? (rex.lnum != nfa_endp->se_u.pos.lnum
6015 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006016 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006017 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02006018 break;
6019
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006020 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02006021 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006022 {
6023 copy_sub(&m->norm, &t->subs.norm);
6024#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006025 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02006026 copy_sub(&m->synt, &t->subs.synt);
6027#endif
6028 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006029#ifdef ENABLE_LOG
6030 fprintf(log_fd, "Match found:\n");
6031 log_subsexpr(m);
6032#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02006033 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006034 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02006035 if (nextlist->n == 0)
6036 clen = 0;
6037 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006038
6039 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006040 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006041 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006042 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02006043 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006044 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006045 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006046 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006047 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02006048#ifdef ENABLE_LOG
6049 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6050 failure_chance(t->state->out, 0),
6051 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02006052#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006053 // Do it directly if there already is a PIM or when
6054 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02006055 if (t->pim.result != NFA_PIM_UNUSED
6056 || t->state->c == NFA_START_INVISIBLE_FIRST
6057 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6058 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6059 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006060 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006061 int in_use = m->norm.in_use;
6062
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006063 // Copy submatch info for the recursive call, opposite
6064 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006065 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02006066#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006067 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006068 copy_sub_off(&m->synt, &t->subs.synt);
6069#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006070
Bram Moolenaara2d95102013-06-04 14:23:05 +02006071 /*
6072 * First try matching the invisible match, then what
6073 * follows.
6074 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006075 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006076 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006077 if (result == NFA_TOO_EXPENSIVE)
6078 {
6079 nfa_match = result;
6080 goto theend;
6081 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006082
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006083 // for \@! and \@<! it is a match when the result is
6084 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006085 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006086 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6087 || t->state->c
6088 == NFA_START_INVISIBLE_BEFORE_NEG
6089 || t->state->c
6090 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006091 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006092 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006093 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006094#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006095 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006096 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006097#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006098 // If the pattern has \ze and it matched in the
6099 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006100 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006101
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006102 // t->state->out1 is the corresponding
6103 // END_INVISIBLE node; Add its out to the current
6104 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006105 add_here = TRUE;
6106 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006107 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006108 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006109 }
6110 else
6111 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006112 nfa_pim_T pim;
6113
Bram Moolenaara2d95102013-06-04 14:23:05 +02006114 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006115 * First try matching what follows. Only if a match
6116 * is found verify the invisible match matches. Add a
6117 * nfa_pim_T to the following states, it contains info
6118 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006119 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006120 pim.state = t->state;
6121 pim.result = NFA_PIM_TODO;
6122 pim.subs.norm.in_use = 0;
6123#ifdef FEAT_SYN_HL
6124 pim.subs.synt.in_use = 0;
6125#endif
6126 if (REG_MULTI)
6127 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006128 pim.end.pos.col = (int)(rex.input - rex.line);
6129 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006130 }
6131 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006132 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006133
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006134 // t->state->out1 is the corresponding END_INVISIBLE
6135 // node; Add its out to the current list (zero-width
6136 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006137 if (addstate_here(thislist, t->state->out1->out,
6138 &t->subs, &pim, &listidx) == NULL)
6139 {
6140 nfa_match = NFA_TOO_EXPENSIVE;
6141 goto theend;
6142 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006143 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006144 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006145 break;
6146
Bram Moolenaar87953742013-06-05 18:52:40 +02006147 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006148 {
6149 nfa_state_T *skip = NULL;
6150#ifdef ENABLE_LOG
6151 int skip_lid = 0;
6152#endif
6153
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006154 // There is no point in trying to match the pattern if the
6155 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006156 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6157 {
6158 skip = t->state->out1->out;
6159#ifdef ENABLE_LOG
6160 skip_lid = nextlist->id;
6161#endif
6162 }
6163 else if (state_in_list(nextlist,
6164 t->state->out1->out->out, &t->subs))
6165 {
6166 skip = t->state->out1->out->out;
6167#ifdef ENABLE_LOG
6168 skip_lid = nextlist->id;
6169#endif
6170 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006171 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006172 t->state->out1->out->out, &t->subs))
6173 {
6174 skip = t->state->out1->out->out;
6175#ifdef ENABLE_LOG
6176 skip_lid = thislist->id;
6177#endif
6178 }
6179 if (skip != NULL)
6180 {
6181#ifdef ENABLE_LOG
6182 nfa_set_code(skip->c);
6183 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6184 abs(skip->id), skip_lid, skip->c, code);
6185#endif
6186 break;
6187 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006188 // Copy submatch info to the recursive call, opposite of what
6189 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006190 copy_sub_off(&m->norm, &t->subs.norm);
6191#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006192 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006193 copy_sub_off(&m->synt, &t->subs.synt);
6194#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006195
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006196 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006197 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006198 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006199 if (result == NFA_TOO_EXPENSIVE)
6200 {
6201 nfa_match = result;
6202 goto theend;
6203 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006204 if (result)
6205 {
6206 int bytelen;
6207
6208#ifdef ENABLE_LOG
6209 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6210 log_subsexpr(m);
6211#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006212 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006213 copy_sub_off(&t->subs.norm, &m->norm);
6214#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006215 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006216 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006217#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006218 // Now we need to skip over the matched text and then
6219 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006220 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006221 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006222 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006223 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006224 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006225 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006226
6227#ifdef ENABLE_LOG
6228 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6229#endif
6230 if (bytelen == 0)
6231 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006232 // empty match, output of corresponding
6233 // NFA_END_PATTERN/NFA_SKIP to be used at current
6234 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006235 add_here = TRUE;
6236 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006237 }
6238 else if (bytelen <= clen)
6239 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006240 // match current character, output of corresponding
6241 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006242 add_state = t->state->out1->out->out;
6243 add_off = clen;
6244 }
6245 else
6246 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006247 // skip over the matched characters, set character
6248 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006249 add_state = t->state->out1->out;
6250 add_off = bytelen;
6251 add_count = bytelen - clen;
6252 }
6253 }
6254 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006255 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006256
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006257 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006258 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006259 {
6260 add_here = TRUE;
6261 add_state = t->state->out;
6262 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006263 break;
6264
6265 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006266 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006267 {
6268 add_here = TRUE;
6269 add_state = t->state->out;
6270 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006271 break;
6272
6273 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006274 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006275
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006276 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006277 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006278 else if (has_mbyte)
6279 {
6280 int this_class;
6281
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006282 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006283 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006284 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006285 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006286 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006287 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006288 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006289 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006290 || (rex.input > rex.line
6291 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006292 result = FALSE;
6293 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006294 {
6295 add_here = TRUE;
6296 add_state = t->state->out;
6297 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006298 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006299
6300 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006301 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006302 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006303 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006304 else if (has_mbyte)
6305 {
6306 int this_class, prev_class;
6307
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006308 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006309 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006310 prev_class = reg_prev_class();
6311 if (this_class == prev_class
6312 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006313 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006314 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006315 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6316 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006317 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006318 result = FALSE;
6319 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006320 {
6321 add_here = TRUE;
6322 add_state = t->state->out;
6323 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006324 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006325
Bram Moolenaar4b780632013-05-31 22:14:52 +02006326 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006327 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006328 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006329 {
6330 add_here = TRUE;
6331 add_state = t->state->out;
6332 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006333 break;
6334
6335 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006336 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006337 {
6338 add_here = TRUE;
6339 add_state = t->state->out;
6340 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006341 break;
6342
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006343 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006344 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006345 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006346 int len = 0;
6347 nfa_state_T *end;
6348 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006349 int cchars[MAX_MCO];
6350 int ccount = 0;
6351 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006352
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006353 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006354 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006355 if (utf_iscomposing(sta->c))
6356 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006357 // Only match composing character(s), ignore base
6358 // character. Used for ".{composing}" and "{composing}"
6359 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006360 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006361 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006362 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006363 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006364 // If \Z was present, then ignore composing characters.
6365 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006366 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006367 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006368 else
6369 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006370 while (sta->c != NFA_END_COMPOSING)
6371 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006372 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006373
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006374 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006375 else if (len > 0 || mc == sta->c)
6376 {
6377 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006378 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006379 len += mb_char2len(mc);
6380 sta = sta->out;
6381 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006382
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006383 // We don't care about the order of composing characters.
6384 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006385 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006386 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006387 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006388 cchars[ccount++] = mc;
6389 len += mb_char2len(mc);
6390 if (ccount == MAX_MCO)
6391 break;
6392 }
6393
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006394 // Check that each composing char in the pattern matches a
6395 // composing char in the text. We do not check if all
6396 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006397 result = OK;
6398 while (sta->c != NFA_END_COMPOSING)
6399 {
6400 for (j = 0; j < ccount; ++j)
6401 if (cchars[j] == sta->c)
6402 break;
6403 if (j == ccount)
6404 {
6405 result = FAIL;
6406 break;
6407 }
6408 sta = sta->out;
6409 }
6410 }
6411 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006412 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006413
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006414 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006415 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006416 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006417 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006418
6419 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006420 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006421 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006422 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006423 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006424 // Pass -1 for the offset, which means taking the position
6425 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006426 add_state = t->state->out;
6427 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006428 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006429 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006430 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006431 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006432 add_state = t->state->out;
6433 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006434 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006435 break;
6436
Bram Moolenaar417bad22013-06-07 14:08:30 +02006437 case NFA_START_COLL:
6438 case NFA_START_NEG_COLL:
6439 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006440 // What follows is a list of characters, until NFA_END_COLL.
6441 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006442 nfa_state_T *state;
6443 int result_if_matched;
6444 int c1, c2;
6445
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006446 // Never match EOL. If it's part of the collection it is added
6447 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006448 if (curc == NUL)
6449 break;
6450
6451 state = t->state->out;
6452 result_if_matched = (t->state->c == NFA_START_COLL);
6453 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006454 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006455 if (state->c == NFA_END_COLL)
6456 {
6457 result = !result_if_matched;
6458 break;
6459 }
6460 if (state->c == NFA_RANGE_MIN)
6461 {
6462 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006463 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006464 c2 = state->val;
6465#ifdef ENABLE_LOG
6466 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6467 curc, c1, c2);
6468#endif
6469 if (curc >= c1 && curc <= c2)
6470 {
6471 result = result_if_matched;
6472 break;
6473 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006474 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006475 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006476 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006477 int done = FALSE;
6478
6479 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006480 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006481 {
6482 result = result_if_matched;
6483 done = TRUE;
6484 break;
6485 }
6486 if (done)
6487 break;
6488 }
6489 }
6490 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006491 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006492 || (rex.reg_ic && MB_CASEFOLD(curc)
6493 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006494 {
6495 result = result_if_matched;
6496 break;
6497 }
6498 state = state->out;
6499 }
6500 if (result)
6501 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006502 // next state is in out of the NFA_END_COLL, out1 of
6503 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006504 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006505 add_off = clen;
6506 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006507 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006508 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006509
6510 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006511 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006512 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006513 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006514 add_state = t->state->out;
6515 add_off = clen;
6516 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006517 break;
6518
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006519 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006520 // On a composing character skip over it. Otherwise do
6521 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006522 if (enc_utf8 && utf_iscomposing(curc))
6523 {
6524 add_off = clen;
6525 }
6526 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006527 {
6528 add_here = TRUE;
6529 add_off = 0;
6530 }
6531 add_state = t->state->out;
6532 break;
6533
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006534 /*
6535 * Character classes like \a for alpha, \d for digit etc.
6536 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006537 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006538 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006539 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006540 break;
6541
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006542 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006543 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006544 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006545 break;
6546
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006547 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006548 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006549 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006550 break;
6551
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006552 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006553 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006554 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006555 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006556 break;
6557
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006558 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006559 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006560 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006561 break;
6562
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006563 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006564 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006565 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006566 break;
6567
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006568 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006569 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006570 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006571 break;
6572
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006573 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006574 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006575 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006576 break;
6577
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006578 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006579 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006580 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006581 break;
6582
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006583 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006584 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006585 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006586 break;
6587
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006588 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006589 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006590 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006591 break;
6592
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006593 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006594 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006595 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006596 break;
6597
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006598 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006599 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006600 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006601 break;
6602
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006603 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006604 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006605 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006606 break;
6607
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006608 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006609 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006610 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006611 break;
6612
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006613 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006614 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006615 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006616 break;
6617
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006618 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006619 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006620 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006621 break;
6622
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006623 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006624 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006625 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006626 break;
6627
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006628 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006629 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006630 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006631 break;
6632
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006633 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006634 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006635 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006636 break;
6637
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006638 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006639 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006640 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006641 break;
6642
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006643 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006644 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006645 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006646 break;
6647
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006648 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006649 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006650 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006651 break;
6652
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006653 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006654 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006655 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006656 break;
6657
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006658 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006659 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006660 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006661 break;
6662
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006663 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006664 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006665 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006666 break;
6667
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006668 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006669 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006670 ADD_STATE_IF_MATCH(t->state);
6671 break;
6672
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006673 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006674 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006675 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006676 ADD_STATE_IF_MATCH(t->state);
6677 break;
6678
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006679 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006680 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006681 ADD_STATE_IF_MATCH(t->state);
6682 break;
6683
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006684 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006685 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006686 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006687 ADD_STATE_IF_MATCH(t->state);
6688 break;
6689
Bram Moolenaar5714b802013-05-28 22:03:20 +02006690 case NFA_BACKREF1:
6691 case NFA_BACKREF2:
6692 case NFA_BACKREF3:
6693 case NFA_BACKREF4:
6694 case NFA_BACKREF5:
6695 case NFA_BACKREF6:
6696 case NFA_BACKREF7:
6697 case NFA_BACKREF8:
6698 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006699#ifdef FEAT_SYN_HL
6700 case NFA_ZREF1:
6701 case NFA_ZREF2:
6702 case NFA_ZREF3:
6703 case NFA_ZREF4:
6704 case NFA_ZREF5:
6705 case NFA_ZREF6:
6706 case NFA_ZREF7:
6707 case NFA_ZREF8:
6708 case NFA_ZREF9:
6709#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006710 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006711 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006712 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006713 int bytelen;
6714
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006715 if (t->state->c <= NFA_BACKREF9)
6716 {
6717 subidx = t->state->c - NFA_BACKREF1 + 1;
6718 result = match_backref(&t->subs.norm, subidx, &bytelen);
6719 }
6720#ifdef FEAT_SYN_HL
6721 else
6722 {
6723 subidx = t->state->c - NFA_ZREF1 + 1;
6724 result = match_zref(subidx, &bytelen);
6725 }
6726#endif
6727
Bram Moolenaar5714b802013-05-28 22:03:20 +02006728 if (result)
6729 {
6730 if (bytelen == 0)
6731 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006732 // empty match always works, output of NFA_SKIP to be
6733 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006734 add_here = TRUE;
6735 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006736 }
6737 else if (bytelen <= clen)
6738 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006739 // match current character, jump ahead to out of
6740 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006741 add_state = t->state->out->out;
6742 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006743 }
6744 else
6745 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006746 // skip over the matched characters, set character
6747 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006748 add_state = t->state->out;
6749 add_off = bytelen;
6750 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006751 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006752 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006753 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006754 }
6755 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006756 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006757 if (t->count - clen <= 0)
6758 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006759 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006760 add_state = t->state->out;
6761 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006762 }
6763 else
6764 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006765 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006766 add_state = t->state;
6767 add_off = 0;
6768 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006769 }
6770 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006771
Bram Moolenaar423532e2013-05-29 21:14:42 +02006772 case NFA_LNUM:
6773 case NFA_LNUM_GT:
6774 case NFA_LNUM_LT:
6775 result = (REG_MULTI &&
6776 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006777 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006778 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006779 {
6780 add_here = TRUE;
6781 add_state = t->state->out;
6782 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006783 break;
6784
6785 case NFA_COL:
6786 case NFA_COL_GT:
6787 case NFA_COL_LT:
6788 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006789 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006790 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006791 {
6792 add_here = TRUE;
6793 add_state = t->state->out;
6794 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006795 break;
6796
6797 case NFA_VCOL:
6798 case NFA_VCOL_GT:
6799 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006800 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006801 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006802 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006803 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006804
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006805 // Bail out quickly when there can't be a match, avoid the
6806 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006807 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006808 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006809 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006810 result = FALSE;
6811 if (op == 1 && col - 1 > t->state->val && col > 100)
6812 {
6813 int ts = wp->w_buffer->b_p_ts;
6814
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006815 // Guess that a character won't use more columns than
6816 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006817 if (ts < 4)
6818 ts = 4;
6819 result = col > t->state->val * ts;
6820 }
6821 if (!result)
6822 result = nfa_re_num_cmp(t->state->val, op,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006823 (long_u)win_linetabsize(wp, rex.line, col) + 1);
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006824 if (result)
6825 {
6826 add_here = TRUE;
6827 add_state = t->state->out;
6828 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006829 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006830 break;
6831
Bram Moolenaar044aa292013-06-04 21:27:38 +02006832 case NFA_MARK:
6833 case NFA_MARK_GT:
6834 case NFA_MARK_LT:
6835 {
Bram Moolenaar64066b92021-11-17 18:22:56 +00006836 size_t col = rex.input - rex.line;
Bram Moolenaar6100d022016-10-02 16:51:57 +02006837 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006838
Bram Moolenaar64066b92021-11-17 18:22:56 +00006839 // Line may have been freed, get it again.
6840 if (REG_MULTI)
6841 {
6842 rex.line = reg_getline(rex.lnum);
6843 rex.input = rex.line + col;
6844 }
6845
Bram Moolenaar872bee52021-05-24 22:56:15 +02006846 // Compare the mark position to the match position, if the mark
6847 // exists and mark is set in reg_buf.
6848 if (pos != NULL && pos->lnum > 0)
6849 {
6850 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
6851 && pos->col == MAXCOL
6852 ? (colnr_T)STRLEN(reg_getline(
6853 pos->lnum - rex.reg_firstlnum))
6854 : pos->col;
6855
6856 result = (pos->lnum == rex.lnum + rex.reg_firstlnum
6857 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006858 ? t->state->c == NFA_MARK
Bram Moolenaar872bee52021-05-24 22:56:15 +02006859 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006860 ? t->state->c == NFA_MARK_GT
6861 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006862 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006863 ? t->state->c == NFA_MARK_GT
Bram Moolenaar872bee52021-05-24 22:56:15 +02006864 : t->state->c == NFA_MARK_LT));
6865 if (result)
6866 {
6867 add_here = TRUE;
6868 add_state = t->state->out;
6869 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006870 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006871 break;
6872 }
6873
Bram Moolenaar423532e2013-05-29 21:14:42 +02006874 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006875 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006876 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006877 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006878 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006879 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006880 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006881 {
6882 add_here = TRUE;
6883 add_state = t->state->out;
6884 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006885 break;
6886
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006887 case NFA_VISUAL:
6888 result = reg_match_visual();
6889 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006890 {
6891 add_here = TRUE;
6892 add_state = t->state->out;
6893 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006894 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006895
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006896 case NFA_MOPEN1:
6897 case NFA_MOPEN2:
6898 case NFA_MOPEN3:
6899 case NFA_MOPEN4:
6900 case NFA_MOPEN5:
6901 case NFA_MOPEN6:
6902 case NFA_MOPEN7:
6903 case NFA_MOPEN8:
6904 case NFA_MOPEN9:
6905#ifdef FEAT_SYN_HL
6906 case NFA_ZOPEN:
6907 case NFA_ZOPEN1:
6908 case NFA_ZOPEN2:
6909 case NFA_ZOPEN3:
6910 case NFA_ZOPEN4:
6911 case NFA_ZOPEN5:
6912 case NFA_ZOPEN6:
6913 case NFA_ZOPEN7:
6914 case NFA_ZOPEN8:
6915 case NFA_ZOPEN9:
6916#endif
6917 case NFA_NOPEN:
6918 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006919 // These states are only added to be able to bail out when
6920 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006921 break;
6922
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006923 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006924 {
6925 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006926
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006927#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006928 if (c < 0)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01006929 siemsg("INTERNAL: Negative state char: %ld", c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006930#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006931 result = (c == curc);
6932
Bram Moolenaar6100d022016-10-02 16:51:57 +02006933 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006934 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006935 // If rex.reg_icombine is not set only skip over the character
6936 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006937 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006938 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006939 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006940 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006941 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006942
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006943 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006944
6945 if (add_state != NULL)
6946 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006947 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006948 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006949
6950 if (t->pim.result == NFA_PIM_UNUSED)
6951 pim = NULL;
6952 else
6953 pim = &t->pim;
6954
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006955 // Handle the postponed invisible match if the match might end
6956 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006957 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006958 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006959 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006960 {
6961#ifdef ENABLE_LOG
6962 fprintf(log_fd, "\n");
6963 fprintf(log_fd, "==================================\n");
6964 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6965 fprintf(log_fd, "\n");
6966#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006967 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006968 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006969 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006970 // for \@! and \@<! it is a match when the result is
6971 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006972 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006973 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6974 || pim->state->c
6975 == NFA_START_INVISIBLE_BEFORE_NEG
6976 || pim->state->c
6977 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006978 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006979 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006980 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006981#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006982 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006983 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006984#endif
6985 }
6986 }
6987 else
6988 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006989 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006990#ifdef ENABLE_LOG
6991 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006992 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006993 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6994 fprintf(log_fd, "\n");
6995#endif
6996 }
6997
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006998 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006999 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02007000 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
7001 || pim->state->c
7002 == NFA_START_INVISIBLE_BEFORE_NEG
7003 || pim->state->c
7004 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02007005 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007006 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007007 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007008#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02007009 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007010 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02007011#endif
7012 }
7013 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007014 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02007015 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007016
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007017 // Postponed invisible match was handled, don't add it to
7018 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02007019 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02007020 }
7021
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007022 // If "pim" points into l->t it will become invalid when
7023 // adding the state causes the list to be reallocated. Make a
7024 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02007025 if (pim == &t->pim)
7026 {
7027 copy_pim(&pim_copy, pim);
7028 pim = &pim_copy;
7029 }
7030
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007031 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007032 r = addstate_here(thislist, add_state, &t->subs,
7033 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007034 else
7035 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007036 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02007037 if (add_count > 0)
7038 nextlist->t[nextlist->n - 1].count = add_count;
7039 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007040 if (r == NULL)
7041 {
7042 nfa_match = NFA_TOO_EXPENSIVE;
7043 goto theend;
7044 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007045 }
7046
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007047 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007048
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007049 // Look for the start of a match in the current position by adding the
7050 // start state to the list of states.
7051 // The first found match is the leftmost one, thus the order of states
7052 // matters!
7053 // Do not add the start state in recursive calls of nfa_regmatch(),
7054 // because recursive calls should only start in the first position.
7055 // Unless "nfa_endp" is not NULL, then we match the end position.
7056 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02007057 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007058 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02007059 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02007060 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02007061 && (rex.reg_maxcol == 0
Bram Moolenaar0270f382018-07-17 05:43:58 +02007062 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02007063 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02007064 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007065 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7066 || (rex.lnum == nfa_endp->se_u.pos.lnum
7067 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02007068 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02007069 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007070 {
7071#ifdef ENABLE_LOG
7072 fprintf(log_fd, "(---) STARTSTATE\n");
7073#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007074 // Inline optimized code for addstate() if we know the state is
7075 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007076 if (toplevel)
7077 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007078 int add = TRUE;
7079 int c;
7080
7081 if (prog->regstart != NUL && clen != 0)
7082 {
7083 if (nextlist->n == 0)
7084 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007085 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007086
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007087 // Nextlist is empty, we can skip ahead to the
7088 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007089 if (skip_to_start(prog->regstart, &col) == FAIL)
7090 break;
7091#ifdef ENABLE_LOG
7092 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007093 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007094#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007095 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007096 }
7097 else
7098 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007099 // Checking if the required start character matches is
7100 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007101 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007102 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007103 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007104 {
7105#ifdef ENABLE_LOG
7106 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7107#endif
7108 add = FALSE;
7109 }
7110 }
7111 }
7112
7113 if (add)
7114 {
7115 if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007116 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007117 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007118 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007119 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007120 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7121 {
7122 nfa_match = NFA_TOO_EXPENSIVE;
7123 goto theend;
7124 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007125 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007126 }
7127 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007128 {
7129 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7130 {
7131 nfa_match = NFA_TOO_EXPENSIVE;
7132 goto theend;
7133 }
7134 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007135 }
7136
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007137#ifdef ENABLE_LOG
7138 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007139 {
7140 int i;
7141
7142 for (i = 0; i < thislist->n; i++)
7143 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7144 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007145 fprintf(log_fd, "\n");
7146#endif
7147
7148nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007149 // Advance to the next character, or advance to the next line, or
7150 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007151 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007152 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007153 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007154 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007155 reg_nextline();
7156 else
7157 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007158
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007159 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007160 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007161 if (got_int)
7162 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007163#ifdef FEAT_RELTIME
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007164 // Check for timeout once in a twenty times to avoid overhead.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007165 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7166 {
7167 nfa_time_count = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007168 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007169 break;
7170 }
7171#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007172 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007173
7174#ifdef ENABLE_LOG
7175 if (log_fd != stderr)
7176 fclose(log_fd);
7177 log_fd = NULL;
7178#endif
7179
7180theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007181 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007182 vim_free(list[0].t);
7183 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007184 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007185#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007186#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007187 fclose(debug);
7188#endif
7189
Bram Moolenaar963fee22013-05-26 21:47:28 +02007190 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007191}
7192
7193/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007194 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007195 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007196 */
7197 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007198nfa_regtry(
7199 nfa_regprog_T *prog,
7200 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007201 proftime_T *tm UNUSED, // timeout limit or NULL
7202 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007203{
7204 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007205 regsubs_T subs, m;
7206 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007207 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007208#ifdef ENABLE_LOG
7209 FILE *f;
7210#endif
7211
Bram Moolenaar0270f382018-07-17 05:43:58 +02007212 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007213#ifdef FEAT_RELTIME
7214 nfa_time_limit = tm;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007215 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007216 nfa_time_count = 0;
7217#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007218
7219#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007220 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007221 if (f != NULL)
7222 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007223 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007224#ifdef DEBUG
7225 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7226#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007227 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007228 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007229 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007230 fprintf(f, "\n\n");
7231 fclose(f);
7232 }
7233 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007234 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007235#endif
7236
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007237 clear_sub(&subs.norm);
7238 clear_sub(&m.norm);
7239#ifdef FEAT_SYN_HL
7240 clear_sub(&subs.synt);
7241 clear_sub(&m.synt);
7242#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007243
Bram Moolenaarfda37292014-11-05 14:27:36 +01007244 result = nfa_regmatch(prog, start, &subs, &m);
7245 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007246 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007247 else if (result == NFA_TOO_EXPENSIVE)
7248 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007249
7250 cleanup_subexpr();
7251 if (REG_MULTI)
7252 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007253 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007254 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007255 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7256 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007257
Bram Moolenaar6100d022016-10-02 16:51:57 +02007258 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7259 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007260 }
7261
Bram Moolenaar6100d022016-10-02 16:51:57 +02007262 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007263 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007264 rex.reg_startpos[0].lnum = 0;
7265 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007266 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007267 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007268 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007269 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007270 rex.reg_endpos[0].lnum = rex.lnum;
7271 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007272 }
7273 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007274 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007275 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007276 }
7277 else
7278 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007279 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007280 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007281 rex.reg_startp[i] = subs.norm.list.line[i].start;
7282 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007283 }
7284
Bram Moolenaar6100d022016-10-02 16:51:57 +02007285 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007286 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007287 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007288 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007289 }
7290
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007291#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007292 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007293 unref_extmatch(re_extmatch_out);
7294 re_extmatch_out = NULL;
7295
7296 if (prog->reghasz == REX_SET)
7297 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007298 cleanup_zsubexpr();
7299 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007300 if (re_extmatch_out == NULL)
7301 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007302 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007303 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007304 {
7305 if (REG_MULTI)
7306 {
7307 struct multipos *mpos = &subs.synt.list.multi[i];
7308
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007309 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007310 if (mpos->start_lnum >= 0
7311 && mpos->start_lnum == mpos->end_lnum
7312 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007313 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007314 vim_strnsave(reg_getline(mpos->start_lnum)
7315 + mpos->start_col,
7316 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007317 }
7318 else
7319 {
7320 struct linepos *lpos = &subs.synt.list.line[i];
7321
7322 if (lpos->start != NULL && lpos->end != NULL)
7323 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007324 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007325 }
7326 }
7327 }
7328#endif
7329
Bram Moolenaar0270f382018-07-17 05:43:58 +02007330 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007331}
7332
7333/*
7334 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02007335 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007336 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007337 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007338 */
7339 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007340nfa_regexec_both(
7341 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007342 colnr_T startcol, // column to start looking for match
7343 proftime_T *tm, // timeout limit or NULL
7344 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007345{
7346 nfa_regprog_T *prog;
7347 long retval = 0L;
7348 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007349 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007350
7351 if (REG_MULTI)
7352 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007353 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007354 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007355 rex.reg_startpos = rex.reg_mmatch->startpos;
7356 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007357 }
7358 else
7359 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007360 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7361 rex.reg_startp = rex.reg_match->startp;
7362 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007363 }
7364
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007365 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007366 if (prog == NULL || line == NULL)
7367 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02007368 iemsg(_(e_null_argument));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007369 goto theend;
7370 }
7371
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007372 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007373 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007374 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007375 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007376 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007377
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007378 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007379 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007380 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007381
Bram Moolenaar0270f382018-07-17 05:43:58 +02007382 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007383 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007384
Bram Moolenaar0270f382018-07-17 05:43:58 +02007385 rex.nfa_has_zend = prog->has_zend;
7386 rex.nfa_has_backref = prog->has_backref;
7387 rex.nfa_nsubexpr = prog->nsubexp;
7388 rex.nfa_listid = 1;
7389 rex.nfa_alt_listid = 2;
7390#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007391 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007392#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007393
Bram Moolenaard89616e2013-06-06 18:46:06 +02007394 if (prog->reganch && col > 0)
7395 return 0L;
7396
Bram Moolenaar0270f382018-07-17 05:43:58 +02007397 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007398#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007399 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007400 if (prog->reghasz == REX_SET)
7401 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007402 rex.nfa_has_zsubexpr = TRUE;
7403 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007404 }
7405 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007406 {
7407 rex.nfa_has_zsubexpr = FALSE;
7408 rex.need_clear_zsubexpr = FALSE;
7409 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007410#endif
7411
Bram Moolenaard89616e2013-06-06 18:46:06 +02007412 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007413 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007414 // Skip ahead until a character we know the match must start with.
7415 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007416 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007417 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007418
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007419 // If match_text is set it contains the full text that must match.
7420 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007421 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar473de612013-06-08 18:19:48 +02007422 return find_match_text(col, prog->regstart, prog->match_text);
7423 }
7424
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007425 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007426 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007427 goto theend;
7428
Bram Moolenaar0270f382018-07-17 05:43:58 +02007429 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7430 // it's accidentally used during execution.
7431 nstate = 0;
7432 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007433 {
7434 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007435 prog->state[i].lastlist[0] = 0;
7436 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007437 }
7438
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007439 retval = nfa_regtry(prog, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007440
Bram Moolenaar0270f382018-07-17 05:43:58 +02007441#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007442 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007443#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007444
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007445theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007446 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007447 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007448 // Make sure the end is never before the start. Can happen when \zs and
7449 // \ze are used.
7450 if (REG_MULTI)
7451 {
7452 lpos_T *start = &rex.reg_mmatch->startpos[0];
7453 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007454
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007455 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007456 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007457 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7458 }
7459 else
7460 {
7461 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7462 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7463 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007464 }
7465
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007466 return retval;
7467}
7468
7469/*
7470 * Compile a regular expression into internal code for the NFA matcher.
7471 * Returns the program in allocated space. Returns NULL for an error.
7472 */
7473 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007474nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007475{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007476 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007477 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007478 int *postfix;
7479
7480 if (expr == NULL)
7481 return NULL;
7482
Bram Moolenaar0270f382018-07-17 05:43:58 +02007483#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007484 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007485#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007486 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007487
7488 init_class_tab();
7489
7490 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7491 return NULL;
7492
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007493 // Build postfix form of the regexp. Needed to build the NFA
7494 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007495 postfix = re2post();
7496 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007497 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007498
7499 /*
7500 * In order to build the NFA, we parse the input regexp twice:
7501 * 1. first pass to count size (so we can allocate space)
7502 * 2. second to emit code
7503 */
7504#ifdef ENABLE_LOG
7505 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007506 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007507
7508 if (f != NULL)
7509 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007510 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007511 fclose(f);
7512 }
7513 }
7514#endif
7515
7516 /*
7517 * PASS 1
7518 * Count number of NFA states in "nstate". Do not build the NFA.
7519 */
7520 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007521
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007522 // allocate the regprog with space for the compiled regexp
Bram Moolenaar16619a22013-06-11 18:42:36 +02007523 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007524 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007525 if (prog == NULL)
7526 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007527 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007528 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007529
7530 /*
7531 * PASS 2
7532 * Build the NFA
7533 */
7534 prog->start = post2nfa(postfix, post_ptr, FALSE);
7535 if (prog->start == NULL)
7536 goto fail;
7537
7538 prog->regflags = regflags;
7539 prog->engine = &nfa_regengine;
7540 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007541 prog->has_zend = rex.nfa_has_zend;
7542 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007543 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007544
Bram Moolenaara2947e22013-06-11 22:44:09 +02007545 nfa_postprocess(prog);
7546
Bram Moolenaard89616e2013-06-06 18:46:06 +02007547 prog->reganch = nfa_get_reganch(prog->start, 0);
7548 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007549 prog->match_text = nfa_get_match_text(prog->start);
7550
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007551#ifdef ENABLE_LOG
7552 nfa_postfix_dump(expr, OK);
7553 nfa_dump(prog);
7554#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007555#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007556 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007557 prog->reghasz = re_has_z;
7558#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007559 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007560#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007561 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007562#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007563
7564out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007565 VIM_CLEAR(post_start);
7566 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007567 state_ptr = NULL;
7568 return (regprog_T *)prog;
7569
7570fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007571 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007572#ifdef ENABLE_LOG
7573 nfa_postfix_dump(expr, FAIL);
7574#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007575#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007576 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007577#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007578 goto out;
7579}
7580
Bram Moolenaar473de612013-06-08 18:19:48 +02007581/*
7582 * Free a compiled regexp program, returned by nfa_regcomp().
7583 */
7584 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007585nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007586{
7587 if (prog != NULL)
7588 {
7589 vim_free(((nfa_regprog_T *)prog)->match_text);
Bram Moolenaar473de612013-06-08 18:19:48 +02007590 vim_free(((nfa_regprog_T *)prog)->pattern);
Bram Moolenaar473de612013-06-08 18:19:48 +02007591 vim_free(prog);
7592 }
7593}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007594
7595/*
7596 * Match a regexp against a string.
7597 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7598 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007599 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007600 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007601 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007602 */
7603 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007604nfa_regexec_nl(
7605 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007606 char_u *line, // string to match against
7607 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007608 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007609{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007610 rex.reg_match = rmp;
7611 rex.reg_mmatch = NULL;
7612 rex.reg_maxline = 0;
7613 rex.reg_line_lbr = line_lbr;
7614 rex.reg_buf = curbuf;
7615 rex.reg_win = NULL;
7616 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007617 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007618 rex.reg_maxcol = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007619 return nfa_regexec_both(line, col, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007620}
7621
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007622
7623/*
7624 * Match a regexp against multiple lines.
7625 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7626 * Uses curbuf for line count and 'iskeyword'.
7627 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007628 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007629 * match otherwise.
7630 *
7631 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7632 *
7633 * ! Also NOTE : match may actually be in another line. e.g.:
7634 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7635 *
7636 * +-------------------------+
7637 * |a |
7638 * |b |
7639 * |c |
7640 * | |
7641 * +-------------------------+
7642 *
7643 * then nfa_regexec_multi() returns 3. while the original
7644 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7645 *
7646 * FIXME if this behavior is not compatible.
7647 */
7648 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007649nfa_regexec_multi(
7650 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007651 win_T *win, // window in which to search or NULL
7652 buf_T *buf, // buffer in which to search
7653 linenr_T lnum, // nr of line to start looking for match
7654 colnr_T col, // column to start looking for match
7655 proftime_T *tm, // timeout limit or NULL
7656 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007657{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007658 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007659 return nfa_regexec_both(NULL, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007660}
7661
7662#ifdef DEBUG
7663# undef ENABLE_LOG
7664#endif