blob: 9edca8191286c347589c7be565c84404a47c79c3 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
21 * Do disable all of this when compiling Vim for debugging, undefine DEBUG in
22 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaar63d9e732019-12-05 21:10:38 +010032// Added to NFA_ANY - NFA_NUPPER_IC to include a NL.
Bram Moolenaar1cfad522013-08-14 12:06:49 +020033#define NFA_ADD_NL 31
34
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020035enum
36{
37 NFA_SPLIT = -1024,
38 NFA_MATCH,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010039 NFA_EMPTY, // matches 0-length
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020040
Bram Moolenaar63d9e732019-12-05 21:10:38 +010041 NFA_START_COLL, // [abc] start
42 NFA_END_COLL, // [abc] end
43 NFA_START_NEG_COLL, // [^abc] start
44 NFA_END_NEG_COLL, // [^abc] end (postfix only)
45 NFA_RANGE, // range of the two previous items
46 // (postfix only)
47 NFA_RANGE_MIN, // low end of a range
48 NFA_RANGE_MAX, // high end of a range
Bram Moolenaar417bad22013-06-07 14:08:30 +020049
Bram Moolenaar63d9e732019-12-05 21:10:38 +010050 NFA_CONCAT, // concatenate two previous items (postfix
51 // only)
52 NFA_OR, // \| (postfix only)
53 NFA_STAR, // greedy * (postfix only)
54 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
55 NFA_QUEST, // greedy \? (postfix only)
56 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020057
Bram Moolenaar63d9e732019-12-05 21:10:38 +010058 NFA_BOL, // ^ Begin line
59 NFA_EOL, // $ End line
60 NFA_BOW, // \< Begin word
61 NFA_EOW, // \> End word
62 NFA_BOF, // \%^ Begin file
63 NFA_EOF, // \%$ End file
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020064 NFA_NEWL,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010065 NFA_ZSTART, // Used for \zs
66 NFA_ZEND, // Used for \ze
67 NFA_NOPEN, // Start of subexpression marked with \%(
68 NFA_NCLOSE, // End of subexpr. marked with \%( ... \)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069 NFA_START_INVISIBLE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020070 NFA_START_INVISIBLE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020071 NFA_START_INVISIBLE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020072 NFA_START_INVISIBLE_NEG_FIRST,
Bram Moolenaar61602c52013-06-01 19:54:43 +020073 NFA_START_INVISIBLE_BEFORE,
Bram Moolenaara2947e22013-06-11 22:44:09 +020074 NFA_START_INVISIBLE_BEFORE_FIRST,
Bram Moolenaardecd9542013-06-07 16:31:50 +020075 NFA_START_INVISIBLE_BEFORE_NEG,
Bram Moolenaara2947e22013-06-11 22:44:09 +020076 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
Bram Moolenaar87953742013-06-05 18:52:40 +020077 NFA_START_PATTERN,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020078 NFA_END_INVISIBLE,
Bram Moolenaardecd9542013-06-07 16:31:50 +020079 NFA_END_INVISIBLE_NEG,
Bram Moolenaar87953742013-06-05 18:52:40 +020080 NFA_END_PATTERN,
Bram Moolenaar63d9e732019-12-05 21:10:38 +010081 NFA_COMPOSING, // Next nodes in NFA are part of the
82 // composing multibyte char
83 NFA_END_COMPOSING, // End of a composing char in the NFA
84 NFA_ANY_COMPOSING, // \%C: Any composing characters.
85 NFA_OPT_CHARS, // \%[abc]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020086
Bram Moolenaar63d9e732019-12-05 21:10:38 +010087 // The following are used only in the postfix form, not in the NFA
88 NFA_PREV_ATOM_NO_WIDTH, // Used for \@=
89 NFA_PREV_ATOM_NO_WIDTH_NEG, // Used for \@!
90 NFA_PREV_ATOM_JUST_BEFORE, // Used for \@<=
91 NFA_PREV_ATOM_JUST_BEFORE_NEG, // Used for \@<!
92 NFA_PREV_ATOM_LIKE_PATTERN, // Used for \@>
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020093
Bram Moolenaar63d9e732019-12-05 21:10:38 +010094 NFA_BACKREF1, // \1
95 NFA_BACKREF2, // \2
96 NFA_BACKREF3, // \3
97 NFA_BACKREF4, // \4
98 NFA_BACKREF5, // \5
99 NFA_BACKREF6, // \6
100 NFA_BACKREF7, // \7
101 NFA_BACKREF8, // \8
102 NFA_BACKREF9, // \9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200103#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100104 NFA_ZREF1, // \z1
105 NFA_ZREF2, // \z2
106 NFA_ZREF3, // \z3
107 NFA_ZREF4, // \z4
108 NFA_ZREF5, // \z5
109 NFA_ZREF6, // \z6
110 NFA_ZREF7, // \z7
111 NFA_ZREF8, // \z8
112 NFA_ZREF9, // \z9
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200113#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100114 NFA_SKIP, // Skip characters
Bram Moolenaar5714b802013-05-28 22:03:20 +0200115
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200116 NFA_MOPEN,
Bram Moolenaarefb23f22013-06-01 23:02:54 +0200117 NFA_MOPEN1,
118 NFA_MOPEN2,
119 NFA_MOPEN3,
120 NFA_MOPEN4,
121 NFA_MOPEN5,
122 NFA_MOPEN6,
123 NFA_MOPEN7,
124 NFA_MOPEN8,
125 NFA_MOPEN9,
126
127 NFA_MCLOSE,
128 NFA_MCLOSE1,
129 NFA_MCLOSE2,
130 NFA_MCLOSE3,
131 NFA_MCLOSE4,
132 NFA_MCLOSE5,
133 NFA_MCLOSE6,
134 NFA_MCLOSE7,
135 NFA_MCLOSE8,
136 NFA_MCLOSE9,
137
138#ifdef FEAT_SYN_HL
139 NFA_ZOPEN,
140 NFA_ZOPEN1,
141 NFA_ZOPEN2,
142 NFA_ZOPEN3,
143 NFA_ZOPEN4,
144 NFA_ZOPEN5,
145 NFA_ZOPEN6,
146 NFA_ZOPEN7,
147 NFA_ZOPEN8,
148 NFA_ZOPEN9,
149
150 NFA_ZCLOSE,
151 NFA_ZCLOSE1,
152 NFA_ZCLOSE2,
153 NFA_ZCLOSE3,
154 NFA_ZCLOSE4,
155 NFA_ZCLOSE5,
156 NFA_ZCLOSE6,
157 NFA_ZCLOSE7,
158 NFA_ZCLOSE8,
159 NFA_ZCLOSE9,
160#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200161
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100162 // NFA_FIRST_NL
163 NFA_ANY, // Match any one character.
164 NFA_IDENT, // Match identifier char
165 NFA_SIDENT, // Match identifier char but no digit
166 NFA_KWORD, // Match keyword char
167 NFA_SKWORD, // Match word char but no digit
168 NFA_FNAME, // Match file name char
169 NFA_SFNAME, // Match file name char but no digit
170 NFA_PRINT, // Match printable char
171 NFA_SPRINT, // Match printable char but no digit
172 NFA_WHITE, // Match whitespace char
173 NFA_NWHITE, // Match non-whitespace char
174 NFA_DIGIT, // Match digit char
175 NFA_NDIGIT, // Match non-digit char
176 NFA_HEX, // Match hex char
177 NFA_NHEX, // Match non-hex char
178 NFA_OCTAL, // Match octal char
179 NFA_NOCTAL, // Match non-octal char
180 NFA_WORD, // Match word char
181 NFA_NWORD, // Match non-word char
182 NFA_HEAD, // Match head char
183 NFA_NHEAD, // Match non-head char
184 NFA_ALPHA, // Match alpha char
185 NFA_NALPHA, // Match non-alpha char
186 NFA_LOWER, // Match lowercase char
187 NFA_NLOWER, // Match non-lowercase char
188 NFA_UPPER, // Match uppercase char
189 NFA_NUPPER, // Match non-uppercase char
190 NFA_LOWER_IC, // Match [a-z]
191 NFA_NLOWER_IC, // Match [^a-z]
192 NFA_UPPER_IC, // Match [A-Z]
193 NFA_NUPPER_IC, // Match [^A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200194
195 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
196 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
Bram Moolenaar423532e2013-05-29 21:14:42 +0200197
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100198 NFA_CURSOR, // Match cursor pos
199 NFA_LNUM, // Match line number
200 NFA_LNUM_GT, // Match > line number
201 NFA_LNUM_LT, // Match < line number
202 NFA_COL, // Match cursor column
203 NFA_COL_GT, // Match > cursor column
204 NFA_COL_LT, // Match < cursor column
205 NFA_VCOL, // Match cursor virtual column
206 NFA_VCOL_GT, // Match > cursor virtual column
207 NFA_VCOL_LT, // Match < cursor virtual column
208 NFA_MARK, // Match mark
209 NFA_MARK_GT, // Match > mark
210 NFA_MARK_LT, // Match < mark
211 NFA_VISUAL, // Match Visual area
Bram Moolenaar423532e2013-05-29 21:14:42 +0200212
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100213 // Character classes [:alnum:] etc
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200214 NFA_CLASS_ALNUM,
215 NFA_CLASS_ALPHA,
216 NFA_CLASS_BLANK,
217 NFA_CLASS_CNTRL,
218 NFA_CLASS_DIGIT,
219 NFA_CLASS_GRAPH,
220 NFA_CLASS_LOWER,
221 NFA_CLASS_PRINT,
222 NFA_CLASS_PUNCT,
223 NFA_CLASS_SPACE,
224 NFA_CLASS_UPPER,
225 NFA_CLASS_XDIGIT,
226 NFA_CLASS_TAB,
227 NFA_CLASS_RETURN,
228 NFA_CLASS_BACKSPACE,
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100229 NFA_CLASS_ESCAPE,
230 NFA_CLASS_IDENT,
231 NFA_CLASS_KEYWORD,
232 NFA_CLASS_FNAME
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200233};
234
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100235// Keep in sync with classchars.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200236static int nfa_classcodes[] = {
237 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
238 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
239 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
240 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
241 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
242 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
243 NFA_UPPER, NFA_NUPPER
244};
245
Bram Moolenaar174a8482013-11-28 14:20:17 +0100246static char_u e_nul_found[] = N_("E865: (NFA) Regexp end encountered prematurely");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200247static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
Bram Moolenaara5483442019-02-17 20:17:02 +0100248static char_u e_ill_char_class[] = N_("E877: (NFA regexp) Invalid character class: %d");
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +0200249static char_u e_value_too_large[] = N_("E951: \\% value too large");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200250
Bram Moolenaar0270f382018-07-17 05:43:58 +0200251// Variables only used in nfa_regcomp() and descendants.
252static int nfa_re_flags; // re_flags passed to nfa_regcomp()
253static int *post_start; // holds the postfix form of r.e.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200254static int *post_end;
255static int *post_ptr;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100256
257// Set when the pattern should use the NFA engine.
258// E.g. [[:upper:]] only allows 8bit characters for BT engine,
259// while NFA engine handles multibyte characters correctly.
260static int wants_nfa;
261
Bram Moolenaar0270f382018-07-17 05:43:58 +0200262static int nstate; // Number of states in the NFA.
263static int istate; // Index in the state vector, used in alloc_state()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200264
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100265// If not NULL match must end at this position
Bram Moolenaar307aa162013-06-02 16:34:21 +0200266static save_se_T *nfa_endp = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200267
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100268// 0 for first call to nfa_regmatch(), 1 for recursive call.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +0200269static int nfa_ll_index = 0;
270
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100271static int realloc_post_list(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100272static int nfa_reg(int paren);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200273#ifdef DEBUG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100274static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200275#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100276static int match_follows(nfa_state_T *startstate, int depth);
277static int failure_chance(nfa_state_T *state, int depth);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200278
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100279// helper functions used when doing re2post() ... regatom() parsing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200280#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200281 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200282 return FAIL; \
283 *post_ptr++ = c; \
284 } while (0)
285
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200286/*
287 * Initialize internal variables before NFA compilation.
288 * Return OK on success, FAIL otherwise.
289 */
290 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100291nfa_regcomp_start(
292 char_u *expr,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100293 int re_flags) // see vim_regcomp()
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200294{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200295 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200296 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200297
298 nstate = 0;
299 istate = 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100300 // A reasonable estimation for maximum size
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200301 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200302
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100303 // Some items blow up in size, such as [A-z]. Add more space for that.
304 // When it is still not enough realloc_post_list() will be used.
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200305 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200306
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100307 // Size for postfix representation of expr.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200308 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200309
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200310 post_start = alloc(postfix_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200311 if (post_start == NULL)
312 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200313 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200314 post_end = post_start + nstate_max;
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100315 wants_nfa = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +0200316 rex.nfa_has_zend = FALSE;
317 rex.nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200318
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100319 // shared with BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200320 regcomp_start(expr, re_flags);
321
322 return OK;
323}
324
325/*
Bram Moolenaard89616e2013-06-06 18:46:06 +0200326 * Figure out if the NFA state list starts with an anchor, must match at start
327 * of the line.
328 */
329 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100330nfa_get_reganch(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200331{
332 nfa_state_T *p = start;
333
334 if (depth > 4)
335 return 0;
336
337 while (p != NULL)
338 {
339 switch (p->c)
340 {
341 case NFA_BOL:
342 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100343 return 1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200344
345 case NFA_ZSTART:
346 case NFA_ZEND:
347 case NFA_CURSOR:
348 case NFA_VISUAL:
349
350 case NFA_MOPEN:
351 case NFA_MOPEN1:
352 case NFA_MOPEN2:
353 case NFA_MOPEN3:
354 case NFA_MOPEN4:
355 case NFA_MOPEN5:
356 case NFA_MOPEN6:
357 case NFA_MOPEN7:
358 case NFA_MOPEN8:
359 case NFA_MOPEN9:
360 case NFA_NOPEN:
361#ifdef FEAT_SYN_HL
362 case NFA_ZOPEN:
363 case NFA_ZOPEN1:
364 case NFA_ZOPEN2:
365 case NFA_ZOPEN3:
366 case NFA_ZOPEN4:
367 case NFA_ZOPEN5:
368 case NFA_ZOPEN6:
369 case NFA_ZOPEN7:
370 case NFA_ZOPEN8:
371 case NFA_ZOPEN9:
372#endif
373 p = p->out;
374 break;
375
376 case NFA_SPLIT:
377 return nfa_get_reganch(p->out, depth + 1)
378 && nfa_get_reganch(p->out1, depth + 1);
379
380 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100381 return 0; // noooo
Bram Moolenaard89616e2013-06-06 18:46:06 +0200382 }
383 }
384 return 0;
385}
386
387/*
388 * Figure out if the NFA state list starts with a character which must match
389 * at start of the match.
390 */
391 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392nfa_get_regstart(nfa_state_T *start, int depth)
Bram Moolenaard89616e2013-06-06 18:46:06 +0200393{
394 nfa_state_T *p = start;
395
396 if (depth > 4)
397 return 0;
398
399 while (p != NULL)
400 {
401 switch (p->c)
402 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100403 // all kinds of zero-width matches
Bram Moolenaard89616e2013-06-06 18:46:06 +0200404 case NFA_BOL:
405 case NFA_BOF:
406 case NFA_BOW:
407 case NFA_EOW:
408 case NFA_ZSTART:
409 case NFA_ZEND:
410 case NFA_CURSOR:
411 case NFA_VISUAL:
412 case NFA_LNUM:
413 case NFA_LNUM_GT:
414 case NFA_LNUM_LT:
415 case NFA_COL:
416 case NFA_COL_GT:
417 case NFA_COL_LT:
418 case NFA_VCOL:
419 case NFA_VCOL_GT:
420 case NFA_VCOL_LT:
421 case NFA_MARK:
422 case NFA_MARK_GT:
423 case NFA_MARK_LT:
424
425 case NFA_MOPEN:
426 case NFA_MOPEN1:
427 case NFA_MOPEN2:
428 case NFA_MOPEN3:
429 case NFA_MOPEN4:
430 case NFA_MOPEN5:
431 case NFA_MOPEN6:
432 case NFA_MOPEN7:
433 case NFA_MOPEN8:
434 case NFA_MOPEN9:
435 case NFA_NOPEN:
436#ifdef FEAT_SYN_HL
437 case NFA_ZOPEN:
438 case NFA_ZOPEN1:
439 case NFA_ZOPEN2:
440 case NFA_ZOPEN3:
441 case NFA_ZOPEN4:
442 case NFA_ZOPEN5:
443 case NFA_ZOPEN6:
444 case NFA_ZOPEN7:
445 case NFA_ZOPEN8:
446 case NFA_ZOPEN9:
447#endif
448 p = p->out;
449 break;
450
451 case NFA_SPLIT:
452 {
453 int c1 = nfa_get_regstart(p->out, depth + 1);
454 int c2 = nfa_get_regstart(p->out1, depth + 1);
455
456 if (c1 == c2)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100457 return c1; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200458 return 0;
459 }
460
461 default:
Bram Moolenaardecd9542013-06-07 16:31:50 +0200462 if (p->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100463 return p->c; // yes!
Bram Moolenaard89616e2013-06-06 18:46:06 +0200464 return 0;
465 }
466 }
467 return 0;
468}
469
470/*
Bram Moolenaar473de612013-06-08 18:19:48 +0200471 * Figure out if the NFA state list contains just literal text and nothing
Bram Moolenaare7766ee2013-06-08 22:30:03 +0200472 * else. If so return a string in allocated memory with what must match after
473 * regstart. Otherwise return NULL.
Bram Moolenaar473de612013-06-08 18:19:48 +0200474 */
475 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100476nfa_get_match_text(nfa_state_T *start)
Bram Moolenaar473de612013-06-08 18:19:48 +0200477{
478 nfa_state_T *p = start;
479 int len = 0;
480 char_u *ret;
481 char_u *s;
482
483 if (p->c != NFA_MOPEN)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100484 return NULL; // just in case
Bram Moolenaar473de612013-06-08 18:19:48 +0200485 p = p->out;
486 while (p->c > 0)
487 {
488 len += MB_CHAR2LEN(p->c);
489 p = p->out;
490 }
491 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
492 return NULL;
493
494 ret = alloc(len);
495 if (ret != NULL)
496 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100497 p = start->out->out; // skip first char, it goes into regstart
Bram Moolenaar473de612013-06-08 18:19:48 +0200498 s = ret;
499 while (p->c > 0)
500 {
Bram Moolenaar473de612013-06-08 18:19:48 +0200501 if (has_mbyte)
502 s += (*mb_char2bytes)(p->c, s);
503 else
Bram Moolenaar473de612013-06-08 18:19:48 +0200504 *s++ = p->c;
505 p = p->out;
506 }
507 *s = NUL;
508 }
509 return ret;
510}
511
512/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200513 * Allocate more space for post_start. Called when
514 * running above the estimated number of states.
515 */
516 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100517realloc_post_list(void)
Bram Moolenaar16299b52013-05-30 18:45:23 +0200518{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200519 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100520 int new_max;
Bram Moolenaar16299b52013-05-30 18:45:23 +0200521 int *new_start;
522 int *old_start;
523
Bram Moolenaar38f08e72019-02-20 22:04:32 +0100524 // For weird patterns the number of states can be very high. Increasing by
525 // 50% seems a reasonable compromise between memory use and speed.
526 new_max = nstate_max * 3 / 2;
Bram Moolenaarc799fe22019-05-28 23:08:19 +0200527 new_start = ALLOC_MULT(int, new_max);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200528 if (new_start == NULL)
529 return FAIL;
530 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
Bram Moolenaar16299b52013-05-30 18:45:23 +0200531 old_start = post_start;
532 post_start = new_start;
533 post_ptr = new_start + (post_ptr - old_start);
534 post_end = post_start + new_max;
535 vim_free(old_start);
536 return OK;
537}
538
539/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200540 * Search between "start" and "end" and try to recognize a
541 * character class in expanded form. For example [0-9].
542 * On success, return the id the character class to be emitted.
543 * On failure, return 0 (=FAIL)
544 * Start points to the first char of the range, while end should point
545 * to the closing brace.
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200546 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
547 * need to be interpreted as [a-zA-Z].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200548 */
549 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100550nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200551{
Bram Moolenaarf8115092013-06-04 17:47:05 +0200552# define CLASS_not 0x80
553# define CLASS_af 0x40
554# define CLASS_AF 0x20
555# define CLASS_az 0x10
556# define CLASS_AZ 0x08
557# define CLASS_o7 0x04
558# define CLASS_o9 0x02
559# define CLASS_underscore 0x01
560
561 int newl = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200562 char_u *p;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200563 int config = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200564
565 if (extra_newl == TRUE)
566 newl = TRUE;
567
568 if (*end != ']')
569 return FAIL;
570 p = start;
571 if (*p == '^')
572 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200573 config |= CLASS_not;
Bram Moolenaar01d89dd2013-06-03 19:41:06 +0200574 p++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200575 }
576
577 while (p < end)
578 {
579 if (p + 2 < end && *(p + 1) == '-')
580 {
581 switch (*p)
582 {
583 case '0':
584 if (*(p + 2) == '9')
585 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200586 config |= CLASS_o9;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200587 break;
588 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200589 if (*(p + 2) == '7')
590 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200591 config |= CLASS_o7;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200592 break;
593 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200594 return FAIL;
595
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200596 case 'a':
597 if (*(p + 2) == 'z')
598 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200599 config |= CLASS_az;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200600 break;
601 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200602 if (*(p + 2) == 'f')
603 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200604 config |= CLASS_af;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200605 break;
606 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200607 return FAIL;
608
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200609 case 'A':
610 if (*(p + 2) == 'Z')
611 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200612 config |= CLASS_AZ;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200613 break;
614 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200615 if (*(p + 2) == 'F')
616 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200617 config |= CLASS_AF;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200618 break;
619 }
Bram Moolenaarbb7943b2017-06-05 13:30:06 +0200620 return FAIL;
621
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200622 default:
623 return FAIL;
624 }
625 p += 3;
626 }
627 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
628 {
629 newl = TRUE;
630 p += 2;
631 }
632 else if (*p == '_')
633 {
Bram Moolenaarf8115092013-06-04 17:47:05 +0200634 config |= CLASS_underscore;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200635 p ++;
636 }
637 else if (*p == '\n')
638 {
639 newl = TRUE;
640 p ++;
641 }
642 else
643 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100644 } // while (p < end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200645
646 if (p != end)
647 return FAIL;
648
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200649 if (newl == TRUE)
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200650 extra_newl = NFA_ADD_NL;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200651
652 switch (config)
653 {
654 case CLASS_o9:
655 return extra_newl + NFA_DIGIT;
656 case CLASS_not | CLASS_o9:
657 return extra_newl + NFA_NDIGIT;
658 case CLASS_af | CLASS_AF | CLASS_o9:
659 return extra_newl + NFA_HEX;
660 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
661 return extra_newl + NFA_NHEX;
662 case CLASS_o7:
663 return extra_newl + NFA_OCTAL;
664 case CLASS_not | CLASS_o7:
665 return extra_newl + NFA_NOCTAL;
666 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
667 return extra_newl + NFA_WORD;
668 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
669 return extra_newl + NFA_NWORD;
670 case CLASS_az | CLASS_AZ | CLASS_underscore:
671 return extra_newl + NFA_HEAD;
672 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
673 return extra_newl + NFA_NHEAD;
674 case CLASS_az | CLASS_AZ:
675 return extra_newl + NFA_ALPHA;
676 case CLASS_not | CLASS_az | CLASS_AZ:
677 return extra_newl + NFA_NALPHA;
678 case CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200679 return extra_newl + NFA_LOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200680 case CLASS_not | CLASS_az:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200681 return extra_newl + NFA_NLOWER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200682 case CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200683 return extra_newl + NFA_UPPER_IC;
Bram Moolenaarf8115092013-06-04 17:47:05 +0200684 case CLASS_not | CLASS_AZ:
Bram Moolenaar1cfad522013-08-14 12:06:49 +0200685 return extra_newl + NFA_NUPPER_IC;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200686 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200687 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200688}
689
690/*
691 * Produce the bytes for equivalence class "c".
692 * Currently only handles latin1, latin9 and utf-8.
693 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
694 * equivalent to 'a OR b OR c'
695 *
696 * NOTE! When changing this function, also update reg_equi_class()
697 */
698 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100699nfa_emit_equi_class(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200700{
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200701#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200702
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200703 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
704 || STRCMP(p_enc, "iso-8859-15") == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200705 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200706#ifdef EBCDIC
707# define A_circumflex 0x62
708# define A_diaeresis 0x63
709# define A_grave 0x64
710# define A_acute 0x65
711# define A_virguilla 0x66
712# define A_ring 0x67
713# define C_cedilla 0x68
714# define E_acute 0x71
715# define E_circumflex 0x72
716# define E_diaeresis 0x73
717# define E_grave 0x74
718# define I_acute 0x75
719# define I_circumflex 0x76
720# define I_diaeresis 0x77
721# define I_grave 0x78
722# define N_virguilla 0x69
723# define O_circumflex 0xeb
724# define O_diaeresis 0xec
725# define O_grave 0xed
726# define O_acute 0xee
727# define O_virguilla 0xef
728# define O_slash 0x80
729# define U_circumflex 0xfb
730# define U_diaeresis 0xfc
731# define U_grave 0xfd
732# define U_acute 0xfe
733# define Y_acute 0xba
734# define a_grave 0x42
735# define a_acute 0x43
736# define a_circumflex 0x44
737# define a_virguilla 0x45
738# define a_diaeresis 0x46
739# define a_ring 0x47
740# define c_cedilla 0x48
741# define e_grave 0x51
742# define e_acute 0x52
743# define e_circumflex 0x53
744# define e_diaeresis 0x54
745# define i_grave 0x55
746# define i_acute 0x56
747# define i_circumflex 0x57
748# define i_diaeresis 0x58
749# define n_virguilla 0x49
750# define o_grave 0xcb
751# define o_acute 0xcc
752# define o_circumflex 0xcd
753# define o_virguilla 0xce
754# define o_diaeresis 0xcf
755# define o_slash 0x70
756# define u_grave 0xdb
757# define u_acute 0xdc
758# define u_circumflex 0xdd
759# define u_diaeresis 0xde
760# define y_acute 0x8d
761# define y_diaeresis 0xdf
762#else
763# define A_grave 0xc0
764# define A_acute 0xc1
765# define A_circumflex 0xc2
766# define A_virguilla 0xc3
767# define A_diaeresis 0xc4
768# define A_ring 0xc5
769# define C_cedilla 0xc7
770# define E_grave 0xc8
771# define E_acute 0xc9
772# define E_circumflex 0xca
773# define E_diaeresis 0xcb
774# define I_grave 0xcc
775# define I_acute 0xcd
776# define I_circumflex 0xce
777# define I_diaeresis 0xcf
778# define N_virguilla 0xd1
779# define O_grave 0xd2
780# define O_acute 0xd3
781# define O_circumflex 0xd4
782# define O_virguilla 0xd5
783# define O_diaeresis 0xd6
784# define O_slash 0xd8
785# define U_grave 0xd9
786# define U_acute 0xda
787# define U_circumflex 0xdb
788# define U_diaeresis 0xdc
789# define Y_acute 0xdd
790# define a_grave 0xe0
791# define a_acute 0xe1
792# define a_circumflex 0xe2
793# define a_virguilla 0xe3
794# define a_diaeresis 0xe4
795# define a_ring 0xe5
796# define c_cedilla 0xe7
797# define e_grave 0xe8
798# define e_acute 0xe9
799# define e_circumflex 0xea
800# define e_diaeresis 0xeb
801# define i_grave 0xec
802# define i_acute 0xed
803# define i_circumflex 0xee
804# define i_diaeresis 0xef
805# define n_virguilla 0xf1
806# define o_grave 0xf2
807# define o_acute 0xf3
808# define o_circumflex 0xf4
809# define o_virguilla 0xf5
810# define o_diaeresis 0xf6
811# define o_slash 0xf8
812# define u_grave 0xf9
813# define u_acute 0xfa
814# define u_circumflex 0xfb
815# define u_diaeresis 0xfc
816# define y_acute 0xfd
817# define y_diaeresis 0xff
818#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200819 switch (c)
820 {
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200821 case 'A': case A_grave: case A_acute: case A_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200822 case A_virguilla: case A_diaeresis: case A_ring:
823 case 0x100: case 0x102: case 0x104: case 0x1cd:
824 case 0x1de: case 0x1e0: case 0x1fa: case 0x200:
825 case 0x202: case 0x226: case 0x23a: case 0x1e00:
826 case 0x1ea0: case 0x1ea2: case 0x1ea4: case 0x1ea6:
827 case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
828 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
829 EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
830 EMIT2(A_circumflex) EMIT2(A_virguilla)
831 EMIT2(A_diaeresis) EMIT2(A_ring)
832 EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
833 EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
834 EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
835 EMIT2(0x226) EMIT2(0x23a) EMIT2(0x1e00)
836 EMIT2(0x1ea0) EMIT2(0x1ea2) EMIT2(0x1ea4)
837 EMIT2(0x1ea6) EMIT2(0x1ea8) EMIT2(0x1eaa)
838 EMIT2(0x1eac) EMIT2(0x1eae) EMIT2(0x1eb0)
839 EMIT2(0x1eb2) EMIT2(0x1eb6) EMIT2(0x1eb4)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200840 return OK;
841
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200842 case 'B': case 0x181: case 0x243: case 0x1e02:
843 case 0x1e04: case 0x1e06:
844 EMIT2('B')
845 EMIT2(0x181) EMIT2(0x243) EMIT2(0x1e02)
846 EMIT2(0x1e04) EMIT2(0x1e06)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200847 return OK;
848
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200849 case 'C': case C_cedilla: case 0x106: case 0x108:
850 case 0x10a: case 0x10c: case 0x187: case 0x23b:
851 case 0x1e08: case 0xa792:
852 EMIT2('C') EMIT2(C_cedilla)
853 EMIT2(0x106) EMIT2(0x108) EMIT2(0x10a)
854 EMIT2(0x10c) EMIT2(0x187) EMIT2(0x23b)
855 EMIT2(0x1e08) EMIT2(0xa792)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200856 return OK;
857
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200858 case 'D': case 0x10e: case 0x110: case 0x18a:
859 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
860 case 0x1e12:
861 EMIT2('D') EMIT2(0x10e) EMIT2(0x110) EMIT2(0x18a)
862 EMIT2(0x1e0a) EMIT2(0x1e0c) EMIT2(0x1e0e)
863 EMIT2(0x1e10) EMIT2(0x1e12)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200864 return OK;
865
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200866 case 'E': case E_grave: case E_acute: case E_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200867 case E_diaeresis: case 0x112: case 0x114: case 0x116:
868 case 0x118: case 0x11a: case 0x204: case 0x206:
869 case 0x228: case 0x246: case 0x1e14: case 0x1e16:
870 case 0x1e18: case 0x1e1a: case 0x1e1c: case 0x1eb8:
871 case 0x1eba: case 0x1ebc: case 0x1ebe: case 0x1ec0:
872 case 0x1ec2: case 0x1ec4: case 0x1ec6:
873 EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
874 EMIT2(E_circumflex) EMIT2(E_diaeresis)
875 EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
876 EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
877 EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
878 EMIT2(0x1e14) EMIT2(0x1e16) EMIT2(0x1e18)
879 EMIT2(0x1e1a) EMIT2(0x1e1c) EMIT2(0x1eb8)
880 EMIT2(0x1eba) EMIT2(0x1ebc) EMIT2(0x1ebe)
881 EMIT2(0x1ec0) EMIT2(0x1ec2) EMIT2(0x1ec4)
882 EMIT2(0x1ec6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200883 return OK;
884
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200885 case 'F': case 0x191: case 0x1e1e: case 0xa798:
886 EMIT2('F') EMIT2(0x191) EMIT2(0x1e1e) EMIT2(0xa798)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200887 return OK;
888
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200889 case 'G': case 0x11c: case 0x11e: case 0x120:
890 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
891 case 0x1f4: case 0x1e20: case 0xa7a0:
892 EMIT2('G') EMIT2(0x11c) EMIT2(0x11e) EMIT2(0x120)
893 EMIT2(0x122) EMIT2(0x193) EMIT2(0x1e4)
894 EMIT2(0x1e6) EMIT2(0x1f4) EMIT2(0x1e20)
895 EMIT2(0xa7a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200896 return OK;
897
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200898 case 'H': case 0x124: case 0x126: case 0x21e:
899 case 0x1e22: case 0x1e24: case 0x1e26: case 0x1e28:
900 case 0x1e2a: case 0x2c67:
901 EMIT2('H') EMIT2(0x124) EMIT2(0x126) EMIT2(0x21e)
902 EMIT2(0x1e22) EMIT2(0x1e24) EMIT2(0x1e26)
903 EMIT2(0x1e28) EMIT2(0x1e2a) EMIT2(0x2c67)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200904 return OK;
905
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200906 case 'I': case I_grave: case I_acute: case I_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200907 case I_diaeresis: case 0x128: case 0x12a: case 0x12c:
908 case 0x12e: case 0x130: case 0x197: case 0x1cf:
909 case 0x208: case 0x20a: case 0x1e2c: case 0x1e2e:
910 case 0x1ec8: case 0x1eca:
911 EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
912 EMIT2(I_circumflex) EMIT2(I_diaeresis)
913 EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
914 EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
915 EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
916 EMIT2(0x1e2c) EMIT2(0x1e2e) EMIT2(0x1ec8)
917 EMIT2(0x1eca)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200918 return OK;
919
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200920 case 'J': case 0x134: case 0x248:
921 EMIT2('J') EMIT2(0x134) EMIT2(0x248)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200922 return OK;
923
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200924 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
925 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
926 EMIT2('K') EMIT2(0x136) EMIT2(0x198) EMIT2(0x1e8)
927 EMIT2(0x1e30) EMIT2(0x1e32) EMIT2(0x1e34)
928 EMIT2(0x2c69) EMIT2(0xa740)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200929 return OK;
930
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200931 case 'L': case 0x139: case 0x13b: case 0x13d:
932 case 0x13f: case 0x141: case 0x23d: case 0x1e36:
933 case 0x1e38: case 0x1e3a: case 0x1e3c: case 0x2c60:
934 EMIT2('L') EMIT2(0x139) EMIT2(0x13b)
935 EMIT2(0x13d) EMIT2(0x13f) EMIT2(0x141)
936 EMIT2(0x23d) EMIT2(0x1e36) EMIT2(0x1e38)
937 EMIT2(0x1e3a) EMIT2(0x1e3c) EMIT2(0x2c60)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200938 return OK;
939
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200940 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
941 EMIT2('M') EMIT2(0x1e3e) EMIT2(0x1e40)
942 EMIT2(0x1e42)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200943 return OK;
944
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200945 case 'N': case N_virguilla:
946 case 0x143: case 0x145: case 0x147: case 0x1f8:
947 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
948 case 0xa7a4:
949 EMIT2('N') EMIT2(N_virguilla)
950 EMIT2(0x143) EMIT2(0x145) EMIT2(0x147)
951 EMIT2(0x1f8) EMIT2(0x1e44) EMIT2(0x1e46)
952 EMIT2(0x1e48) EMIT2(0x1e4a) EMIT2(0xa7a4)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200953 return OK;
954
Bram Moolenaar2a6fa562016-04-04 20:55:59 +0200955 case 'O': case O_grave: case O_acute: case O_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 case O_virguilla: case O_diaeresis: case O_slash:
957 case 0x14c: case 0x14e: case 0x150: case 0x19f:
958 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec:
959 case 0x1fe: case 0x20c: case 0x20e: case 0x22a:
960 case 0x22c: case 0x22e: case 0x230: case 0x1e4c:
961 case 0x1e4e: case 0x1e50: case 0x1e52: case 0x1ecc:
962 case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
963 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc:
964 case 0x1ede: case 0x1ee0: case 0x1ee2:
965 EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
966 EMIT2(O_circumflex) EMIT2(O_virguilla)
967 EMIT2(O_diaeresis) EMIT2(O_slash)
968 EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
969 EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
970 EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
971 EMIT2(0x20c) EMIT2(0x20e) EMIT2(0x22a)
972 EMIT2(0x22c) EMIT2(0x22e) EMIT2(0x230)
973 EMIT2(0x1e4c) EMIT2(0x1e4e) EMIT2(0x1e50)
974 EMIT2(0x1e52) EMIT2(0x1ecc) EMIT2(0x1ece)
975 EMIT2(0x1ed0) EMIT2(0x1ed2) EMIT2(0x1ed4)
976 EMIT2(0x1ed6) EMIT2(0x1ed8) EMIT2(0x1eda)
977 EMIT2(0x1edc) EMIT2(0x1ede) EMIT2(0x1ee0)
978 EMIT2(0x1ee2)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200979 return OK;
980
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
982 EMIT2('P') EMIT2(0x1a4) EMIT2(0x1e54) EMIT2(0x1e56)
983 EMIT2(0x2c63)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200984 return OK;
985
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200986 case 'Q': case 0x24a:
987 EMIT2('Q') EMIT2(0x24a)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200988 return OK;
989
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
991 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
992 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
993 EMIT2('R') EMIT2(0x154) EMIT2(0x156) EMIT2(0x158)
994 EMIT2(0x210) EMIT2(0x212) EMIT2(0x24c) EMIT2(0x1e58)
995 EMIT2(0x1e5a) EMIT2(0x1e5c) EMIT2(0x1e5e) EMIT2(0x2c64)
996 EMIT2(0xa7a6)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +0200997 return OK;
998
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200999 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
1000 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
1001 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
1002 EMIT2('S') EMIT2(0x15a) EMIT2(0x15c) EMIT2(0x15e)
1003 EMIT2(0x160) EMIT2(0x218) EMIT2(0x1e60) EMIT2(0x1e62)
1004 EMIT2(0x1e64) EMIT2(0x1e66) EMIT2(0x1e68) EMIT2(0x2c7e)
1005 EMIT2(0xa7a8)
1006 return OK;
1007
1008 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
1009 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
1010 case 0x1e6e: case 0x1e70:
1011 EMIT2('T') EMIT2(0x162) EMIT2(0x164) EMIT2(0x166)
1012 EMIT2(0x1ac) EMIT2(0x1ae) EMIT2(0x23e) EMIT2(0x21a)
1013 EMIT2(0x1e6a) EMIT2(0x1e6c) EMIT2(0x1e6e) EMIT2(0x1e70)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001014 return OK;
1015
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001016 case 'U': case U_grave: case U_acute: case U_diaeresis:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001017 case U_circumflex: case 0x168: case 0x16a: case 0x16c:
1018 case 0x16e: case 0x170: case 0x172: case 0x1af:
1019 case 0x1d3: case 0x1d5: case 0x1d7: case 0x1d9:
1020 case 0x1db: case 0x214: case 0x216: case 0x244:
1021 case 0x1e72: case 0x1e74: case 0x1e76: case 0x1e78:
1022 case 0x1e7a: case 0x1ee4: case 0x1ee6: case 0x1ee8:
1023 case 0x1eea: case 0x1eec: case 0x1eee: case 0x1ef0:
1024 EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
1025 EMIT2(U_diaeresis) EMIT2(U_circumflex)
1026 EMIT2(0x168) EMIT2(0x16a)
1027 EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
1028 EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
1029 EMIT2(0x1d5) EMIT2(0x1d7) EMIT2(0x1d9)
1030 EMIT2(0x1db) EMIT2(0x214) EMIT2(0x216)
1031 EMIT2(0x244) EMIT2(0x1e72) EMIT2(0x1e74)
1032 EMIT2(0x1e76) EMIT2(0x1e78) EMIT2(0x1e7a)
1033 EMIT2(0x1ee4) EMIT2(0x1ee6) EMIT2(0x1ee8)
1034 EMIT2(0x1eea) EMIT2(0x1eec) EMIT2(0x1eee)
1035 EMIT2(0x1ef0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001036 return OK;
1037
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001038 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
1039 EMIT2('V') EMIT2(0x1b2) EMIT2(0x1e7c) EMIT2(0x1e7e)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001040 return OK;
1041
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001042 case 'W': case 0x174: case 0x1e80: case 0x1e82: case 0x1e84:
1043 case 0x1e86: case 0x1e88:
1044 EMIT2('W') EMIT2(0x174) EMIT2(0x1e80) EMIT2(0x1e82)
1045 EMIT2(0x1e84) EMIT2(0x1e86) EMIT2(0x1e88)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001046 return OK;
1047
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001048 case 'X': case 0x1e8a: case 0x1e8c:
1049 EMIT2('X') EMIT2(0x1e8a) EMIT2(0x1e8c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001050 return OK;
1051
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001052 case 'Y': case Y_acute: case 0x176: case 0x178:
1053 case 0x1b3: case 0x232: case 0x24e: case 0x1e8e:
1054 case 0x1ef2: case 0x1ef4: case 0x1ef6: case 0x1ef8:
1055 EMIT2('Y') EMIT2(Y_acute)
1056 EMIT2(0x176) EMIT2(0x178) EMIT2(0x1b3)
1057 EMIT2(0x232) EMIT2(0x24e) EMIT2(0x1e8e)
1058 EMIT2(0x1ef2) EMIT2(0x1ef4) EMIT2(0x1ef6)
1059 EMIT2(0x1ef8)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001060 return OK;
1061
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001062 case 'Z': case 0x179: case 0x17b: case 0x17d:
1063 case 0x1b5: case 0x1e90: case 0x1e92: case 0x1e94:
1064 case 0x2c6b:
1065 EMIT2('Z') EMIT2(0x179) EMIT2(0x17b) EMIT2(0x17d)
1066 EMIT2(0x1b5) EMIT2(0x1e90) EMIT2(0x1e92)
1067 EMIT2(0x1e94) EMIT2(0x2c6b)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001068 return OK;
1069
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001070 case 'a': case a_grave: case a_acute: case a_circumflex:
1071 case a_virguilla: case a_diaeresis: case a_ring:
1072 case 0x101: case 0x103: case 0x105: case 0x1ce:
1073 case 0x1df: case 0x1e1: case 0x1fb: case 0x201:
1074 case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
1075 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
1076 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
1077 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
1078 case 0x1eb7: case 0x2c65:
1079 EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
1080 EMIT2(a_circumflex) EMIT2(a_virguilla)
1081 EMIT2(a_diaeresis) EMIT2(a_ring)
1082 EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
1083 EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
1084 EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
1085 EMIT2(0x227) EMIT2(0x1d8f) EMIT2(0x1e01)
1086 EMIT2(0x1e9a) EMIT2(0x1ea1) EMIT2(0x1ea3)
1087 EMIT2(0x1ea5) EMIT2(0x1ea7) EMIT2(0x1ea9)
1088 EMIT2(0x1eab) EMIT2(0x1ead) EMIT2(0x1eaf)
1089 EMIT2(0x1eb1) EMIT2(0x1eb3) EMIT2(0x1eb5)
1090 EMIT2(0x1eb7) EMIT2(0x2c65)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001091 return OK;
1092
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001093 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
1094 case 0x1e03: case 0x1e05: case 0x1e07:
1095 EMIT2('b') EMIT2(0x180) EMIT2(0x253) EMIT2(0x1d6c)
1096 EMIT2(0x1d80) EMIT2(0x1e03) EMIT2(0x1e05) EMIT2(0x1e07)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001097 return OK;
1098
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001099 case 'c': case c_cedilla: case 0x107: case 0x109: case 0x10b:
1100 case 0x10d: case 0x188: case 0x23c: case 0x1e09: case 0xa793:
1101 case 0xa794:
1102 EMIT2('c') EMIT2(c_cedilla)
1103 EMIT2(0x107) EMIT2(0x109) EMIT2(0x10b)
1104 EMIT2(0x10d) EMIT2(0x188) EMIT2(0x23c)
1105 EMIT2(0x1e09) EMIT2(0xa793) EMIT2(0xa794)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001106 return OK;
1107
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001108 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
1109 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d: case 0x1e0f:
1110 case 0x1e11: case 0x1e13:
1111 EMIT2('d') EMIT2(0x10f) EMIT2(0x111)
1112 EMIT2(0x257) EMIT2(0x1d6d) EMIT2(0x1d81)
1113 EMIT2(0x1d91) EMIT2(0x1e0b) EMIT2(0x1e0d)
1114 EMIT2(0x1e0f) EMIT2(0x1e11) EMIT2(0x1e13)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001115 return OK;
1116
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001117 case 'e': case e_grave: case e_acute: case e_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001118 case e_diaeresis: case 0x113: case 0x115: case 0x117:
1119 case 0x119: case 0x11b: case 0x205: case 0x207:
1120 case 0x229: case 0x247: case 0x1d92: case 0x1e15:
1121 case 0x1e17: case 0x1e19: case 0x1e1b: case 0x1e1d:
1122 case 0x1eb9: case 0x1ebb: case 0x1ebd: case 0x1ebf:
1123 case 0x1ec1: case 0x1ec3: case 0x1ec5: case 0x1ec7:
1124 EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
1125 EMIT2(e_circumflex) EMIT2(e_diaeresis)
1126 EMIT2(0x113) EMIT2(0x115)
1127 EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
1128 EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
1129 EMIT2(0x247) EMIT2(0x1d92) EMIT2(0x1e15)
1130 EMIT2(0x1e17) EMIT2(0x1e19) EMIT2(0x1e1b)
1131 EMIT2(0x1e1d) EMIT2(0x1eb9) EMIT2(0x1ebb)
1132 EMIT2(0x1ebd) EMIT2(0x1ebf) EMIT2(0x1ec1)
1133 EMIT2(0x1ec3) EMIT2(0x1ec5) EMIT2(0x1ec7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001134 return OK;
1135
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001136 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
1137 case 0x1e1f: case 0xa799:
1138 EMIT2('f') EMIT2(0x192) EMIT2(0x1d6e) EMIT2(0x1d82)
1139 EMIT2(0x1e1f) EMIT2(0xa799)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001140 return OK;
1141
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001142 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
1143 case 0x1e5: case 0x1e7: case 0x1f5: case 0x260: case 0x1d83:
1144 case 0x1e21: case 0xa7a1:
1145 EMIT2('g') EMIT2(0x11d) EMIT2(0x11f) EMIT2(0x121)
1146 EMIT2(0x123) EMIT2(0x1e5) EMIT2(0x1e7)
1147 EMIT2(0x1f5) EMIT2(0x260) EMIT2(0x1d83)
1148 EMIT2(0x1e21) EMIT2(0xa7a1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001149 return OK;
1150
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001151 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
1152 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
1153 case 0x1e96: case 0x2c68: case 0xa795:
1154 EMIT2('h') EMIT2(0x125) EMIT2(0x127) EMIT2(0x21f)
1155 EMIT2(0x1e23) EMIT2(0x1e25) EMIT2(0x1e27)
1156 EMIT2(0x1e29) EMIT2(0x1e2b) EMIT2(0x1e96)
1157 EMIT2(0x2c68) EMIT2(0xa795)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001158 return OK;
1159
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001160 case 'i': case i_grave: case i_acute: case i_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001161 case i_diaeresis: case 0x129: case 0x12b: case 0x12d:
1162 case 0x12f: case 0x1d0: case 0x209: case 0x20b:
1163 case 0x268: case 0x1d96: case 0x1e2d: case 0x1e2f:
1164 case 0x1ec9: case 0x1ecb:
1165 EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
1166 EMIT2(i_circumflex) EMIT2(i_diaeresis)
1167 EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
1168 EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
1169 EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
1170 EMIT2(0x1e2d) EMIT2(0x1e2f) EMIT2(0x1ec9)
1171 EMIT2(0x1ecb) EMIT2(0x1ecb)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001172 return OK;
1173
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001174 case 'j': case 0x135: case 0x1f0: case 0x249:
1175 EMIT2('j') EMIT2(0x135) EMIT2(0x1f0) EMIT2(0x249)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001176 return OK;
1177
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001178 case 'k': case 0x137: case 0x199: case 0x1e9: case 0x1d84:
1179 case 0x1e31: case 0x1e33: case 0x1e35: case 0x2c6a: case 0xa741:
1180 EMIT2('k') EMIT2(0x137) EMIT2(0x199) EMIT2(0x1e9)
1181 EMIT2(0x1d84) EMIT2(0x1e31) EMIT2(0x1e33)
1182 EMIT2(0x1e35) EMIT2(0x2c6a) EMIT2(0xa741)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001183 return OK;
1184
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001185 case 'l': case 0x13a: case 0x13c: case 0x13e: case 0x140:
1186 case 0x142: case 0x19a: case 0x1e37: case 0x1e39: case 0x1e3b:
1187 case 0x1e3d: case 0x2c61:
1188 EMIT2('l') EMIT2(0x13a) EMIT2(0x13c)
1189 EMIT2(0x13e) EMIT2(0x140) EMIT2(0x142)
1190 EMIT2(0x19a) EMIT2(0x1e37) EMIT2(0x1e39)
1191 EMIT2(0x1e3b) EMIT2(0x1e3d) EMIT2(0x2c61)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001192 return OK;
1193
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001194 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
1195 EMIT2('m') EMIT2(0x1d6f) EMIT2(0x1e3f)
1196 EMIT2(0x1e41) EMIT2(0x1e43)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001197 return OK;
1198
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001199 case 'n': case n_virguilla: case 0x144: case 0x146: case 0x148:
1200 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87: case 0x1e45:
1201 case 0x1e47: case 0x1e49: case 0x1e4b: case 0xa7a5:
1202 EMIT2('n') EMIT2(n_virguilla)
1203 EMIT2(0x144) EMIT2(0x146) EMIT2(0x148)
1204 EMIT2(0x149) EMIT2(0x1f9) EMIT2(0x1d70)
1205 EMIT2(0x1d87) EMIT2(0x1e45) EMIT2(0x1e47)
1206 EMIT2(0x1e49) EMIT2(0x1e4b) EMIT2(0xa7a5)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001207 return OK;
1208
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001209 case 'o': case o_grave: case o_acute: case o_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001210 case o_virguilla: case o_diaeresis: case o_slash:
1211 case 0x14d: case 0x14f: case 0x151: case 0x1a1:
1212 case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
1213 case 0x20d: case 0x20f: case 0x22b: case 0x22d:
1214 case 0x22f: case 0x231: case 0x275: case 0x1e4d:
1215 case 0x1e4f: case 0x1e51: case 0x1e53: case 0x1ecd:
1216 case 0x1ecf: case 0x1ed1: case 0x1ed3: case 0x1ed5:
1217 case 0x1ed7: case 0x1ed9: case 0x1edb: case 0x1edd:
1218 case 0x1edf: case 0x1ee1: case 0x1ee3:
1219 EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
1220 EMIT2(o_circumflex) EMIT2(o_virguilla)
1221 EMIT2(o_diaeresis) EMIT2(o_slash)
1222 EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
1223 EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
1224 EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
1225 EMIT2(0x20f) EMIT2(0x22b) EMIT2(0x22d)
1226 EMIT2(0x22f) EMIT2(0x231) EMIT2(0x275)
1227 EMIT2(0x1e4d) EMIT2(0x1e4f) EMIT2(0x1e51)
1228 EMIT2(0x1e53) EMIT2(0x1ecd) EMIT2(0x1ecf)
1229 EMIT2(0x1ed1) EMIT2(0x1ed3) EMIT2(0x1ed5)
1230 EMIT2(0x1ed7) EMIT2(0x1ed9) EMIT2(0x1edb)
1231 EMIT2(0x1edd) EMIT2(0x1edf) EMIT2(0x1ee1)
1232 EMIT2(0x1ee3)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001233 return OK;
1234
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001235 case 'p': case 0x1a5: case 0x1d71: case 0x1d7d: case 0x1d88:
1236 case 0x1e55: case 0x1e57:
1237 EMIT2('p') EMIT2(0x1a5) EMIT2(0x1d71) EMIT2(0x1d7d)
1238 EMIT2(0x1d88) EMIT2(0x1e55) EMIT2(0x1e57)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001239 return OK;
1240
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001241 case 'q': case 0x24b: case 0x2a0:
1242 EMIT2('q') EMIT2(0x24b) EMIT2(0x2a0)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001243 return OK;
1244
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001245 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
1246 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
1247 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
1248 case 0xa7a7:
1249 EMIT2('r') EMIT2(0x155) EMIT2(0x157) EMIT2(0x159)
1250 EMIT2(0x211) EMIT2(0x213) EMIT2(0x24d) EMIT2(0x27d)
1251 EMIT2(0x1d72) EMIT2(0x1d73) EMIT2(0x1d89) EMIT2(0x1e59)
1252 EMIT2(0x1e5b) EMIT2(0x1e5d) EMIT2(0x1e5f) EMIT2(0xa7a7)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001253 return OK;
1254
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001255 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
1256 case 0x219: case 0x23f: case 0x1d74: case 0x1d8a: case 0x1e61:
1257 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
1258 EMIT2('s') EMIT2(0x15b) EMIT2(0x15d) EMIT2(0x15f)
1259 EMIT2(0x161) EMIT2(0x219) EMIT2(0x23f) EMIT2(0x1d74)
1260 EMIT2(0x1d8a) EMIT2(0x1e61) EMIT2(0x1e63) EMIT2(0x1e65)
1261 EMIT2(0x1e67) EMIT2(0x1e69) EMIT2(0xa7a9)
1262 return OK;
1263
1264 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
1265 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
1266 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
1267 EMIT2('t') EMIT2(0x163) EMIT2(0x165) EMIT2(0x167)
1268 EMIT2(0x1ab) EMIT2(0x1ad) EMIT2(0x21b) EMIT2(0x288)
1269 EMIT2(0x1d75) EMIT2(0x1e6b) EMIT2(0x1e6d) EMIT2(0x1e6f)
1270 EMIT2(0x1e71) EMIT2(0x1e97) EMIT2(0x2c66)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001271 return OK;
1272
Bram Moolenaar2a6fa562016-04-04 20:55:59 +02001273 case 'u': case u_grave: case u_acute: case u_circumflex:
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001274 case u_diaeresis: case 0x169: case 0x16b: case 0x16d:
1275 case 0x16f: case 0x171: case 0x173: case 0x1b0: case 0x1d4:
1276 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc: case 0x215:
1277 case 0x217: case 0x289: case 0x1d7e: case 0x1d99: case 0x1e73:
1278 case 0x1e75: case 0x1e77: case 0x1e79: case 0x1e7b:
1279 case 0x1ee5: case 0x1ee7: case 0x1ee9: case 0x1eeb:
1280 case 0x1eed: case 0x1eef: case 0x1ef1:
1281 EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
1282 EMIT2(u_circumflex) EMIT2(u_diaeresis)
1283 EMIT2(0x169) EMIT2(0x16b)
1284 EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
1285 EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
1286 EMIT2(0x215) EMIT2(0x217) EMIT2(0x1b0)
1287 EMIT2(0x1d4) EMIT2(0x1da) EMIT2(0x1dc)
1288 EMIT2(0x289) EMIT2(0x1e73) EMIT2(0x1d7e)
1289 EMIT2(0x1d99) EMIT2(0x1e75) EMIT2(0x1e77)
1290 EMIT2(0x1e79) EMIT2(0x1e7b) EMIT2(0x1ee5)
1291 EMIT2(0x1ee7) EMIT2(0x1ee9) EMIT2(0x1eeb)
1292 EMIT2(0x1eed) EMIT2(0x1eef) EMIT2(0x1ef1)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001293 return OK;
1294
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001295 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
1296 EMIT2('v') EMIT2(0x28b) EMIT2(0x1d8c) EMIT2(0x1e7d)
1297 EMIT2(0x1e7f)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001298 return OK;
1299
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001300 case 'w': case 0x175: case 0x1e81: case 0x1e83: case 0x1e85:
1301 case 0x1e87: case 0x1e89: case 0x1e98:
1302 EMIT2('w') EMIT2(0x175) EMIT2(0x1e81) EMIT2(0x1e83)
1303 EMIT2(0x1e85) EMIT2(0x1e87) EMIT2(0x1e89) EMIT2(0x1e98)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001304 return OK;
1305
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001306 case 'x': case 0x1e8b: case 0x1e8d:
1307 EMIT2('x') EMIT2(0x1e8b) EMIT2(0x1e8d)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001308 return OK;
1309
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001310 case 'y': case y_acute: case y_diaeresis: case 0x177:
1311 case 0x1b4: case 0x233: case 0x24f: case 0x1e8f:
1312 case 0x1e99: case 0x1ef3: case 0x1ef5: case 0x1ef7:
1313 case 0x1ef9:
1314 EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
1315 EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
1316 EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
1317 EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001318 return OK;
1319
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001320 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1321 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1322 case 0x1e95: case 0x2c6c:
1323 EMIT2('z') EMIT2(0x17a) EMIT2(0x17c) EMIT2(0x17e)
1324 EMIT2(0x1b6) EMIT2(0x1d76) EMIT2(0x1d8e) EMIT2(0x1e91)
1325 EMIT2(0x1e93) EMIT2(0x1e95) EMIT2(0x2c6c)
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001326 return OK;
1327
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001328 // default: character itself
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001329 }
1330 }
1331
Bram Moolenaare6a2fa62013-09-19 17:00:20 +02001332 EMIT2(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001333 return OK;
1334#undef EMIT2
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001335#undef EMIT2
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001336}
1337
1338/*
1339 * Code to parse regular expression.
1340 *
1341 * We try to reuse parsing functions in regexp.c to
1342 * minimize surprise and keep the syntax consistent.
1343 */
1344
1345/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001346 * Parse the lowest level.
1347 *
1348 * An atom can be one of a long list of items. Many atoms match one character
1349 * in the text. It is often an ordinary character or a character class.
1350 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1351 * is only for syntax highlighting.
1352 *
1353 * atom ::= ordinary-atom
1354 * or \( pattern \)
1355 * or \%( pattern \)
1356 * or \z( pattern \)
1357 */
1358 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001359nfa_regatom(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001360{
1361 int c;
1362 int charclass;
1363 int equiclass;
1364 int collclass;
1365 int got_coll_char;
1366 char_u *p;
1367 char_u *endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001368 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001369 int extra = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001370 int emit_range;
1371 int negated;
1372 int result;
1373 int startc = -1;
1374 int endc = -1;
1375 int oldstartc = -1;
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001376 int save_prev_at_start = prev_at_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001377
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001378 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001379 switch (c)
1380 {
Bram Moolenaar47196582013-05-25 22:04:23 +02001381 case NUL:
Bram Moolenaar174a8482013-11-28 14:20:17 +01001382 EMSG_RET_FAIL(_(e_nul_found));
Bram Moolenaar47196582013-05-25 22:04:23 +02001383
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001384 case Magic('^'):
1385 EMIT(NFA_BOL);
1386 break;
1387
1388 case Magic('$'):
1389 EMIT(NFA_EOL);
1390#if defined(FEAT_SYN_HL) || defined(PROTO)
1391 had_eol = TRUE;
1392#endif
1393 break;
1394
1395 case Magic('<'):
1396 EMIT(NFA_BOW);
1397 break;
1398
1399 case Magic('>'):
1400 EMIT(NFA_EOW);
1401 break;
1402
1403 case Magic('_'):
1404 c = no_Magic(getchr());
Bram Moolenaar174a8482013-11-28 14:20:17 +01001405 if (c == NUL)
1406 EMSG_RET_FAIL(_(e_nul_found));
1407
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001408 if (c == '^') // "\_^" is start-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001409 {
1410 EMIT(NFA_BOL);
1411 break;
1412 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001413 if (c == '$') // "\_$" is end-of-line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001414 {
1415 EMIT(NFA_EOL);
1416#if defined(FEAT_SYN_HL) || defined(PROTO)
1417 had_eol = TRUE;
1418#endif
1419 break;
1420 }
1421
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001422 extra = NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001423
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001424 // "\_[" is collection plus newline
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001425 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001426 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001427
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001428 // "\_x" is character class plus newline
1429 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001430
1431 /*
1432 * Character classes.
1433 */
1434 case Magic('.'):
1435 case Magic('i'):
1436 case Magic('I'):
1437 case Magic('k'):
1438 case Magic('K'):
1439 case Magic('f'):
1440 case Magic('F'):
1441 case Magic('p'):
1442 case Magic('P'):
1443 case Magic('s'):
1444 case Magic('S'):
1445 case Magic('d'):
1446 case Magic('D'):
1447 case Magic('x'):
1448 case Magic('X'):
1449 case Magic('o'):
1450 case Magic('O'):
1451 case Magic('w'):
1452 case Magic('W'):
1453 case Magic('h'):
1454 case Magic('H'):
1455 case Magic('a'):
1456 case Magic('A'):
1457 case Magic('l'):
1458 case Magic('L'):
1459 case Magic('u'):
1460 case Magic('U'):
1461 p = vim_strchr(classchars, no_Magic(c));
1462 if (p == NULL)
1463 {
Bram Moolenaar174a8482013-11-28 14:20:17 +01001464 if (extra == NFA_ADD_NL)
1465 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001466 semsg(_(e_ill_char_class), c);
Bram Moolenaar174a8482013-11-28 14:20:17 +01001467 rc_did_emsg = TRUE;
1468 return FAIL;
1469 }
Bram Moolenaarb5443cc2019-01-15 20:19:40 +01001470 siemsg("INTERNAL: Unknown character class char: %d", c);
Bram Moolenaar5714b802013-05-28 22:03:20 +02001471 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001472 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01001473
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001474 // When '.' is followed by a composing char ignore the dot, so that
1475 // the composing char is matched here.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001476 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1477 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001478 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001479 c = getchr();
1480 goto nfa_do_multibyte;
1481 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001482 EMIT(nfa_classcodes[p - classchars]);
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001483 if (extra == NFA_ADD_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001484 {
1485 EMIT(NFA_NEWL);
1486 EMIT(NFA_OR);
1487 regflags |= RF_HASNL;
1488 }
1489 break;
1490
1491 case Magic('n'):
1492 if (reg_string)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001493 // In a string "\n" matches a newline character.
Bram Moolenaar417bad22013-06-07 14:08:30 +02001494 EMIT(NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001495 else
1496 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001497 // In buffer text "\n" matches the end of a line.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001498 EMIT(NFA_NEWL);
1499 regflags |= RF_HASNL;
1500 }
1501 break;
1502
1503 case Magic('('):
1504 if (nfa_reg(REG_PAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001505 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001506 break;
1507
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001508 case Magic('|'):
1509 case Magic('&'):
1510 case Magic(')'):
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001511 semsg(_(e_misplaced), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001512 return FAIL;
1513
1514 case Magic('='):
1515 case Magic('?'):
1516 case Magic('+'):
1517 case Magic('@'):
1518 case Magic('*'):
1519 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001520 // these should follow an atom, not form an atom
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001521 semsg(_(e_misplaced), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001522 return FAIL;
1523
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001524 case Magic('~'):
1525 {
1526 char_u *lp;
1527
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001528 // Previous substitute pattern.
1529 // Generated as "\%(pattern\)".
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001530 if (reg_prev_sub == NULL)
1531 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001532 emsg(_(e_nopresub));
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001533 return FAIL;
1534 }
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001535 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp))
Bram Moolenaarf18fb7a2013-06-02 22:08:03 +02001536 {
1537 EMIT(PTR2CHAR(lp));
1538 if (lp != reg_prev_sub)
1539 EMIT(NFA_CONCAT);
1540 }
1541 EMIT(NFA_NOPEN);
1542 break;
1543 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001544
Bram Moolenaar428e9872013-05-30 17:05:39 +02001545 case Magic('1'):
1546 case Magic('2'):
1547 case Magic('3'):
1548 case Magic('4'):
1549 case Magic('5'):
1550 case Magic('6'):
1551 case Magic('7'):
1552 case Magic('8'):
1553 case Magic('9'):
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001554 {
1555 int refnum = no_Magic(c) - '1';
1556
1557 if (!seen_endbrace(refnum + 1))
1558 return FAIL;
1559 EMIT(NFA_BACKREF1 + refnum);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001560 rex.nfa_has_backref = TRUE;
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +02001561 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02001562 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001563
1564 case Magic('z'):
1565 c = no_Magic(getchr());
1566 switch (c)
1567 {
1568 case 's':
1569 EMIT(NFA_ZSTART);
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001570 if (re_mult_next("\\zs") == FAIL)
1571 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001572 break;
1573 case 'e':
1574 EMIT(NFA_ZEND);
Bram Moolenaar0270f382018-07-17 05:43:58 +02001575 rex.nfa_has_zend = TRUE;
Bram Moolenaar2d46e602014-08-29 11:56:32 +02001576 if (re_mult_next("\\ze") == FAIL)
1577 return FAIL;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02001578 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001579#ifdef FEAT_SYN_HL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001580 case '1':
1581 case '2':
1582 case '3':
1583 case '4':
1584 case '5':
1585 case '6':
1586 case '7':
1587 case '8':
1588 case '9':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001589 // \z1...\z9
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001590 if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar5de820b2013-06-02 15:01:57 +02001591 EMSG_RET_FAIL(_(e_z1_not_allowed));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001592 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001593 // No need to set rex.nfa_has_backref, the sub-matches don't
1594 // change when \z1 .. \z9 matches or not.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001595 re_has_z = REX_USE;
1596 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001597 case '(':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001598 // \z(
Bram Moolenaarbcf94422018-06-23 14:21:42 +02001599 if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar5de820b2013-06-02 15:01:57 +02001600 EMSG_RET_FAIL(_(e_z_not_allowed));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001601 if (nfa_reg(REG_ZPAREN) == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001602 return FAIL; // cascaded error
Bram Moolenaarefb23f22013-06-01 23:02:54 +02001603 re_has_z = REX_SET;
1604 break;
1605#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001606 default:
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001607 semsg(_("E867: (NFA) Unknown operator '\\z%c'"),
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001608 no_Magic(c));
1609 return FAIL;
1610 }
1611 break;
1612
1613 case Magic('%'):
1614 c = no_Magic(getchr());
1615 switch (c)
1616 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001617 // () without a back reference
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001618 case '(':
1619 if (nfa_reg(REG_NPAREN) == FAIL)
1620 return FAIL;
1621 EMIT(NFA_NOPEN);
1622 break;
1623
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001624 case 'd': // %d123 decimal
1625 case 'o': // %o123 octal
1626 case 'x': // %xab hex 2
1627 case 'u': // %uabcd hex 4
1628 case 'U': // %U1234abcd hex 8
Bram Moolenaar47196582013-05-25 22:04:23 +02001629 {
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001630 long nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001631
Bram Moolenaar47196582013-05-25 22:04:23 +02001632 switch (c)
1633 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001634 case 'd': nr = getdecchrs(); break;
1635 case 'o': nr = getoctchrs(); break;
1636 case 'x': nr = gethexchrs(2); break;
1637 case 'u': nr = gethexchrs(4); break;
1638 case 'U': nr = gethexchrs(8); break;
1639 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +02001640 }
1641
Bram Moolenaar527a2d82019-02-21 22:28:51 +01001642 if (nr < 0 || nr > INT_MAX)
Bram Moolenaar47196582013-05-25 22:04:23 +02001643 EMSG2_RET_FAIL(
1644 _("E678: Invalid character after %s%%[dxouU]"),
1645 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001646 // A NUL is stored in the text as NL
1647 // TODO: what if a composing character follows?
Bram Moolenaar595cad22013-09-22 13:57:24 +02001648 EMIT(nr == 0 ? 0x0a : nr);
Bram Moolenaar47196582013-05-25 22:04:23 +02001649 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001650 break;
1651
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001652 // Catch \%^ and \%$ regardless of where they appear in the
1653 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001654 case '^':
1655 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001656 break;
1657
1658 case '$':
1659 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001660 break;
1661
1662 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +02001663 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001664 break;
1665
1666 case 'V':
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001667 EMIT(NFA_VISUAL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001668 break;
1669
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02001670 case 'C':
1671 EMIT(NFA_ANY_COMPOSING);
1672 break;
1673
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001674 case '[':
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001675 {
1676 int n;
1677
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001678 // \%[abc]
Bram Moolenaard7986252013-06-17 21:33:41 +02001679 for (n = 0; (c = peekchr()) != ']'; ++n)
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001680 {
1681 if (c == NUL)
1682 EMSG2_RET_FAIL(_(e_missing_sb),
1683 reg_magic == MAGIC_ALL);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001684 // recursive call!
Bram Moolenaard7986252013-06-17 21:33:41 +02001685 if (nfa_regatom() == FAIL)
1686 return FAIL;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001687 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001688 getchr(); // get the ]
Bram Moolenaar2976c022013-06-05 21:30:37 +02001689 if (n == 0)
1690 EMSG2_RET_FAIL(_(e_empty_sb),
1691 reg_magic == MAGIC_ALL);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001692 EMIT(NFA_OPT_CHARS);
1693 EMIT(n);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001694
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001695 // Emit as "\%(\%[abc]\)" to be able to handle
1696 // "\%[abc]*" which would cause the empty string to be
1697 // matched an unlimited number of times. NFA_NOPEN is
1698 // added only once at a position, while NFA_SPLIT is
1699 // added multiple times. This is more efficient than
1700 // not allowing NFA_SPLIT multiple times, it is used
1701 // a lot.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02001702 EMIT(NFA_NOPEN);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001703 break;
1704 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02001705
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001706 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +02001707 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001708 long_u n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001709 int cmp = c;
1710
1711 if (c == '<' || c == '>')
1712 c = getchr();
1713 while (VIM_ISDIGIT(c))
1714 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001715 long_u tmp = n * 10 + (c - '0');
1716
1717 if (tmp < n)
1718 {
1719 // overflow.
1720 emsg(_(e_value_too_large));
1721 return FAIL;
1722 }
1723 n = tmp;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001724 c = getchr();
1725 }
1726 if (c == 'l' || c == 'c' || c == 'v')
1727 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001728 long_u limit = INT_MAX;
Bram Moolenaar9403a212019-02-13 18:35:06 +01001729
Bram Moolenaar423532e2013-05-29 21:14:42 +02001730 if (c == 'l')
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001731 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001732 // \%{n}l \%{n}<l \%{n}>l
Bram Moolenaar423532e2013-05-29 21:14:42 +02001733 EMIT(cmp == '<' ? NFA_LNUM_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001734 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
Bram Moolenaar7c29f382016-02-12 19:08:15 +01001735 if (save_prev_at_start)
1736 at_start = TRUE;
1737 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001738 else if (c == 'c')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001739 // \%{n}c \%{n}<c \%{n}>c
Bram Moolenaar423532e2013-05-29 21:14:42 +02001740 EMIT(cmp == '<' ? NFA_COL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001741 cmp == '>' ? NFA_COL_GT : NFA_COL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001742 else
Bram Moolenaar9403a212019-02-13 18:35:06 +01001743 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001744 // \%{n}v \%{n}<v \%{n}>v
Bram Moolenaar423532e2013-05-29 21:14:42 +02001745 EMIT(cmp == '<' ? NFA_VCOL_LT :
Bram Moolenaar044aa292013-06-04 21:27:38 +02001746 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
Bram Moolenaar9403a212019-02-13 18:35:06 +01001747 limit = INT_MAX / MB_MAXBYTES;
1748 }
1749 if (n >= limit)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001750 {
Bram Moolenaarc5acc0f2020-06-03 18:55:38 +02001751 emsg(_(e_value_too_large));
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001752 return FAIL;
1753 }
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01001754 EMIT((int)n);
Bram Moolenaar423532e2013-05-29 21:14:42 +02001755 break;
1756 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02001757 else if (c == '\'' && n == 0)
1758 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001759 // \%'m \%<'m \%>'m
Bram Moolenaar044aa292013-06-04 21:27:38 +02001760 EMIT(cmp == '<' ? NFA_MARK_LT :
1761 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
Bram Moolenaard75799ab72013-06-05 11:05:17 +02001762 EMIT(getchr());
Bram Moolenaar044aa292013-06-04 21:27:38 +02001763 break;
1764 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02001765 }
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001766 semsg(_("E867: (NFA) Unknown operator '\\%%%c'"),
Bram Moolenaar5714b802013-05-28 22:03:20 +02001767 no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001768 return FAIL;
1769 }
1770 break;
1771
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001772 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +02001773collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001774 /*
Bram Moolenaar417bad22013-06-07 14:08:30 +02001775 * [abc] uses NFA_START_COLL - NFA_END_COLL
1776 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1777 * Each character is produced as a regular state, using
1778 * NFA_CONCAT to bind them together.
1779 * Besides normal characters there can be:
1780 * - character classes NFA_CLASS_*
1781 * - ranges, two characters followed by NFA_RANGE.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001782 */
1783
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001784 p = regparse;
1785 endp = skip_anyof(p);
1786 if (*endp == ']')
1787 {
1788 /*
1789 * Try to reverse engineer character classes. For example,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001790 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001791 * and perform the necessary substitutions in the NFA.
1792 */
1793 result = nfa_recognize_char_class(regparse, endp,
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001794 extra == NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001795 if (result != FAIL)
1796 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001797 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001798 {
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001799 EMIT(result - NFA_ADD_NL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001800 EMIT(NFA_NEWL);
1801 EMIT(NFA_OR);
1802 }
Bram Moolenaar1cfad522013-08-14 12:06:49 +02001803 else
1804 EMIT(result);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001805 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001806 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001807 return OK;
1808 }
1809 /*
1810 * Failed to recognize a character class. Use the simple
1811 * version that turns [abc] into 'a' OR 'b' OR 'c'
1812 */
1813 startc = endc = oldstartc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001814 negated = FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001815 if (*regparse == '^') // negated range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001816 {
1817 negated = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001818 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001819 EMIT(NFA_START_NEG_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001820 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001821 else
1822 EMIT(NFA_START_COLL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001823 if (*regparse == '-')
1824 {
1825 startc = '-';
1826 EMIT(startc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02001827 EMIT(NFA_CONCAT);
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001828 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001829 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001830 // Emit the OR branches for each character in the []
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001831 emit_range = FALSE;
1832 while (regparse < endp)
1833 {
1834 oldstartc = startc;
1835 startc = -1;
1836 got_coll_char = FALSE;
1837 if (*regparse == '[')
1838 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001839 // Check for [: :], [= =], [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001840 equiclass = collclass = 0;
1841 charclass = get_char_class(&regparse);
1842 if (charclass == CLASS_NONE)
1843 {
1844 equiclass = get_equi_class(&regparse);
1845 if (equiclass == 0)
1846 collclass = get_coll_element(&regparse);
1847 }
1848
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001849 // Character class like [:alpha:]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001850 if (charclass != CLASS_NONE)
1851 {
1852 switch (charclass)
1853 {
1854 case CLASS_ALNUM:
1855 EMIT(NFA_CLASS_ALNUM);
1856 break;
1857 case CLASS_ALPHA:
1858 EMIT(NFA_CLASS_ALPHA);
1859 break;
1860 case CLASS_BLANK:
1861 EMIT(NFA_CLASS_BLANK);
1862 break;
1863 case CLASS_CNTRL:
1864 EMIT(NFA_CLASS_CNTRL);
1865 break;
1866 case CLASS_DIGIT:
1867 EMIT(NFA_CLASS_DIGIT);
1868 break;
1869 case CLASS_GRAPH:
1870 EMIT(NFA_CLASS_GRAPH);
1871 break;
1872 case CLASS_LOWER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001873 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001874 EMIT(NFA_CLASS_LOWER);
1875 break;
1876 case CLASS_PRINT:
1877 EMIT(NFA_CLASS_PRINT);
1878 break;
1879 case CLASS_PUNCT:
1880 EMIT(NFA_CLASS_PUNCT);
1881 break;
1882 case CLASS_SPACE:
1883 EMIT(NFA_CLASS_SPACE);
1884 break;
1885 case CLASS_UPPER:
Bram Moolenaar66c50c52021-01-02 17:43:49 +01001886 wants_nfa = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001887 EMIT(NFA_CLASS_UPPER);
1888 break;
1889 case CLASS_XDIGIT:
1890 EMIT(NFA_CLASS_XDIGIT);
1891 break;
1892 case CLASS_TAB:
1893 EMIT(NFA_CLASS_TAB);
1894 break;
1895 case CLASS_RETURN:
1896 EMIT(NFA_CLASS_RETURN);
1897 break;
1898 case CLASS_BACKSPACE:
1899 EMIT(NFA_CLASS_BACKSPACE);
1900 break;
1901 case CLASS_ESCAPE:
1902 EMIT(NFA_CLASS_ESCAPE);
1903 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001904 case CLASS_IDENT:
1905 EMIT(NFA_CLASS_IDENT);
1906 break;
1907 case CLASS_KEYWORD:
1908 EMIT(NFA_CLASS_KEYWORD);
1909 break;
1910 case CLASS_FNAME:
1911 EMIT(NFA_CLASS_FNAME);
1912 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001913 }
Bram Moolenaar417bad22013-06-07 14:08:30 +02001914 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001915 continue;
1916 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001917 // Try equivalence class [=a=] and the like
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001918 if (equiclass != 0)
1919 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02001920 result = nfa_emit_equi_class(equiclass);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001921 if (result == FAIL)
1922 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001923 // should never happen
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001924 EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
1925 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001926 continue;
1927 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001928 // Try collating class like [. .]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001929 if (collclass != 0)
1930 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001931 startc = collclass; // allow [.a.]-x as a range
1932 // Will emit the proper atom at the end of the
1933 // while loop.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001934 }
1935 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001936 // Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1937 // start character.
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001938 if (*regparse == '-' && oldstartc != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001939 {
1940 emit_range = TRUE;
1941 startc = oldstartc;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001942 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001943 continue; // reading the end of the range
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001944 }
1945
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001946 // Now handle simple and escaped characters.
1947 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1948 // accepts "\t", "\e", etc., but only when the 'l' flag in
1949 // 'cpoptions' is not included.
1950 // Posix doesn't recognize backslash at all.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001951 if (*regparse == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001952 && !reg_cpo_bsl
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001953 && regparse + 1 <= endp
1954 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +02001955 || (!reg_cpo_lit
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001956 && vim_strchr(REGEXP_ABBR, regparse[1])
1957 != NULL)
1958 )
1959 )
1960 {
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001961 MB_PTR_ADV(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001962
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001963 if (*regparse == 'n')
Bram Moolenaara5483442019-02-17 20:17:02 +01001964 startc = (reg_string || emit_range
1965 || regparse[1] == '-') ? NL : NFA_NEWL;
Bram Moolenaarabab0b02019-03-30 18:47:01 +01001966 else if (*regparse == 'd'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001967 || *regparse == 'o'
1968 || *regparse == 'x'
1969 || *regparse == 'u'
1970 || *regparse == 'U'
1971 )
1972 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001973 // TODO(RE) This needs more testing
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001974 startc = coll_get_char();
1975 got_coll_char = TRUE;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01001976 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001977 }
1978 else
1979 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001980 // \r,\t,\e,\b
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001981 startc = backslash_trans(*regparse);
1982 }
1983 }
1984
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001985 // Normal printable char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001986 if (startc == -1)
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001987 startc = PTR2CHAR(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001988
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001989 // Previous char was '-', so this char is end of range.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001990 if (emit_range)
1991 {
Bram Moolenaar75d7a062013-06-01 13:24:24 +02001992 endc = startc;
1993 startc = oldstartc;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001994 if (startc > endc)
Bram Moolenaar966e58e2017-06-05 16:54:08 +02001995 EMSG_RET_FAIL(_(e_reverse_range));
Bram Moolenaar417bad22013-06-07 14:08:30 +02001996
1997 if (endc > startc + 2)
1998 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001999 // Emit a range instead of the sequence of
2000 // individual characters.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002001 if (startc == 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002002 // \x00 is translated to \x0a, start at \x01.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002003 EMIT(1);
2004 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002005 --post_ptr; // remove NFA_CONCAT
Bram Moolenaar417bad22013-06-07 14:08:30 +02002006 EMIT(endc);
2007 EMIT(NFA_RANGE);
2008 EMIT(NFA_CONCAT);
2009 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002010 else if (has_mbyte && ((*mb_char2len)(startc) > 1
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002011 || (*mb_char2len)(endc) > 1))
2012 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002013 // Emit the characters in the range.
2014 // "startc" was already emitted, so skip it.
2015 //
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002016 for (c = startc + 1; c <= endc; c++)
2017 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002018 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002019 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002020 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002021 }
2022 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002023 {
2024#ifdef EBCDIC
2025 int alpha_only = FALSE;
2026
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002027 // for alphabetical range skip the gaps
2028 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002029 if (isalpha(startc) && isalpha(endc))
2030 alpha_only = TRUE;
2031#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002032 // Emit the range. "startc" was already emitted, so
2033 // skip it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002034 for (c = startc + 1; c <= endc; c++)
2035#ifdef EBCDIC
2036 if (!alpha_only || isalpha(startc))
2037#endif
2038 {
2039 EMIT(c);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002040 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002041 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002042 }
Bram Moolenaar75d7a062013-06-01 13:24:24 +02002043 emit_range = FALSE;
2044 startc = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002045 }
2046 else
2047 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002048 // This char (startc) is not part of a range. Just
2049 // emit it.
2050 // Normally, simply emit startc. But if we get char
2051 // code=0 from a collating char, then replace it with
2052 // 0x0a.
2053 // This is needed to completely mimic the behaviour of
2054 // the backtracking engine.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002055 if (startc == NFA_NEWL)
2056 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002057 // Line break can't be matched as part of the
2058 // collection, add an OR below. But not for negated
2059 // range.
Bram Moolenaar417bad22013-06-07 14:08:30 +02002060 if (!negated)
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002061 extra = NFA_ADD_NL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002062 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002063 else
Bram Moolenaar417bad22013-06-07 14:08:30 +02002064 {
2065 if (got_coll_char == TRUE && startc == 0)
2066 EMIT(0x0a);
2067 else
2068 EMIT(startc);
2069 EMIT(NFA_CONCAT);
2070 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002071 }
2072
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002073 MB_PTR_ADV(regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002074 } // while (p < endp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002075
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002076 MB_PTR_BACK(old_regparse, regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002077 if (*regparse == '-') // if last, '-' is just a char
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002078 {
2079 EMIT('-');
Bram Moolenaar417bad22013-06-07 14:08:30 +02002080 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002081 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002082
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002083 // skip the trailing ]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002084 regparse = endp;
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002085 MB_PTR_ADV(regparse);
Bram Moolenaar417bad22013-06-07 14:08:30 +02002086
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002087 // Mark end of the collection.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002088 if (negated == TRUE)
Bram Moolenaar417bad22013-06-07 14:08:30 +02002089 EMIT(NFA_END_NEG_COLL);
2090 else
2091 EMIT(NFA_END_COLL);
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002092
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002093 // \_[] also matches \n but it's not negated
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002094 if (extra == NFA_ADD_NL)
Bram Moolenaarbad704f2013-05-30 11:51:08 +02002095 {
2096 EMIT(reg_string ? NL : NFA_NEWL);
2097 EMIT(NFA_OR);
2098 }
2099
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002100 return OK;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002101 } // if exists closing ]
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002102
2103 if (reg_strict)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002104 EMSG_RET_FAIL(_(e_missingbracket));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002105 // FALLTHROUGH
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002106
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002107 default:
2108 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002109 int plen;
2110
2111nfa_do_multibyte:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002112 // plen is length of current char with composing chars
Bram Moolenaar47196582013-05-25 22:04:23 +02002113 if (enc_utf8 && ((*mb_char2len)(c)
Bram Moolenaarace95982017-03-29 17:30:27 +02002114 != (plen = utfc_ptr2len(old_regparse))
Bram Moolenaar47196582013-05-25 22:04:23 +02002115 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002116 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02002117 int i = 0;
2118
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002119 // A base character plus composing characters, or just one
2120 // or more composing characters.
2121 // This requires creating a separate atom as if enclosing
2122 // the characters in (), where NFA_COMPOSING is the ( and
2123 // NFA_END_COMPOSING is the ). Note that right now we are
2124 // building the postfix form, not the NFA itself;
2125 // a composing char could be: a, b, c, NFA_COMPOSING
2126 // where 'b' and 'c' are chars with codes > 256.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002127 for (;;)
2128 {
2129 EMIT(c);
2130 if (i > 0)
2131 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002132 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002133 break;
2134 c = utf_ptr2char(old_regparse + i);
2135 }
2136 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002137 regparse = old_regparse + plen;
2138 }
2139 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002140 {
2141 c = no_Magic(c);
2142 EMIT(c);
2143 }
2144 return OK;
2145 }
2146 }
2147
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002148 return OK;
2149}
2150
2151/*
2152 * Parse something followed by possible [*+=].
2153 *
2154 * A piece is an atom, possibly followed by a multi, an indication of how many
2155 * times the atom can be matched. Example: "a*" matches any sequence of "a"
2156 * characters: "", "a", "aa", etc.
2157 *
2158 * piece ::= atom
2159 * or atom multi
2160 */
2161 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002162nfa_regpiece(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002163{
2164 int i;
2165 int op;
2166 int ret;
2167 long minval, maxval;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002168 int greedy = TRUE; // Braces are prefixed with '-' ?
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002169 parse_state_T old_state;
2170 parse_state_T new_state;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01002171 long c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02002172 int old_post_pos;
2173 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002174 int quest;
2175
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002176 // Save the current parse state, so that we can use it if <atom>{m,n} is
2177 // next.
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002178 save_parse_state(&old_state);
2179
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002180 // store current pos in the postfix form, for \{m,n} involving 0s
Bram Moolenaar16299b52013-05-30 18:45:23 +02002181 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002182
2183 ret = nfa_regatom();
2184 if (ret == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002185 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002186
2187 op = peekchr();
2188 if (re_multi_type(op) == NOT_MULTI)
2189 return OK;
2190
2191 skipchr();
2192 switch (op)
2193 {
2194 case Magic('*'):
2195 EMIT(NFA_STAR);
2196 break;
2197
2198 case Magic('+'):
2199 /*
2200 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
2201 * first and only submatch would be "aaa". But the backtracking
2202 * engine interprets the plus as "try matching one more time", and
2203 * a* matches a second time at the end of the input, the empty
2204 * string.
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002205 * The submatch will be the empty string.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002206 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002207 * In order to be consistent with the old engine, we replace
2208 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002209 */
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002210 restore_parse_state(&old_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002211 curchr = -1;
2212 if (nfa_regatom() == FAIL)
2213 return FAIL;
2214 EMIT(NFA_STAR);
2215 EMIT(NFA_CONCAT);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002216 skipchr(); // skip the \+
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002217 break;
2218
2219 case Magic('@'):
Bram Moolenaar61602c52013-06-01 19:54:43 +02002220 c2 = getdecchrs();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002221 op = no_Magic(getchr());
Bram Moolenaar61602c52013-06-01 19:54:43 +02002222 i = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002223 switch(op)
2224 {
2225 case '=':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002226 // \@=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002227 i = NFA_PREV_ATOM_NO_WIDTH;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002228 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002229 case '!':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002230 // \@!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002231 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002232 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002233 case '<':
Bram Moolenaar61602c52013-06-01 19:54:43 +02002234 op = no_Magic(getchr());
2235 if (op == '=')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002236 // \@<=
Bram Moolenaar61602c52013-06-01 19:54:43 +02002237 i = NFA_PREV_ATOM_JUST_BEFORE;
2238 else if (op == '!')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002239 // \@<!
Bram Moolenaar61602c52013-06-01 19:54:43 +02002240 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
2241 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002242 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002243 // \@>
Bram Moolenaar87953742013-06-05 18:52:40 +02002244 i = NFA_PREV_ATOM_LIKE_PATTERN;
2245 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002246 }
Bram Moolenaar61602c52013-06-01 19:54:43 +02002247 if (i == 0)
2248 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002249 semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op);
Bram Moolenaar61602c52013-06-01 19:54:43 +02002250 return FAIL;
2251 }
2252 EMIT(i);
2253 if (i == NFA_PREV_ATOM_JUST_BEFORE
2254 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
2255 EMIT(c2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002256 break;
2257
2258 case Magic('?'):
2259 case Magic('='):
2260 EMIT(NFA_QUEST);
2261 break;
2262
2263 case Magic('{'):
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002264 // a{2,5} will expand to 'aaa?a?a?'
2265 // a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
2266 // version of '?'
2267 // \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
2268 // parenthesis have the same id
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002269
2270 greedy = TRUE;
2271 c2 = peekchr();
2272 if (c2 == '-' || c2 == Magic('-'))
2273 {
2274 skipchr();
2275 greedy = FALSE;
2276 }
2277 if (!read_limits(&minval, &maxval))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002278 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002279
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002280 // <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
2281 // <atom>*
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002282 if (minval == 0 && maxval == MAX_LIMIT)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002283 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002284 if (greedy) // { { (match the braces)
2285 // \{}, \{0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002286 EMIT(NFA_STAR);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002287 else // { { (match the braces)
2288 // \{-}, \{-0,}
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002289 EMIT(NFA_STAR_NONGREEDY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002290 break;
2291 }
2292
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002293 // Special case: x{0} or x{-0}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002294 if (maxval == 0)
2295 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002296 // Ignore result of previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002297 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002298 // NFA_EMPTY is 0-length and works everywhere
Bram Moolenaar699c1202013-09-25 16:41:54 +02002299 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002300 return OK;
2301 }
2302
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002303 // The engine is very inefficient (uses too many states) when the
2304 // maximum is much larger than the minimum and when the maximum is
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002305 // large. However, when maxval is MAX_LIMIT, it is okay, as this
2306 // will emit NFA_STAR.
2307 // Bail out if we can use the other engine, but only, when the
2308 // pattern does not need the NFA engine like (e.g. [[:upper:]]\{2,\}
2309 // does not work with with characters > 8 bit with the BT engine)
Bram Moolenaara1d2c582015-02-10 18:18:17 +01002310 if ((nfa_re_flags & RE_AUTO)
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002311 && (maxval > 500 || maxval > minval + 200)
2312 && (maxval != MAX_LIMIT && minval < 200)
2313 && !wants_nfa)
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002314 return FAIL;
2315
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002316 // Ignore previous call to nfa_regatom()
Bram Moolenaar16299b52013-05-30 18:45:23 +02002317 post_ptr = post_start + my_post_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002318 // Save parse state after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002319 save_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002320
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002321 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
2322 for (i = 0; i < maxval; i++)
2323 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002324 // Goto beginning of the repeated atom
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002325 restore_parse_state(&old_state);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002326 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002327 if (nfa_regatom() == FAIL)
2328 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002329 // after "minval" times, atoms are optional
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002330 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002331 {
2332 if (maxval == MAX_LIMIT)
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002333 {
2334 if (greedy)
2335 EMIT(NFA_STAR);
2336 else
2337 EMIT(NFA_STAR_NONGREEDY);
2338 }
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002339 else
2340 EMIT(quest);
2341 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02002342 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002343 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02002344 if (i + 1 > minval && maxval == MAX_LIMIT)
2345 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002346 }
2347
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002348 // Go to just after the repeated atom and the \{}
Bram Moolenaar3737fc12013-06-01 14:42:56 +02002349 restore_parse_state(&new_state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002350 curchr = -1;
2351
2352 break;
2353
2354
2355 default:
2356 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002357 } // end switch
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002358
2359 if (re_multi_type(peekchr()) != NOT_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002360 // Can't have a multi follow a multi.
Bram Moolenaar3c867da2018-06-23 14:34:28 +02002361 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002362
2363 return OK;
2364}
2365
2366/*
2367 * Parse one or more pieces, concatenated. It matches a match for the
2368 * first piece, followed by a match for the second piece, etc. Example:
2369 * "f[0-9]b", first matches "f", then a digit and then "b".
2370 *
2371 * concat ::= piece
2372 * or piece piece
2373 * or piece piece piece
2374 * etc.
2375 */
2376 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002377nfa_regconcat(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002378{
2379 int cont = TRUE;
2380 int first = TRUE;
2381
2382 while (cont)
2383 {
2384 switch (peekchr())
2385 {
2386 case NUL:
2387 case Magic('|'):
2388 case Magic('&'):
2389 case Magic(')'):
2390 cont = FALSE;
2391 break;
2392
2393 case Magic('Z'):
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002394 regflags |= RF_ICOMBINE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002395 skipchr_keepstart();
2396 break;
2397 case Magic('c'):
2398 regflags |= RF_ICASE;
2399 skipchr_keepstart();
2400 break;
2401 case Magic('C'):
2402 regflags |= RF_NOICASE;
2403 skipchr_keepstart();
2404 break;
2405 case Magic('v'):
2406 reg_magic = MAGIC_ALL;
2407 skipchr_keepstart();
2408 curchr = -1;
2409 break;
2410 case Magic('m'):
2411 reg_magic = MAGIC_ON;
2412 skipchr_keepstart();
2413 curchr = -1;
2414 break;
2415 case Magic('M'):
2416 reg_magic = MAGIC_OFF;
2417 skipchr_keepstart();
2418 curchr = -1;
2419 break;
2420 case Magic('V'):
2421 reg_magic = MAGIC_NONE;
2422 skipchr_keepstart();
2423 curchr = -1;
2424 break;
2425
2426 default:
2427 if (nfa_regpiece() == FAIL)
2428 return FAIL;
2429 if (first == FALSE)
2430 EMIT(NFA_CONCAT);
2431 else
2432 first = FALSE;
2433 break;
2434 }
2435 }
2436
2437 return OK;
2438}
2439
2440/*
2441 * Parse a branch, one or more concats, separated by "\&". It matches the
2442 * last concat, but only if all the preceding concats also match at the same
2443 * position. Examples:
2444 * "foobeep\&..." matches "foo" in "foobeep".
2445 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2446 *
2447 * branch ::= concat
2448 * or concat \& concat
2449 * or concat \& concat \& concat
2450 * etc.
2451 */
2452 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002453nfa_regbranch(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002454{
Bram Moolenaar16299b52013-05-30 18:45:23 +02002455 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002456
Bram Moolenaar16299b52013-05-30 18:45:23 +02002457 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002458
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002459 // First branch, possibly the only one
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002460 if (nfa_regconcat() == FAIL)
2461 return FAIL;
2462
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002463 // Try next concats
Bram Moolenaar890dd052017-12-16 19:59:37 +01002464 while (peekchr() == Magic('&'))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002465 {
2466 skipchr();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002467 // if concat is empty do emit a node
Bram Moolenaar890dd052017-12-16 19:59:37 +01002468 if (old_post_pos == (int)(post_ptr - post_start))
2469 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002470 EMIT(NFA_NOPEN);
2471 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02002472 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002473 if (nfa_regconcat() == FAIL)
2474 return FAIL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002475 // if concat is empty do emit a node
Bram Moolenaar16299b52013-05-30 18:45:23 +02002476 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002477 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002478 EMIT(NFA_CONCAT);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002479 }
2480
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002481 // if a branch is empty, emit one node for it
Bram Moolenaar16299b52013-05-30 18:45:23 +02002482 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaar699c1202013-09-25 16:41:54 +02002483 EMIT(NFA_EMPTY);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002484
2485 return OK;
2486}
2487
2488/*
2489 * Parse a pattern, one or more branches, separated by "\|". It matches
2490 * anything that matches one of the branches. Example: "foo\|beep" matches
2491 * "foo" and matches "beep". If more than one branch matches, the first one
2492 * is used.
2493 *
2494 * pattern ::= branch
2495 * or branch \| branch
2496 * or branch \| branch \| branch
2497 * etc.
2498 */
2499 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002500nfa_reg(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002501 int paren) // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002502{
2503 int parno = 0;
2504
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002505 if (paren == REG_PAREN)
2506 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002507 if (regnpar >= NSUBEXP) // Too many `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002508 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002509 parno = regnpar++;
2510 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002511#ifdef FEAT_SYN_HL
2512 else if (paren == REG_ZPAREN)
2513 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002514 // Make a ZOPEN node.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002515 if (regnzpar >= NSUBEXP)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002516 EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002517 parno = regnzpar++;
2518 }
2519#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002520
2521 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002522 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002523
2524 while (peekchr() == Magic('|'))
2525 {
2526 skipchr();
2527 if (nfa_regbranch() == FAIL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002528 return FAIL; // cascaded error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002529 EMIT(NFA_OR);
2530 }
2531
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002532 // Check for proper termination.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002533 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2534 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002535 if (paren == REG_NPAREN)
2536 EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
2537 else
2538 EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
2539 }
2540 else if (paren == REG_NOPAREN && peekchr() != NUL)
2541 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002542 if (peekchr() == Magic(')'))
2543 EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
2544 else
2545 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
2546 }
2547 /*
2548 * Here we set the flag allowing back references to this set of
2549 * parentheses.
2550 */
2551 if (paren == REG_PAREN)
2552 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002553 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002554 EMIT(NFA_MOPEN + parno);
2555 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002556#ifdef FEAT_SYN_HL
2557 else if (paren == REG_ZPAREN)
2558 EMIT(NFA_ZOPEN + parno);
2559#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002560
2561 return OK;
2562}
2563
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002564#ifdef DEBUG
2565static char_u code[50];
2566
2567 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002568nfa_set_code(int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002569{
2570 int addnl = FALSE;
2571
2572 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
2573 {
2574 addnl = TRUE;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002575 c -= NFA_ADD_NL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002576 }
2577
2578 STRCPY(code, "");
2579 switch (c)
2580 {
2581 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2582 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2583 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2584 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2585 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2586 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2587
Bram Moolenaar5714b802013-05-28 22:03:20 +02002588 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2589 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2590 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2591 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2592 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2593 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2594 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2595 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2596 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002597#ifdef FEAT_SYN_HL
2598 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2599 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2600 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2601 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2602 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2603 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2604 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2605 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2606 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2607#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02002608 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2609
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002610 case NFA_PREV_ATOM_NO_WIDTH:
2611 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02002612 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2613 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002614 case NFA_PREV_ATOM_JUST_BEFORE:
2615 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2616 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2617 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002618 case NFA_PREV_ATOM_LIKE_PATTERN:
2619 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2620
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002621 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2622 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002623 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002624 case NFA_START_INVISIBLE_FIRST:
2625 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002626 case NFA_START_INVISIBLE_NEG:
2627 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002628 case NFA_START_INVISIBLE_NEG_FIRST:
2629 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
Bram Moolenaar61602c52013-06-01 19:54:43 +02002630 case NFA_START_INVISIBLE_BEFORE:
2631 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002632 case NFA_START_INVISIBLE_BEFORE_FIRST:
2633 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002634 case NFA_START_INVISIBLE_BEFORE_NEG:
2635 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
Bram Moolenaara2947e22013-06-11 22:44:09 +02002636 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2637 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002638 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002639 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
Bram Moolenaardecd9542013-06-07 16:31:50 +02002640 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
Bram Moolenaar87953742013-06-05 18:52:40 +02002641 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002642
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002643 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2644 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02002645 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002646
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002647 case NFA_MOPEN:
2648 case NFA_MOPEN1:
2649 case NFA_MOPEN2:
2650 case NFA_MOPEN3:
2651 case NFA_MOPEN4:
2652 case NFA_MOPEN5:
2653 case NFA_MOPEN6:
2654 case NFA_MOPEN7:
2655 case NFA_MOPEN8:
2656 case NFA_MOPEN9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002657 STRCPY(code, "NFA_MOPEN(x)");
2658 code[10] = c - NFA_MOPEN + '0';
2659 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002660 case NFA_MCLOSE:
2661 case NFA_MCLOSE1:
2662 case NFA_MCLOSE2:
2663 case NFA_MCLOSE3:
2664 case NFA_MCLOSE4:
2665 case NFA_MCLOSE5:
2666 case NFA_MCLOSE6:
2667 case NFA_MCLOSE7:
2668 case NFA_MCLOSE8:
2669 case NFA_MCLOSE9:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002670 STRCPY(code, "NFA_MCLOSE(x)");
2671 code[11] = c - NFA_MCLOSE + '0';
2672 break;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02002673#ifdef FEAT_SYN_HL
2674 case NFA_ZOPEN:
2675 case NFA_ZOPEN1:
2676 case NFA_ZOPEN2:
2677 case NFA_ZOPEN3:
2678 case NFA_ZOPEN4:
2679 case NFA_ZOPEN5:
2680 case NFA_ZOPEN6:
2681 case NFA_ZOPEN7:
2682 case NFA_ZOPEN8:
2683 case NFA_ZOPEN9:
2684 STRCPY(code, "NFA_ZOPEN(x)");
2685 code[10] = c - NFA_ZOPEN + '0';
2686 break;
2687 case NFA_ZCLOSE:
2688 case NFA_ZCLOSE1:
2689 case NFA_ZCLOSE2:
2690 case NFA_ZCLOSE3:
2691 case NFA_ZCLOSE4:
2692 case NFA_ZCLOSE5:
2693 case NFA_ZCLOSE6:
2694 case NFA_ZCLOSE7:
2695 case NFA_ZCLOSE8:
2696 case NFA_ZCLOSE9:
2697 STRCPY(code, "NFA_ZCLOSE(x)");
2698 code[11] = c - NFA_ZCLOSE + '0';
2699 break;
2700#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002701 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2702 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2703 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2704 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02002705 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2706 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002707 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2708 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2709 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2710 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2711 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2712 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2713 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2714 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2715 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2716 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2717 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2718 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2719 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2720 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02002721 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
Bram Moolenaar044aa292013-06-04 21:27:38 +02002722
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002723 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaar36b3a012013-06-01 12:40:20 +02002724 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2725 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2726 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
Bram Moolenaar699c1202013-09-25 16:41:54 +02002727 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002728 case NFA_OR: STRCPY(code, "NFA_OR"); break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002729
2730 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2731 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2732 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2733 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2734 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2735 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2736 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2737
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002738 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2739 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2740 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2741 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2742 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2743 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2744 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2745 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2746 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2747 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2748 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2749 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2750 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2751 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2752 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2753 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01002754 case NFA_CLASS_IDENT: STRCPY(code, "NFA_CLASS_IDENT"); break;
2755 case NFA_CLASS_KEYWORD: STRCPY(code, "NFA_CLASS_KEYWORD"); break;
2756 case NFA_CLASS_FNAME: STRCPY(code, "NFA_CLASS_FNAME"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002757
2758 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2759 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2760 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
2761 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2762 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
2763 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2764 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
2765 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2766 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
2767 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2768 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
2769 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2770 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
2771 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2772 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2773 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2774 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
2775 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2776 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2777 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2778 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2779 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2780 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
2781 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2782 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
2783 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2784 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
Bram Moolenaar1cfad522013-08-14 12:06:49 +02002785 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2786 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2787 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2788 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002789
2790 default:
2791 STRCPY(code, "CHAR(x)");
2792 code[5] = c;
2793 }
2794
2795 if (addnl == TRUE)
2796 STRCAT(code, " + NEWLINE ");
2797
2798}
2799
2800#ifdef ENABLE_LOG
2801static FILE *log_fd;
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002802static char_u e_log_open_failed[] = N_("Could not open temporary log file for writing, displaying on stderr... ");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002803
2804/*
2805 * Print the postfix notation of the current regexp.
2806 */
2807 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002808nfa_postfix_dump(char_u *expr, int retval)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002809{
2810 int *p;
2811 FILE *f;
2812
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002813 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002814 if (f != NULL)
2815 {
2816 fprintf(f, "\n-------------------------\n");
2817 if (retval == FAIL)
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02002818 fprintf(f, ">>> NFA engine failed... \n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002819 else if (retval == OK)
2820 fprintf(f, ">>> NFA engine succeeded !\n");
2821 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002822 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002823 {
2824 nfa_set_code(*p);
2825 fprintf(f, "%s, ", code);
2826 }
2827 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaareec3e1e2013-08-01 18:38:26 +02002828 for (p = post_start; *p && p < post_ptr; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002829 fprintf(f, "%d ", *p);
2830 fprintf(f, "\n\n");
2831 fclose(f);
2832 }
2833}
2834
2835/*
2836 * Print the NFA starting with a root node "state".
2837 */
2838 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002839nfa_print_state(FILE *debugf, nfa_state_T *state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002840{
Bram Moolenaar152e7892013-05-25 12:28:11 +02002841 garray_T indent;
2842
2843 ga_init2(&indent, 1, 64);
2844 ga_append(&indent, '\0');
2845 nfa_print_state2(debugf, state, &indent);
2846 ga_clear(&indent);
2847}
2848
2849 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002850nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
Bram Moolenaar152e7892013-05-25 12:28:11 +02002851{
2852 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002853
2854 if (state == NULL)
2855 return;
2856
2857 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02002858
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002859 // Output indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002860 p = (char_u *)indent->ga_data;
2861 if (indent->ga_len >= 3)
2862 {
2863 int last = indent->ga_len - 3;
2864 char_u save[2];
2865
2866 STRNCPY(save, &p[last], 2);
2867 STRNCPY(&p[last], "+-", 2);
2868 fprintf(debugf, " %s", p);
2869 STRNCPY(&p[last], save, 2);
2870 }
2871 else
2872 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002873
2874 nfa_set_code(state->c);
Bram Moolenaardecd9542013-06-07 16:31:50 +02002875 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
Bram Moolenaar417bad22013-06-07 14:08:30 +02002876 code,
2877 state->c,
2878 abs(state->id),
2879 state->val);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002880 if (state->id < 0)
2881 return;
2882
2883 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02002884
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002885 // grow indent for state->out
Bram Moolenaar152e7892013-05-25 12:28:11 +02002886 indent->ga_len -= 1;
2887 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002888 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002889 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002890 ga_concat(indent, (char_u *)" ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002891 ga_append(indent, '\0');
2892
2893 nfa_print_state2(debugf, state->out, indent);
2894
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002895 // replace last part of indent for state->out1
Bram Moolenaar152e7892013-05-25 12:28:11 +02002896 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02002897 ga_concat(indent, (char_u *)" ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02002898 ga_append(indent, '\0');
2899
2900 nfa_print_state2(debugf, state->out1, indent);
2901
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002902 // shrink indent
Bram Moolenaar152e7892013-05-25 12:28:11 +02002903 indent->ga_len -= 3;
2904 ga_append(indent, '\0');
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002905}
2906
2907/*
2908 * Print the NFA state machine.
2909 */
2910 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002911nfa_dump(nfa_regprog_T *prog)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002912{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002913 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002914
2915 if (debugf != NULL)
2916 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02002917 nfa_print_state(debugf, prog->start);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002918
Bram Moolenaar473de612013-06-08 18:19:48 +02002919 if (prog->reganch)
2920 fprintf(debugf, "reganch: %d\n", prog->reganch);
2921 if (prog->regstart != NUL)
2922 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2923 prog->regstart, prog->regstart);
2924 if (prog->match_text != NULL)
2925 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
Bram Moolenaard89616e2013-06-06 18:46:06 +02002926
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002927 fclose(debugf);
2928 }
2929}
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002930#endif // ENABLE_LOG
2931#endif // DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002932
2933/*
2934 * Parse r.e. @expr and convert it into postfix form.
2935 * Return the postfix string on success, NULL otherwise.
2936 */
2937 static int *
Bram Moolenaar05540972016-01-30 20:31:25 +01002938re2post(void)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002939{
2940 if (nfa_reg(REG_NOPAREN) == FAIL)
2941 return NULL;
2942 EMIT(NFA_MOPEN);
2943 return post_start;
2944}
2945
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002946// NB. Some of the code below is inspired by Russ's.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002947
2948/*
2949 * Represents an NFA state plus zero or one or two arrows exiting.
2950 * if c == MATCH, no arrows out; matching state.
2951 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2952 * If c < 256, labeled arrow with character c to out.
2953 */
2954
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002955static nfa_state_T *state_ptr; // points to nfa_prog->state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002956
2957/*
2958 * Allocate and initialize nfa_state_T.
2959 */
2960 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002961alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002962{
2963 nfa_state_T *s;
2964
2965 if (istate >= nstate)
2966 return NULL;
2967
2968 s = &state_ptr[istate++];
2969
2970 s->c = c;
2971 s->out = out;
2972 s->out1 = out1;
Bram Moolenaar417bad22013-06-07 14:08:30 +02002973 s->val = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002974
2975 s->id = istate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02002976 s->lastlist[0] = 0;
2977 s->lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002978
2979 return s;
2980}
2981
2982/*
2983 * A partially built NFA without the matching state filled in.
2984 * Frag_T.start points at the start state.
2985 * Frag_T.out is a list of places that need to be set to the
2986 * next state for this fragment.
2987 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002988
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002989// Since the out pointers in the list are always
2990// uninitialized, we use the pointers themselves
2991// as storage for the Ptrlists.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002992typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002993union Ptrlist
2994{
2995 Ptrlist *next;
2996 nfa_state_T *s;
2997};
2998
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002999struct Frag
3000{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003001 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003002 Ptrlist *out;
3003};
3004typedef struct Frag Frag_T;
3005
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003006/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02003007 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003008 */
3009 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003010frag(nfa_state_T *start, Ptrlist *out)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003011{
Bram Moolenaar053bb602013-05-20 13:55:21 +02003012 Frag_T n;
3013
3014 n.start = start;
3015 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003016 return n;
3017}
3018
3019/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003020 * Create singleton list containing just outp.
3021 */
3022 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003023list1(
3024 nfa_state_T **outp)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003025{
3026 Ptrlist *l;
3027
3028 l = (Ptrlist *)outp;
3029 l->next = NULL;
3030 return l;
3031}
3032
3033/*
3034 * Patch the list of states at out to point to start.
3035 */
3036 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003037patch(Ptrlist *l, nfa_state_T *s)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003038{
3039 Ptrlist *next;
3040
3041 for (; l; l = next)
3042 {
3043 next = l->next;
3044 l->s = s;
3045 }
3046}
3047
3048
3049/*
3050 * Join the two lists l1 and l2, returning the combination.
3051 */
3052 static Ptrlist *
Bram Moolenaar05540972016-01-30 20:31:25 +01003053append(Ptrlist *l1, Ptrlist *l2)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003054{
3055 Ptrlist *oldl1;
3056
3057 oldl1 = l1;
3058 while (l1->next)
3059 l1 = l1->next;
3060 l1->next = l2;
3061 return oldl1;
3062}
3063
3064/*
3065 * Stack used for transforming postfix form into NFA.
3066 */
3067static Frag_T empty;
3068
3069 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003070st_error(int *postfix UNUSED, int *end UNUSED, int *p UNUSED)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003071{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003072#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003073 FILE *df;
3074 int *p2;
3075
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003076 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003077 if (df)
3078 {
3079 fprintf(df, "Error popping the stack!\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003080# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003081 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003082# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003083 fprintf(df, "Postfix form is: ");
Bram Moolenaar0270f382018-07-17 05:43:58 +02003084# ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003085 for (p2 = postfix; p2 < end; p2++)
3086 {
3087 nfa_set_code(*p2);
3088 fprintf(df, "%s, ", code);
3089 }
3090 nfa_set_code(*p);
3091 fprintf(df, "\nCurrent position is: ");
3092 for (p2 = postfix; p2 <= p; p2 ++)
3093 {
3094 nfa_set_code(*p2);
3095 fprintf(df, "%s, ", code);
3096 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02003097# else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003098 for (p2 = postfix; p2 < end; p2++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003099 fprintf(df, "%d, ", *p2);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003100 fprintf(df, "\nCurrent position is: ");
3101 for (p2 = postfix; p2 <= p; p2 ++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003102 fprintf(df, "%d, ", *p2);
Bram Moolenaar0270f382018-07-17 05:43:58 +02003103# endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003104 fprintf(df, "\n--------------------------\n");
3105 fclose(df);
3106 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003107#endif
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01003108 emsg(_("E874: (NFA) Could not pop the stack!"));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003109}
3110
3111/*
3112 * Push an item onto the stack.
3113 */
3114 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003115st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003116{
3117 Frag_T *stackp = *p;
3118
3119 if (stackp >= stack_end)
3120 return;
3121 *stackp = s;
3122 *p = *p + 1;
3123}
3124
3125/*
3126 * Pop an item from the stack.
3127 */
3128 static Frag_T
Bram Moolenaar05540972016-01-30 20:31:25 +01003129st_pop(Frag_T **p, Frag_T *stack)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003130{
3131 Frag_T *stackp;
3132
3133 *p = *p - 1;
3134 stackp = *p;
3135 if (stackp < stack)
3136 return empty;
3137 return **p;
3138}
3139
3140/*
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003141 * Estimate the maximum byte length of anything matching "state".
3142 * When unknown or unlimited return -1.
3143 */
3144 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01003145nfa_max_width(nfa_state_T *startstate, int depth)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003146{
3147 int l, r;
3148 nfa_state_T *state = startstate;
3149 int len = 0;
3150
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003151 // detect looping in a NFA_SPLIT
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003152 if (depth > 4)
3153 return -1;
3154
Bram Moolenaarfe70acb2013-06-21 18:31:23 +02003155 while (state != NULL)
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003156 {
3157 switch (state->c)
3158 {
3159 case NFA_END_INVISIBLE:
3160 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003161 // the end, return what we have
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003162 return len;
3163
3164 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003165 // two alternatives, use the maximum
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003166 l = nfa_max_width(state->out, depth + 1);
3167 r = nfa_max_width(state->out1, depth + 1);
3168 if (l < 0 || r < 0)
3169 return -1;
3170 return len + (l > r ? l : r);
3171
3172 case NFA_ANY:
3173 case NFA_START_COLL:
3174 case NFA_START_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003175 // matches some character, including composing chars
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003176 if (enc_utf8)
3177 len += MB_MAXBYTES;
3178 else if (has_mbyte)
3179 len += 2;
3180 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003181 ++len;
3182 if (state->c != NFA_ANY)
3183 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003184 // skip over the characters
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003185 state = state->out1->out;
3186 continue;
3187 }
3188 break;
3189
3190 case NFA_DIGIT:
3191 case NFA_WHITE:
3192 case NFA_HEX:
3193 case NFA_OCTAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003194 // ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003195 ++len;
3196 break;
3197
3198 case NFA_IDENT:
3199 case NFA_SIDENT:
3200 case NFA_KWORD:
3201 case NFA_SKWORD:
3202 case NFA_FNAME:
3203 case NFA_SFNAME:
3204 case NFA_PRINT:
3205 case NFA_SPRINT:
3206 case NFA_NWHITE:
3207 case NFA_NDIGIT:
3208 case NFA_NHEX:
3209 case NFA_NOCTAL:
3210 case NFA_WORD:
3211 case NFA_NWORD:
3212 case NFA_HEAD:
3213 case NFA_NHEAD:
3214 case NFA_ALPHA:
3215 case NFA_NALPHA:
3216 case NFA_LOWER:
3217 case NFA_NLOWER:
3218 case NFA_UPPER:
3219 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02003220 case NFA_LOWER_IC:
3221 case NFA_NLOWER_IC:
3222 case NFA_UPPER_IC:
3223 case NFA_NUPPER_IC:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02003224 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003225 // possibly non-ascii
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003226 if (has_mbyte)
3227 len += 3;
3228 else
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003229 ++len;
3230 break;
3231
3232 case NFA_START_INVISIBLE:
3233 case NFA_START_INVISIBLE_NEG:
3234 case NFA_START_INVISIBLE_BEFORE:
3235 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003236 // zero-width, out1 points to the END state
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003237 state = state->out1->out;
3238 continue;
3239
3240 case NFA_BACKREF1:
3241 case NFA_BACKREF2:
3242 case NFA_BACKREF3:
3243 case NFA_BACKREF4:
3244 case NFA_BACKREF5:
3245 case NFA_BACKREF6:
3246 case NFA_BACKREF7:
3247 case NFA_BACKREF8:
3248 case NFA_BACKREF9:
3249#ifdef FEAT_SYN_HL
3250 case NFA_ZREF1:
3251 case NFA_ZREF2:
3252 case NFA_ZREF3:
3253 case NFA_ZREF4:
3254 case NFA_ZREF5:
3255 case NFA_ZREF6:
3256 case NFA_ZREF7:
3257 case NFA_ZREF8:
3258 case NFA_ZREF9:
3259#endif
3260 case NFA_NEWL:
3261 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003262 // unknown width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003263 return -1;
3264
3265 case NFA_BOL:
3266 case NFA_EOL:
3267 case NFA_BOF:
3268 case NFA_EOF:
3269 case NFA_BOW:
3270 case NFA_EOW:
3271 case NFA_MOPEN:
3272 case NFA_MOPEN1:
3273 case NFA_MOPEN2:
3274 case NFA_MOPEN3:
3275 case NFA_MOPEN4:
3276 case NFA_MOPEN5:
3277 case NFA_MOPEN6:
3278 case NFA_MOPEN7:
3279 case NFA_MOPEN8:
3280 case NFA_MOPEN9:
3281#ifdef FEAT_SYN_HL
3282 case NFA_ZOPEN:
3283 case NFA_ZOPEN1:
3284 case NFA_ZOPEN2:
3285 case NFA_ZOPEN3:
3286 case NFA_ZOPEN4:
3287 case NFA_ZOPEN5:
3288 case NFA_ZOPEN6:
3289 case NFA_ZOPEN7:
3290 case NFA_ZOPEN8:
3291 case NFA_ZOPEN9:
3292 case NFA_ZCLOSE:
3293 case NFA_ZCLOSE1:
3294 case NFA_ZCLOSE2:
3295 case NFA_ZCLOSE3:
3296 case NFA_ZCLOSE4:
3297 case NFA_ZCLOSE5:
3298 case NFA_ZCLOSE6:
3299 case NFA_ZCLOSE7:
3300 case NFA_ZCLOSE8:
3301 case NFA_ZCLOSE9:
3302#endif
3303 case NFA_MCLOSE:
3304 case NFA_MCLOSE1:
3305 case NFA_MCLOSE2:
3306 case NFA_MCLOSE3:
3307 case NFA_MCLOSE4:
3308 case NFA_MCLOSE5:
3309 case NFA_MCLOSE6:
3310 case NFA_MCLOSE7:
3311 case NFA_MCLOSE8:
3312 case NFA_MCLOSE9:
3313 case NFA_NOPEN:
3314 case NFA_NCLOSE:
3315
3316 case NFA_LNUM_GT:
3317 case NFA_LNUM_LT:
3318 case NFA_COL_GT:
3319 case NFA_COL_LT:
3320 case NFA_VCOL_GT:
3321 case NFA_VCOL_LT:
3322 case NFA_MARK_GT:
3323 case NFA_MARK_LT:
3324 case NFA_VISUAL:
3325 case NFA_LNUM:
3326 case NFA_CURSOR:
3327 case NFA_COL:
3328 case NFA_VCOL:
3329 case NFA_MARK:
3330
3331 case NFA_ZSTART:
3332 case NFA_ZEND:
3333 case NFA_OPT_CHARS:
Bram Moolenaar699c1202013-09-25 16:41:54 +02003334 case NFA_EMPTY:
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003335 case NFA_START_PATTERN:
3336 case NFA_END_PATTERN:
3337 case NFA_COMPOSING:
3338 case NFA_END_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003339 // zero-width
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003340 break;
3341
3342 default:
3343 if (state->c < 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003344 // don't know what this is
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003345 return -1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003346 // normal character
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003347 len += MB_CHAR2LEN(state->c);
3348 break;
3349 }
3350
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003351 // normal way to continue
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003352 state = state->out;
3353 }
3354
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003355 // unrecognized, "cannot happen"
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003356 return -1;
3357}
Bram Moolenaar1e02e662013-06-08 23:26:27 +02003358
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003359/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003360 * Convert a postfix form into its equivalent NFA.
3361 * Return the NFA start state on success, NULL otherwise.
3362 */
3363 static nfa_state_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01003364post2nfa(int *postfix, int *end, int nfa_calc_size)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003365{
3366 int *p;
3367 int mopen;
3368 int mclose;
3369 Frag_T *stack = NULL;
3370 Frag_T *stackp = NULL;
3371 Frag_T *stack_end = NULL;
3372 Frag_T e1;
3373 Frag_T e2;
3374 Frag_T e;
3375 nfa_state_T *s;
3376 nfa_state_T *s1;
3377 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003378 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003379
3380 if (postfix == NULL)
3381 return NULL;
3382
Bram Moolenaar053bb602013-05-20 13:55:21 +02003383#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003384#define POP() st_pop(&stackp, stack); \
3385 if (stackp < stack) \
3386 { \
3387 st_error(postfix, end, p); \
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003388 vim_free(stack); \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003389 return NULL; \
3390 }
3391
3392 if (nfa_calc_size == FALSE)
3393 {
Bram Moolenaar32aa1022019-11-02 22:54:41 +01003394 // Allocate space for the stack. Max states on the stack: "nstate".
Bram Moolenaarc799fe22019-05-28 23:08:19 +02003395 stack = ALLOC_MULT(Frag_T, nstate + 1);
Bram Moolenaarc57463c2018-12-26 22:04:41 +01003396 if (stack == NULL)
3397 return NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003398 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02003399 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003400 }
3401
3402 for (p = postfix; p < end; ++p)
3403 {
3404 switch (*p)
3405 {
3406 case NFA_CONCAT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003407 // Concatenation.
3408 // Pay attention: this operator does not exist in the r.e. itself
3409 // (it is implicit, really). It is added when r.e. is translated
3410 // to postfix form in re2post().
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003411 if (nfa_calc_size == TRUE)
3412 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003413 // nstate += 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003414 break;
3415 }
3416 e2 = POP();
3417 e1 = POP();
3418 patch(e1.out, e2.start);
3419 PUSH(frag(e1.start, e2.out));
3420 break;
3421
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003422 case NFA_OR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003423 // Alternation
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003424 if (nfa_calc_size == TRUE)
3425 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003426 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003427 break;
3428 }
3429 e2 = POP();
3430 e1 = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003431 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003432 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003433 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003434 PUSH(frag(s, append(e1.out, e2.out)));
3435 break;
3436
3437 case NFA_STAR:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003438 // Zero or more, prefer more
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003439 if (nfa_calc_size == TRUE)
3440 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003441 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003442 break;
3443 }
3444 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003445 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003446 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003447 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003448 patch(e.out, s);
3449 PUSH(frag(s, list1(&s->out1)));
3450 break;
3451
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003452 case NFA_STAR_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003453 // Zero or more, prefer zero
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003454 if (nfa_calc_size == TRUE)
3455 {
3456 nstate++;
3457 break;
3458 }
3459 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003460 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaar36b3a012013-06-01 12:40:20 +02003461 if (s == NULL)
3462 goto theend;
3463 patch(e.out, s);
3464 PUSH(frag(s, list1(&s->out)));
3465 break;
3466
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003467 case NFA_QUEST:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003468 // one or zero atoms=> greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003469 if (nfa_calc_size == TRUE)
3470 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003471 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003472 break;
3473 }
3474 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003475 s = alloc_state(NFA_SPLIT, e.start, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003476 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003477 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003478 PUSH(frag(s, append(e.out, list1(&s->out1))));
3479 break;
3480
3481 case NFA_QUEST_NONGREEDY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003482 // zero or one atoms => non-greedy match
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003483 if (nfa_calc_size == TRUE)
3484 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003485 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003486 break;
3487 }
3488 e = POP();
Bram Moolenaar525666f2013-06-02 16:40:55 +02003489 s = alloc_state(NFA_SPLIT, NULL, e.start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003490 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003491 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003492 PUSH(frag(s, append(e.out, list1(&s->out))));
3493 break;
3494
Bram Moolenaar417bad22013-06-07 14:08:30 +02003495 case NFA_END_COLL:
3496 case NFA_END_NEG_COLL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003497 // On the stack is the sequence starting with NFA_START_COLL or
3498 // NFA_START_NEG_COLL and all possible characters. Patch it to
3499 // add the output to the start.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003500 if (nfa_calc_size == TRUE)
3501 {
3502 nstate++;
3503 break;
3504 }
3505 e = POP();
3506 s = alloc_state(NFA_END_COLL, NULL, NULL);
3507 if (s == NULL)
3508 goto theend;
3509 patch(e.out, s);
3510 e.start->out1 = s;
3511 PUSH(frag(e.start, list1(&s->out)));
3512 break;
3513
3514 case NFA_RANGE:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003515 // Before this are two characters, the low and high end of a
3516 // range. Turn them into two states with MIN and MAX.
Bram Moolenaar417bad22013-06-07 14:08:30 +02003517 if (nfa_calc_size == TRUE)
3518 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003519 // nstate += 0;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003520 break;
3521 }
3522 e2 = POP();
3523 e1 = POP();
3524 e2.start->val = e2.start->c;
3525 e2.start->c = NFA_RANGE_MAX;
3526 e1.start->val = e1.start->c;
3527 e1.start->c = NFA_RANGE_MIN;
3528 patch(e1.out, e2.start);
3529 PUSH(frag(e1.start, e2.out));
3530 break;
3531
Bram Moolenaar699c1202013-09-25 16:41:54 +02003532 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003533 // 0-length, used in a repetition with max/min count of 0
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003534 if (nfa_calc_size == TRUE)
3535 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003536 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003537 break;
3538 }
Bram Moolenaar699c1202013-09-25 16:41:54 +02003539 s = alloc_state(NFA_EMPTY, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003540 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003541 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003542 PUSH(frag(s, list1(&s->out)));
3543 break;
3544
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003545 case NFA_OPT_CHARS:
3546 {
3547 int n;
3548
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003549 // \%[abc] implemented as:
3550 // NFA_SPLIT
3551 // +-CHAR(a)
3552 // | +-NFA_SPLIT
3553 // | +-CHAR(b)
3554 // | | +-NFA_SPLIT
3555 // | | +-CHAR(c)
3556 // | | | +-next
3557 // | | +- next
3558 // | +- next
3559 // +- next
3560 n = *++p; // get number of characters
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003561 if (nfa_calc_size == TRUE)
3562 {
3563 nstate += n;
3564 break;
3565 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003566 s = NULL; // avoid compiler warning
3567 e1.out = NULL; // stores list with out1's
3568 s1 = NULL; // previous NFA_SPLIT to connect to
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003569 while (n-- > 0)
3570 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003571 e = POP(); // get character
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003572 s = alloc_state(NFA_SPLIT, e.start, NULL);
3573 if (s == NULL)
3574 goto theend;
3575 if (e1.out == NULL)
3576 e1 = e;
3577 patch(e.out, s1);
3578 append(e1.out, list1(&s->out1));
3579 s1 = s;
3580 }
3581 PUSH(frag(s, e1.out));
3582 break;
3583 }
3584
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003585 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003586 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar61602c52013-06-01 19:54:43 +02003587 case NFA_PREV_ATOM_JUST_BEFORE:
3588 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02003589 case NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003590 {
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003591 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3592 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
Bram Moolenaar87953742013-06-05 18:52:40 +02003593 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
Bram Moolenaardecd9542013-06-07 16:31:50 +02003594 int start_state;
3595 int end_state;
Bram Moolenaar87953742013-06-05 18:52:40 +02003596 int n = 0;
3597 nfa_state_T *zend;
3598 nfa_state_T *skip;
3599
Bram Moolenaardecd9542013-06-07 16:31:50 +02003600 switch (*p)
Bram Moolenaar87953742013-06-05 18:52:40 +02003601 {
Bram Moolenaardecd9542013-06-07 16:31:50 +02003602 case NFA_PREV_ATOM_NO_WIDTH:
3603 start_state = NFA_START_INVISIBLE;
3604 end_state = NFA_END_INVISIBLE;
3605 break;
3606 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3607 start_state = NFA_START_INVISIBLE_NEG;
3608 end_state = NFA_END_INVISIBLE_NEG;
3609 break;
3610 case NFA_PREV_ATOM_JUST_BEFORE:
3611 start_state = NFA_START_INVISIBLE_BEFORE;
3612 end_state = NFA_END_INVISIBLE;
3613 break;
3614 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3615 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3616 end_state = NFA_END_INVISIBLE_NEG;
3617 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003618 default: // NFA_PREV_ATOM_LIKE_PATTERN:
Bram Moolenaardecd9542013-06-07 16:31:50 +02003619 start_state = NFA_START_PATTERN;
3620 end_state = NFA_END_PATTERN;
3621 break;
Bram Moolenaar87953742013-06-05 18:52:40 +02003622 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003623
3624 if (before)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003625 n = *++p; // get the count
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003626
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003627 // The \@= operator: match the preceding atom with zero width.
3628 // The \@! operator: no match for the preceding atom.
3629 // The \@<= operator: match for the preceding atom.
3630 // The \@<! operator: no match for the preceding atom.
3631 // Surrounds the preceding atom with START_INVISIBLE and
3632 // END_INVISIBLE, similarly to MOPEN.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003633
3634 if (nfa_calc_size == TRUE)
3635 {
Bram Moolenaar87953742013-06-05 18:52:40 +02003636 nstate += pattern ? 4 : 2;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003637 break;
3638 }
3639 e = POP();
Bram Moolenaar87953742013-06-05 18:52:40 +02003640 s1 = alloc_state(end_state, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003641 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003642 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003643
Bram Moolenaar87953742013-06-05 18:52:40 +02003644 s = alloc_state(start_state, e.start, s1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003645 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003646 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003647 if (pattern)
3648 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003649 // NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02003650 skip = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003651 if (skip == NULL)
3652 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003653 zend = alloc_state(NFA_ZEND, s1, NULL);
Bram Moolenaar983b3a52017-08-01 15:14:26 +02003654 if (zend == NULL)
3655 goto theend;
Bram Moolenaar87953742013-06-05 18:52:40 +02003656 s1->out= skip;
3657 patch(e.out, zend);
3658 PUSH(frag(s, list1(&skip->out)));
Bram Moolenaar61602c52013-06-01 19:54:43 +02003659 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003660 else
3661 {
3662 patch(e.out, s1);
3663 PUSH(frag(s, list1(&s1->out)));
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003664 if (before)
3665 {
3666 if (n <= 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003667 // See if we can guess the maximum width, it avoids a
3668 // lot of pointless tries.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003669 n = nfa_max_width(e.start, 0);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003670 s->val = n; // store the count
Bram Moolenaare7766ee2013-06-08 22:30:03 +02003671 }
Bram Moolenaar87953742013-06-05 18:52:40 +02003672 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003673 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003674 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003675
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003676 case NFA_COMPOSING: // char with composing char
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003677#if 0
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003678 // TODO
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003679 if (regflags & RF_ICOMBINE)
3680 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003681 // use the base character only
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003682 }
3683#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003684 // FALLTHROUGH
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003685
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003686 case NFA_MOPEN: // \( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003687 case NFA_MOPEN1:
3688 case NFA_MOPEN2:
3689 case NFA_MOPEN3:
3690 case NFA_MOPEN4:
3691 case NFA_MOPEN5:
3692 case NFA_MOPEN6:
3693 case NFA_MOPEN7:
3694 case NFA_MOPEN8:
3695 case NFA_MOPEN9:
3696#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003697 case NFA_ZOPEN: // \z( \) Submatch
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003698 case NFA_ZOPEN1:
3699 case NFA_ZOPEN2:
3700 case NFA_ZOPEN3:
3701 case NFA_ZOPEN4:
3702 case NFA_ZOPEN5:
3703 case NFA_ZOPEN6:
3704 case NFA_ZOPEN7:
3705 case NFA_ZOPEN8:
3706 case NFA_ZOPEN9:
3707#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003708 case NFA_NOPEN: // \%( \) "Invisible Submatch"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003709 if (nfa_calc_size == TRUE)
3710 {
3711 nstate += 2;
3712 break;
3713 }
3714
3715 mopen = *p;
3716 switch (*p)
3717 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003718 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3719#ifdef FEAT_SYN_HL
3720 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3721 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3722 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3723 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3724 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3725 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3726 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3727 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3728 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3729 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3730#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003731 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003732 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003733 // NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003734 mclose = *p + NSUBEXP;
3735 break;
3736 }
3737
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003738 // Allow "NFA_MOPEN" as a valid postfix representation for
3739 // the empty regexp "". In this case, the NFA will be
3740 // NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3741 // empty groups of parenthesis, and empty mbyte chars
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003742 if (stackp == stack)
3743 {
Bram Moolenaar525666f2013-06-02 16:40:55 +02003744 s = alloc_state(mopen, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003745 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003746 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003747 s1 = alloc_state(mclose, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003748 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003749 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003750 patch(list1(&s->out), s1);
3751 PUSH(frag(s, list1(&s1->out)));
3752 break;
3753 }
3754
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003755 // At least one node was emitted before NFA_MOPEN, so
3756 // at least one node will be between NFA_MOPEN and NFA_MCLOSE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003757 e = POP();
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003758 s = alloc_state(mopen, e.start, NULL); // `('
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003759 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003760 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003761
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003762 s1 = alloc_state(mclose, NULL, NULL); // `)'
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003763 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003764 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003765 patch(e.out, s1);
3766
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003767 if (mopen == NFA_COMPOSING)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003768 // COMPOSING->out1 = END_COMPOSING
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003769 patch(list1(&s->out1), s1);
3770
3771 PUSH(frag(s, list1(&s1->out)));
3772 break;
3773
Bram Moolenaar5714b802013-05-28 22:03:20 +02003774 case NFA_BACKREF1:
3775 case NFA_BACKREF2:
3776 case NFA_BACKREF3:
3777 case NFA_BACKREF4:
3778 case NFA_BACKREF5:
3779 case NFA_BACKREF6:
3780 case NFA_BACKREF7:
3781 case NFA_BACKREF8:
3782 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003783#ifdef FEAT_SYN_HL
3784 case NFA_ZREF1:
3785 case NFA_ZREF2:
3786 case NFA_ZREF3:
3787 case NFA_ZREF4:
3788 case NFA_ZREF5:
3789 case NFA_ZREF6:
3790 case NFA_ZREF7:
3791 case NFA_ZREF8:
3792 case NFA_ZREF9:
3793#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003794 if (nfa_calc_size == TRUE)
3795 {
3796 nstate += 2;
3797 break;
3798 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003799 s = alloc_state(*p, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003800 if (s == NULL)
3801 goto theend;
Bram Moolenaar525666f2013-06-02 16:40:55 +02003802 s1 = alloc_state(NFA_SKIP, NULL, NULL);
Bram Moolenaar5714b802013-05-28 22:03:20 +02003803 if (s1 == NULL)
3804 goto theend;
3805 patch(list1(&s->out), s1);
3806 PUSH(frag(s, list1(&s1->out)));
3807 break;
3808
Bram Moolenaar423532e2013-05-29 21:14:42 +02003809 case NFA_LNUM:
3810 case NFA_LNUM_GT:
3811 case NFA_LNUM_LT:
3812 case NFA_VCOL:
3813 case NFA_VCOL_GT:
3814 case NFA_VCOL_LT:
3815 case NFA_COL:
3816 case NFA_COL_GT:
3817 case NFA_COL_LT:
Bram Moolenaar044aa292013-06-04 21:27:38 +02003818 case NFA_MARK:
3819 case NFA_MARK_GT:
3820 case NFA_MARK_LT:
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003821 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003822 int n = *++p; // lnum, col or mark name
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003823
Bram Moolenaar423532e2013-05-29 21:14:42 +02003824 if (nfa_calc_size == TRUE)
3825 {
3826 nstate += 1;
3827 break;
3828 }
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003829 s = alloc_state(p[-1], NULL, NULL);
Bram Moolenaar423532e2013-05-29 21:14:42 +02003830 if (s == NULL)
3831 goto theend;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003832 s->val = n;
Bram Moolenaar423532e2013-05-29 21:14:42 +02003833 PUSH(frag(s, list1(&s->out)));
3834 break;
Bram Moolenaard75799ab72013-06-05 11:05:17 +02003835 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02003836
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003837 case NFA_ZSTART:
3838 case NFA_ZEND:
3839 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003840 // Operands
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003841 if (nfa_calc_size == TRUE)
3842 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003843 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003844 break;
3845 }
Bram Moolenaar525666f2013-06-02 16:40:55 +02003846 s = alloc_state(*p, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003847 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003848 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003849 PUSH(frag(s, list1(&s->out)));
3850 break;
3851
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003852 } // switch(*p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003853
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003854 } // for(p = postfix; *p; ++p)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003855
3856 if (nfa_calc_size == TRUE)
3857 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02003858 nstate++;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003859 goto theend; // Return value when counting size is ignored anyway
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003860 }
3861
3862 e = POP();
3863 if (stackp != stack)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003864 {
3865 vim_free(stack);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003866 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003867 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003868
3869 if (istate >= nstate)
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003870 {
3871 vim_free(stack);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003872 EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
Bram Moolenaar50ab9942015-04-13 15:28:12 +02003873 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003874
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003875 matchstate = &state_ptr[istate++]; // the match state
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003876 matchstate->c = NFA_MATCH;
3877 matchstate->out = matchstate->out1 = NULL;
Bram Moolenaar417bad22013-06-07 14:08:30 +02003878 matchstate->id = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003879
3880 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02003881 ret = e.start;
3882
3883theend:
3884 vim_free(stack);
3885 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003886
3887#undef POP1
3888#undef PUSH1
3889#undef POP2
3890#undef PUSH2
3891#undef POP
3892#undef PUSH
3893}
3894
Bram Moolenaara2947e22013-06-11 22:44:09 +02003895/*
3896 * After building the NFA program, inspect it to add optimization hints.
3897 */
3898 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003899nfa_postprocess(nfa_regprog_T *prog)
Bram Moolenaara2947e22013-06-11 22:44:09 +02003900{
3901 int i;
3902 int c;
3903
3904 for (i = 0; i < prog->nstate; ++i)
3905 {
3906 c = prog->state[i].c;
3907 if (c == NFA_START_INVISIBLE
3908 || c == NFA_START_INVISIBLE_NEG
3909 || c == NFA_START_INVISIBLE_BEFORE
3910 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3911 {
3912 int directly;
3913
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003914 // Do it directly when what follows is possibly the end of the
3915 // match.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003916 if (match_follows(prog->state[i].out1->out, 0))
3917 directly = TRUE;
3918 else
3919 {
3920 int ch_invisible = failure_chance(prog->state[i].out, 0);
3921 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3922
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003923 // Postpone when the invisible match is expensive or has a
3924 // lower chance of failing.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003925 if (c == NFA_START_INVISIBLE_BEFORE
3926 || c == NFA_START_INVISIBLE_BEFORE_NEG)
3927 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003928 // "before" matches are very expensive when
3929 // unbounded, always prefer what follows then,
3930 // unless what follows will always match.
3931 // Otherwise strongly prefer what follows.
Bram Moolenaara2947e22013-06-11 22:44:09 +02003932 if (prog->state[i].val <= 0 && ch_follows > 0)
3933 directly = FALSE;
3934 else
3935 directly = ch_follows * 10 < ch_invisible;
3936 }
3937 else
3938 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003939 // normal invisible, first do the one with the
3940 // highest failure chance
Bram Moolenaara2947e22013-06-11 22:44:09 +02003941 directly = ch_follows < ch_invisible;
3942 }
3943 }
3944 if (directly)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003945 // switch to the _FIRST state
Bram Moolenaara2947e22013-06-11 22:44:09 +02003946 ++prog->state[i].c;
3947 }
3948 }
3949}
3950
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003951/////////////////////////////////////////////////////////////////
3952// NFA execution code.
3953/////////////////////////////////////////////////////////////////
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003954
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003955typedef struct
3956{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003957 int in_use; // number of subexpr with useful info
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003958
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003959 // When REG_MULTI is TRUE list.multi is used, otherwise list.line.
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003960 union
3961 {
3962 struct multipos
3963 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01003964 linenr_T start_lnum;
3965 linenr_T end_lnum;
3966 colnr_T start_col;
3967 colnr_T end_col;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003968 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003969 struct linepos
3970 {
3971 char_u *start;
3972 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003973 } line[NSUBEXP];
3974 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003975} regsub_T;
3976
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003977typedef struct
3978{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003979 regsub_T norm; // \( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003980#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003981 regsub_T synt; // \z( .. \) matches
Bram Moolenaarefb23f22013-06-01 23:02:54 +02003982#endif
3983} regsubs_T;
3984
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003985// nfa_pim_T stores a Postponed Invisible Match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02003986typedef struct nfa_pim_S nfa_pim_T;
3987struct nfa_pim_S
3988{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003989 int result; // NFA_PIM_*, see below
3990 nfa_state_T *state; // the invisible match start state
3991 regsubs_T subs; // submatch info, only party used
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02003992 union
3993 {
3994 lpos_T pos;
3995 char_u *ptr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003996 } end; // where the match must end
Bram Moolenaara2d95102013-06-04 14:23:05 +02003997};
3998
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003999// Values for done in nfa_pim_T.
4000#define NFA_PIM_UNUSED 0 // pim not used
4001#define NFA_PIM_TODO 1 // pim not done yet
4002#define NFA_PIM_MATCH 2 // pim executed, matches
4003#define NFA_PIM_NOMATCH 3 // pim executed, no match
Bram Moolenaara2d95102013-06-04 14:23:05 +02004004
4005
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004006// nfa_thread_T contains execution information of a NFA state
Bram Moolenaar4b417062013-05-25 20:19:50 +02004007typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004008{
4009 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004010 int count;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004011 nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed
4012 // invisible match
4013 regsubs_T subs; // submatch info, only party used
Bram Moolenaar4b417062013-05-25 20:19:50 +02004014} nfa_thread_T;
4015
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004016// nfa_list_T contains the alternative NFA execution states.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004017typedef struct
4018{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004019 nfa_thread_T *t; // allocated array of states
4020 int n; // nr of states currently in "t"
4021 int len; // max nr of states in "t"
4022 int id; // ID of the list
4023 int has_pim; // TRUE when any state has a PIM
Bram Moolenaar4b417062013-05-25 20:19:50 +02004024} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004025
Bram Moolenaar5714b802013-05-28 22:03:20 +02004026#ifdef ENABLE_LOG
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004027static void log_subexpr(regsub_T *sub);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004028
4029 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004030log_subsexpr(regsubs_T *subs)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004031{
4032 log_subexpr(&subs->norm);
4033# ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004034 if (rex.nfa_has_zsubexpr)
Bram Moolenaar6d3a5d72013-06-06 18:04:51 +02004035 log_subexpr(&subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004036# endif
4037}
4038
Bram Moolenaar5714b802013-05-28 22:03:20 +02004039 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004040log_subexpr(regsub_T *sub)
Bram Moolenaar5714b802013-05-28 22:03:20 +02004041{
4042 int j;
4043
4044 for (j = 0; j < sub->in_use; j++)
4045 if (REG_MULTI)
Bram Moolenaar87953742013-06-05 18:52:40 +02004046 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004047 j,
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004048 sub->list.multi[j].start_col,
4049 (int)sub->list.multi[j].start_lnum,
4050 sub->list.multi[j].end_col,
4051 (int)sub->list.multi[j].end_lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02004052 else
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004053 {
4054 char *s = (char *)sub->list.line[j].start;
4055 char *e = (char *)sub->list.line[j].end;
4056
Bram Moolenaar87953742013-06-05 18:52:40 +02004057 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
Bram Moolenaar5714b802013-05-28 22:03:20 +02004058 j,
Bram Moolenaar5b84ddc2013-06-05 16:33:10 +02004059 s == NULL ? "NULL" : s,
4060 e == NULL ? "NULL" : e);
4061 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004062}
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004063
4064 static char *
Bram Moolenaar05540972016-01-30 20:31:25 +01004065pim_info(nfa_pim_T *pim)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004066{
4067 static char buf[30];
4068
4069 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
4070 buf[0] = NUL;
4071 else
4072 {
4073 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
Bram Moolenaar0270f382018-07-17 05:43:58 +02004074 : (int)(pim->end.ptr - rex.input));
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004075 }
4076 return buf;
4077}
4078
Bram Moolenaar5714b802013-05-28 22:03:20 +02004079#endif
4080
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004081// Used during execution: whether a match has been found.
Bram Moolenaar2338c322018-07-08 19:07:19 +02004082static int nfa_match;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004083#ifdef FEAT_RELTIME
4084static proftime_T *nfa_time_limit;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02004085static int *nfa_timed_out;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004086static int nfa_time_count;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01004087#endif
Bram Moolenaar4b417062013-05-25 20:19:50 +02004088
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004089static void copy_sub(regsub_T *to, regsub_T *from);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01004090static int pim_equal(nfa_pim_T *one, nfa_pim_T *two);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004091
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004092/*
4093 * Copy postponed invisible match info from "from" to "to".
4094 */
4095 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004096copy_pim(nfa_pim_T *to, nfa_pim_T *from)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004097{
4098 to->result = from->result;
4099 to->state = from->state;
4100 copy_sub(&to->subs.norm, &from->subs.norm);
4101#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004102 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004103 copy_sub(&to->subs.synt, &from->subs.synt);
4104#endif
4105 to->end = from->end;
4106}
4107
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004108 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004109clear_sub(regsub_T *sub)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004110{
4111 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004112 // Use 0xff to set lnum to -1
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004113 vim_memset(sub->list.multi, 0xff,
Bram Moolenaar0270f382018-07-17 05:43:58 +02004114 sizeof(struct multipos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004115 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004116 vim_memset(sub->list.line, 0,
4117 sizeof(struct linepos) * rex.nfa_nsubexpr);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004118 sub->in_use = 0;
4119}
4120
4121/*
4122 * Copy the submatches from "from" to "to".
4123 */
4124 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004125copy_sub(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004126{
4127 to->in_use = from->in_use;
4128 if (from->in_use > 0)
4129 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004130 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004131 if (REG_MULTI)
4132 mch_memmove(&to->list.multi[0],
4133 &from->list.multi[0],
4134 sizeof(struct multipos) * from->in_use);
4135 else
4136 mch_memmove(&to->list.line[0],
4137 &from->list.line[0],
4138 sizeof(struct linepos) * from->in_use);
4139 }
4140}
4141
4142/*
4143 * Like copy_sub() but exclude the main match.
4144 */
4145 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004146copy_sub_off(regsub_T *to, regsub_T *from)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004147{
4148 if (to->in_use < from->in_use)
4149 to->in_use = from->in_use;
4150 if (from->in_use > 1)
4151 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004152 // Copy the match start and end positions.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004153 if (REG_MULTI)
4154 mch_memmove(&to->list.multi[1],
4155 &from->list.multi[1],
4156 sizeof(struct multipos) * (from->in_use - 1));
4157 else
4158 mch_memmove(&to->list.line[1],
4159 &from->list.line[1],
4160 sizeof(struct linepos) * (from->in_use - 1));
4161 }
4162}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004163
Bram Moolenaar428e9872013-05-30 17:05:39 +02004164/*
Bram Moolenaarf2118842013-09-25 18:16:38 +02004165 * Like copy_sub() but only do the end of the main match if \ze is present.
4166 */
4167 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01004168copy_ze_off(regsub_T *to, regsub_T *from)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004169{
Bram Moolenaar0270f382018-07-17 05:43:58 +02004170 if (rex.nfa_has_zend)
Bram Moolenaarf2118842013-09-25 18:16:38 +02004171 {
4172 if (REG_MULTI)
4173 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004174 if (from->list.multi[0].end_lnum >= 0)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004175 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004176 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
4177 to->list.multi[0].end_col = from->list.multi[0].end_col;
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01004178 }
Bram Moolenaarf2118842013-09-25 18:16:38 +02004179 }
4180 else
4181 {
4182 if (from->list.line[0].end != NULL)
4183 to->list.line[0].end = from->list.line[0].end;
4184 }
4185 }
4186}
4187
4188/*
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004189 * Return TRUE if "sub1" and "sub2" have the same start positions.
Bram Moolenaaree482532014-05-13 15:56:51 +02004190 * When using back-references also check the end position.
Bram Moolenaar428e9872013-05-30 17:05:39 +02004191 */
4192 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004193sub_equal(regsub_T *sub1, regsub_T *sub2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004194{
4195 int i;
4196 int todo;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004197 linenr_T s1;
4198 linenr_T s2;
4199 char_u *sp1;
4200 char_u *sp2;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004201
4202 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
4203 if (REG_MULTI)
4204 {
4205 for (i = 0; i < todo; ++i)
4206 {
4207 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004208 s1 = sub1->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004209 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004210 s1 = -1;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004211 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004212 s2 = sub2->list.multi[i].start_lnum;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004213 else
Bram Moolenaara0169122013-06-26 18:16:58 +02004214 s2 = -1;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004215 if (s1 != s2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004216 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004217 if (s1 != -1 && sub1->list.multi[i].start_col
4218 != sub2->list.multi[i].start_col)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004219 return FALSE;
Bram Moolenaaree482532014-05-13 15:56:51 +02004220
Bram Moolenaar0270f382018-07-17 05:43:58 +02004221 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004222 {
4223 if (i < sub1->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004224 s1 = sub1->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004225 else
4226 s1 = -1;
4227 if (i < sub2->in_use)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004228 s2 = sub2->list.multi[i].end_lnum;
Bram Moolenaaree482532014-05-13 15:56:51 +02004229 else
4230 s2 = -1;
4231 if (s1 != s2)
4232 return FALSE;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004233 if (s1 != -1 && sub1->list.multi[i].end_col
4234 != sub2->list.multi[i].end_col)
Bram Moolenaaree482532014-05-13 15:56:51 +02004235 return FALSE;
4236 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004237 }
4238 }
4239 else
4240 {
4241 for (i = 0; i < todo; ++i)
4242 {
4243 if (i < sub1->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004244 sp1 = sub1->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004245 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004246 sp1 = NULL;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004247 if (i < sub2->in_use)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004248 sp2 = sub2->list.line[i].start;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004249 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02004250 sp2 = NULL;
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004251 if (sp1 != sp2)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004252 return FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004253 if (rex.nfa_has_backref)
Bram Moolenaaree482532014-05-13 15:56:51 +02004254 {
4255 if (i < sub1->in_use)
4256 sp1 = sub1->list.line[i].end;
4257 else
4258 sp1 = NULL;
4259 if (i < sub2->in_use)
4260 sp2 = sub2->list.line[i].end;
4261 else
4262 sp2 = NULL;
4263 if (sp1 != sp2)
4264 return FALSE;
4265 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004266 }
4267 }
4268
4269 return TRUE;
4270}
4271
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004272#ifdef ENABLE_LOG
4273 static void
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004274report_state(char *action,
4275 regsub_T *sub,
4276 nfa_state_T *state,
4277 int lid,
4278 nfa_pim_T *pim)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004279{
4280 int col;
4281
4282 if (sub->in_use <= 0)
4283 col = -1;
4284 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004285 col = sub->list.multi[0].start_col;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004286 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02004287 col = (int)(sub->list.line[0].start - rex.line);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004288 nfa_set_code(state->c);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004289 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
4290 action, abs(state->id), lid, state->c, code, col,
4291 pim_info(pim));
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004292}
4293#endif
4294
Bram Moolenaar43e02982013-06-07 17:31:29 +02004295/*
4296 * Return TRUE if the same state is already in list "l" with the same
4297 * positions as "subs".
4298 */
4299 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004300has_state_with_pos(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004301 nfa_list_T *l, // runtime state list
4302 nfa_state_T *state, // state to update
4303 regsubs_T *subs, // pointers to subexpressions
4304 nfa_pim_T *pim) // postponed match or NULL
Bram Moolenaar43e02982013-06-07 17:31:29 +02004305{
4306 nfa_thread_T *thread;
4307 int i;
4308
4309 for (i = 0; i < l->n; ++i)
4310 {
4311 thread = &l->t[i];
4312 if (thread->state->id == state->id
4313 && sub_equal(&thread->subs.norm, &subs->norm)
4314#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004315 && (!rex.nfa_has_zsubexpr
Bram Moolenaarc5089bb2013-06-14 21:15:25 +02004316 || sub_equal(&thread->subs.synt, &subs->synt))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004317#endif
Bram Moolenaar69b52452013-07-17 21:10:51 +02004318 && pim_equal(&thread->pim, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004319 return TRUE;
4320 }
4321 return FALSE;
4322}
4323
4324/*
Bram Moolenaar69b52452013-07-17 21:10:51 +02004325 * Return TRUE if "one" and "two" are equal. That includes when both are not
4326 * set.
4327 */
4328 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004329pim_equal(nfa_pim_T *one, nfa_pim_T *two)
Bram Moolenaar69b52452013-07-17 21:10:51 +02004330{
4331 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
4332 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
4333
4334 if (one_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004335 // one is unused: equal when two is also unused
Bram Moolenaar69b52452013-07-17 21:10:51 +02004336 return two_unused;
4337 if (two_unused)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004338 // one is used and two is not: not equal
Bram Moolenaar69b52452013-07-17 21:10:51 +02004339 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004340 // compare the state id
Bram Moolenaar3f0df062013-08-14 13:34:25 +02004341 if (one->state->id != two->state->id)
4342 return FALSE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004343 // compare the position
Bram Moolenaar69b52452013-07-17 21:10:51 +02004344 if (REG_MULTI)
4345 return one->end.pos.lnum == two->end.pos.lnum
4346 && one->end.pos.col == two->end.pos.col;
4347 return one->end.ptr == two->end.ptr;
4348}
4349
4350/*
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004351 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
4352 */
4353 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004354match_follows(nfa_state_T *startstate, int depth)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004355{
4356 nfa_state_T *state = startstate;
4357
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004358 // avoid too much recursion
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004359 if (depth > 10)
4360 return FALSE;
4361
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004362 while (state != NULL)
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004363 {
4364 switch (state->c)
4365 {
4366 case NFA_MATCH:
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004367 case NFA_MCLOSE:
4368 case NFA_END_INVISIBLE:
4369 case NFA_END_INVISIBLE_NEG:
4370 case NFA_END_PATTERN:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004371 return TRUE;
4372
4373 case NFA_SPLIT:
4374 return match_follows(state->out, depth + 1)
4375 || match_follows(state->out1, depth + 1);
4376
4377 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004378 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004379 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004380 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004381 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004382 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004383 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02004384 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004385 case NFA_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004386 // skip ahead to next state
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004387 state = state->out1->out;
Bram Moolenaar690ae9c2013-07-13 20:58:11 +02004388 continue;
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004389
4390 case NFA_ANY:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02004391 case NFA_ANY_COMPOSING:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004392 case NFA_IDENT:
4393 case NFA_SIDENT:
4394 case NFA_KWORD:
4395 case NFA_SKWORD:
4396 case NFA_FNAME:
4397 case NFA_SFNAME:
4398 case NFA_PRINT:
4399 case NFA_SPRINT:
4400 case NFA_WHITE:
4401 case NFA_NWHITE:
4402 case NFA_DIGIT:
4403 case NFA_NDIGIT:
4404 case NFA_HEX:
4405 case NFA_NHEX:
4406 case NFA_OCTAL:
4407 case NFA_NOCTAL:
4408 case NFA_WORD:
4409 case NFA_NWORD:
4410 case NFA_HEAD:
4411 case NFA_NHEAD:
4412 case NFA_ALPHA:
4413 case NFA_NALPHA:
4414 case NFA_LOWER:
4415 case NFA_NLOWER:
4416 case NFA_UPPER:
4417 case NFA_NUPPER:
Bram Moolenaar1cfad522013-08-14 12:06:49 +02004418 case NFA_LOWER_IC:
4419 case NFA_NLOWER_IC:
4420 case NFA_UPPER_IC:
4421 case NFA_NUPPER_IC:
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004422 case NFA_START_COLL:
4423 case NFA_START_NEG_COLL:
4424 case NFA_NEWL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004425 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004426 return FALSE;
4427
4428 default:
4429 if (state->c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004430 // state will advance input
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004431 return FALSE;
4432
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004433 // Others: zero-width or possibly zero-width, might still find
4434 // a match at the same position, keep looking.
Bram Moolenaar1e02e662013-06-08 23:26:27 +02004435 break;
4436 }
4437 state = state->out;
4438 }
4439 return FALSE;
4440}
4441
4442
4443/*
Bram Moolenaar43e02982013-06-07 17:31:29 +02004444 * Return TRUE if "state" is already in list "l".
4445 */
4446 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01004447state_in_list(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004448 nfa_list_T *l, // runtime state list
4449 nfa_state_T *state, // state to update
4450 regsubs_T *subs) // pointers to subexpressions
Bram Moolenaar43e02982013-06-07 17:31:29 +02004451{
4452 if (state->lastlist[nfa_ll_index] == l->id)
4453 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004454 if (!rex.nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004455 return TRUE;
4456 }
4457 return FALSE;
4458}
4459
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004460// Offset used for "off" by addstate_here().
Bram Moolenaar16b35782016-09-09 20:29:50 +02004461#define ADDSTATE_HERE_OFFSET 10
4462
Bram Moolenaard05bf562013-06-30 23:24:08 +02004463/*
4464 * Add "state" and possibly what follows to state list ".".
4465 * Returns "subs_arg", possibly copied into temp_subs.
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004466 * Returns NULL when recursiveness is too deep.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004467 */
Bram Moolenaard05bf562013-06-30 23:24:08 +02004468 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004469addstate(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004470 nfa_list_T *l, // runtime state list
4471 nfa_state_T *state, // state to update
4472 regsubs_T *subs_arg, // pointers to subexpressions
4473 nfa_pim_T *pim, // postponed look-behind match
4474 int off_arg) // byte offset, when -1 go to next line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004475{
Bram Moolenaar963fee22013-05-26 21:47:28 +02004476 int subidx;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004477 int off = off_arg;
4478 int add_here = FALSE;
4479 int listindex = 0;
4480 int k;
4481 int found = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004482 nfa_thread_T *thread;
Bram Moolenaard5638832016-09-09 17:59:50 +02004483 struct multipos save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004484 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004485 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004486 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004487 regsub_T *sub;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004488 regsubs_T *subs = subs_arg;
4489 static regsubs_T temp_subs;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004490#ifdef ENABLE_LOG
4491 int did_print = FALSE;
4492#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004493 static int depth = 0;
4494
4495 // This function is called recursively. When the depth is too much we run
4496 // out of stack and crash, limit recursiveness here.
Bram Moolenaar5382f122019-02-13 01:18:38 +01004497 if (++depth >= 5000 || subs == NULL)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004498 {
4499 --depth;
4500 return NULL;
4501 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004502
Bram Moolenaar16b35782016-09-09 20:29:50 +02004503 if (off_arg <= -ADDSTATE_HERE_OFFSET)
4504 {
4505 add_here = TRUE;
4506 off = 0;
4507 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
4508 }
4509
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004510 switch (state->c)
4511 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004512 case NFA_NCLOSE:
4513 case NFA_MCLOSE:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004514 case NFA_MCLOSE1:
4515 case NFA_MCLOSE2:
4516 case NFA_MCLOSE3:
4517 case NFA_MCLOSE4:
4518 case NFA_MCLOSE5:
4519 case NFA_MCLOSE6:
4520 case NFA_MCLOSE7:
4521 case NFA_MCLOSE8:
4522 case NFA_MCLOSE9:
4523#ifdef FEAT_SYN_HL
4524 case NFA_ZCLOSE:
4525 case NFA_ZCLOSE1:
4526 case NFA_ZCLOSE2:
4527 case NFA_ZCLOSE3:
4528 case NFA_ZCLOSE4:
4529 case NFA_ZCLOSE5:
4530 case NFA_ZCLOSE6:
4531 case NFA_ZCLOSE7:
4532 case NFA_ZCLOSE8:
4533 case NFA_ZCLOSE9:
4534#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004535 case NFA_MOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004536 case NFA_ZEND:
Bram Moolenaar927d4a12013-06-09 17:25:34 +02004537 case NFA_SPLIT:
Bram Moolenaar699c1202013-09-25 16:41:54 +02004538 case NFA_EMPTY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004539 // These nodes are not added themselves but their "out" and/or
4540 // "out1" may be added below.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004541 break;
4542
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004543 case NFA_BOL:
4544 case NFA_BOF:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004545 // "^" won't match past end-of-line, don't bother trying.
4546 // Except when at the end of the line, or when we are going to the
4547 // next line for a look-behind match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004548 if (rex.input > rex.line
4549 && *rex.input != NUL
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004550 && (nfa_endp == NULL
4551 || !REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02004552 || rex.lnum == nfa_endp->se_u.pos.lnum))
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004553 goto skip_add;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004554 // FALLTHROUGH
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004555
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004556 case NFA_MOPEN1:
4557 case NFA_MOPEN2:
4558 case NFA_MOPEN3:
4559 case NFA_MOPEN4:
4560 case NFA_MOPEN5:
4561 case NFA_MOPEN6:
4562 case NFA_MOPEN7:
4563 case NFA_MOPEN8:
4564 case NFA_MOPEN9:
4565#ifdef FEAT_SYN_HL
4566 case NFA_ZOPEN:
4567 case NFA_ZOPEN1:
4568 case NFA_ZOPEN2:
4569 case NFA_ZOPEN3:
4570 case NFA_ZOPEN4:
4571 case NFA_ZOPEN5:
4572 case NFA_ZOPEN6:
4573 case NFA_ZOPEN7:
4574 case NFA_ZOPEN8:
4575 case NFA_ZOPEN9:
4576#endif
Bram Moolenaar398d53d2013-08-01 15:45:52 +02004577 case NFA_NOPEN:
Bram Moolenaar67604ae2013-06-05 16:51:57 +02004578 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004579 // These nodes need to be added so that we can bail out when it
4580 // was added to this list before at the same position to avoid an
4581 // endless loop for "\(\)*"
Bram Moolenaar307aa162013-06-02 16:34:21 +02004582
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004583 default:
Bram Moolenaar272fb582013-11-21 16:03:40 +01004584 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004585 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004586 // This state is already in the list, don't add it again,
4587 // unless it is an MOPEN that is used for a backreference or
4588 // when there is a PIM. For NFA_MATCH check the position,
4589 // lower position is preferred.
Bram Moolenaar0270f382018-07-17 05:43:58 +02004590 if (!rex.nfa_has_backref && pim == NULL && !l->has_pim
Bram Moolenaar9c235062014-05-13 16:44:29 +02004591 && state->c != NFA_MATCH)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004592 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004593 // When called from addstate_here() do insert before
4594 // existing states.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004595 if (add_here)
4596 {
4597 for (k = 0; k < l->n && k < listindex; ++k)
4598 if (l->t[k].state->id == state->id)
4599 {
4600 found = TRUE;
4601 break;
4602 }
4603 }
4604 if (!add_here || found)
4605 {
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004606skip_add:
4607#ifdef ENABLE_LOG
Bram Moolenaar16b35782016-09-09 20:29:50 +02004608 nfa_set_code(state->c);
4609 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4610 abs(state->id), l->id, state->c, code,
4611 pim == NULL ? "NULL" : "yes", l->has_pim, found);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004612#endif
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004613 --depth;
Bram Moolenaar16b35782016-09-09 20:29:50 +02004614 return subs;
4615 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004616 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02004617
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004618 // Do not add the state again when it exists with the same
4619 // positions.
Bram Moolenaar69b52452013-07-17 21:10:51 +02004620 if (has_state_with_pos(l, state, subs, pim))
Bram Moolenaar43e02982013-06-07 17:31:29 +02004621 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004622 }
4623
Bram Moolenaar688b3982019-02-13 21:47:36 +01004624 // When there are backreferences or PIMs the number of states may
4625 // be (a lot) bigger than anticipated.
Bram Moolenaara0169122013-06-26 18:16:58 +02004626 if (l->n == l->len)
Bram Moolenaar428e9872013-05-30 17:05:39 +02004627 {
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004628 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004629 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004630 nfa_thread_T *newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004631
Bram Moolenaar688b3982019-02-13 21:47:36 +01004632 if ((long)(newsize >> 10) >= p_mmp)
4633 {
4634 emsg(_(e_maxmempat));
4635 --depth;
4636 return NULL;
4637 }
Bram Moolenaard05bf562013-06-30 23:24:08 +02004638 if (subs != &temp_subs)
4639 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004640 // "subs" may point into the current array, need to make a
4641 // copy before it becomes invalid.
Bram Moolenaard05bf562013-06-30 23:24:08 +02004642 copy_sub(&temp_subs.norm, &subs->norm);
4643#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004644 if (rex.nfa_has_zsubexpr)
Bram Moolenaard05bf562013-06-30 23:24:08 +02004645 copy_sub(&temp_subs.synt, &subs->synt);
4646#endif
4647 subs = &temp_subs;
4648 }
4649
Bram Moolenaar688b3982019-02-13 21:47:36 +01004650 newt = vim_realloc(l->t, newsize);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004651 if (newt == NULL)
4652 {
4653 // out of memory
4654 --depth;
4655 return NULL;
4656 }
4657 l->t = newt;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004658 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004659 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004660
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004661 // add the state to the list
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02004662 state->lastlist[nfa_ll_index] = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004663 thread = &l->t[l->n++];
4664 thread->state = state;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004665 if (pim == NULL)
4666 thread->pim.result = NFA_PIM_UNUSED;
4667 else
Bram Moolenaar196ed142013-07-21 18:59:24 +02004668 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004669 copy_pim(&thread->pim, pim);
Bram Moolenaar196ed142013-07-21 18:59:24 +02004670 l->has_pim = TRUE;
4671 }
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004672 copy_sub(&thread->subs.norm, &subs->norm);
4673#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02004674 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004675 copy_sub(&thread->subs.synt, &subs->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004676#endif
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004677#ifdef ENABLE_LOG
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004678 report_state("Adding", &thread->subs.norm, state, l->id, pim);
Bram Moolenaarf6de0322013-06-02 21:30:04 +02004679 did_print = TRUE;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004680#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004681 }
4682
4683#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004684 if (!did_print)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02004685 report_state("Processing", &subs->norm, state, l->id, pim);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004686#endif
4687 switch (state->c)
4688 {
4689 case NFA_MATCH:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004690 break;
4691
4692 case NFA_SPLIT:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004693 // order matters here
Bram Moolenaar16b35782016-09-09 20:29:50 +02004694 subs = addstate(l, state->out, subs, pim, off_arg);
4695 subs = addstate(l, state->out1, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004696 break;
4697
Bram Moolenaar699c1202013-09-25 16:41:54 +02004698 case NFA_EMPTY:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004699 case NFA_NOPEN:
4700 case NFA_NCLOSE:
Bram Moolenaar16b35782016-09-09 20:29:50 +02004701 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004702 break;
4703
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004704 case NFA_MOPEN:
4705 case NFA_MOPEN1:
4706 case NFA_MOPEN2:
4707 case NFA_MOPEN3:
4708 case NFA_MOPEN4:
4709 case NFA_MOPEN5:
4710 case NFA_MOPEN6:
4711 case NFA_MOPEN7:
4712 case NFA_MOPEN8:
4713 case NFA_MOPEN9:
4714#ifdef FEAT_SYN_HL
4715 case NFA_ZOPEN:
4716 case NFA_ZOPEN1:
4717 case NFA_ZOPEN2:
4718 case NFA_ZOPEN3:
4719 case NFA_ZOPEN4:
4720 case NFA_ZOPEN5:
4721 case NFA_ZOPEN6:
4722 case NFA_ZOPEN7:
4723 case NFA_ZOPEN8:
4724 case NFA_ZOPEN9:
4725#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004726 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004727 if (state->c == NFA_ZSTART)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004728 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004729 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004730 sub = &subs->norm;
4731 }
4732#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004733 else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004734 {
4735 subidx = state->c - NFA_ZOPEN;
4736 sub = &subs->synt;
4737 }
4738#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004739 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004740 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004741 subidx = state->c - NFA_MOPEN;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004742 sub = &subs->norm;
4743 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004744
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004745 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004746 save_ptr = NULL;
Bram Moolenaara80faa82020-04-12 19:37:17 +02004747 CLEAR_FIELD(save_multipos);
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004748
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004749 // Set the position (with "off" added) in the subexpression. Save
4750 // and restore it when it was in use. Otherwise fill any gap.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004751 if (REG_MULTI)
4752 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004753 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004754 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004755 save_multipos = sub->list.multi[subidx];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004756 save_in_use = -1;
4757 }
4758 else
4759 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004760 save_in_use = sub->in_use;
4761 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004762 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004763 sub->list.multi[i].start_lnum = -1;
4764 sub->list.multi[i].end_lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004765 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004766 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004767 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02004768 if (off == -1)
4769 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004770 sub->list.multi[subidx].start_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004771 sub->list.multi[subidx].start_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004772 }
4773 else
4774 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004775 sub->list.multi[subidx].start_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004776 sub->list.multi[subidx].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004777 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004778 }
Bram Moolenaarc2b717e2015-09-29 15:06:14 +02004779 sub->list.multi[subidx].end_lnum = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004780 }
4781 else
4782 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004783 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004784 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004785 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004786 save_in_use = -1;
4787 }
4788 else
4789 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02004790 save_in_use = sub->in_use;
4791 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004792 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004793 sub->list.line[i].start = NULL;
4794 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004795 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02004796 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004797 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02004798 sub->list.line[subidx].start = rex.input + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004799 }
4800
Bram Moolenaar16b35782016-09-09 20:29:50 +02004801 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004802 if (subs == NULL)
4803 break;
4804 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004805#ifdef FEAT_SYN_HL
4806 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9)
4807 sub = &subs->synt;
4808 else
4809#endif
4810 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004811
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004812 if (save_in_use == -1)
4813 {
4814 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004815 sub->list.multi[subidx] = save_multipos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004816 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004817 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004818 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004819 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02004820 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004821 break;
4822
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004823 case NFA_MCLOSE:
Bram Moolenaar0270f382018-07-17 05:43:58 +02004824 if (rex.nfa_has_zend && (REG_MULTI
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004825 ? subs->norm.list.multi[0].end_lnum >= 0
Bram Moolenaar9be44812013-09-05 21:15:44 +02004826 : subs->norm.list.line[0].end != NULL))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004827 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004828 // Do not overwrite the position set by \ze.
Bram Moolenaar16b35782016-09-09 20:29:50 +02004829 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004830 break;
4831 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004832 // FALLTHROUGH
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004833 case NFA_MCLOSE1:
4834 case NFA_MCLOSE2:
4835 case NFA_MCLOSE3:
4836 case NFA_MCLOSE4:
4837 case NFA_MCLOSE5:
4838 case NFA_MCLOSE6:
4839 case NFA_MCLOSE7:
4840 case NFA_MCLOSE8:
4841 case NFA_MCLOSE9:
4842#ifdef FEAT_SYN_HL
4843 case NFA_ZCLOSE:
4844 case NFA_ZCLOSE1:
4845 case NFA_ZCLOSE2:
4846 case NFA_ZCLOSE3:
4847 case NFA_ZCLOSE4:
4848 case NFA_ZCLOSE5:
4849 case NFA_ZCLOSE6:
4850 case NFA_ZCLOSE7:
4851 case NFA_ZCLOSE8:
4852 case NFA_ZCLOSE9:
4853#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004854 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004855 if (state->c == NFA_ZEND)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004856 {
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004857 subidx = 0;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004858 sub = &subs->norm;
4859 }
4860#ifdef FEAT_SYN_HL
Bram Moolenaarebefd992013-08-14 14:18:40 +02004861 else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004862 {
4863 subidx = state->c - NFA_ZCLOSE;
4864 sub = &subs->synt;
4865 }
4866#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02004867 else
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004868 {
Bram Moolenaar963fee22013-05-26 21:47:28 +02004869 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02004870 sub = &subs->norm;
4871 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004872
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004873 // We don't fill in gaps here, there must have been an MOPEN that
4874 // has done that.
Bram Moolenaar5714b802013-05-28 22:03:20 +02004875 save_in_use = sub->in_use;
4876 if (sub->in_use <= subidx)
4877 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004878 if (REG_MULTI)
4879 {
Bram Moolenaard5638832016-09-09 17:59:50 +02004880 save_multipos = sub->list.multi[subidx];
Bram Moolenaar35b23862013-05-22 23:00:40 +02004881 if (off == -1)
4882 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004883 sub->list.multi[subidx].end_lnum = rex.lnum + 1;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004884 sub->list.multi[subidx].end_col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004885 }
4886 else
4887 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02004888 sub->list.multi[subidx].end_lnum = rex.lnum;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01004889 sub->list.multi[subidx].end_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02004890 (colnr_T)(rex.input - rex.line + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02004891 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004892 // avoid compiler warnings
Bram Moolenaarde9149e2013-07-17 19:22:13 +02004893 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004894 }
4895 else
4896 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004897 save_ptr = sub->list.line[subidx].end;
Bram Moolenaar0270f382018-07-17 05:43:58 +02004898 sub->list.line[subidx].end = rex.input + off;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004899 // avoid compiler warnings
Bram Moolenaara80faa82020-04-12 19:37:17 +02004900 CLEAR_FIELD(save_multipos);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004901 }
4902
Bram Moolenaar16b35782016-09-09 20:29:50 +02004903 subs = addstate(l, state->out, subs, pim, off_arg);
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004904 if (subs == NULL)
4905 break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004906 // "subs" may have changed, need to set "sub" again
Bram Moolenaarebefd992013-08-14 14:18:40 +02004907#ifdef FEAT_SYN_HL
4908 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9)
4909 sub = &subs->synt;
4910 else
4911#endif
4912 sub = &subs->norm;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004913
4914 if (REG_MULTI)
Bram Moolenaard5638832016-09-09 17:59:50 +02004915 sub->list.multi[subidx] = save_multipos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004916 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004917 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004918 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004919 break;
4920 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004921 --depth;
Bram Moolenaard05bf562013-06-30 23:24:08 +02004922 return subs;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004923}
4924
4925/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02004926 * Like addstate(), but the new state(s) are put at position "*ip".
4927 * Used for zero-width matches, next state to use is the added one.
4928 * This makes sure the order of states to be tried does not change, which
4929 * matters for alternatives.
4930 */
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004931 static regsubs_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01004932addstate_here(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004933 nfa_list_T *l, // runtime state list
4934 nfa_state_T *state, // state to update
4935 regsubs_T *subs, // pointers to subexpressions
4936 nfa_pim_T *pim, // postponed look-behind match
Bram Moolenaar05540972016-01-30 20:31:25 +01004937 int *ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004938{
4939 int tlen = l->n;
4940 int count;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004941 int listidx = *ip;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004942 regsubs_T *r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004943
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004944 // First add the state(s) at the end, so that we know how many there are.
4945 // Pass the listidx as offset (avoids adding another argument to
4946 // addstate().
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004947 r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4948 if (r == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004949 return NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02004950
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004951 // when "*ip" was at the end of the list, nothing to do
Bram Moolenaara2d95102013-06-04 14:23:05 +02004952 if (listidx + 1 == tlen)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004953 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02004954
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004955 // re-order to put the new state at the current position
Bram Moolenaar4b417062013-05-25 20:19:50 +02004956 count = l->n - tlen;
Bram Moolenaara50d02d2013-06-16 15:43:50 +02004957 if (count == 0)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004958 return r; // no state got added
Bram Moolenaar428e9872013-05-30 17:05:39 +02004959 if (count == 1)
4960 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01004961 // overwrite the current state
Bram Moolenaara2d95102013-06-04 14:23:05 +02004962 l->t[listidx] = l->t[l->n - 1];
Bram Moolenaar428e9872013-05-30 17:05:39 +02004963 }
4964 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02004965 {
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004966 if (l->n + count - 1 >= l->len)
4967 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004968 // not enough space to move the new states, reallocate the list
4969 // and move the states to the right position
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004970 int newlen = l->len * 3 / 2 + 50;
Bram Moolenaar688b3982019-02-13 21:47:36 +01004971 size_t newsize = newlen * sizeof(nfa_thread_T);
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004972 nfa_thread_T *newl;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004973
Bram Moolenaar688b3982019-02-13 21:47:36 +01004974 if ((long)(newsize >> 10) >= p_mmp)
4975 {
4976 emsg(_(e_maxmempat));
4977 return NULL;
4978 }
Bram Moolenaarc799fe22019-05-28 23:08:19 +02004979 newl = alloc(newsize);
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004980 if (newl == NULL)
Bram Moolenaar15bbd6e2019-02-13 20:31:50 +01004981 return NULL;
4982 l->len = newlen;
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004983 mch_memmove(&(newl[0]),
4984 &(l->t[0]),
4985 sizeof(nfa_thread_T) * listidx);
4986 mch_memmove(&(newl[listidx]),
4987 &(l->t[l->n - count]),
4988 sizeof(nfa_thread_T) * count);
4989 mch_memmove(&(newl[listidx + count]),
4990 &(l->t[listidx + 1]),
4991 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
4992 vim_free(l->t);
4993 l->t = newl;
4994 }
4995 else
4996 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01004997 // make space for new states, then move them from the
4998 // end to the current position
Bram Moolenaar55480dc2013-06-30 13:17:24 +02004999 mch_memmove(&(l->t[listidx + count]),
5000 &(l->t[listidx + 1]),
5001 sizeof(nfa_thread_T) * (l->n - listidx - 1));
5002 mch_memmove(&(l->t[listidx]),
5003 &(l->t[l->n - 1]),
5004 sizeof(nfa_thread_T) * count);
5005 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005006 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02005007 --l->n;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005008 *ip = listidx - 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005009
5010 return r;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005011}
5012
5013/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005014 * Check character class "class" against current character c.
5015 */
5016 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005017check_char_class(int class, int c)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005018{
5019 switch (class)
5020 {
5021 case NFA_CLASS_ALNUM:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005022 if (c >= 1 && c < 128 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005023 return OK;
5024 break;
5025 case NFA_CLASS_ALPHA:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005026 if (c >= 1 && c < 128 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005027 return OK;
5028 break;
5029 case NFA_CLASS_BLANK:
5030 if (c == ' ' || c == '\t')
5031 return OK;
5032 break;
5033 case NFA_CLASS_CNTRL:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005034 if (c >= 1 && c <= 127 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005035 return OK;
5036 break;
5037 case NFA_CLASS_DIGIT:
5038 if (VIM_ISDIGIT(c))
5039 return OK;
5040 break;
5041 case NFA_CLASS_GRAPH:
Bram Moolenaar0c078fc2017-03-29 15:31:20 +02005042 if (c >= 1 && c <= 127 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005043 return OK;
5044 break;
5045 case NFA_CLASS_LOWER:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005046 if (MB_ISLOWER(c) && c != 170 && c != 186)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005047 return OK;
5048 break;
5049 case NFA_CLASS_PRINT:
5050 if (vim_isprintc(c))
5051 return OK;
5052 break;
5053 case NFA_CLASS_PUNCT:
Bram Moolenaare8aee7d2016-04-26 21:39:13 +02005054 if (c >= 1 && c < 128 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005055 return OK;
5056 break;
5057 case NFA_CLASS_SPACE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005058 if ((c >= 9 && c <= 13) || (c == ' '))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005059 return OK;
5060 break;
5061 case NFA_CLASS_UPPER:
5062 if (MB_ISUPPER(c))
5063 return OK;
5064 break;
5065 case NFA_CLASS_XDIGIT:
5066 if (vim_isxdigit(c))
5067 return OK;
5068 break;
5069 case NFA_CLASS_TAB:
5070 if (c == '\t')
5071 return OK;
5072 break;
5073 case NFA_CLASS_RETURN:
5074 if (c == '\r')
5075 return OK;
5076 break;
5077 case NFA_CLASS_BACKSPACE:
5078 if (c == '\b')
5079 return OK;
5080 break;
5081 case NFA_CLASS_ESCAPE:
5082 if (c == '\033')
5083 return OK;
5084 break;
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01005085 case NFA_CLASS_IDENT:
5086 if (vim_isIDc(c))
5087 return OK;
5088 break;
5089 case NFA_CLASS_KEYWORD:
5090 if (reg_iswordc(c))
5091 return OK;
5092 break;
5093 case NFA_CLASS_FNAME:
5094 if (vim_isfilec(c))
5095 return OK;
5096 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005097
5098 default:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005099 // should not be here :P
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005100 siemsg(_(e_ill_char_class), class);
Bram Moolenaar417bad22013-06-07 14:08:30 +02005101 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005102 }
5103 return FAIL;
5104}
5105
Bram Moolenaar5714b802013-05-28 22:03:20 +02005106/*
5107 * Check for a match with subexpression "subidx".
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005108 * Return TRUE if it matches.
Bram Moolenaar5714b802013-05-28 22:03:20 +02005109 */
5110 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005111match_backref(
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005112 regsub_T *sub, // pointers to subexpressions
Bram Moolenaar05540972016-01-30 20:31:25 +01005113 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005114 int *bytelen) // out: length of match in bytes
Bram Moolenaar5714b802013-05-28 22:03:20 +02005115{
5116 int len;
5117
5118 if (sub->in_use <= subidx)
5119 {
5120retempty:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005121 // backref was not set, match an empty string
Bram Moolenaar5714b802013-05-28 22:03:20 +02005122 *bytelen = 0;
5123 return TRUE;
5124 }
5125
5126 if (REG_MULTI)
5127 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005128 if (sub->list.multi[subidx].start_lnum < 0
5129 || sub->list.multi[subidx].end_lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005130 goto retempty;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005131 if (sub->list.multi[subidx].start_lnum == rex.lnum
5132 && sub->list.multi[subidx].end_lnum == rex.lnum)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005133 {
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005134 len = sub->list.multi[subidx].end_col
5135 - sub->list.multi[subidx].start_col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005136 if (cstrncmp(rex.line + sub->list.multi[subidx].start_col,
5137 rex.input, &len) == 0)
Bram Moolenaar580abea2013-06-14 20:31:28 +02005138 {
5139 *bytelen = len;
5140 return TRUE;
5141 }
5142 }
5143 else
5144 {
5145 if (match_with_backref(
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005146 sub->list.multi[subidx].start_lnum,
5147 sub->list.multi[subidx].start_col,
5148 sub->list.multi[subidx].end_lnum,
5149 sub->list.multi[subidx].end_col,
Bram Moolenaar580abea2013-06-14 20:31:28 +02005150 bytelen) == RA_MATCH)
5151 return TRUE;
Bram Moolenaar5714b802013-05-28 22:03:20 +02005152 }
5153 }
5154 else
5155 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005156 if (sub->list.line[subidx].start == NULL
5157 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005158 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02005159 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005160 if (cstrncmp(sub->list.line[subidx].start, rex.input, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02005161 {
5162 *bytelen = len;
5163 return TRUE;
5164 }
5165 }
5166 return FALSE;
5167}
5168
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005169#ifdef FEAT_SYN_HL
5170
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005171/*
5172 * Check for a match with \z subexpression "subidx".
5173 * Return TRUE if it matches.
5174 */
5175 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005176match_zref(
5177 int subidx,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005178 int *bytelen) // out: length of match in bytes
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005179{
5180 int len;
5181
5182 cleanup_zsubexpr();
5183 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL)
5184 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005185 // backref was not set, match an empty string
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005186 *bytelen = 0;
5187 return TRUE;
5188 }
5189
5190 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005191 if (cstrncmp(re_extmatch_in->matches[subidx], rex.input, &len) == 0)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005192 {
5193 *bytelen = len;
5194 return TRUE;
5195 }
5196 return FALSE;
5197}
5198#endif
5199
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005200/*
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005201 * Save list IDs for all NFA states of "prog" into "list".
5202 * Also reset the IDs to zero.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005203 * Only used for the recursive value lastlist[1].
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005204 */
5205 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005206nfa_save_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005207{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005208 int i;
5209 nfa_state_T *p;
5210
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005211 // Order in the list is reverse, it's a bit faster that way.
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005212 p = &prog->state[0];
5213 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005214 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005215 list[i] = p->lastlist[1];
5216 p->lastlist[1] = 0;
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005217 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005218 }
5219}
5220
5221/*
5222 * Restore list IDs from "list" to all NFA states.
5223 */
5224 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01005225nfa_restore_listids(nfa_regprog_T *prog, int *list)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005226{
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005227 int i;
5228 nfa_state_T *p;
5229
5230 p = &prog->state[0];
5231 for (i = prog->nstate; --i >= 0; )
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005232 {
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005233 p->lastlist[1] = list[i];
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005234 ++p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005235 }
5236}
5237
Bram Moolenaar423532e2013-05-29 21:14:42 +02005238 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005239nfa_re_num_cmp(long_u val, int op, long_u pos)
Bram Moolenaar423532e2013-05-29 21:14:42 +02005240{
5241 if (op == 1) return pos > val;
5242 if (op == 2) return pos < val;
5243 return val == pos;
5244}
5245
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01005246static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *submatch, regsubs_T *m);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02005247
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005248/*
Bram Moolenaarf46da702013-06-02 22:37:42 +02005249 * Recursively call nfa_regmatch()
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005250 * "pim" is NULL or contains info about a Postponed Invisible Match (start
5251 * position).
Bram Moolenaarf46da702013-06-02 22:37:42 +02005252 */
5253 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005254recursive_regmatch(
5255 nfa_state_T *state,
5256 nfa_pim_T *pim,
5257 nfa_regprog_T *prog,
5258 regsubs_T *submatch,
5259 regsubs_T *m,
Bram Moolenaar2338c322018-07-08 19:07:19 +02005260 int **listids,
5261 int *listids_len)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005262{
Bram Moolenaar0270f382018-07-17 05:43:58 +02005263 int save_reginput_col = (int)(rex.input - rex.line);
5264 int save_reglnum = rex.lnum;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005265 int save_nfa_match = nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005266 int save_nfa_listid = rex.nfa_listid;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005267 save_se_T *save_nfa_endp = nfa_endp;
5268 save_se_T endpos;
5269 save_se_T *endposp = NULL;
5270 int result;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005271 int need_restore = FALSE;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005272
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005273 if (pim != NULL)
5274 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005275 // start at the position where the postponed match was
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005276 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005277 rex.input = rex.line + pim->end.pos.col;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005278 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005279 rex.input = pim->end.ptr;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005280 }
5281
Bram Moolenaardecd9542013-06-07 16:31:50 +02005282 if (state->c == NFA_START_INVISIBLE_BEFORE
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01005283 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5284 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
5285 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005286 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005287 // The recursive match must end at the current position. When "pim" is
5288 // not NULL it specifies the current position.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005289 endposp = &endpos;
5290 if (REG_MULTI)
5291 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005292 if (pim == NULL)
5293 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005294 endpos.se_u.pos.col = (int)(rex.input - rex.line);
5295 endpos.se_u.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005296 }
5297 else
5298 endpos.se_u.pos = pim->end.pos;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005299 }
5300 else
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005301 {
5302 if (pim == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005303 endpos.se_u.ptr = rex.input;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005304 else
5305 endpos.se_u.ptr = pim->end.ptr;
5306 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005307
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005308 // Go back the specified number of bytes, or as far as the
5309 // start of the previous line, to try matching "\@<=" or
5310 // not matching "\@<!". This is very inefficient, limit the number of
5311 // bytes if possible.
Bram Moolenaarf46da702013-06-02 22:37:42 +02005312 if (state->val <= 0)
5313 {
5314 if (REG_MULTI)
5315 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005316 rex.line = reg_getline(--rex.lnum);
5317 if (rex.line == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005318 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005319 rex.line = reg_getline(++rex.lnum);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005320 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005321 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005322 }
5323 else
5324 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005325 if (REG_MULTI && (int)(rex.input - rex.line) < state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005326 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005327 // Not enough bytes in this line, go to end of
5328 // previous line.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005329 rex.line = reg_getline(--rex.lnum);
5330 if (rex.line == NULL)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005331 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005332 // can't go before the first line
Bram Moolenaar0270f382018-07-17 05:43:58 +02005333 rex.line = reg_getline(++rex.lnum);
5334 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005335 }
5336 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005337 rex.input = rex.line + STRLEN(rex.line);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005338 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005339 if ((int)(rex.input - rex.line) >= state->val)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005340 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005341 rex.input -= state->val;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005342 if (has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005343 rex.input -= mb_head_off(rex.line, rex.input);
Bram Moolenaarf46da702013-06-02 22:37:42 +02005344 }
5345 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005346 rex.input = rex.line;
Bram Moolenaarf46da702013-06-02 22:37:42 +02005347 }
5348 }
5349
Bram Moolenaarf46da702013-06-02 22:37:42 +02005350#ifdef ENABLE_LOG
5351 if (log_fd != stderr)
5352 fclose(log_fd);
5353 log_fd = NULL;
5354#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005355 // Have to clear the lastlist field of the NFA nodes, so that
5356 // nfa_regmatch() and addstate() can run properly after recursion.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005357 if (nfa_ll_index == 1)
5358 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005359 // Already calling nfa_regmatch() recursively. Save the lastlist[1]
5360 // values and clear them.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005361 if (*listids == NULL || *listids_len < prog->nstate)
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005362 {
Bram Moolenaar2338c322018-07-08 19:07:19 +02005363 vim_free(*listids);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005364 *listids = ALLOC_MULT(int, prog->nstate);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005365 if (*listids == NULL)
5366 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005367 emsg(_("E878: (NFA) Could not allocate memory for branch traversal!"));
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005368 return 0;
5369 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02005370 *listids_len = prog->nstate;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005371 }
5372 nfa_save_listids(prog, *listids);
5373 need_restore = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005374 // any value of rex.nfa_listid will do
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005375 }
5376 else
5377 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005378 // First recursive nfa_regmatch() call, switch to the second lastlist
5379 // entry. Make sure rex.nfa_listid is different from a previous
5380 // recursive call, because some states may still have this ID.
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005381 ++nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005382 if (rex.nfa_listid <= rex.nfa_alt_listid)
5383 rex.nfa_listid = rex.nfa_alt_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005384 }
5385
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005386 // Call nfa_regmatch() to check if the current concat matches at this
5387 // position. The concat ends with the node NFA_END_INVISIBLE
Bram Moolenaarf46da702013-06-02 22:37:42 +02005388 nfa_endp = endposp;
5389 result = nfa_regmatch(prog, state->out, submatch, m);
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005390
5391 if (need_restore)
5392 nfa_restore_listids(prog, *listids);
5393 else
5394 {
5395 --nfa_ll_index;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005396 rex.nfa_alt_listid = rex.nfa_listid;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02005397 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005398
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005399 // restore position in input text
Bram Moolenaar0270f382018-07-17 05:43:58 +02005400 rex.lnum = save_reglnum;
Bram Moolenaar484d2412013-06-13 19:47:07 +02005401 if (REG_MULTI)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005402 rex.line = reg_getline(rex.lnum);
5403 rex.input = rex.line + save_reginput_col;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005404 if (result != NFA_TOO_EXPENSIVE)
5405 {
5406 nfa_match = save_nfa_match;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005407 rex.nfa_listid = save_nfa_listid;
Bram Moolenaar6747fab2016-06-28 22:39:16 +02005408 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005409 nfa_endp = save_nfa_endp;
5410
5411#ifdef ENABLE_LOG
5412 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5413 if (log_fd != NULL)
5414 {
5415 fprintf(log_fd, "****************************\n");
5416 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
5417 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
5418 fprintf(log_fd, "****************************\n");
5419 }
5420 else
5421 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005422 emsg(_(e_log_open_failed));
Bram Moolenaarf46da702013-06-02 22:37:42 +02005423 log_fd = stderr;
5424 }
5425#endif
5426
5427 return result;
5428}
5429
Bram Moolenaara2d95102013-06-04 14:23:05 +02005430/*
5431 * Estimate the chance of a match with "state" failing.
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005432 * empty match: 0
Bram Moolenaara2d95102013-06-04 14:23:05 +02005433 * NFA_ANY: 1
5434 * specific character: 99
5435 */
5436 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005437failure_chance(nfa_state_T *state, int depth)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005438{
5439 int c = state->c;
5440 int l, r;
5441
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005442 // detect looping
Bram Moolenaara2d95102013-06-04 14:23:05 +02005443 if (depth > 4)
5444 return 1;
5445
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005446 switch (c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02005447 {
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005448 case NFA_SPLIT:
5449 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005450 // avoid recursive stuff
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005451 return 1;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005452 // two alternatives, use the lowest failure chance
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005453 l = failure_chance(state->out, depth + 1);
5454 r = failure_chance(state->out1, depth + 1);
5455 return l < r ? l : r;
5456
5457 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005458 // matches anything, unlikely to fail
Bram Moolenaara2d95102013-06-04 14:23:05 +02005459 return 1;
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005460
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005461 case NFA_MATCH:
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02005462 case NFA_MCLOSE:
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005463 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005464 // empty match works always
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005465 return 0;
5466
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005467 case NFA_START_INVISIBLE:
5468 case NFA_START_INVISIBLE_FIRST:
5469 case NFA_START_INVISIBLE_NEG:
5470 case NFA_START_INVISIBLE_NEG_FIRST:
5471 case NFA_START_INVISIBLE_BEFORE:
5472 case NFA_START_INVISIBLE_BEFORE_FIRST:
5473 case NFA_START_INVISIBLE_BEFORE_NEG:
5474 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5475 case NFA_START_PATTERN:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005476 // recursive regmatch is expensive, use low failure chance
Bram Moolenaar44c71db2013-06-14 22:33:51 +02005477 return 5;
5478
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005479 case NFA_BOL:
5480 case NFA_EOL:
5481 case NFA_BOF:
5482 case NFA_EOF:
5483 case NFA_NEWL:
5484 return 99;
5485
5486 case NFA_BOW:
5487 case NFA_EOW:
5488 return 90;
5489
5490 case NFA_MOPEN:
5491 case NFA_MOPEN1:
5492 case NFA_MOPEN2:
5493 case NFA_MOPEN3:
5494 case NFA_MOPEN4:
5495 case NFA_MOPEN5:
5496 case NFA_MOPEN6:
5497 case NFA_MOPEN7:
5498 case NFA_MOPEN8:
5499 case NFA_MOPEN9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005500#ifdef FEAT_SYN_HL
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005501 case NFA_ZOPEN:
5502 case NFA_ZOPEN1:
5503 case NFA_ZOPEN2:
5504 case NFA_ZOPEN3:
5505 case NFA_ZOPEN4:
5506 case NFA_ZOPEN5:
5507 case NFA_ZOPEN6:
5508 case NFA_ZOPEN7:
5509 case NFA_ZOPEN8:
5510 case NFA_ZOPEN9:
5511 case NFA_ZCLOSE:
5512 case NFA_ZCLOSE1:
5513 case NFA_ZCLOSE2:
5514 case NFA_ZCLOSE3:
5515 case NFA_ZCLOSE4:
5516 case NFA_ZCLOSE5:
5517 case NFA_ZCLOSE6:
5518 case NFA_ZCLOSE7:
5519 case NFA_ZCLOSE8:
5520 case NFA_ZCLOSE9:
Bram Moolenaarb76591e2013-06-04 21:42:22 +02005521#endif
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005522 case NFA_NOPEN:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005523 case NFA_MCLOSE1:
5524 case NFA_MCLOSE2:
5525 case NFA_MCLOSE3:
5526 case NFA_MCLOSE4:
5527 case NFA_MCLOSE5:
5528 case NFA_MCLOSE6:
5529 case NFA_MCLOSE7:
5530 case NFA_MCLOSE8:
5531 case NFA_MCLOSE9:
5532 case NFA_NCLOSE:
5533 return failure_chance(state->out, depth + 1);
5534
5535 case NFA_BACKREF1:
5536 case NFA_BACKREF2:
5537 case NFA_BACKREF3:
5538 case NFA_BACKREF4:
5539 case NFA_BACKREF5:
5540 case NFA_BACKREF6:
5541 case NFA_BACKREF7:
5542 case NFA_BACKREF8:
5543 case NFA_BACKREF9:
5544#ifdef FEAT_SYN_HL
5545 case NFA_ZREF1:
5546 case NFA_ZREF2:
5547 case NFA_ZREF3:
5548 case NFA_ZREF4:
5549 case NFA_ZREF5:
5550 case NFA_ZREF6:
5551 case NFA_ZREF7:
5552 case NFA_ZREF8:
5553 case NFA_ZREF9:
5554#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005555 // backreferences don't match in many places
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005556 return 94;
5557
5558 case NFA_LNUM_GT:
5559 case NFA_LNUM_LT:
5560 case NFA_COL_GT:
5561 case NFA_COL_LT:
5562 case NFA_VCOL_GT:
5563 case NFA_VCOL_LT:
5564 case NFA_MARK_GT:
5565 case NFA_MARK_LT:
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005566 case NFA_VISUAL:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005567 // before/after positions don't match very often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005568 return 85;
5569
5570 case NFA_LNUM:
5571 return 90;
5572
5573 case NFA_CURSOR:
5574 case NFA_COL:
5575 case NFA_VCOL:
5576 case NFA_MARK:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005577 // specific positions rarely match
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005578 return 98;
5579
5580 case NFA_COMPOSING:
5581 return 95;
5582
5583 default:
5584 if (c > 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005585 // character match fails often
Bram Moolenaare2b8cb32013-06-05 11:46:25 +02005586 return 95;
5587 }
5588
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005589 // something else, includes character classes
Bram Moolenaara2d95102013-06-04 14:23:05 +02005590 return 50;
5591}
5592
Bram Moolenaarf46da702013-06-02 22:37:42 +02005593/*
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005594 * Skip until the char "c" we know a match must start with.
5595 */
5596 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005597skip_to_start(int c, colnr_T *colp)
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005598{
5599 char_u *s;
5600
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005601 // Used often, do some work to avoid call overhead.
Bram Moolenaara12a1612019-01-24 16:39:02 +01005602 if (!rex.reg_ic && !has_mbyte)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005603 s = vim_strbyte(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005604 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005605 s = cstrchr(rex.line + *colp, c);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005606 if (s == NULL)
5607 return FAIL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005608 *colp = (int)(s - rex.line);
Bram Moolenaar87f764a2013-06-08 14:38:27 +02005609 return OK;
5610}
5611
5612/*
Bram Moolenaar473de612013-06-08 18:19:48 +02005613 * Check for a match with match_text.
Bram Moolenaare7766ee2013-06-08 22:30:03 +02005614 * Called after skip_to_start() has found regstart.
Bram Moolenaar473de612013-06-08 18:19:48 +02005615 * Returns zero for no match, 1 for a match.
5616 */
5617 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01005618find_match_text(colnr_T startcol, int regstart, char_u *match_text)
Bram Moolenaar473de612013-06-08 18:19:48 +02005619{
5620 colnr_T col = startcol;
5621 int c1, c2;
5622 int len1, len2;
5623 int match;
5624
5625 for (;;)
5626 {
5627 match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005628 len2 = MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005629 for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
5630 {
5631 c1 = PTR2CHAR(match_text + len1);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005632 c2 = PTR2CHAR(rex.line + col + len2);
Bram Moolenaar59de4172020-06-09 19:34:54 +02005633 if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2)))
Bram Moolenaar473de612013-06-08 18:19:48 +02005634 {
5635 match = FALSE;
5636 break;
5637 }
5638 len2 += MB_CHAR2LEN(c2);
5639 }
5640 if (match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005641 // check that no composing char follows
Bram Moolenaar473de612013-06-08 18:19:48 +02005642 && !(enc_utf8
Bram Moolenaara12a1612019-01-24 16:39:02 +01005643 && utf_iscomposing(PTR2CHAR(rex.line + col + len2))))
Bram Moolenaar473de612013-06-08 18:19:48 +02005644 {
5645 cleanup_subexpr();
5646 if (REG_MULTI)
5647 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005648 rex.reg_startpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005649 rex.reg_startpos[0].col = col;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005650 rex.reg_endpos[0].lnum = rex.lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02005651 rex.reg_endpos[0].col = col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005652 }
5653 else
5654 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005655 rex.reg_startp[0] = rex.line + col;
5656 rex.reg_endp[0] = rex.line + col + len2;
Bram Moolenaar473de612013-06-08 18:19:48 +02005657 }
5658 return 1L;
5659 }
5660
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005661 // Try finding regstart after the current match.
5662 col += MB_CHAR2LEN(regstart); // skip regstart
Bram Moolenaar473de612013-06-08 18:19:48 +02005663 if (skip_to_start(regstart, &col) == FAIL)
5664 break;
5665 }
5666 return 0L;
5667}
5668
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005669#ifdef FEAT_RELTIME
5670 static int
5671nfa_did_time_out()
5672{
5673 if (nfa_time_limit != NULL && profile_passed_limit(nfa_time_limit))
5674 {
5675 if (nfa_timed_out != NULL)
5676 *nfa_timed_out = TRUE;
5677 return TRUE;
5678 }
5679 return FALSE;
5680}
5681#endif
5682
Bram Moolenaar473de612013-06-08 18:19:48 +02005683/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005684 * Main matching routine.
5685 *
Bram Moolenaar0270f382018-07-17 05:43:58 +02005686 * Run NFA to determine whether it matches rex.input.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005687 *
Bram Moolenaar307aa162013-06-02 16:34:21 +02005688 * When "nfa_endp" is not NULL it is a required end-of-match position.
Bram Moolenaar61602c52013-06-01 19:54:43 +02005689 *
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005690 * Return TRUE if there is a match, FALSE if there is no match,
5691 * NFA_TOO_EXPENSIVE if we end up with too many states.
Bram Moolenaarf2118842013-09-25 18:16:38 +02005692 * When there is a match "submatch" contains the positions.
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005693 *
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005694 * Note: Caller must ensure that: start != NULL.
5695 */
5696 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01005697nfa_regmatch(
5698 nfa_regprog_T *prog,
5699 nfa_state_T *start,
5700 regsubs_T *submatch,
5701 regsubs_T *m)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005702{
Bram Moolenaarc6b1cc92019-05-03 11:21:05 +02005703 int result = FALSE;
Bram Moolenaaraaf30472015-01-27 14:40:00 +01005704 size_t size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005705 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005706 int go_to_nextline = FALSE;
5707 nfa_thread_T *t;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005708 nfa_list_T list[2];
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005709 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02005710 nfa_list_T *thislist;
5711 nfa_list_T *nextlist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005712 int *listids = NULL;
Bram Moolenaar2338c322018-07-08 19:07:19 +02005713 int listids_len = 0;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005714 nfa_state_T *add_state;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005715 int add_here;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005716 int add_count;
Bram Moolenaar4380d1e2013-06-09 20:51:00 +02005717 int add_off = 0;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005718 int toplevel = start->c == NFA_MOPEN;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005719 regsubs_T *r;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005720#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005721 FILE *debug;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005722#endif
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005723
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005724 // Some patterns may take a long time to match, especially when using
5725 // recursive_regmatch(). Allow interrupting them with CTRL-C.
Bram Moolenaar41f12052013-08-25 17:01:42 +02005726 fast_breakcheck();
5727 if (got_int)
5728 return FALSE;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005729#ifdef FEAT_RELTIME
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005730 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01005731 return FALSE;
5732#endif
Bram Moolenaar41f12052013-08-25 17:01:42 +02005733
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005734#ifdef NFA_REGEXP_DEBUG_LOG
5735 debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5736 if (debug == NULL)
5737 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005738 semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
Bram Moolenaardc633cf2016-04-23 14:33:19 +02005739 return FALSE;
5740 }
5741#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02005742 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005743
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005744 // Allocate memory for the lists of nodes.
Bram Moolenaar0270f382018-07-17 05:43:58 +02005745 size = (prog->nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005746
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005747 list[0].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005748 list[0].len = prog->nstate + 1;
Bram Moolenaarc799fe22019-05-28 23:08:19 +02005749 list[1].t = alloc(size);
Bram Moolenaar0270f382018-07-17 05:43:58 +02005750 list[1].len = prog->nstate + 1;
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005751 if (list[0].t == NULL || list[1].t == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005752 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005753
5754#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02005755 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005756 if (log_fd != NULL)
5757 {
5758 fprintf(log_fd, "**********************************\n");
5759 nfa_set_code(start->c);
5760 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5761 abs(start->id), code);
5762 fprintf(log_fd, "**********************************\n");
5763 }
5764 else
5765 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01005766 emsg(_(e_log_open_failed));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005767 log_fd = stderr;
5768 }
5769#endif
5770
5771 thislist = &list[0];
5772 thislist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005773 thislist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005774 nextlist = &list[1];
5775 nextlist->n = 0;
Bram Moolenaar196ed142013-07-21 18:59:24 +02005776 nextlist->has_pim = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005777#ifdef ENABLE_LOG
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005778 fprintf(log_fd, "(---) STARTSTATE first\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005779#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02005780 thislist->id = rex.nfa_listid + 1;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005781
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005782 // Inline optimized code for addstate(thislist, start, m, 0) if we know
5783 // it's the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005784 if (toplevel)
5785 {
5786 if (REG_MULTI)
5787 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005788 m->norm.list.multi[0].start_lnum = rex.lnum;
5789 m->norm.list.multi[0].start_col = (colnr_T)(rex.input - rex.line);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005790 }
5791 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005792 m->norm.list.line[0].start = rex.input;
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005793 m->norm.in_use = 1;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005794 r = addstate(thislist, start->out, m, NULL, 0);
Bram Moolenaarf96d1092013-06-07 22:39:40 +02005795 }
5796 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01005797 r = addstate(thislist, start, m, NULL, 0);
5798 if (r == NULL)
5799 {
5800 nfa_match = NFA_TOO_EXPENSIVE;
5801 goto theend;
5802 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005803
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005804#define ADD_STATE_IF_MATCH(state) \
5805 if (result) { \
Bram Moolenaara2d95102013-06-04 14:23:05 +02005806 add_state = state->out; \
5807 add_off = clen; \
5808 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005809
5810 /*
5811 * Run for each character.
5812 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02005813 for (;;)
5814 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005815 int curc;
5816 int clen;
5817
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005818 if (has_mbyte)
5819 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005820 curc = (*mb_ptr2char)(rex.input);
5821 clen = (*mb_ptr2len)(rex.input);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005822 }
5823 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005824 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02005825 curc = *rex.input;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005826 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005827 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005828 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02005829 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005830 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005831 go_to_nextline = FALSE;
5832 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005833
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005834 // swap lists
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005835 thislist = &list[flag];
5836 nextlist = &list[flag ^= 1];
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005837 nextlist->n = 0; // clear nextlist
Bram Moolenaar196ed142013-07-21 18:59:24 +02005838 nextlist->has_pim = FALSE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02005839 ++rex.nfa_listid;
Bram Moolenaarbcf94422018-06-23 14:21:42 +02005840 if (prog->re_engine == AUTOMATIC_ENGINE
Bram Moolenaar0270f382018-07-17 05:43:58 +02005841 && (rex.nfa_listid >= NFA_MAX_STATES
Bram Moolenaar5ec74142018-06-23 17:14:41 +02005842# ifdef FEAT_EVAL
5843 || nfa_fail_for_testing
5844# endif
5845 ))
Bram Moolenaarfda37292014-11-05 14:27:36 +01005846 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005847 // too many states, retry with old engine
Bram Moolenaarfda37292014-11-05 14:27:36 +01005848 nfa_match = NFA_TOO_EXPENSIVE;
5849 goto theend;
5850 }
5851
Bram Moolenaar0270f382018-07-17 05:43:58 +02005852 thislist->id = rex.nfa_listid;
5853 nextlist->id = rex.nfa_listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005854
5855#ifdef ENABLE_LOG
5856 fprintf(log_fd, "------------------------------------------\n");
Bram Moolenaar0270f382018-07-17 05:43:58 +02005857 fprintf(log_fd, ">>> Reginput is \"%s\"\n", rex.input);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005858 fprintf(log_fd, ">>> Advanced one character... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005859 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02005860 {
5861 int i;
5862
5863 for (i = 0; i < thislist->n; i++)
5864 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5865 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005866 fprintf(log_fd, "\n");
5867#endif
5868
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005869#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005870 fprintf(debug, "\n-------------------\n");
5871#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005872 /*
5873 * If the state lists are empty we can stop.
5874 */
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005875 if (thislist->n == 0)
Bram Moolenaar66e83d72013-05-21 14:03:00 +02005876 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005877
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005878 // compute nextlist
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005879 for (listidx = 0; listidx < thislist->n; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005880 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005881 // If the list gets very long there probably is something wrong.
5882 // At least allow interrupting with CTRL-C.
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02005883 fast_breakcheck();
5884 if (got_int)
5885 break;
5886#ifdef FEAT_RELTIME
5887 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
5888 {
5889 nfa_time_count = 0;
5890 if (nfa_did_time_out())
5891 break;
5892 }
5893#endif
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005894 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005895
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02005896#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005897 nfa_set_code(t->state->c);
5898 fprintf(debug, "%s, ", code);
5899#endif
5900#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005901 {
5902 int col;
5903
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02005904 if (t->subs.norm.in_use <= 0)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005905 col = -1;
5906 else if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01005907 col = t->subs.norm.list.multi[0].start_col;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005908 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02005909 col = (int)(t->subs.norm.list.line[0].start - rex.line);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005910 nfa_set_code(t->state->c);
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02005911 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02005912 abs(t->state->id), (int)t->state->c, code, col,
5913 pim_info(&t->pim));
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02005914 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005915#endif
5916
5917 /*
5918 * Handle the possible codes of the current state.
5919 * The most important is NFA_MATCH.
5920 */
Bram Moolenaara2d95102013-06-04 14:23:05 +02005921 add_state = NULL;
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02005922 add_here = FALSE;
Bram Moolenaara2d95102013-06-04 14:23:05 +02005923 add_count = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005924 switch (t->state->c)
5925 {
5926 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005927 {
Bram Moolenaaref2dff52020-12-21 14:54:32 +01005928 // If the match is not at the start of the line, ends before a
5929 // composing characters and rex.reg_icombine is not set, that
5930 // is not really a match.
5931 if (enc_utf8 && !rex.reg_icombine
5932 && rex.input != rex.line && utf_iscomposing(curc))
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02005933 break;
Bram Moolenaara12a1612019-01-24 16:39:02 +01005934
Bram Moolenaar963fee22013-05-26 21:47:28 +02005935 nfa_match = TRUE;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005936 copy_sub(&submatch->norm, &t->subs.norm);
5937#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005938 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf6de0322013-06-02 21:30:04 +02005939 copy_sub(&submatch->synt, &t->subs.synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005940#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005941#ifdef ENABLE_LOG
Bram Moolenaarefb23f22013-06-01 23:02:54 +02005942 log_subsexpr(&t->subs);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005943#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005944 // Found the left-most longest match, do not look at any other
5945 // states at this position. When the list of states is going
5946 // to be empty quit without advancing, so that "rex.input" is
5947 // correct.
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02005948 if (nextlist->n == 0)
Bram Moolenaar57a285b2013-05-26 16:57:28 +02005949 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02005950 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02005951 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005952
5953 case NFA_END_INVISIBLE:
Bram Moolenaardecd9542013-06-07 16:31:50 +02005954 case NFA_END_INVISIBLE_NEG:
Bram Moolenaar87953742013-06-05 18:52:40 +02005955 case NFA_END_PATTERN:
Bram Moolenaarf46da702013-06-02 22:37:42 +02005956 /*
5957 * This is only encountered after a NFA_START_INVISIBLE or
Bram Moolenaar61602c52013-06-01 19:54:43 +02005958 * NFA_START_INVISIBLE_BEFORE node.
5959 * They surround a zero-width group, used with "\@=", "\&",
5960 * "\@!", "\@<=" and "\@<!".
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005961 * If we got here, it means that the current "invisible" group
5962 * finished successfully, so return control to the parent
Bram Moolenaarf46da702013-06-02 22:37:42 +02005963 * nfa_regmatch(). For a look-behind match only when it ends
5964 * in the position in "nfa_endp".
5965 * Submatches are stored in *m, and used in the parent call.
5966 */
Bram Moolenaar61602c52013-06-01 19:54:43 +02005967#ifdef ENABLE_LOG
Bram Moolenaarf46da702013-06-02 22:37:42 +02005968 if (nfa_endp != NULL)
5969 {
5970 if (REG_MULTI)
5971 fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005972 (int)rex.lnum,
Bram Moolenaarf46da702013-06-02 22:37:42 +02005973 (int)nfa_endp->se_u.pos.lnum,
Bram Moolenaar0270f382018-07-17 05:43:58 +02005974 (int)(rex.input - rex.line),
Bram Moolenaarf46da702013-06-02 22:37:42 +02005975 nfa_endp->se_u.pos.col);
5976 else
5977 fprintf(log_fd, "Current col: %d, endp col: %d\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02005978 (int)(rex.input - rex.line),
5979 (int)(nfa_endp->se_u.ptr - rex.input));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02005980 }
Bram Moolenaarf46da702013-06-02 22:37:42 +02005981#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005982 // If "nfa_endp" is set it's only a match if it ends at
5983 // "nfa_endp"
Bram Moolenaarf46da702013-06-02 22:37:42 +02005984 if (nfa_endp != NULL && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02005985 ? (rex.lnum != nfa_endp->se_u.pos.lnum
5986 || (int)(rex.input - rex.line)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005987 != nfa_endp->se_u.pos.col)
Bram Moolenaar0270f382018-07-17 05:43:58 +02005988 : rex.input != nfa_endp->se_u.ptr))
Bram Moolenaarf46da702013-06-02 22:37:42 +02005989 break;
5990
Bram Moolenaar63d9e732019-12-05 21:10:38 +01005991 // do not set submatches for \@!
Bram Moolenaardecd9542013-06-07 16:31:50 +02005992 if (t->state->c != NFA_END_INVISIBLE_NEG)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005993 {
5994 copy_sub(&m->norm, &t->subs.norm);
5995#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02005996 if (rex.nfa_has_zsubexpr)
Bram Moolenaarf46da702013-06-02 22:37:42 +02005997 copy_sub(&m->synt, &t->subs.synt);
5998#endif
5999 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006000#ifdef ENABLE_LOG
6001 fprintf(log_fd, "Match found:\n");
6002 log_subsexpr(m);
6003#endif
Bram Moolenaarf46da702013-06-02 22:37:42 +02006004 nfa_match = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006005 // See comment above at "goto nextchar".
Bram Moolenaar78c93e42013-09-05 16:05:36 +02006006 if (nextlist->n == 0)
6007 clen = 0;
6008 goto nextchar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006009
6010 case NFA_START_INVISIBLE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006011 case NFA_START_INVISIBLE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006012 case NFA_START_INVISIBLE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006013 case NFA_START_INVISIBLE_NEG_FIRST:
Bram Moolenaar61602c52013-06-01 19:54:43 +02006014 case NFA_START_INVISIBLE_BEFORE:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006015 case NFA_START_INVISIBLE_BEFORE_FIRST:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006016 case NFA_START_INVISIBLE_BEFORE_NEG:
Bram Moolenaara2947e22013-06-11 22:44:09 +02006017 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006018 {
Bram Moolenaarbcf4d172013-06-10 16:35:18 +02006019#ifdef ENABLE_LOG
6020 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
6021 failure_chance(t->state->out, 0),
6022 failure_chance(t->state->out1->out, 0));
Bram Moolenaarb76591e2013-06-04 21:42:22 +02006023#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006024 // Do it directly if there already is a PIM or when
6025 // nfa_postprocess() detected it will work better.
Bram Moolenaara2947e22013-06-11 22:44:09 +02006026 if (t->pim.result != NFA_PIM_UNUSED
6027 || t->state->c == NFA_START_INVISIBLE_FIRST
6028 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6029 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
6030 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006031 {
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006032 int in_use = m->norm.in_use;
6033
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006034 // Copy submatch info for the recursive call, opposite
6035 // of what happens on success below.
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006036 copy_sub_off(&m->norm, &t->subs.norm);
Bram Moolenaar699c1202013-09-25 16:41:54 +02006037#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006038 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006039 copy_sub_off(&m->synt, &t->subs.synt);
6040#endif
Bram Moolenaarf86c0b02013-06-26 12:42:44 +02006041
Bram Moolenaara2d95102013-06-04 14:23:05 +02006042 /*
6043 * First try matching the invisible match, then what
6044 * follows.
6045 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006046 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006047 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006048 if (result == NFA_TOO_EXPENSIVE)
6049 {
6050 nfa_match = result;
6051 goto theend;
6052 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006053
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006054 // for \@! and \@<! it is a match when the result is
6055 // FALSE
Bram Moolenaardecd9542013-06-07 16:31:50 +02006056 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006057 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
6058 || t->state->c
6059 == NFA_START_INVISIBLE_BEFORE_NEG
6060 || t->state->c
6061 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006062 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006063 // Copy submatch info from the recursive call
Bram Moolenaara2d95102013-06-04 14:23:05 +02006064 copy_sub_off(&t->subs.norm, &m->norm);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006065#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006066 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006067 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006068#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006069 // If the pattern has \ze and it matched in the
6070 // sub pattern, use it.
Bram Moolenaarf2118842013-09-25 18:16:38 +02006071 copy_ze_off(&t->subs.norm, &m->norm);
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02006072
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006073 // t->state->out1 is the corresponding
6074 // END_INVISIBLE node; Add its out to the current
6075 // list (zero-width match).
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006076 add_here = TRUE;
6077 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006078 }
Bram Moolenaar4d9ae212013-06-28 23:04:42 +02006079 m->norm.in_use = in_use;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006080 }
6081 else
6082 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006083 nfa_pim_T pim;
6084
Bram Moolenaara2d95102013-06-04 14:23:05 +02006085 /*
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006086 * First try matching what follows. Only if a match
6087 * is found verify the invisible match matches. Add a
6088 * nfa_pim_T to the following states, it contains info
6089 * about the invisible match.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006090 */
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006091 pim.state = t->state;
6092 pim.result = NFA_PIM_TODO;
6093 pim.subs.norm.in_use = 0;
6094#ifdef FEAT_SYN_HL
6095 pim.subs.synt.in_use = 0;
6096#endif
6097 if (REG_MULTI)
6098 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006099 pim.end.pos.col = (int)(rex.input - rex.line);
6100 pim.end.pos.lnum = rex.lnum;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006101 }
6102 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006103 pim.end.ptr = rex.input;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006104
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006105 // t->state->out1 is the corresponding END_INVISIBLE
6106 // node; Add its out to the current list (zero-width
6107 // match).
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006108 if (addstate_here(thislist, t->state->out1->out,
6109 &t->subs, &pim, &listidx) == NULL)
6110 {
6111 nfa_match = NFA_TOO_EXPENSIVE;
6112 goto theend;
6113 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006114 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006115 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006116 break;
6117
Bram Moolenaar87953742013-06-05 18:52:40 +02006118 case NFA_START_PATTERN:
Bram Moolenaar43e02982013-06-07 17:31:29 +02006119 {
6120 nfa_state_T *skip = NULL;
6121#ifdef ENABLE_LOG
6122 int skip_lid = 0;
6123#endif
6124
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006125 // There is no point in trying to match the pattern if the
6126 // output state is not going to be added to the list.
Bram Moolenaar43e02982013-06-07 17:31:29 +02006127 if (state_in_list(nextlist, t->state->out1->out, &t->subs))
6128 {
6129 skip = t->state->out1->out;
6130#ifdef ENABLE_LOG
6131 skip_lid = nextlist->id;
6132#endif
6133 }
6134 else if (state_in_list(nextlist,
6135 t->state->out1->out->out, &t->subs))
6136 {
6137 skip = t->state->out1->out->out;
6138#ifdef ENABLE_LOG
6139 skip_lid = nextlist->id;
6140#endif
6141 }
Bram Moolenaar44c71db2013-06-14 22:33:51 +02006142 else if (state_in_list(thislist,
Bram Moolenaar43e02982013-06-07 17:31:29 +02006143 t->state->out1->out->out, &t->subs))
6144 {
6145 skip = t->state->out1->out->out;
6146#ifdef ENABLE_LOG
6147 skip_lid = thislist->id;
6148#endif
6149 }
6150 if (skip != NULL)
6151 {
6152#ifdef ENABLE_LOG
6153 nfa_set_code(skip->c);
6154 fprintf(log_fd, "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
6155 abs(skip->id), skip_lid, skip->c, code);
6156#endif
6157 break;
6158 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006159 // Copy submatch info to the recursive call, opposite of what
6160 // happens afterwards.
Bram Moolenaar699c1202013-09-25 16:41:54 +02006161 copy_sub_off(&m->norm, &t->subs.norm);
6162#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006163 if (rex.nfa_has_zsubexpr)
Bram Moolenaar699c1202013-09-25 16:41:54 +02006164 copy_sub_off(&m->synt, &t->subs.synt);
6165#endif
Bram Moolenaar43e02982013-06-07 17:31:29 +02006166
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006167 // First try matching the pattern.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006168 result = recursive_regmatch(t->state, NULL, prog,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006169 submatch, m, &listids, &listids_len);
Bram Moolenaarfda37292014-11-05 14:27:36 +01006170 if (result == NFA_TOO_EXPENSIVE)
6171 {
6172 nfa_match = result;
6173 goto theend;
6174 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006175 if (result)
6176 {
6177 int bytelen;
6178
6179#ifdef ENABLE_LOG
6180 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
6181 log_subsexpr(m);
6182#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006183 // Copy submatch info from the recursive call
Bram Moolenaar87953742013-06-05 18:52:40 +02006184 copy_sub_off(&t->subs.norm, &m->norm);
6185#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006186 if (rex.nfa_has_zsubexpr)
Bram Moolenaar188c57b2013-06-06 16:22:06 +02006187 copy_sub_off(&t->subs.synt, &m->synt);
Bram Moolenaar87953742013-06-05 18:52:40 +02006188#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006189 // Now we need to skip over the matched text and then
6190 // continue with what follows.
Bram Moolenaar87953742013-06-05 18:52:40 +02006191 if (REG_MULTI)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006192 // TODO: multi-line match
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01006193 bytelen = m->norm.list.multi[0].end_col
Bram Moolenaar0270f382018-07-17 05:43:58 +02006194 - (int)(rex.input - rex.line);
Bram Moolenaar87953742013-06-05 18:52:40 +02006195 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02006196 bytelen = (int)(m->norm.list.line[0].end - rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02006197
6198#ifdef ENABLE_LOG
6199 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
6200#endif
6201 if (bytelen == 0)
6202 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006203 // empty match, output of corresponding
6204 // NFA_END_PATTERN/NFA_SKIP to be used at current
6205 // position
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006206 add_here = TRUE;
6207 add_state = t->state->out1->out->out;
Bram Moolenaar87953742013-06-05 18:52:40 +02006208 }
6209 else if (bytelen <= clen)
6210 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006211 // match current character, output of corresponding
6212 // NFA_END_PATTERN to be used at next position.
Bram Moolenaar87953742013-06-05 18:52:40 +02006213 add_state = t->state->out1->out->out;
6214 add_off = clen;
6215 }
6216 else
6217 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006218 // skip over the matched characters, set character
6219 // count in NFA_SKIP
Bram Moolenaar87953742013-06-05 18:52:40 +02006220 add_state = t->state->out1->out;
6221 add_off = bytelen;
6222 add_count = bytelen - clen;
6223 }
6224 }
6225 break;
Bram Moolenaar43e02982013-06-07 17:31:29 +02006226 }
Bram Moolenaar87953742013-06-05 18:52:40 +02006227
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006228 case NFA_BOL:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006229 if (rex.input == rex.line)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006230 {
6231 add_here = TRUE;
6232 add_state = t->state->out;
6233 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006234 break;
6235
6236 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006237 if (curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006238 {
6239 add_here = TRUE;
6240 add_state = t->state->out;
6241 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006242 break;
6243
6244 case NFA_BOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006245 result = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006246
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006247 if (curc == NUL)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006248 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006249 else if (has_mbyte)
6250 {
6251 int this_class;
6252
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006253 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006254 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006255 if (this_class <= 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006256 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006257 else if (reg_prev_class() == this_class)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006258 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006259 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006260 else if (!vim_iswordc_buf(curc, rex.reg_buf)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006261 || (rex.input > rex.line
6262 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006263 result = FALSE;
6264 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006265 {
6266 add_here = TRUE;
6267 add_state = t->state->out;
6268 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006269 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006270
6271 case NFA_EOW:
Bram Moolenaardecd9542013-06-07 16:31:50 +02006272 result = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006273 if (rex.input == rex.line)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006274 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006275 else if (has_mbyte)
6276 {
6277 int this_class, prev_class;
6278
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006279 // Get class of current and previous char (if it exists).
Bram Moolenaar0270f382018-07-17 05:43:58 +02006280 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006281 prev_class = reg_prev_class();
6282 if (this_class == prev_class
6283 || prev_class == 0 || prev_class == 1)
Bram Moolenaardecd9542013-06-07 16:31:50 +02006284 result = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006285 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02006286 else if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
6287 || (rex.input[0] != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006288 && vim_iswordc_buf(curc, rex.reg_buf)))
Bram Moolenaardecd9542013-06-07 16:31:50 +02006289 result = FALSE;
6290 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006291 {
6292 add_here = TRUE;
6293 add_state = t->state->out;
6294 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006295 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006296
Bram Moolenaar4b780632013-05-31 22:14:52 +02006297 case NFA_BOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006298 if (rex.lnum == 0 && rex.input == rex.line
Bram Moolenaar6100d022016-10-02 16:51:57 +02006299 && (!REG_MULTI || rex.reg_firstlnum == 1))
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006300 {
6301 add_here = TRUE;
6302 add_state = t->state->out;
6303 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006304 break;
6305
6306 case NFA_EOF:
Bram Moolenaar0270f382018-07-17 05:43:58 +02006307 if (rex.lnum == rex.reg_maxline && curc == NUL)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006308 {
6309 add_here = TRUE;
6310 add_state = t->state->out;
6311 }
Bram Moolenaar4b780632013-05-31 22:14:52 +02006312 break;
6313
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006314 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006315 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006316 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02006317 int len = 0;
6318 nfa_state_T *end;
6319 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006320 int cchars[MAX_MCO];
6321 int ccount = 0;
6322 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006323
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006324 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006325 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006326 if (utf_iscomposing(sta->c))
6327 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006328 // Only match composing character(s), ignore base
6329 // character. Used for ".{composing}" and "{composing}"
6330 // (no preceding character).
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006331 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02006332 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006333 if (rex.reg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006334 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006335 // If \Z was present, then ignore composing characters.
6336 // When ignoring the base character this always matches.
Bram Moolenaardff72ba2018-02-08 22:45:17 +01006337 if (sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006338 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006339 else
6340 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006341 while (sta->c != NFA_END_COMPOSING)
6342 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006343 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006344
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006345 // Check base character matches first, unless ignored.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006346 else if (len > 0 || mc == sta->c)
6347 {
6348 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006349 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02006350 len += mb_char2len(mc);
6351 sta = sta->out;
6352 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006353
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006354 // We don't care about the order of composing characters.
6355 // Get them into cchars[] first.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006356 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006357 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02006358 mc = mb_ptr2char(rex.input + len);
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006359 cchars[ccount++] = mc;
6360 len += mb_char2len(mc);
6361 if (ccount == MAX_MCO)
6362 break;
6363 }
6364
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006365 // Check that each composing char in the pattern matches a
6366 // composing char in the text. We do not check if all
6367 // composing chars are matched.
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006368 result = OK;
6369 while (sta->c != NFA_END_COMPOSING)
6370 {
6371 for (j = 0; j < ccount; ++j)
6372 if (cchars[j] == sta->c)
6373 break;
6374 if (j == ccount)
6375 {
6376 result = FAIL;
6377 break;
6378 }
6379 sta = sta->out;
6380 }
6381 }
6382 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02006383 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02006384
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006385 end = t->state->out1; // NFA_END_COMPOSING
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006386 ADD_STATE_IF_MATCH(end);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006387 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02006388 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006389
6390 case NFA_NEWL:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006391 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02006392 && rex.lnum <= rex.reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006393 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02006394 go_to_nextline = TRUE;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006395 // Pass -1 for the offset, which means taking the position
6396 // at the start of the next line.
Bram Moolenaara2d95102013-06-04 14:23:05 +02006397 add_state = t->state->out;
6398 add_off = -1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006399 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006400 else if (curc == '\n' && rex.reg_line_lbr)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006401 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006402 // match \n as if it is an ordinary character
Bram Moolenaara2d95102013-06-04 14:23:05 +02006403 add_state = t->state->out;
6404 add_off = 1;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02006405 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006406 break;
6407
Bram Moolenaar417bad22013-06-07 14:08:30 +02006408 case NFA_START_COLL:
6409 case NFA_START_NEG_COLL:
6410 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006411 // What follows is a list of characters, until NFA_END_COLL.
6412 // One of them must match or none of them must match.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006413 nfa_state_T *state;
6414 int result_if_matched;
6415 int c1, c2;
6416
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006417 // Never match EOL. If it's part of the collection it is added
6418 // as a separate state with an OR.
Bram Moolenaar417bad22013-06-07 14:08:30 +02006419 if (curc == NUL)
6420 break;
6421
6422 state = t->state->out;
6423 result_if_matched = (t->state->c == NFA_START_COLL);
6424 for (;;)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006425 {
Bram Moolenaar417bad22013-06-07 14:08:30 +02006426 if (state->c == NFA_END_COLL)
6427 {
6428 result = !result_if_matched;
6429 break;
6430 }
6431 if (state->c == NFA_RANGE_MIN)
6432 {
6433 c1 = state->val;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006434 state = state->out; // advance to NFA_RANGE_MAX
Bram Moolenaar417bad22013-06-07 14:08:30 +02006435 c2 = state->val;
6436#ifdef ENABLE_LOG
6437 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
6438 curc, c1, c2);
6439#endif
6440 if (curc >= c1 && curc <= c2)
6441 {
6442 result = result_if_matched;
6443 break;
6444 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02006445 if (rex.reg_ic)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006446 {
Bram Moolenaar59de4172020-06-09 19:34:54 +02006447 int curc_low = MB_CASEFOLD(curc);
Bram Moolenaar417bad22013-06-07 14:08:30 +02006448 int done = FALSE;
6449
6450 for ( ; c1 <= c2; ++c1)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006451 if (MB_CASEFOLD(c1) == curc_low)
Bram Moolenaar417bad22013-06-07 14:08:30 +02006452 {
6453 result = result_if_matched;
6454 done = TRUE;
6455 break;
6456 }
6457 if (done)
6458 break;
6459 }
6460 }
6461 else if (state->c < 0 ? check_char_class(state->c, curc)
Bram Moolenaar2c7b9062018-02-04 18:22:46 +01006462 : (curc == state->c
Bram Moolenaar59de4172020-06-09 19:34:54 +02006463 || (rex.reg_ic && MB_CASEFOLD(curc)
6464 == MB_CASEFOLD(state->c))))
Bram Moolenaar417bad22013-06-07 14:08:30 +02006465 {
6466 result = result_if_matched;
6467 break;
6468 }
6469 state = state->out;
6470 }
6471 if (result)
6472 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006473 // next state is in out of the NFA_END_COLL, out1 of
6474 // START points to the END state
Bram Moolenaar417bad22013-06-07 14:08:30 +02006475 add_state = t->state->out1->out;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006476 add_off = clen;
6477 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006478 break;
Bram Moolenaar417bad22013-06-07 14:08:30 +02006479 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006480
6481 case NFA_ANY:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006482 // Any char except '\0', (end of input) does not match.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006483 if (curc > 0)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006484 {
Bram Moolenaara2d95102013-06-04 14:23:05 +02006485 add_state = t->state->out;
6486 add_off = clen;
6487 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006488 break;
6489
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006490 case NFA_ANY_COMPOSING:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006491 // On a composing character skip over it. Otherwise do
6492 // nothing. Always matches.
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006493 if (enc_utf8 && utf_iscomposing(curc))
6494 {
6495 add_off = clen;
6496 }
6497 else
Bram Moolenaar8df5acf2014-05-13 19:37:29 +02006498 {
6499 add_here = TRUE;
6500 add_off = 0;
6501 }
6502 add_state = t->state->out;
6503 break;
6504
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006505 /*
6506 * Character classes like \a for alpha, \d for digit etc.
6507 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006508 case NFA_IDENT: // \i
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006509 result = vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006510 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006511 break;
6512
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006513 case NFA_SIDENT: // \I
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006514 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006515 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006516 break;
6517
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006518 case NFA_KWORD: // \k
Bram Moolenaar0270f382018-07-17 05:43:58 +02006519 result = vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006520 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006521 break;
6522
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006523 case NFA_SKWORD: // \K
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006524 result = !VIM_ISDIGIT(curc)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006525 && vim_iswordp_buf(rex.input, rex.reg_buf);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006526 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006527 break;
6528
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006529 case NFA_FNAME: // \f
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006530 result = vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006531 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006532 break;
6533
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006534 case NFA_SFNAME: // \F
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006535 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006536 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006537 break;
6538
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006539 case NFA_PRINT: // \p
Bram Moolenaar0270f382018-07-17 05:43:58 +02006540 result = vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006541 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006542 break;
6543
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006544 case NFA_SPRINT: // \P
Bram Moolenaar0270f382018-07-17 05:43:58 +02006545 result = !VIM_ISDIGIT(curc) && vim_isprintc(PTR2CHAR(rex.input));
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006546 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006547 break;
6548
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006549 case NFA_WHITE: // \s
Bram Moolenaar1c465442017-03-12 20:10:05 +01006550 result = VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006551 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006552 break;
6553
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006554 case NFA_NWHITE: // \S
Bram Moolenaar1c465442017-03-12 20:10:05 +01006555 result = curc != NUL && !VIM_ISWHITE(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006556 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006557 break;
6558
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006559 case NFA_DIGIT: // \d
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006560 result = ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006561 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006562 break;
6563
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006564 case NFA_NDIGIT: // \D
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006565 result = curc != NUL && !ri_digit(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006566 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006567 break;
6568
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006569 case NFA_HEX: // \x
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006570 result = ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006571 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006572 break;
6573
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006574 case NFA_NHEX: // \X
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006575 result = curc != NUL && !ri_hex(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006576 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006577 break;
6578
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006579 case NFA_OCTAL: // \o
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006580 result = ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006581 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006582 break;
6583
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006584 case NFA_NOCTAL: // \O
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006585 result = curc != NUL && !ri_octal(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006586 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006587 break;
6588
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006589 case NFA_WORD: // \w
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006590 result = ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006591 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006592 break;
6593
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006594 case NFA_NWORD: // \W
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006595 result = curc != NUL && !ri_word(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006596 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006597 break;
6598
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006599 case NFA_HEAD: // \h
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006600 result = ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006601 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006602 break;
6603
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006604 case NFA_NHEAD: // \H
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006605 result = curc != NUL && !ri_head(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006606 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006607 break;
6608
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006609 case NFA_ALPHA: // \a
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006610 result = ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006611 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006612 break;
6613
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006614 case NFA_NALPHA: // \A
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006615 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006616 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006617 break;
6618
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006619 case NFA_LOWER: // \l
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006620 result = ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006621 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006622 break;
6623
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006624 case NFA_NLOWER: // \L
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006625 result = curc != NUL && !ri_lower(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006626 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006627 break;
6628
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006629 case NFA_UPPER: // \u
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006630 result = ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006631 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006632 break;
6633
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006634 case NFA_NUPPER: // \U
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02006635 result = curc != NUL && !ri_upper(curc);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006636 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006637 break;
6638
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006639 case NFA_LOWER_IC: // [a-z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006640 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006641 ADD_STATE_IF_MATCH(t->state);
6642 break;
6643
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006644 case NFA_NLOWER_IC: // [^a-z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006645 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006646 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006647 ADD_STATE_IF_MATCH(t->state);
6648 break;
6649
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006650 case NFA_UPPER_IC: // [A-Z]
Bram Moolenaar6100d022016-10-02 16:51:57 +02006651 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006652 ADD_STATE_IF_MATCH(t->state);
6653 break;
6654
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006655 case NFA_NUPPER_IC: // ^[A-Z]
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006656 result = curc != NUL
Bram Moolenaar6100d022016-10-02 16:51:57 +02006657 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
Bram Moolenaar1cfad522013-08-14 12:06:49 +02006658 ADD_STATE_IF_MATCH(t->state);
6659 break;
6660
Bram Moolenaar5714b802013-05-28 22:03:20 +02006661 case NFA_BACKREF1:
6662 case NFA_BACKREF2:
6663 case NFA_BACKREF3:
6664 case NFA_BACKREF4:
6665 case NFA_BACKREF5:
6666 case NFA_BACKREF6:
6667 case NFA_BACKREF7:
6668 case NFA_BACKREF8:
6669 case NFA_BACKREF9:
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006670#ifdef FEAT_SYN_HL
6671 case NFA_ZREF1:
6672 case NFA_ZREF2:
6673 case NFA_ZREF3:
6674 case NFA_ZREF4:
6675 case NFA_ZREF5:
6676 case NFA_ZREF6:
6677 case NFA_ZREF7:
6678 case NFA_ZREF8:
6679 case NFA_ZREF9:
6680#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006681 // \1 .. \9 \z1 .. \z9
Bram Moolenaar5714b802013-05-28 22:03:20 +02006682 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006683 int subidx;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006684 int bytelen;
6685
Bram Moolenaarefb23f22013-06-01 23:02:54 +02006686 if (t->state->c <= NFA_BACKREF9)
6687 {
6688 subidx = t->state->c - NFA_BACKREF1 + 1;
6689 result = match_backref(&t->subs.norm, subidx, &bytelen);
6690 }
6691#ifdef FEAT_SYN_HL
6692 else
6693 {
6694 subidx = t->state->c - NFA_ZREF1 + 1;
6695 result = match_zref(subidx, &bytelen);
6696 }
6697#endif
6698
Bram Moolenaar5714b802013-05-28 22:03:20 +02006699 if (result)
6700 {
6701 if (bytelen == 0)
6702 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006703 // empty match always works, output of NFA_SKIP to be
6704 // used next
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006705 add_here = TRUE;
6706 add_state = t->state->out->out;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006707 }
6708 else if (bytelen <= clen)
6709 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006710 // match current character, jump ahead to out of
6711 // NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006712 add_state = t->state->out->out;
6713 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006714 }
6715 else
6716 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006717 // skip over the matched characters, set character
6718 // count in NFA_SKIP
Bram Moolenaara2d95102013-06-04 14:23:05 +02006719 add_state = t->state->out;
6720 add_off = bytelen;
6721 add_count = bytelen - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006722 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02006723 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02006724 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006725 }
6726 case NFA_SKIP:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006727 // character of previous matching \1 .. \9 or \@>
Bram Moolenaar5714b802013-05-28 22:03:20 +02006728 if (t->count - clen <= 0)
6729 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006730 // end of match, go to what follows
Bram Moolenaara2d95102013-06-04 14:23:05 +02006731 add_state = t->state->out;
6732 add_off = clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006733 }
6734 else
6735 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006736 // add state again with decremented count
Bram Moolenaara2d95102013-06-04 14:23:05 +02006737 add_state = t->state;
6738 add_off = 0;
6739 add_count = t->count - clen;
Bram Moolenaar5714b802013-05-28 22:03:20 +02006740 }
6741 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006742
Bram Moolenaar423532e2013-05-29 21:14:42 +02006743 case NFA_LNUM:
6744 case NFA_LNUM_GT:
6745 case NFA_LNUM_LT:
6746 result = (REG_MULTI &&
6747 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006748 (long_u)(rex.lnum + rex.reg_firstlnum)));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006749 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006750 {
6751 add_here = TRUE;
6752 add_state = t->state->out;
6753 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006754 break;
6755
6756 case NFA_COL:
6757 case NFA_COL_GT:
6758 case NFA_COL_LT:
6759 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006760 (long_u)(rex.input - rex.line) + 1);
Bram Moolenaar423532e2013-05-29 21:14:42 +02006761 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006762 {
6763 add_here = TRUE;
6764 add_state = t->state->out;
6765 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006766 break;
6767
6768 case NFA_VCOL:
6769 case NFA_VCOL_GT:
6770 case NFA_VCOL_LT:
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006771 {
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006772 int op = t->state->c - NFA_VCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02006773 colnr_T col = (colnr_T)(rex.input - rex.line);
Bram Moolenaar6100d022016-10-02 16:51:57 +02006774 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006775
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006776 // Bail out quickly when there can't be a match, avoid the
6777 // overhead of win_linetabsize() on long lines.
Bram Moolenaar4f36dc32015-03-05 17:16:06 +01006778 if (op != 1 && col > t->state->val
Bram Moolenaara12a1612019-01-24 16:39:02 +01006779 * (has_mbyte ? MB_MAXBYTES : 1))
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006780 break;
Bram Moolenaaref795d12015-01-18 16:46:32 +01006781 result = FALSE;
6782 if (op == 1 && col - 1 > t->state->val && col > 100)
6783 {
6784 int ts = wp->w_buffer->b_p_ts;
6785
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006786 // Guess that a character won't use more columns than
6787 // 'tabstop', with a minimum of 4.
Bram Moolenaaref795d12015-01-18 16:46:32 +01006788 if (ts < 4)
6789 ts = 4;
6790 result = col > t->state->val * ts;
6791 }
6792 if (!result)
6793 result = nfa_re_num_cmp(t->state->val, op,
Bram Moolenaar0270f382018-07-17 05:43:58 +02006794 (long_u)win_linetabsize(wp, rex.line, col) + 1);
Bram Moolenaara20bcad2015-01-14 18:40:28 +01006795 if (result)
6796 {
6797 add_here = TRUE;
6798 add_state = t->state->out;
6799 }
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006800 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006801 break;
6802
Bram Moolenaar044aa292013-06-04 21:27:38 +02006803 case NFA_MARK:
6804 case NFA_MARK_GT:
6805 case NFA_MARK_LT:
6806 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02006807 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, FALSE);
Bram Moolenaar044aa292013-06-04 21:27:38 +02006808
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006809 // Compare the mark position to the match position.
6810 result = (pos != NULL // mark doesn't exist
6811 && pos->lnum > 0 // mark isn't set in reg_buf
Bram Moolenaar0270f382018-07-17 05:43:58 +02006812 && (pos->lnum == rex.lnum + rex.reg_firstlnum
6813 ? (pos->col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006814 ? t->state->c == NFA_MARK
Bram Moolenaar0270f382018-07-17 05:43:58 +02006815 : (pos->col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar044aa292013-06-04 21:27:38 +02006816 ? t->state->c == NFA_MARK_GT
6817 : t->state->c == NFA_MARK_LT))
Bram Moolenaar0270f382018-07-17 05:43:58 +02006818 : (pos->lnum < rex.lnum + rex.reg_firstlnum
Bram Moolenaar044aa292013-06-04 21:27:38 +02006819 ? t->state->c == NFA_MARK_GT
6820 : t->state->c == NFA_MARK_LT)));
6821 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006822 {
6823 add_here = TRUE;
6824 add_state = t->state->out;
6825 }
Bram Moolenaar044aa292013-06-04 21:27:38 +02006826 break;
6827 }
6828
Bram Moolenaar423532e2013-05-29 21:14:42 +02006829 case NFA_CURSOR:
Bram Moolenaar6100d022016-10-02 16:51:57 +02006830 result = (rex.reg_win != NULL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006831 && (rex.lnum + rex.reg_firstlnum
Bram Moolenaar6100d022016-10-02 16:51:57 +02006832 == rex.reg_win->w_cursor.lnum)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006833 && ((colnr_T)(rex.input - rex.line)
Bram Moolenaar6100d022016-10-02 16:51:57 +02006834 == rex.reg_win->w_cursor.col));
Bram Moolenaar423532e2013-05-29 21:14:42 +02006835 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006836 {
6837 add_here = TRUE;
6838 add_state = t->state->out;
6839 }
Bram Moolenaar423532e2013-05-29 21:14:42 +02006840 break;
6841
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006842 case NFA_VISUAL:
6843 result = reg_match_visual();
6844 if (result)
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006845 {
6846 add_here = TRUE;
6847 add_state = t->state->out;
6848 }
Bram Moolenaar973fced2013-06-05 21:10:59 +02006849 break;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02006850
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006851 case NFA_MOPEN1:
6852 case NFA_MOPEN2:
6853 case NFA_MOPEN3:
6854 case NFA_MOPEN4:
6855 case NFA_MOPEN5:
6856 case NFA_MOPEN6:
6857 case NFA_MOPEN7:
6858 case NFA_MOPEN8:
6859 case NFA_MOPEN9:
6860#ifdef FEAT_SYN_HL
6861 case NFA_ZOPEN:
6862 case NFA_ZOPEN1:
6863 case NFA_ZOPEN2:
6864 case NFA_ZOPEN3:
6865 case NFA_ZOPEN4:
6866 case NFA_ZOPEN5:
6867 case NFA_ZOPEN6:
6868 case NFA_ZOPEN7:
6869 case NFA_ZOPEN8:
6870 case NFA_ZOPEN9:
6871#endif
6872 case NFA_NOPEN:
6873 case NFA_ZSTART:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006874 // These states are only added to be able to bail out when
6875 // they are added again, nothing is to be done.
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006876 break;
6877
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006878 default: // regular character
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006879 {
6880 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02006881
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006882#ifdef DEBUG
Bram Moolenaardecd9542013-06-07 16:31:50 +02006883 if (c < 0)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01006884 siemsg("INTERNAL: Negative state char: %ld", c);
Bram Moolenaar398d53d2013-08-01 15:45:52 +02006885#endif
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006886 result = (c == curc);
6887
Bram Moolenaar6100d022016-10-02 16:51:57 +02006888 if (!result && rex.reg_ic)
Bram Moolenaar59de4172020-06-09 19:34:54 +02006889 result = MB_CASEFOLD(c) == MB_CASEFOLD(curc);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006890 // If rex.reg_icombine is not set only skip over the character
6891 // itself. When it is set skip over composing characters.
Bram Moolenaar6100d022016-10-02 16:51:57 +02006892 if (result && enc_utf8 && !rex.reg_icombine)
Bram Moolenaar0270f382018-07-17 05:43:58 +02006893 clen = utf_ptr2len(rex.input);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02006894 ADD_STATE_IF_MATCH(t->state);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02006895 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02006896 }
Bram Moolenaara2d95102013-06-04 14:23:05 +02006897
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006898 } // switch (t->state->c)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006899
6900 if (add_state != NULL)
6901 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006902 nfa_pim_T *pim;
Bram Moolenaara951e352013-10-06 15:46:11 +02006903 nfa_pim_T pim_copy;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006904
6905 if (t->pim.result == NFA_PIM_UNUSED)
6906 pim = NULL;
6907 else
6908 pim = &t->pim;
6909
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006910 // Handle the postponed invisible match if the match might end
6911 // without advancing and before the end of the line.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006912 if (pim != NULL && (clen == 0 || match_follows(add_state, 0)))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006913 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006914 if (pim->result == NFA_PIM_TODO)
Bram Moolenaara2d95102013-06-04 14:23:05 +02006915 {
6916#ifdef ENABLE_LOG
6917 fprintf(log_fd, "\n");
6918 fprintf(log_fd, "==================================\n");
6919 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6920 fprintf(log_fd, "\n");
6921#endif
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006922 result = recursive_regmatch(pim->state, pim,
Bram Moolenaar2338c322018-07-08 19:07:19 +02006923 prog, submatch, m, &listids, &listids_len);
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006924 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006925 // for \@! and \@<! it is a match when the result is
6926 // FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006927 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006928 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6929 || pim->state->c
6930 == NFA_START_INVISIBLE_BEFORE_NEG
6931 || pim->state->c
6932 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006933 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006934 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006935 copy_sub_off(&pim->subs.norm, &m->norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006936#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006937 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006938 copy_sub_off(&pim->subs.synt, &m->synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006939#endif
6940 }
6941 }
6942 else
6943 {
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006944 result = (pim->result == NFA_PIM_MATCH);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006945#ifdef ENABLE_LOG
6946 fprintf(log_fd, "\n");
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006947 fprintf(log_fd, "Using previous recursive nfa_regmatch() result, result == %d\n", pim->result);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006948 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
6949 fprintf(log_fd, "\n");
6950#endif
6951 }
6952
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006953 // for \@! and \@<! it is a match when result is FALSE
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006954 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
Bram Moolenaara2947e22013-06-11 22:44:09 +02006955 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6956 || pim->state->c
6957 == NFA_START_INVISIBLE_BEFORE_NEG
6958 || pim->state->c
6959 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST))
Bram Moolenaara2d95102013-06-04 14:23:05 +02006960 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006961 // Copy submatch info from the recursive call
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006962 copy_sub_off(&t->subs.norm, &pim->subs.norm);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006963#ifdef FEAT_SYN_HL
Bram Moolenaar0270f382018-07-17 05:43:58 +02006964 if (rex.nfa_has_zsubexpr)
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006965 copy_sub_off(&t->subs.synt, &pim->subs.synt);
Bram Moolenaara2d95102013-06-04 14:23:05 +02006966#endif
6967 }
6968 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006969 // look-behind match failed, don't add the state
Bram Moolenaara2d95102013-06-04 14:23:05 +02006970 continue;
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006971
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006972 // Postponed invisible match was handled, don't add it to
6973 // following states.
Bram Moolenaar2a4e98a2013-06-09 16:24:45 +02006974 pim = NULL;
Bram Moolenaara2d95102013-06-04 14:23:05 +02006975 }
6976
Bram Moolenaar63d9e732019-12-05 21:10:38 +01006977 // If "pim" points into l->t it will become invalid when
6978 // adding the state causes the list to be reallocated. Make a
6979 // local copy to avoid that.
Bram Moolenaara951e352013-10-06 15:46:11 +02006980 if (pim == &t->pim)
6981 {
6982 copy_pim(&pim_copy, pim);
6983 pim = &pim_copy;
6984 }
6985
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006986 if (add_here)
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006987 r = addstate_here(thislist, add_state, &t->subs,
6988 pim, &listidx);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006989 else
6990 {
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006991 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
Bram Moolenaarb1b284f2013-06-08 13:33:37 +02006992 if (add_count > 0)
6993 nextlist->t[nextlist->n - 1].count = add_count;
6994 }
Bram Moolenaar5567ad42019-02-12 23:05:46 +01006995 if (r == NULL)
6996 {
6997 nfa_match = NFA_TOO_EXPENSIVE;
6998 goto theend;
6999 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007000 }
7001
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007002 } // for (thislist = thislist; thislist->state; thislist++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007003
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007004 // Look for the start of a match in the current position by adding the
7005 // start state to the list of states.
7006 // The first found match is the leftmost one, thus the order of states
7007 // matters!
7008 // Do not add the start state in recursive calls of nfa_regmatch(),
7009 // because recursive calls should only start in the first position.
7010 // Unless "nfa_endp" is not NULL, then we match the end position.
7011 // Also don't start a match past the first line.
Bram Moolenaar61602c52013-06-01 19:54:43 +02007012 if (nfa_match == FALSE
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007013 && ((toplevel
Bram Moolenaar0270f382018-07-17 05:43:58 +02007014 && rex.lnum == 0
Bram Moolenaar61602c52013-06-01 19:54:43 +02007015 && clen != 0
Bram Moolenaar6100d022016-10-02 16:51:57 +02007016 && (rex.reg_maxcol == 0
Bram Moolenaar0270f382018-07-17 05:43:58 +02007017 || (colnr_T)(rex.input - rex.line) < rex.reg_maxcol))
Bram Moolenaar307aa162013-06-02 16:34:21 +02007018 || (nfa_endp != NULL
Bram Moolenaar61602c52013-06-01 19:54:43 +02007019 && (REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007020 ? (rex.lnum < nfa_endp->se_u.pos.lnum
7021 || (rex.lnum == nfa_endp->se_u.pos.lnum
7022 && (int)(rex.input - rex.line)
Bram Moolenaar307aa162013-06-02 16:34:21 +02007023 < nfa_endp->se_u.pos.col))
Bram Moolenaar0270f382018-07-17 05:43:58 +02007024 : rex.input < nfa_endp->se_u.ptr))))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007025 {
7026#ifdef ENABLE_LOG
7027 fprintf(log_fd, "(---) STARTSTATE\n");
7028#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007029 // Inline optimized code for addstate() if we know the state is
7030 // the first MOPEN.
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007031 if (toplevel)
7032 {
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007033 int add = TRUE;
7034 int c;
7035
7036 if (prog->regstart != NUL && clen != 0)
7037 {
7038 if (nextlist->n == 0)
7039 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007040 colnr_T col = (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007041
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007042 // Nextlist is empty, we can skip ahead to the
7043 // character that must appear at the start.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007044 if (skip_to_start(prog->regstart, &col) == FAIL)
7045 break;
7046#ifdef ENABLE_LOG
7047 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
Bram Moolenaar0270f382018-07-17 05:43:58 +02007048 col - ((colnr_T)(rex.input - rex.line) + clen));
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007049#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007050 rex.input = rex.line + col - clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007051 }
7052 else
7053 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007054 // Checking if the required start character matches is
7055 // cheaper than adding a state that won't match.
Bram Moolenaar0270f382018-07-17 05:43:58 +02007056 c = PTR2CHAR(rex.input + clen);
Bram Moolenaar6100d022016-10-02 16:51:57 +02007057 if (c != prog->regstart && (!rex.reg_ic
Bram Moolenaar59de4172020-06-09 19:34:54 +02007058 || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart)))
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007059 {
7060#ifdef ENABLE_LOG
7061 fprintf(log_fd, " Skipping start state, regstart does not match\n");
7062#endif
7063 add = FALSE;
7064 }
7065 }
7066 }
7067
7068 if (add)
7069 {
7070 if (REG_MULTI)
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007071 m->norm.list.multi[0].start_col =
Bram Moolenaar0270f382018-07-17 05:43:58 +02007072 (colnr_T)(rex.input - rex.line) + clen;
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007073 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007074 m->norm.list.line[0].start = rex.input + clen;
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007075 if (addstate(nextlist, start->out, m, NULL, clen) == NULL)
7076 {
7077 nfa_match = NFA_TOO_EXPENSIVE;
7078 goto theend;
7079 }
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007080 }
Bram Moolenaarf96d1092013-06-07 22:39:40 +02007081 }
7082 else
Bram Moolenaar5567ad42019-02-12 23:05:46 +01007083 {
7084 if (addstate(nextlist, start, m, NULL, clen) == NULL)
7085 {
7086 nfa_match = NFA_TOO_EXPENSIVE;
7087 goto theend;
7088 }
7089 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007090 }
7091
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007092#ifdef ENABLE_LOG
7093 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007094 {
7095 int i;
7096
7097 for (i = 0; i < thislist->n; i++)
7098 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
7099 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007100 fprintf(log_fd, "\n");
7101#endif
7102
7103nextchar:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007104 // Advance to the next character, or advance to the next line, or
7105 // finish.
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02007106 if (clen != 0)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007107 rex.input += clen;
Bram Moolenaar307aa162013-06-02 16:34:21 +02007108 else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
Bram Moolenaar0270f382018-07-17 05:43:58 +02007109 && rex.lnum < nfa_endp->se_u.pos.lnum))
Bram Moolenaar35b23862013-05-22 23:00:40 +02007110 reg_nextline();
7111 else
7112 break;
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007113
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007114 // Allow interrupting with CTRL-C.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007115 line_breakcheck();
Bram Moolenaara20bcad2015-01-14 18:40:28 +01007116 if (got_int)
7117 break;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007118#ifdef FEAT_RELTIME
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007119 // Check for timeout once in a twenty times to avoid overhead.
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007120 if (nfa_time_limit != NULL && ++nfa_time_count == 20)
7121 {
7122 nfa_time_count = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007123 if (nfa_did_time_out())
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007124 break;
7125 }
7126#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02007127 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007128
7129#ifdef ENABLE_LOG
7130 if (log_fd != stderr)
7131 fclose(log_fd);
7132 log_fd = NULL;
7133#endif
7134
7135theend:
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007136 // Free memory
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007137 vim_free(list[0].t);
7138 vim_free(list[1].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02007139 vim_free(listids);
Bram Moolenaar8aca2e92013-06-07 14:59:18 +02007140#undef ADD_STATE_IF_MATCH
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02007141#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007142 fclose(debug);
7143#endif
7144
Bram Moolenaar963fee22013-05-26 21:47:28 +02007145 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007146}
7147
7148/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02007149 * Try match of "prog" with at rex.line["col"].
Bram Moolenaar8c731502014-11-23 15:57:49 +01007150 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007151 */
7152 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007153nfa_regtry(
7154 nfa_regprog_T *prog,
7155 colnr_T col,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007156 proftime_T *tm UNUSED, // timeout limit or NULL
7157 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007158{
7159 int i;
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007160 regsubs_T subs, m;
7161 nfa_state_T *start = prog->start;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007162 int result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007163#ifdef ENABLE_LOG
7164 FILE *f;
7165#endif
7166
Bram Moolenaar0270f382018-07-17 05:43:58 +02007167 rex.input = rex.line + col;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007168#ifdef FEAT_RELTIME
7169 nfa_time_limit = tm;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007170 nfa_timed_out = timed_out;
Bram Moolenaar70781ee2015-02-03 16:49:24 +01007171 nfa_time_count = 0;
7172#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007173
7174#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007175 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007176 if (f != NULL)
7177 {
Bram Moolenaar87953742013-06-05 18:52:40 +02007178 fprintf(f, "\n\n\t=======================================================\n");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007179#ifdef DEBUG
7180 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
7181#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007182 fprintf(f, "\tInput text is \"%s\" \n", rex.input);
Bram Moolenaar87953742013-06-05 18:52:40 +02007183 fprintf(f, "\t=======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02007184 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007185 fprintf(f, "\n\n");
7186 fclose(f);
7187 }
7188 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01007189 emsg("Could not open temporary log file for writing");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007190#endif
7191
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007192 clear_sub(&subs.norm);
7193 clear_sub(&m.norm);
7194#ifdef FEAT_SYN_HL
7195 clear_sub(&subs.synt);
7196 clear_sub(&m.synt);
7197#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007198
Bram Moolenaarfda37292014-11-05 14:27:36 +01007199 result = nfa_regmatch(prog, start, &subs, &m);
7200 if (result == FALSE)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007201 return 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01007202 else if (result == NFA_TOO_EXPENSIVE)
7203 return result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007204
7205 cleanup_subexpr();
7206 if (REG_MULTI)
7207 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007208 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007209 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007210 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
7211 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007212
Bram Moolenaar6100d022016-10-02 16:51:57 +02007213 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
7214 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007215 }
7216
Bram Moolenaar6100d022016-10-02 16:51:57 +02007217 if (rex.reg_startpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007218 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007219 rex.reg_startpos[0].lnum = 0;
7220 rex.reg_startpos[0].col = col;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007221 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02007222 if (rex.reg_endpos[0].lnum < 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007223 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007224 // pattern has a \ze but it didn't match, use current end
Bram Moolenaar0270f382018-07-17 05:43:58 +02007225 rex.reg_endpos[0].lnum = rex.lnum;
7226 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007227 }
7228 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007229 // Use line number of "\ze".
Bram Moolenaar0270f382018-07-17 05:43:58 +02007230 rex.lnum = rex.reg_endpos[0].lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007231 }
7232 else
7233 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007234 for (i = 0; i < subs.norm.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007235 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007236 rex.reg_startp[i] = subs.norm.list.line[i].start;
7237 rex.reg_endp[i] = subs.norm.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007238 }
7239
Bram Moolenaar6100d022016-10-02 16:51:57 +02007240 if (rex.reg_startp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007241 rex.reg_startp[0] = rex.line + col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007242 if (rex.reg_endp[0] == NULL)
Bram Moolenaar0270f382018-07-17 05:43:58 +02007243 rex.reg_endp[0] = rex.input;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007244 }
7245
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007246#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007247 // Package any found \z(...\) matches for export. Default is none.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007248 unref_extmatch(re_extmatch_out);
7249 re_extmatch_out = NULL;
7250
7251 if (prog->reghasz == REX_SET)
7252 {
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007253 cleanup_zsubexpr();
7254 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01007255 if (re_extmatch_out == NULL)
7256 return 0;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007257 // Loop over \z1, \z2, etc. There is no \z0.
Bram Moolenaar5ad075c2015-11-24 15:18:32 +01007258 for (i = 1; i < subs.synt.in_use; i++)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007259 {
7260 if (REG_MULTI)
7261 {
7262 struct multipos *mpos = &subs.synt.list.multi[i];
7263
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007264 // Only accept single line matches that are valid.
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007265 if (mpos->start_lnum >= 0
7266 && mpos->start_lnum == mpos->end_lnum
7267 && mpos->end_col >= mpos->start_col)
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007268 re_extmatch_out->matches[i] =
Bram Moolenaar0cd040b2015-01-27 14:54:11 +01007269 vim_strnsave(reg_getline(mpos->start_lnum)
7270 + mpos->start_col,
7271 mpos->end_col - mpos->start_col);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007272 }
7273 else
7274 {
7275 struct linepos *lpos = &subs.synt.list.line[i];
7276
7277 if (lpos->start != NULL && lpos->end != NULL)
7278 re_extmatch_out->matches[i] =
Bram Moolenaar71ccd032020-06-12 22:59:11 +02007279 vim_strnsave(lpos->start, lpos->end - lpos->start);
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007280 }
7281 }
7282 }
7283#endif
7284
Bram Moolenaar0270f382018-07-17 05:43:58 +02007285 return 1 + rex.lnum;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007286}
7287
7288/*
7289 * Match a regexp against a string ("line" points to the string) or multiple
7290 * lines ("line" is NULL, use reg_getline()).
7291 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007292 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007293 */
7294 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007295nfa_regexec_both(
7296 char_u *line,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007297 colnr_T startcol, // column to start looking for match
7298 proftime_T *tm, // timeout limit or NULL
7299 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007300{
7301 nfa_regprog_T *prog;
7302 long retval = 0L;
7303 int i;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007304 colnr_T col = startcol;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007305
7306 if (REG_MULTI)
7307 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007308 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007309 line = reg_getline((linenr_T)0); // relative to the cursor
Bram Moolenaar6100d022016-10-02 16:51:57 +02007310 rex.reg_startpos = rex.reg_mmatch->startpos;
7311 rex.reg_endpos = rex.reg_mmatch->endpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007312 }
7313 else
7314 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02007315 prog = (nfa_regprog_T *)rex.reg_match->regprog;
7316 rex.reg_startp = rex.reg_match->startp;
7317 rex.reg_endp = rex.reg_match->endp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007318 }
7319
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007320 // Be paranoid...
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007321 if (prog == NULL || line == NULL)
7322 {
Bram Moolenaare83cca22020-09-07 18:53:21 +02007323 iemsg(_(e_null));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007324 goto theend;
7325 }
7326
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007327 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007328 if (prog->regflags & RF_ICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007329 rex.reg_ic = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007330 else if (prog->regflags & RF_NOICASE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007331 rex.reg_ic = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007332
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007333 // If pattern contains "\Z" overrule value of rex.reg_icombine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007334 if (prog->regflags & RF_ICOMBINE)
Bram Moolenaar6100d022016-10-02 16:51:57 +02007335 rex.reg_icombine = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007336
Bram Moolenaar0270f382018-07-17 05:43:58 +02007337 rex.line = line;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007338 rex.lnum = 0; // relative to line
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007339
Bram Moolenaar0270f382018-07-17 05:43:58 +02007340 rex.nfa_has_zend = prog->has_zend;
7341 rex.nfa_has_backref = prog->has_backref;
7342 rex.nfa_nsubexpr = prog->nsubexp;
7343 rex.nfa_listid = 1;
7344 rex.nfa_alt_listid = 2;
7345#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007346 nfa_regengine.expr = prog->pattern;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007347#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007348
Bram Moolenaard89616e2013-06-06 18:46:06 +02007349 if (prog->reganch && col > 0)
7350 return 0L;
7351
Bram Moolenaar0270f382018-07-17 05:43:58 +02007352 rex.need_clear_subexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007353#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007354 // Clear the external match subpointers if necessary.
Bram Moolenaar473de612013-06-08 18:19:48 +02007355 if (prog->reghasz == REX_SET)
7356 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02007357 rex.nfa_has_zsubexpr = TRUE;
7358 rex.need_clear_zsubexpr = TRUE;
Bram Moolenaar473de612013-06-08 18:19:48 +02007359 }
7360 else
Bram Moolenaar0270f382018-07-17 05:43:58 +02007361 {
7362 rex.nfa_has_zsubexpr = FALSE;
7363 rex.need_clear_zsubexpr = FALSE;
7364 }
Bram Moolenaar473de612013-06-08 18:19:48 +02007365#endif
7366
Bram Moolenaard89616e2013-06-06 18:46:06 +02007367 if (prog->regstart != NUL)
Bram Moolenaar473de612013-06-08 18:19:48 +02007368 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007369 // Skip ahead until a character we know the match must start with.
7370 // When there is none there is no match.
Bram Moolenaar87f764a2013-06-08 14:38:27 +02007371 if (skip_to_start(prog->regstart, &col) == FAIL)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007372 return 0L;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007373
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007374 // If match_text is set it contains the full text that must match.
7375 // Nothing else to try. Doesn't handle combining chars well.
Bram Moolenaara12a1612019-01-24 16:39:02 +01007376 if (prog->match_text != NULL && !rex.reg_icombine)
Bram Moolenaar473de612013-06-08 18:19:48 +02007377 return find_match_text(col, prog->regstart, prog->match_text);
7378 }
7379
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007380 // If the start column is past the maximum column: no need to try.
Bram Moolenaar6100d022016-10-02 16:51:57 +02007381 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
Bram Moolenaard89616e2013-06-06 18:46:06 +02007382 goto theend;
7383
Bram Moolenaar0270f382018-07-17 05:43:58 +02007384 // Set the "nstate" used by nfa_regcomp() to zero to trigger an error when
7385 // it's accidentally used during execution.
7386 nstate = 0;
7387 for (i = 0; i < prog->nstate; ++i)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007388 {
7389 prog->state[i].id = i;
Bram Moolenaardd2ccdf2013-06-03 12:17:04 +02007390 prog->state[i].lastlist[0] = 0;
7391 prog->state[i].lastlist[1] = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007392 }
7393
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007394 retval = nfa_regtry(prog, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007395
Bram Moolenaar0270f382018-07-17 05:43:58 +02007396#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007397 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007398#endif
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007399
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007400theend:
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007401 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007402 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007403 // Make sure the end is never before the start. Can happen when \zs and
7404 // \ze are used.
7405 if (REG_MULTI)
7406 {
7407 lpos_T *start = &rex.reg_mmatch->startpos[0];
7408 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007409
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007410 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007411 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01007412 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
7413 }
7414 else
7415 {
7416 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
7417 rex.reg_match->endp[0] = rex.reg_match->startp[0];
7418 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01007419 }
7420
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007421 return retval;
7422}
7423
7424/*
7425 * Compile a regular expression into internal code for the NFA matcher.
7426 * Returns the program in allocated space. Returns NULL for an error.
7427 */
7428 static regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01007429nfa_regcomp(char_u *expr, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007430{
Bram Moolenaaraae48832013-05-25 21:18:34 +02007431 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02007432 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007433 int *postfix;
7434
7435 if (expr == NULL)
7436 return NULL;
7437
Bram Moolenaar0270f382018-07-17 05:43:58 +02007438#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007439 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007440#endif
Bram Moolenaare0ad3652015-01-27 12:59:55 +01007441 nfa_re_flags = re_flags;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007442
7443 init_class_tab();
7444
7445 if (nfa_regcomp_start(expr, re_flags) == FAIL)
7446 return NULL;
7447
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007448 // Build postfix form of the regexp. Needed to build the NFA
7449 // (and count its size).
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007450 postfix = re2post();
7451 if (postfix == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007452 goto fail; // Cascaded (syntax?) error
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007453
7454 /*
7455 * In order to build the NFA, we parse the input regexp twice:
7456 * 1. first pass to count size (so we can allocate space)
7457 * 2. second to emit code
7458 */
7459#ifdef ENABLE_LOG
7460 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02007461 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007462
7463 if (f != NULL)
7464 {
Bram Moolenaar9b0c5c22018-06-20 20:37:36 +02007465 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\"... hold on !\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007466 fclose(f);
7467 }
7468 }
7469#endif
7470
7471 /*
7472 * PASS 1
7473 * Count number of NFA states in "nstate". Do not build the NFA.
7474 */
7475 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007476
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007477 // allocate the regprog with space for the compiled regexp
Bram Moolenaar16619a22013-06-11 18:42:36 +02007478 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
Bram Moolenaarc799fe22019-05-28 23:08:19 +02007479 prog = alloc(prog_size);
Bram Moolenaaraae48832013-05-25 21:18:34 +02007480 if (prog == NULL)
7481 goto fail;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007482 state_ptr = prog->state;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007483 prog->re_in_use = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007484
7485 /*
7486 * PASS 2
7487 * Build the NFA
7488 */
7489 prog->start = post2nfa(postfix, post_ptr, FALSE);
7490 if (prog->start == NULL)
7491 goto fail;
7492
7493 prog->regflags = regflags;
7494 prog->engine = &nfa_regengine;
7495 prog->nstate = nstate;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007496 prog->has_zend = rex.nfa_has_zend;
7497 prog->has_backref = rex.nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02007498 prog->nsubexp = regnpar;
Bram Moolenaard89616e2013-06-06 18:46:06 +02007499
Bram Moolenaara2947e22013-06-11 22:44:09 +02007500 nfa_postprocess(prog);
7501
Bram Moolenaard89616e2013-06-06 18:46:06 +02007502 prog->reganch = nfa_get_reganch(prog->start, 0);
7503 prog->regstart = nfa_get_regstart(prog->start, 0);
Bram Moolenaar473de612013-06-08 18:19:48 +02007504 prog->match_text = nfa_get_match_text(prog->start);
7505
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007506#ifdef ENABLE_LOG
7507 nfa_postfix_dump(expr, OK);
7508 nfa_dump(prog);
7509#endif
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007510#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007511 // Remember whether this pattern has any \z specials in it.
Bram Moolenaarefb23f22013-06-01 23:02:54 +02007512 prog->reghasz = re_has_z;
7513#endif
Bram Moolenaar473de612013-06-08 18:19:48 +02007514 prog->pattern = vim_strsave(expr);
Bram Moolenaar0270f382018-07-17 05:43:58 +02007515#ifdef DEBUG
Bram Moolenaar69afb7b2013-06-02 15:55:55 +02007516 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007517#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007518
7519out:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007520 VIM_CLEAR(post_start);
7521 post_ptr = post_end = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007522 state_ptr = NULL;
7523 return (regprog_T *)prog;
7524
7525fail:
Bram Moolenaard23a8232018-02-10 18:45:26 +01007526 VIM_CLEAR(prog);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007527#ifdef ENABLE_LOG
7528 nfa_postfix_dump(expr, FAIL);
7529#endif
Bram Moolenaar0270f382018-07-17 05:43:58 +02007530#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007531 nfa_regengine.expr = NULL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02007532#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007533 goto out;
7534}
7535
Bram Moolenaar473de612013-06-08 18:19:48 +02007536/*
7537 * Free a compiled regexp program, returned by nfa_regcomp().
7538 */
7539 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01007540nfa_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02007541{
7542 if (prog != NULL)
7543 {
7544 vim_free(((nfa_regprog_T *)prog)->match_text);
Bram Moolenaar473de612013-06-08 18:19:48 +02007545 vim_free(((nfa_regprog_T *)prog)->pattern);
Bram Moolenaar473de612013-06-08 18:19:48 +02007546 vim_free(prog);
7547 }
7548}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007549
7550/*
7551 * Match a regexp against a string.
7552 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
7553 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaar2af78a12014-04-23 19:06:37 +02007554 * If "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007555 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007556 * Returns <= 0 for failure, number of lines contained in the match otherwise.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007557 */
7558 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01007559nfa_regexec_nl(
7560 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007561 char_u *line, // string to match against
7562 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01007563 int line_lbr)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007564{
Bram Moolenaar6100d022016-10-02 16:51:57 +02007565 rex.reg_match = rmp;
7566 rex.reg_mmatch = NULL;
7567 rex.reg_maxline = 0;
7568 rex.reg_line_lbr = line_lbr;
7569 rex.reg_buf = curbuf;
7570 rex.reg_win = NULL;
7571 rex.reg_ic = rmp->rm_ic;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007572 rex.reg_icombine = FALSE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02007573 rex.reg_maxcol = 0;
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007574 return nfa_regexec_both(line, col, NULL, NULL);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007575}
7576
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007577
7578/*
7579 * Match a regexp against multiple lines.
7580 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7581 * Uses curbuf for line count and 'iskeyword'.
7582 *
Bram Moolenaar8c731502014-11-23 15:57:49 +01007583 * Return <= 0 if there is no match. Return number of lines contained in the
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007584 * match otherwise.
7585 *
7586 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
7587 *
7588 * ! Also NOTE : match may actually be in another line. e.g.:
7589 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
7590 *
7591 * +-------------------------+
7592 * |a |
7593 * |b |
7594 * |c |
7595 * | |
7596 * +-------------------------+
7597 *
7598 * then nfa_regexec_multi() returns 3. while the original
7599 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
7600 *
7601 * FIXME if this behavior is not compatible.
7602 */
7603 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01007604nfa_regexec_multi(
7605 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01007606 win_T *win, // window in which to search or NULL
7607 buf_T *buf, // buffer in which to search
7608 linenr_T lnum, // nr of line to start looking for match
7609 colnr_T col, // column to start looking for match
7610 proftime_T *tm, // timeout limit or NULL
7611 int *timed_out) // flag set on timeout or NULL
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007612{
Bram Moolenaarf4140482020-02-15 23:06:45 +01007613 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02007614 return nfa_regexec_both(NULL, col, tm, timed_out);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02007615}
7616
7617#ifdef DEBUG
7618# undef ENABLE_LOG
7619#endif