blob: 1dee7b410fd9264cd4cdfa3022096777ef652bb5 [file] [log] [blame]
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * NFA regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 */
7
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02008/*
9 * Logging of NFA engine.
10 *
11 * The NFA engine can write four log files:
12 * - Error log: Contains NFA engine's fatal errors.
13 * - Dump log: Contains compiled NFA state machine's information.
14 * - Run log: Contains information of matching procedure.
15 * - Debug log: Contains detailed information of matching procedure. Can be
16 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
17 * The first one can also be used without debug mode.
18 * The last three are enabled when compiled as debug mode and individually
19 * disabled by commenting them out.
20 * The log files can get quite big!
21 * Do disable all of this when compiling Vim for debugging, undefine DEBUG in
22 * regexp.c
23 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020024#ifdef DEBUG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020025# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020026# define ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +020027# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
28# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
29# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020030#endif
31
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020032enum
33{
34 NFA_SPLIT = -1024,
35 NFA_MATCH,
36 NFA_SKIP_CHAR, /* matches a 0-length char */
37 NFA_END_NEG_RANGE, /* Used when expanding [^ab] */
38
39 NFA_CONCAT,
40 NFA_OR,
41 NFA_STAR,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020042 NFA_QUEST,
43 NFA_QUEST_NONGREEDY, /* Non-greedy version of \? */
44 NFA_NOT, /* used for [^ab] negated char ranges */
45
46 NFA_BOL, /* ^ Begin line */
47 NFA_EOL, /* $ End line */
48 NFA_BOW, /* \< Begin word */
49 NFA_EOW, /* \> End word */
50 NFA_BOF, /* \%^ Begin file */
51 NFA_EOF, /* \%$ End file */
52 NFA_NEWL,
53 NFA_ZSTART, /* Used for \zs */
54 NFA_ZEND, /* Used for \ze */
55 NFA_NOPEN, /* Start of subexpression marked with \%( */
56 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
57 NFA_START_INVISIBLE,
58 NFA_END_INVISIBLE,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020059 NFA_COMPOSING, /* Next nodes in NFA are part of the
60 composing multibyte char */
61 NFA_END_COMPOSING, /* End of a composing char in the NFA */
62
63 /* The following are used only in the postfix form, not in the NFA */
64 NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */
65 NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */
66 NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */
67 NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@<! */
68 NFA_PREV_ATOM_LIKE_PATTERN, /* Used for \@> */
69
Bram Moolenaar5714b802013-05-28 22:03:20 +020070 NFA_BACKREF1, /* \1 */
71 NFA_BACKREF2, /* \2 */
72 NFA_BACKREF3, /* \3 */
73 NFA_BACKREF4, /* \4 */
74 NFA_BACKREF5, /* \5 */
75 NFA_BACKREF6, /* \6 */
76 NFA_BACKREF7, /* \7 */
77 NFA_BACKREF8, /* \8 */
78 NFA_BACKREF9, /* \9 */
79 NFA_SKIP, /* Skip characters */
80
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020081 NFA_MOPEN,
82 NFA_MCLOSE = NFA_MOPEN + NSUBEXP,
83
84 /* NFA_FIRST_NL */
85 NFA_ANY = NFA_MCLOSE + NSUBEXP, /* Match any one character. */
86 NFA_ANYOF, /* Match any character in this string. */
87 NFA_ANYBUT, /* Match any character not in this string. */
88 NFA_IDENT, /* Match identifier char */
89 NFA_SIDENT, /* Match identifier char but no digit */
90 NFA_KWORD, /* Match keyword char */
91 NFA_SKWORD, /* Match word char but no digit */
92 NFA_FNAME, /* Match file name char */
93 NFA_SFNAME, /* Match file name char but no digit */
94 NFA_PRINT, /* Match printable char */
95 NFA_SPRINT, /* Match printable char but no digit */
96 NFA_WHITE, /* Match whitespace char */
97 NFA_NWHITE, /* Match non-whitespace char */
98 NFA_DIGIT, /* Match digit char */
99 NFA_NDIGIT, /* Match non-digit char */
100 NFA_HEX, /* Match hex char */
101 NFA_NHEX, /* Match non-hex char */
102 NFA_OCTAL, /* Match octal char */
103 NFA_NOCTAL, /* Match non-octal char */
104 NFA_WORD, /* Match word char */
105 NFA_NWORD, /* Match non-word char */
106 NFA_HEAD, /* Match head char */
107 NFA_NHEAD, /* Match non-head char */
108 NFA_ALPHA, /* Match alpha char */
109 NFA_NALPHA, /* Match non-alpha char */
110 NFA_LOWER, /* Match lowercase char */
111 NFA_NLOWER, /* Match non-lowercase char */
112 NFA_UPPER, /* Match uppercase char */
113 NFA_NUPPER, /* Match non-uppercase char */
Bram Moolenaar423532e2013-05-29 21:14:42 +0200114
115 NFA_CURSOR, /* Match cursor pos */
116 NFA_LNUM, /* Match line number */
117 NFA_LNUM_GT, /* Match > line number */
118 NFA_LNUM_LT, /* Match < line number */
119 NFA_COL, /* Match cursor column */
120 NFA_COL_GT, /* Match > cursor column */
121 NFA_COL_LT, /* Match < cursor column */
122 NFA_VCOL, /* Match cursor virtual column */
123 NFA_VCOL_GT, /* Match > cursor virtual column */
124 NFA_VCOL_LT, /* Match < cursor virtual column */
125
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200126 NFA_FIRST_NL = NFA_ANY + ADD_NL,
127 NFA_LAST_NL = NFA_NUPPER + ADD_NL,
128
129 /* Character classes [:alnum:] etc */
130 NFA_CLASS_ALNUM,
131 NFA_CLASS_ALPHA,
132 NFA_CLASS_BLANK,
133 NFA_CLASS_CNTRL,
134 NFA_CLASS_DIGIT,
135 NFA_CLASS_GRAPH,
136 NFA_CLASS_LOWER,
137 NFA_CLASS_PRINT,
138 NFA_CLASS_PUNCT,
139 NFA_CLASS_SPACE,
140 NFA_CLASS_UPPER,
141 NFA_CLASS_XDIGIT,
142 NFA_CLASS_TAB,
143 NFA_CLASS_RETURN,
144 NFA_CLASS_BACKSPACE,
145 NFA_CLASS_ESCAPE
146};
147
148/* Keep in sync with classchars. */
149static int nfa_classcodes[] = {
150 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
151 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
152 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
153 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
154 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
155 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
156 NFA_UPPER, NFA_NUPPER
157};
158
159static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
160
161/*
162 * NFA errors can be of 3 types:
163 * *** NFA runtime errors, when something unknown goes wrong. The NFA fails
164 * silently and revert the to backtracking engine.
165 * syntax_error = FALSE;
166 * *** Regexp syntax errors, when the input regexp is not syntactically correct.
167 * The NFA engine displays an error message, and nothing else happens.
168 * syntax_error = TRUE
169 * *** Unsupported features, when the input regexp uses an operator that is not
170 * implemented in the NFA. The NFA engine fails silently, and reverts to the
171 * old backtracking engine.
172 * syntax_error = FALSE
173 * "The NFA fails" means that "compiling the regexp with the NFA fails":
174 * nfa_regcomp() returns FAIL.
175 */
176static int syntax_error = FALSE;
177
178/* NFA regexp \ze operator encountered. */
Bram Moolenaare0fea9c2013-05-27 20:10:50 +0200179static int nfa_has_zend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200180
Bram Moolenaar428e9872013-05-30 17:05:39 +0200181/* NFA regexp \1 .. \9 encountered. */
182static int nfa_has_backref;
183
Bram Moolenaar963fee22013-05-26 21:47:28 +0200184/* Number of sub expressions actually being used during execution. 1 if only
185 * the whole match (subexpr 0) is used. */
186static int nfa_nsubexpr;
187
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200188static int *post_start; /* holds the postfix form of r.e. */
189static int *post_end;
190static int *post_ptr;
191
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200192static int nstate; /* Number of states in the NFA. Also used when
193 * executing. */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200194static int istate; /* Index in the state vector, used in new_state() */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200195
196
197static int nfa_regcomp_start __ARGS((char_u*expr, int re_flags));
198static int nfa_recognize_char_class __ARGS((char_u *start, char_u *end, int extra_newl));
199static int nfa_emit_equi_class __ARGS((int c, int neg));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200200static int nfa_regatom __ARGS((void));
201static int nfa_regpiece __ARGS((void));
202static int nfa_regconcat __ARGS((void));
203static int nfa_regbranch __ARGS((void));
204static int nfa_reg __ARGS((int paren));
205#ifdef DEBUG
206static void nfa_set_code __ARGS((int c));
207static void nfa_postfix_dump __ARGS((char_u *expr, int retval));
Bram Moolenaar152e7892013-05-25 12:28:11 +0200208static void nfa_print_state __ARGS((FILE *debugf, nfa_state_T *state));
209static void nfa_print_state2 __ARGS((FILE *debugf, nfa_state_T *state, garray_T *indent));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200210static void nfa_dump __ARGS((nfa_regprog_T *prog));
211#endif
212static int *re2post __ARGS((void));
213static nfa_state_T *new_state __ARGS((int c, nfa_state_T *out, nfa_state_T *out1));
214static nfa_state_T *post2nfa __ARGS((int *postfix, int *end, int nfa_calc_size));
215static int check_char_class __ARGS((int class, int c));
216static void st_error __ARGS((int *postfix, int *end, int *p));
Bram Moolenaar423532e2013-05-29 21:14:42 +0200217static void nfa_set_neg_listids __ARGS((nfa_state_T *start));
218static void nfa_set_null_listids __ARGS((nfa_state_T *start));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200219static void nfa_save_listids __ARGS((nfa_state_T *start, int *list));
220static void nfa_restore_listids __ARGS((nfa_state_T *start, int *list));
Bram Moolenaar423532e2013-05-29 21:14:42 +0200221static int nfa_re_num_cmp __ARGS((long_u val, int op, long_u pos));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200222static long nfa_regtry __ARGS((nfa_state_T *start, colnr_T col));
223static long nfa_regexec_both __ARGS((char_u *line, colnr_T col));
224static regprog_T *nfa_regcomp __ARGS((char_u *expr, int re_flags));
225static int nfa_regexec __ARGS((regmatch_T *rmp, char_u *line, colnr_T col));
226static long nfa_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf, linenr_T lnum, colnr_T col, proftime_T *tm));
227
228/* helper functions used when doing re2post() ... regatom() parsing */
229#define EMIT(c) do { \
Bram Moolenaar16299b52013-05-30 18:45:23 +0200230 if (post_ptr >= post_end && realloc_post_list() == FAIL) \
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200231 return FAIL; \
232 *post_ptr++ = c; \
233 } while (0)
234
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200235/*
236 * Initialize internal variables before NFA compilation.
237 * Return OK on success, FAIL otherwise.
238 */
239 static int
240nfa_regcomp_start(expr, re_flags)
241 char_u *expr;
242 int re_flags; /* see vim_regcomp() */
243{
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200244 size_t postfix_size;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200245 int nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200246
247 nstate = 0;
248 istate = 0;
Bram Moolenaar61db8b52013-05-26 17:45:49 +0200249 /* A reasonable estimation for maximum size */
Bram Moolenaar54dafde2013-05-31 23:18:00 +0200250 nstate_max = (int)(STRLEN(expr) + 1) * 25;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200251
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200252 /* Some items blow up in size, such as [A-z]. Add more space for that.
Bram Moolenaar16299b52013-05-30 18:45:23 +0200253 * When it is still not enough realloc_post_list() will be used. */
Bram Moolenaarca12d7c2013-05-20 21:26:33 +0200254 nstate_max += 1000;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200255
256 /* Size for postfix representation of expr. */
Bram Moolenaar16299b52013-05-30 18:45:23 +0200257 postfix_size = sizeof(int) * nstate_max;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200258
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200259 post_start = (int *)lalloc(postfix_size, TRUE);
260 if (post_start == NULL)
261 return FAIL;
262 vim_memset(post_start, 0, postfix_size);
263 post_ptr = post_start;
Bram Moolenaarbc0ea8f2013-05-20 13:44:29 +0200264 post_end = post_start + nstate_max;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200265 nfa_has_zend = FALSE;
Bram Moolenaar428e9872013-05-30 17:05:39 +0200266 nfa_has_backref = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200267
268 regcomp_start(expr, re_flags);
269
270 return OK;
271}
272
273/*
Bram Moolenaar16299b52013-05-30 18:45:23 +0200274 * Allocate more space for post_start. Called when
275 * running above the estimated number of states.
276 */
277 static int
278realloc_post_list()
279{
Bram Moolenaar99dc19d2013-05-31 20:49:31 +0200280 int nstate_max = (int)(post_end - post_start);
Bram Moolenaar16299b52013-05-30 18:45:23 +0200281 int new_max = nstate_max + 1000;
282 int *new_start;
283 int *old_start;
284
285 new_start = (int *)lalloc(new_max * sizeof(int), TRUE);
286 if (new_start == NULL)
287 return FAIL;
288 mch_memmove(new_start, post_start, nstate_max * sizeof(int));
289 vim_memset(new_start + nstate_max, 0, 1000 * sizeof(int));
290 old_start = post_start;
291 post_start = new_start;
292 post_ptr = new_start + (post_ptr - old_start);
293 post_end = post_start + new_max;
294 vim_free(old_start);
295 return OK;
296}
297
298/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200299 * Search between "start" and "end" and try to recognize a
300 * character class in expanded form. For example [0-9].
301 * On success, return the id the character class to be emitted.
302 * On failure, return 0 (=FAIL)
303 * Start points to the first char of the range, while end should point
304 * to the closing brace.
305 */
306 static int
307nfa_recognize_char_class(start, end, extra_newl)
308 char_u *start;
309 char_u *end;
310 int extra_newl;
311{
312 int i;
313 /* Each of these variables takes up a char in "config[]",
314 * in the order they are here. */
315 int not = FALSE, af = FALSE, AF = FALSE, az = FALSE, AZ = FALSE,
316 o7 = FALSE, o9 = FALSE, underscore = FALSE, newl = FALSE;
317 char_u *p;
318#define NCONFIGS 16
319 int classid[NCONFIGS] = {
320 NFA_DIGIT, NFA_NDIGIT, NFA_HEX, NFA_NHEX,
321 NFA_OCTAL, NFA_NOCTAL, NFA_WORD, NFA_NWORD,
322 NFA_HEAD, NFA_NHEAD, NFA_ALPHA, NFA_NALPHA,
323 NFA_LOWER, NFA_NLOWER, NFA_UPPER, NFA_NUPPER
324 };
Bram Moolenaarba404472013-05-19 22:31:18 +0200325 char_u myconfig[10];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200326 char_u config[NCONFIGS][9] = {
327 "000000100", /* digit */
328 "100000100", /* non digit */
329 "011000100", /* hex-digit */
330 "111000100", /* non hex-digit */
331 "000001000", /* octal-digit */
332 "100001000", /* [^0-7] */
333 "000110110", /* [0-9A-Za-z_] */
334 "100110110", /* [^0-9A-Za-z_] */
335 "000110010", /* head of word */
336 "100110010", /* not head of word */
337 "000110000", /* alphabetic char a-z */
338 "100110000", /* non alphabetic char */
339 "000100000", /* lowercase letter */
340 "100100000", /* non lowercase */
341 "000010000", /* uppercase */
342 "100010000" /* non uppercase */
343 };
344
345 if (extra_newl == TRUE)
346 newl = TRUE;
347
348 if (*end != ']')
349 return FAIL;
350 p = start;
351 if (*p == '^')
352 {
353 not = TRUE;
354 p ++;
355 }
356
357 while (p < end)
358 {
359 if (p + 2 < end && *(p + 1) == '-')
360 {
361 switch (*p)
362 {
363 case '0':
364 if (*(p + 2) == '9')
365 {
366 o9 = TRUE;
367 break;
368 }
369 else
370 if (*(p + 2) == '7')
371 {
372 o7 = TRUE;
373 break;
374 }
375 case 'a':
376 if (*(p + 2) == 'z')
377 {
378 az = TRUE;
379 break;
380 }
381 else
382 if (*(p + 2) == 'f')
383 {
384 af = TRUE;
385 break;
386 }
387 case 'A':
388 if (*(p + 2) == 'Z')
389 {
390 AZ = TRUE;
391 break;
392 }
393 else
394 if (*(p + 2) == 'F')
395 {
396 AF = TRUE;
397 break;
398 }
399 /* FALLTHROUGH */
400 default:
401 return FAIL;
402 }
403 p += 3;
404 }
405 else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n')
406 {
407 newl = TRUE;
408 p += 2;
409 }
410 else if (*p == '_')
411 {
412 underscore = TRUE;
413 p ++;
414 }
415 else if (*p == '\n')
416 {
417 newl = TRUE;
418 p ++;
419 }
420 else
421 return FAIL;
422 } /* while (p < end) */
423
424 if (p != end)
425 return FAIL;
426
427 /* build the config that represents the ranges we gathered */
428 STRCPY(myconfig, "000000000");
429 if (not == TRUE)
430 myconfig[0] = '1';
431 if (af == TRUE)
432 myconfig[1] = '1';
433 if (AF == TRUE)
434 myconfig[2] = '1';
435 if (az == TRUE)
436 myconfig[3] = '1';
437 if (AZ == TRUE)
438 myconfig[4] = '1';
439 if (o7 == TRUE)
440 myconfig[5] = '1';
441 if (o9 == TRUE)
442 myconfig[6] = '1';
443 if (underscore == TRUE)
444 myconfig[7] = '1';
445 if (newl == TRUE)
446 {
447 myconfig[8] = '1';
448 extra_newl = ADD_NL;
449 }
450 /* try to recognize character classes */
451 for (i = 0; i < NCONFIGS; i++)
Bram Moolenaarba404472013-05-19 22:31:18 +0200452 if (STRNCMP(myconfig, config[i], 8) == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200453 return classid[i] + extra_newl;
454
455 /* fallthrough => no success so far */
456 return FAIL;
457
458#undef NCONFIGS
459}
460
461/*
462 * Produce the bytes for equivalence class "c".
463 * Currently only handles latin1, latin9 and utf-8.
464 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
465 * equivalent to 'a OR b OR c'
466 *
467 * NOTE! When changing this function, also update reg_equi_class()
468 */
469 static int
470nfa_emit_equi_class(c, neg)
471 int c;
472 int neg;
473{
474 int first = TRUE;
475 int glue = neg == TRUE ? NFA_CONCAT : NFA_OR;
476#define EMIT2(c) \
477 EMIT(c); \
478 if (neg == TRUE) { \
479 EMIT(NFA_NOT); \
480 } \
481 if (first == FALSE) \
482 EMIT(glue); \
483 else \
484 first = FALSE; \
485
486#ifdef FEAT_MBYTE
487 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
488 || STRCMP(p_enc, "iso-8859-15") == 0)
489#endif
490 {
491 switch (c)
492 {
493 case 'A': case '\300': case '\301': case '\302':
494 case '\303': case '\304': case '\305':
495 EMIT2('A'); EMIT2('\300'); EMIT2('\301');
496 EMIT2('\302'); EMIT2('\303'); EMIT2('\304');
497 EMIT2('\305');
498 return OK;
499
500 case 'C': case '\307':
501 EMIT2('C'); EMIT2('\307');
502 return OK;
503
504 case 'E': case '\310': case '\311': case '\312': case '\313':
505 EMIT2('E'); EMIT2('\310'); EMIT2('\311');
506 EMIT2('\312'); EMIT2('\313');
507 return OK;
508
509 case 'I': case '\314': case '\315': case '\316': case '\317':
510 EMIT2('I'); EMIT2('\314'); EMIT2('\315');
511 EMIT2('\316'); EMIT2('\317');
512 return OK;
513
514 case 'N': case '\321':
515 EMIT2('N'); EMIT2('\321');
516 return OK;
517
518 case 'O': case '\322': case '\323': case '\324': case '\325':
519 case '\326':
520 EMIT2('O'); EMIT2('\322'); EMIT2('\323');
521 EMIT2('\324'); EMIT2('\325'); EMIT2('\326');
522 return OK;
523
524 case 'U': case '\331': case '\332': case '\333': case '\334':
525 EMIT2('U'); EMIT2('\331'); EMIT2('\332');
526 EMIT2('\333'); EMIT2('\334');
527 return OK;
528
529 case 'Y': case '\335':
530 EMIT2('Y'); EMIT2('\335');
531 return OK;
532
533 case 'a': case '\340': case '\341': case '\342':
534 case '\343': case '\344': case '\345':
535 EMIT2('a'); EMIT2('\340'); EMIT2('\341');
536 EMIT2('\342'); EMIT2('\343'); EMIT2('\344');
537 EMIT2('\345');
538 return OK;
539
540 case 'c': case '\347':
541 EMIT2('c'); EMIT2('\347');
542 return OK;
543
544 case 'e': case '\350': case '\351': case '\352': case '\353':
545 EMIT2('e'); EMIT2('\350'); EMIT2('\351');
546 EMIT2('\352'); EMIT2('\353');
547 return OK;
548
549 case 'i': case '\354': case '\355': case '\356': case '\357':
550 EMIT2('i'); EMIT2('\354'); EMIT2('\355');
551 EMIT2('\356'); EMIT2('\357');
552 return OK;
553
554 case 'n': case '\361':
555 EMIT2('n'); EMIT2('\361');
556 return OK;
557
558 case 'o': case '\362': case '\363': case '\364': case '\365':
559 case '\366':
560 EMIT2('o'); EMIT2('\362'); EMIT2('\363');
561 EMIT2('\364'); EMIT2('\365'); EMIT2('\366');
562 return OK;
563
564 case 'u': case '\371': case '\372': case '\373': case '\374':
565 EMIT2('u'); EMIT2('\371'); EMIT2('\372');
566 EMIT2('\373'); EMIT2('\374');
567 return OK;
568
569 case 'y': case '\375': case '\377':
570 EMIT2('y'); EMIT2('\375'); EMIT2('\377');
571 return OK;
572
573 default:
574 return FAIL;
575 }
576 }
577
578 EMIT(c);
579 return OK;
580#undef EMIT2
581}
582
583/*
584 * Code to parse regular expression.
585 *
586 * We try to reuse parsing functions in regexp.c to
587 * minimize surprise and keep the syntax consistent.
588 */
589
590/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200591 * Parse the lowest level.
592 *
593 * An atom can be one of a long list of items. Many atoms match one character
594 * in the text. It is often an ordinary character or a character class.
595 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
596 * is only for syntax highlighting.
597 *
598 * atom ::= ordinary-atom
599 * or \( pattern \)
600 * or \%( pattern \)
601 * or \z( pattern \)
602 */
603 static int
604nfa_regatom()
605{
606 int c;
607 int charclass;
608 int equiclass;
609 int collclass;
610 int got_coll_char;
611 char_u *p;
612 char_u *endp;
613#ifdef FEAT_MBYTE
614 char_u *old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200615#endif
616 int extra = 0;
617 int first;
618 int emit_range;
619 int negated;
620 int result;
621 int startc = -1;
622 int endc = -1;
623 int oldstartc = -1;
624 int cpo_lit; /* 'cpoptions' contains 'l' flag */
625 int cpo_bsl; /* 'cpoptions' contains '\' flag */
626 int glue; /* ID that will "glue" nodes together */
627
628 cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
629 cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
630
631 c = getchr();
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200632 switch (c)
633 {
Bram Moolenaar47196582013-05-25 22:04:23 +0200634 case NUL:
635 syntax_error = TRUE;
636 EMSG_RET_FAIL(_("E865: (NFA) Regexp end encountered prematurely"));
637
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200638 case Magic('^'):
639 EMIT(NFA_BOL);
640 break;
641
642 case Magic('$'):
643 EMIT(NFA_EOL);
644#if defined(FEAT_SYN_HL) || defined(PROTO)
645 had_eol = TRUE;
646#endif
647 break;
648
649 case Magic('<'):
650 EMIT(NFA_BOW);
651 break;
652
653 case Magic('>'):
654 EMIT(NFA_EOW);
655 break;
656
657 case Magic('_'):
658 c = no_Magic(getchr());
659 if (c == '^') /* "\_^" is start-of-line */
660 {
661 EMIT(NFA_BOL);
662 break;
663 }
664 if (c == '$') /* "\_$" is end-of-line */
665 {
666 EMIT(NFA_EOL);
667#if defined(FEAT_SYN_HL) || defined(PROTO)
668 had_eol = TRUE;
669#endif
670 break;
671 }
672
673 extra = ADD_NL;
674
675 /* "\_[" is collection plus newline */
676 if (c == '[')
Bram Moolenaar307d10a2013-05-23 22:25:15 +0200677 goto collection;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200678
679 /* "\_x" is character class plus newline */
680 /*FALLTHROUGH*/
681
682 /*
683 * Character classes.
684 */
685 case Magic('.'):
686 case Magic('i'):
687 case Magic('I'):
688 case Magic('k'):
689 case Magic('K'):
690 case Magic('f'):
691 case Magic('F'):
692 case Magic('p'):
693 case Magic('P'):
694 case Magic('s'):
695 case Magic('S'):
696 case Magic('d'):
697 case Magic('D'):
698 case Magic('x'):
699 case Magic('X'):
700 case Magic('o'):
701 case Magic('O'):
702 case Magic('w'):
703 case Magic('W'):
704 case Magic('h'):
705 case Magic('H'):
706 case Magic('a'):
707 case Magic('A'):
708 case Magic('l'):
709 case Magic('L'):
710 case Magic('u'):
711 case Magic('U'):
712 p = vim_strchr(classchars, no_Magic(c));
713 if (p == NULL)
714 {
Bram Moolenaar5714b802013-05-28 22:03:20 +0200715 EMSGN("INTERNAL: Unknown character class char: %ld", c);
716 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200717 }
718#ifdef FEAT_MBYTE
719 /* When '.' is followed by a composing char ignore the dot, so that
720 * the composing char is matched here. */
721 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
722 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +0200723 old_regparse = regparse;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200724 c = getchr();
725 goto nfa_do_multibyte;
726 }
727#endif
728 EMIT(nfa_classcodes[p - classchars]);
729 if (extra == ADD_NL)
730 {
731 EMIT(NFA_NEWL);
732 EMIT(NFA_OR);
733 regflags |= RF_HASNL;
734 }
735 break;
736
737 case Magic('n'):
738 if (reg_string)
739 /* In a string "\n" matches a newline character. */
740 EMIT(NL);
741 else
742 {
743 /* In buffer text "\n" matches the end of a line. */
744 EMIT(NFA_NEWL);
745 regflags |= RF_HASNL;
746 }
747 break;
748
749 case Magic('('):
750 if (nfa_reg(REG_PAREN) == FAIL)
751 return FAIL; /* cascaded error */
752 break;
753
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200754 case Magic('|'):
755 case Magic('&'):
756 case Magic(')'):
757 syntax_error = TRUE;
Bram Moolenaarba404472013-05-19 22:31:18 +0200758 EMSGN(_(e_misplaced), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200759 return FAIL;
760
761 case Magic('='):
762 case Magic('?'):
763 case Magic('+'):
764 case Magic('@'):
765 case Magic('*'):
766 case Magic('{'):
767 /* these should follow an atom, not form an atom */
768 syntax_error = TRUE;
Bram Moolenaarba404472013-05-19 22:31:18 +0200769 EMSGN(_(e_misplaced), no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200770 return FAIL;
771
772 case Magic('~'): /* previous substitute pattern */
Bram Moolenaar5714b802013-05-28 22:03:20 +0200773 /* TODO: Not supported yet */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200774 return FAIL;
775
Bram Moolenaar428e9872013-05-30 17:05:39 +0200776 case Magic('1'):
777 case Magic('2'):
778 case Magic('3'):
779 case Magic('4'):
780 case Magic('5'):
781 case Magic('6'):
782 case Magic('7'):
783 case Magic('8'):
784 case Magic('9'):
785 EMIT(NFA_BACKREF1 + (no_Magic(c) - '1'));
786 nfa_has_backref = TRUE;
787 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200788
789 case Magic('z'):
790 c = no_Magic(getchr());
791 switch (c)
792 {
793 case 's':
794 EMIT(NFA_ZSTART);
795 break;
796 case 'e':
797 EMIT(NFA_ZEND);
798 nfa_has_zend = TRUE;
Bram Moolenaare0fea9c2013-05-27 20:10:50 +0200799 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200800 case '1':
801 case '2':
802 case '3':
803 case '4':
804 case '5':
805 case '6':
806 case '7':
807 case '8':
808 case '9':
809 case '(':
Bram Moolenaar5714b802013-05-28 22:03:20 +0200810 /* TODO: \z1...\z9 and \z( not yet supported */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200811 return FAIL;
812 default:
813 syntax_error = TRUE;
Bram Moolenaarba404472013-05-19 22:31:18 +0200814 EMSGN(_("E867: (NFA) Unknown operator '\\z%c'"),
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200815 no_Magic(c));
816 return FAIL;
817 }
818 break;
819
820 case Magic('%'):
821 c = no_Magic(getchr());
822 switch (c)
823 {
824 /* () without a back reference */
825 case '(':
826 if (nfa_reg(REG_NPAREN) == FAIL)
827 return FAIL;
828 EMIT(NFA_NOPEN);
829 break;
830
831 case 'd': /* %d123 decimal */
832 case 'o': /* %o123 octal */
833 case 'x': /* %xab hex 2 */
834 case 'u': /* %uabcd hex 4 */
835 case 'U': /* %U1234abcd hex 8 */
Bram Moolenaar47196582013-05-25 22:04:23 +0200836 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +0200837 int nr;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200838
Bram Moolenaar47196582013-05-25 22:04:23 +0200839 switch (c)
840 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +0200841 case 'd': nr = getdecchrs(); break;
842 case 'o': nr = getoctchrs(); break;
843 case 'x': nr = gethexchrs(2); break;
844 case 'u': nr = gethexchrs(4); break;
845 case 'U': nr = gethexchrs(8); break;
846 default: nr = -1; break;
Bram Moolenaar47196582013-05-25 22:04:23 +0200847 }
848
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +0200849 if (nr < 0)
Bram Moolenaar47196582013-05-25 22:04:23 +0200850 EMSG2_RET_FAIL(
851 _("E678: Invalid character after %s%%[dxouU]"),
852 reg_magic == MAGIC_ALL);
853 /* TODO: what if a composing character follows? */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +0200854 EMIT(nr);
Bram Moolenaar47196582013-05-25 22:04:23 +0200855 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200856 break;
857
858 /* Catch \%^ and \%$ regardless of where they appear in the
859 * pattern -- regardless of whether or not it makes sense. */
860 case '^':
861 EMIT(NFA_BOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200862 break;
863
864 case '$':
865 EMIT(NFA_EOF);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200866 break;
867
868 case '#':
Bram Moolenaar423532e2013-05-29 21:14:42 +0200869 EMIT(NFA_CURSOR);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200870 break;
871
872 case 'V':
Bram Moolenaar5714b802013-05-28 22:03:20 +0200873 /* TODO: not supported yet */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200874 return FAIL;
875 break;
876
877 case '[':
Bram Moolenaar5714b802013-05-28 22:03:20 +0200878 /* TODO: \%[abc] not supported yet */
879 return FAIL;
880
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200881 default:
Bram Moolenaar423532e2013-05-29 21:14:42 +0200882 {
Bram Moolenaar021e1472013-05-30 19:18:31 +0200883 int n = 0;
Bram Moolenaar423532e2013-05-29 21:14:42 +0200884 int cmp = c;
885
886 if (c == '<' || c == '>')
887 c = getchr();
888 while (VIM_ISDIGIT(c))
889 {
890 n = n * 10 + (c - '0');
891 c = getchr();
892 }
893 if (c == 'l' || c == 'c' || c == 'v')
894 {
895 EMIT(n);
896 if (c == 'l')
897 EMIT(cmp == '<' ? NFA_LNUM_LT :
898 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
899 else if (c == 'c')
900 EMIT(cmp == '<' ? NFA_COL_LT :
901 cmp == '>' ? NFA_COL_GT : NFA_COL);
902 else
903 EMIT(cmp == '<' ? NFA_VCOL_LT :
904 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
905 break;
906 }
907 else if (c == '\'')
908 /* TODO: \%'m not supported yet */
909 return FAIL;
910 }
Bram Moolenaar5714b802013-05-28 22:03:20 +0200911 syntax_error = TRUE;
912 EMSGN(_("E867: (NFA) Unknown operator '\\%%%c'"),
913 no_Magic(c));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200914 return FAIL;
915 }
916 break;
917
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200918 case Magic('['):
Bram Moolenaar307d10a2013-05-23 22:25:15 +0200919collection:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200920 /*
921 * Glue is emitted between several atoms from the [].
922 * It is either NFA_OR, or NFA_CONCAT.
923 *
924 * [abc] expands to 'a b NFA_OR c NFA_OR' (in postfix notation)
925 * [^abc] expands to 'a NFA_NOT b NFA_NOT NFA_CONCAT c NFA_NOT
926 * NFA_CONCAT NFA_END_NEG_RANGE NFA_CONCAT' (in postfix
927 * notation)
928 *
929 */
930
931
932/* Emit negation atoms, if needed.
933 * The CONCAT below merges the NOT with the previous node. */
934#define TRY_NEG() \
935 if (negated == TRUE) \
936 { \
937 EMIT(NFA_NOT); \
938 }
939
940/* Emit glue between important nodes : CONCAT or OR. */
941#define EMIT_GLUE() \
942 if (first == FALSE) \
943 EMIT(glue); \
944 else \
945 first = FALSE;
946
947 p = regparse;
948 endp = skip_anyof(p);
949 if (*endp == ']')
950 {
951 /*
952 * Try to reverse engineer character classes. For example,
953 * recognize that [0-9] stands for \d and [A-Za-z_] with \h,
954 * and perform the necessary substitutions in the NFA.
955 */
956 result = nfa_recognize_char_class(regparse, endp,
957 extra == ADD_NL);
958 if (result != FAIL)
959 {
960 if (result >= NFA_DIGIT && result <= NFA_NUPPER)
961 EMIT(result);
962 else /* must be char class + newline */
963 {
964 EMIT(result - ADD_NL);
965 EMIT(NFA_NEWL);
966 EMIT(NFA_OR);
967 }
968 regparse = endp;
Bram Moolenaar51a29832013-05-28 22:30:35 +0200969 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200970 return OK;
971 }
972 /*
973 * Failed to recognize a character class. Use the simple
974 * version that turns [abc] into 'a' OR 'b' OR 'c'
975 */
976 startc = endc = oldstartc = -1;
977 first = TRUE; /* Emitting first atom in this sequence? */
978 negated = FALSE;
979 glue = NFA_OR;
980 if (*regparse == '^') /* negated range */
981 {
982 negated = TRUE;
983 glue = NFA_CONCAT;
Bram Moolenaar51a29832013-05-28 22:30:35 +0200984 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200985 }
986 if (*regparse == '-')
987 {
988 startc = '-';
989 EMIT(startc);
990 TRY_NEG();
991 EMIT_GLUE();
Bram Moolenaar51a29832013-05-28 22:30:35 +0200992 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200993 }
994 /* Emit the OR branches for each character in the [] */
995 emit_range = FALSE;
996 while (regparse < endp)
997 {
998 oldstartc = startc;
999 startc = -1;
1000 got_coll_char = FALSE;
1001 if (*regparse == '[')
1002 {
1003 /* Check for [: :], [= =], [. .] */
1004 equiclass = collclass = 0;
1005 charclass = get_char_class(&regparse);
1006 if (charclass == CLASS_NONE)
1007 {
1008 equiclass = get_equi_class(&regparse);
1009 if (equiclass == 0)
1010 collclass = get_coll_element(&regparse);
1011 }
1012
1013 /* Character class like [:alpha:] */
1014 if (charclass != CLASS_NONE)
1015 {
1016 switch (charclass)
1017 {
1018 case CLASS_ALNUM:
1019 EMIT(NFA_CLASS_ALNUM);
1020 break;
1021 case CLASS_ALPHA:
1022 EMIT(NFA_CLASS_ALPHA);
1023 break;
1024 case CLASS_BLANK:
1025 EMIT(NFA_CLASS_BLANK);
1026 break;
1027 case CLASS_CNTRL:
1028 EMIT(NFA_CLASS_CNTRL);
1029 break;
1030 case CLASS_DIGIT:
1031 EMIT(NFA_CLASS_DIGIT);
1032 break;
1033 case CLASS_GRAPH:
1034 EMIT(NFA_CLASS_GRAPH);
1035 break;
1036 case CLASS_LOWER:
1037 EMIT(NFA_CLASS_LOWER);
1038 break;
1039 case CLASS_PRINT:
1040 EMIT(NFA_CLASS_PRINT);
1041 break;
1042 case CLASS_PUNCT:
1043 EMIT(NFA_CLASS_PUNCT);
1044 break;
1045 case CLASS_SPACE:
1046 EMIT(NFA_CLASS_SPACE);
1047 break;
1048 case CLASS_UPPER:
1049 EMIT(NFA_CLASS_UPPER);
1050 break;
1051 case CLASS_XDIGIT:
1052 EMIT(NFA_CLASS_XDIGIT);
1053 break;
1054 case CLASS_TAB:
1055 EMIT(NFA_CLASS_TAB);
1056 break;
1057 case CLASS_RETURN:
1058 EMIT(NFA_CLASS_RETURN);
1059 break;
1060 case CLASS_BACKSPACE:
1061 EMIT(NFA_CLASS_BACKSPACE);
1062 break;
1063 case CLASS_ESCAPE:
1064 EMIT(NFA_CLASS_ESCAPE);
1065 break;
1066 }
1067 TRY_NEG();
1068 EMIT_GLUE();
1069 continue;
1070 }
1071 /* Try equivalence class [=a=] and the like */
1072 if (equiclass != 0)
1073 {
1074 result = nfa_emit_equi_class(equiclass, negated);
1075 if (result == FAIL)
1076 {
1077 /* should never happen */
1078 EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
1079 }
1080 EMIT_GLUE();
1081 continue;
1082 }
1083 /* Try collating class like [. .] */
1084 if (collclass != 0)
1085 {
1086 startc = collclass; /* allow [.a.]-x as a range */
1087 /* Will emit the proper atom at the end of the
1088 * while loop. */
1089 }
1090 }
1091 /* Try a range like 'a-x' or '\t-z' */
1092 if (*regparse == '-')
1093 {
1094 emit_range = TRUE;
1095 startc = oldstartc;
Bram Moolenaar51a29832013-05-28 22:30:35 +02001096 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001097 continue; /* reading the end of the range */
1098 }
1099
1100 /* Now handle simple and escaped characters.
1101 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1102 * accepts "\t", "\e", etc., but only when the 'l' flag in
1103 * 'cpoptions' is not included.
1104 * Posix doesn't recognize backslash at all.
1105 */
1106 if (*regparse == '\\'
1107 && !cpo_bsl
1108 && regparse + 1 <= endp
1109 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1110 || (!cpo_lit
1111 && vim_strchr(REGEXP_ABBR, regparse[1])
1112 != NULL)
1113 )
1114 )
1115 {
Bram Moolenaar51a29832013-05-28 22:30:35 +02001116 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001117
Bram Moolenaar673af4d2013-05-21 22:00:51 +02001118 if (*regparse == 'n')
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001119 startc = reg_string ? NL : NFA_NEWL;
1120 else
1121 if (*regparse == 'd'
1122 || *regparse == 'o'
1123 || *regparse == 'x'
1124 || *regparse == 'u'
1125 || *regparse == 'U'
1126 )
1127 {
1128 /* TODO(RE) This needs more testing */
1129 startc = coll_get_char();
1130 got_coll_char = TRUE;
Bram Moolenaar51a29832013-05-28 22:30:35 +02001131 mb_ptr_back(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001132 }
1133 else
1134 {
1135 /* \r,\t,\e,\b */
1136 startc = backslash_trans(*regparse);
1137 }
1138 }
1139
1140 /* Normal printable char */
1141 if (startc == -1)
1142#ifdef FEAT_MBYTE
1143 startc = (*mb_ptr2char)(regparse);
1144#else
1145 startc = *regparse;
1146#endif
1147
1148 /* Previous char was '-', so this char is end of range. */
1149 if (emit_range)
1150 {
1151 endc = startc; startc = oldstartc;
1152 if (startc > endc)
1153 EMSG_RET_FAIL(_(e_invrange));
1154#ifdef FEAT_MBYTE
1155 if (has_mbyte && ((*mb_char2len)(startc) > 1
1156 || (*mb_char2len)(endc) > 1))
1157 {
1158 if (endc > startc + 256)
1159 EMSG_RET_FAIL(_(e_invrange));
1160 /* Emit the range. "startc" was already emitted, so
1161 * skip it. */
1162 for (c = startc + 1; c <= endc; c++)
1163 {
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001164 EMIT(c);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001165 TRY_NEG();
1166 EMIT_GLUE();
1167 }
1168 emit_range = FALSE;
1169 }
1170 else
1171#endif
1172 {
1173#ifdef EBCDIC
1174 int alpha_only = FALSE;
1175
1176 /* for alphabetical range skip the gaps
1177 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1178 if (isalpha(startc) && isalpha(endc))
1179 alpha_only = TRUE;
1180#endif
1181 /* Emit the range. "startc" was already emitted, so
1182 * skip it. */
1183 for (c = startc + 1; c <= endc; c++)
1184#ifdef EBCDIC
1185 if (!alpha_only || isalpha(startc))
1186#endif
1187 {
1188 EMIT(c);
1189 TRY_NEG();
1190 EMIT_GLUE();
1191 }
1192 emit_range = FALSE;
1193 }
1194 }
1195 else
1196 {
1197 /*
1198 * This char (startc) is not part of a range. Just
1199 * emit it.
1200 *
1201 * Normally, simply emit startc. But if we get char
1202 * code=0 from a collating char, then replace it with
1203 * 0x0a.
1204 *
1205 * This is needed to completely mimic the behaviour of
1206 * the backtracking engine.
1207 */
1208 if (got_coll_char == TRUE && startc == 0)
1209 EMIT(0x0a);
1210 else
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001211 EMIT(startc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001212 TRY_NEG();
1213 EMIT_GLUE();
1214 }
1215
Bram Moolenaar51a29832013-05-28 22:30:35 +02001216 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001217 } /* while (p < endp) */
1218
Bram Moolenaar51a29832013-05-28 22:30:35 +02001219 mb_ptr_back(old_regparse, regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001220 if (*regparse == '-') /* if last, '-' is just a char */
1221 {
1222 EMIT('-');
1223 TRY_NEG();
1224 EMIT_GLUE();
1225 }
Bram Moolenaar51a29832013-05-28 22:30:35 +02001226 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001227
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001228 /* skip the trailing ] */
1229 regparse = endp;
Bram Moolenaar51a29832013-05-28 22:30:35 +02001230 mb_ptr_adv(regparse);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001231 if (negated == TRUE)
1232 {
1233 /* Mark end of negated char range */
1234 EMIT(NFA_END_NEG_RANGE);
1235 EMIT(NFA_CONCAT);
1236 }
Bram Moolenaarbad704f2013-05-30 11:51:08 +02001237
1238 /* \_[] also matches \n but it's not negated */
1239 if (extra == ADD_NL)
1240 {
1241 EMIT(reg_string ? NL : NFA_NEWL);
1242 EMIT(NFA_OR);
1243 }
1244
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001245 return OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02001246 } /* if exists closing ] */
1247
1248 if (reg_strict)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001249 {
1250 syntax_error = TRUE;
1251 EMSG_RET_FAIL(_(e_missingbracket));
1252 }
Bram Moolenaarfad8de02013-05-24 23:10:50 +02001253 /* FALLTHROUGH */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001254
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001255 default:
1256 {
1257#ifdef FEAT_MBYTE
1258 int plen;
1259
1260nfa_do_multibyte:
Bram Moolenaar47196582013-05-25 22:04:23 +02001261 /* plen is length of current char with composing chars */
1262 if (enc_utf8 && ((*mb_char2len)(c)
1263 != (plen = (*mb_ptr2len)(old_regparse))
1264 || utf_iscomposing(c)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001265 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02001266 int i = 0;
1267
Bram Moolenaar56d58d52013-05-25 14:42:03 +02001268 /* A base character plus composing characters, or just one
1269 * or more composing characters.
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001270 * This requires creating a separate atom as if enclosing
1271 * the characters in (), where NFA_COMPOSING is the ( and
1272 * NFA_END_COMPOSING is the ). Note that right now we are
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001273 * building the postfix form, not the NFA itself;
1274 * a composing char could be: a, b, c, NFA_COMPOSING
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001275 * where 'b' and 'c' are chars with codes > 256. */
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001276 for (;;)
1277 {
1278 EMIT(c);
1279 if (i > 0)
1280 EMIT(NFA_CONCAT);
Bram Moolenaarfad8de02013-05-24 23:10:50 +02001281 if ((i += utf_char2len(c)) >= plen)
Bram Moolenaar3c577f22013-05-24 21:59:54 +02001282 break;
1283 c = utf_ptr2char(old_regparse + i);
1284 }
1285 EMIT(NFA_COMPOSING);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001286 regparse = old_regparse + plen;
1287 }
1288 else
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001289#endif
1290 {
1291 c = no_Magic(c);
1292 EMIT(c);
1293 }
1294 return OK;
1295 }
1296 }
1297
1298#undef TRY_NEG
1299#undef EMIT_GLUE
1300
1301 return OK;
1302}
1303
1304/*
1305 * Parse something followed by possible [*+=].
1306 *
1307 * A piece is an atom, possibly followed by a multi, an indication of how many
1308 * times the atom can be matched. Example: "a*" matches any sequence of "a"
1309 * characters: "", "a", "aa", etc.
1310 *
1311 * piece ::= atom
1312 * or atom multi
1313 */
1314 static int
1315nfa_regpiece()
1316{
1317 int i;
1318 int op;
1319 int ret;
1320 long minval, maxval;
1321 int greedy = TRUE; /* Braces are prefixed with '-' ? */
1322 char_u *old_regparse, *new_regparse;
1323 int c2;
Bram Moolenaar16299b52013-05-30 18:45:23 +02001324 int old_post_pos;
1325 int my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001326 int old_regnpar;
1327 int quest;
1328
1329 /* Save the current position in the regexp, so that we can use it if
1330 * <atom>{m,n} is next. */
1331 old_regparse = regparse;
1332 /* Save current number of open parenthesis, so we can use it if
1333 * <atom>{m,n} is next */
1334 old_regnpar = regnpar;
1335 /* store current pos in the postfix form, for \{m,n} involving 0s */
Bram Moolenaar16299b52013-05-30 18:45:23 +02001336 my_post_start = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001337
1338 ret = nfa_regatom();
1339 if (ret == FAIL)
1340 return FAIL; /* cascaded error */
1341
1342 op = peekchr();
1343 if (re_multi_type(op) == NOT_MULTI)
1344 return OK;
1345
1346 skipchr();
1347 switch (op)
1348 {
1349 case Magic('*'):
1350 EMIT(NFA_STAR);
1351 break;
1352
1353 case Magic('+'):
1354 /*
1355 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
1356 * first and only submatch would be "aaa". But the backtracking
1357 * engine interprets the plus as "try matching one more time", and
1358 * a* matches a second time at the end of the input, the empty
1359 * string.
1360 * The submatch will the empty string.
1361 *
Bram Moolenaar54dafde2013-05-31 23:18:00 +02001362 * In order to be consistent with the old engine, we replace
1363 * <atom>+ with <atom><atom>*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001364 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001365 regnpar = old_regnpar;
1366 regparse = old_regparse;
1367 curchr = -1;
1368 if (nfa_regatom() == FAIL)
1369 return FAIL;
1370 EMIT(NFA_STAR);
1371 EMIT(NFA_CONCAT);
1372 skipchr(); /* skip the \+ */
1373 break;
1374
1375 case Magic('@'):
1376 op = no_Magic(getchr());
1377 switch(op)
1378 {
1379 case '=':
1380 EMIT(NFA_PREV_ATOM_NO_WIDTH);
1381 break;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02001382 case '!':
1383 EMIT(NFA_PREV_ATOM_NO_WIDTH_NEG);
1384 break;
Bram Moolenaar75eb1612013-05-29 18:45:11 +02001385 case '0':
1386 case '1':
1387 case '2':
1388 case '3':
1389 case '4':
1390 case '5':
1391 case '6':
1392 case '7':
1393 case '8':
1394 case '9':
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001395 case '<':
1396 case '>':
1397 /* Not supported yet */
1398 return FAIL;
1399 default:
1400 syntax_error = TRUE;
Bram Moolenaarba404472013-05-19 22:31:18 +02001401 EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001402 return FAIL;
1403 }
1404 break;
1405
1406 case Magic('?'):
1407 case Magic('='):
1408 EMIT(NFA_QUEST);
1409 break;
1410
1411 case Magic('{'):
1412 /* a{2,5} will expand to 'aaa?a?a?'
1413 * a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
1414 * version of '?'
1415 * \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
1416 * parenthesis have the same id
1417 */
1418
1419 greedy = TRUE;
1420 c2 = peekchr();
1421 if (c2 == '-' || c2 == Magic('-'))
1422 {
1423 skipchr();
1424 greedy = FALSE;
1425 }
1426 if (!read_limits(&minval, &maxval))
1427 {
1428 syntax_error = TRUE;
1429 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
1430 }
1431 /* <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
1432 * <atom>* */
1433 if (minval == 0 && maxval == MAX_LIMIT && greedy)
1434 {
1435 EMIT(NFA_STAR);
1436 break;
1437 }
1438
Bram Moolenaar54dafde2013-05-31 23:18:00 +02001439 /* TODO: \{-} doesn't work yet */
1440 if (maxval == MAX_LIMIT && !greedy)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001441 return FAIL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001442
1443 /* Special case: x{0} or x{-0} */
1444 if (maxval == 0)
1445 {
1446 /* Ignore result of previous call to nfa_regatom() */
Bram Moolenaar16299b52013-05-30 18:45:23 +02001447 post_ptr = post_start + my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001448 /* NFA_SKIP_CHAR has 0-length and works everywhere */
1449 EMIT(NFA_SKIP_CHAR);
1450 return OK;
1451 }
1452
1453 /* Ignore previous call to nfa_regatom() */
Bram Moolenaar16299b52013-05-30 18:45:23 +02001454 post_ptr = post_start + my_post_start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001455 /* Save pos after the repeated atom and the \{} */
1456 new_regparse = regparse;
1457
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001458 quest = (greedy == TRUE? NFA_QUEST : NFA_QUEST_NONGREEDY);
1459 for (i = 0; i < maxval; i++)
1460 {
1461 /* Goto beginning of the repeated atom */
1462 regparse = old_regparse;
1463 curchr = -1;
1464 /* Restore count of parenthesis */
1465 regnpar = old_regnpar;
Bram Moolenaar16299b52013-05-30 18:45:23 +02001466 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001467 if (nfa_regatom() == FAIL)
1468 return FAIL;
1469 /* after "minval" times, atoms are optional */
1470 if (i + 1 > minval)
Bram Moolenaar54dafde2013-05-31 23:18:00 +02001471 {
1472 if (maxval == MAX_LIMIT)
1473 EMIT(NFA_STAR);
1474 else
1475 EMIT(quest);
1476 }
Bram Moolenaar16299b52013-05-30 18:45:23 +02001477 if (old_post_pos != my_post_start)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001478 EMIT(NFA_CONCAT);
Bram Moolenaar54dafde2013-05-31 23:18:00 +02001479 if (i + 1 > minval && maxval == MAX_LIMIT)
1480 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001481 }
1482
1483 /* Go to just after the repeated atom and the \{} */
1484 regparse = new_regparse;
1485 curchr = -1;
1486
1487 break;
1488
1489
1490 default:
1491 break;
1492 } /* end switch */
1493
1494 if (re_multi_type(peekchr()) != NOT_MULTI)
1495 {
1496 /* Can't have a multi follow a multi. */
1497 syntax_error = TRUE;
1498 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi !"));
1499 }
1500
1501 return OK;
1502}
1503
1504/*
1505 * Parse one or more pieces, concatenated. It matches a match for the
1506 * first piece, followed by a match for the second piece, etc. Example:
1507 * "f[0-9]b", first matches "f", then a digit and then "b".
1508 *
1509 * concat ::= piece
1510 * or piece piece
1511 * or piece piece piece
1512 * etc.
1513 */
1514 static int
1515nfa_regconcat()
1516{
1517 int cont = TRUE;
1518 int first = TRUE;
1519
1520 while (cont)
1521 {
1522 switch (peekchr())
1523 {
1524 case NUL:
1525 case Magic('|'):
1526 case Magic('&'):
1527 case Magic(')'):
1528 cont = FALSE;
1529 break;
1530
1531 case Magic('Z'):
1532#ifdef FEAT_MBYTE
1533 regflags |= RF_ICOMBINE;
1534#endif
1535 skipchr_keepstart();
1536 break;
1537 case Magic('c'):
1538 regflags |= RF_ICASE;
1539 skipchr_keepstart();
1540 break;
1541 case Magic('C'):
1542 regflags |= RF_NOICASE;
1543 skipchr_keepstart();
1544 break;
1545 case Magic('v'):
1546 reg_magic = MAGIC_ALL;
1547 skipchr_keepstart();
1548 curchr = -1;
1549 break;
1550 case Magic('m'):
1551 reg_magic = MAGIC_ON;
1552 skipchr_keepstart();
1553 curchr = -1;
1554 break;
1555 case Magic('M'):
1556 reg_magic = MAGIC_OFF;
1557 skipchr_keepstart();
1558 curchr = -1;
1559 break;
1560 case Magic('V'):
1561 reg_magic = MAGIC_NONE;
1562 skipchr_keepstart();
1563 curchr = -1;
1564 break;
1565
1566 default:
1567 if (nfa_regpiece() == FAIL)
1568 return FAIL;
1569 if (first == FALSE)
1570 EMIT(NFA_CONCAT);
1571 else
1572 first = FALSE;
1573 break;
1574 }
1575 }
1576
1577 return OK;
1578}
1579
1580/*
1581 * Parse a branch, one or more concats, separated by "\&". It matches the
1582 * last concat, but only if all the preceding concats also match at the same
1583 * position. Examples:
1584 * "foobeep\&..." matches "foo" in "foobeep".
1585 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
1586 *
1587 * branch ::= concat
1588 * or concat \& concat
1589 * or concat \& concat \& concat
1590 * etc.
1591 */
1592 static int
1593nfa_regbranch()
1594{
1595 int ch;
Bram Moolenaar16299b52013-05-30 18:45:23 +02001596 int old_post_pos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001597
Bram Moolenaar16299b52013-05-30 18:45:23 +02001598 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001599
1600 /* First branch, possibly the only one */
1601 if (nfa_regconcat() == FAIL)
1602 return FAIL;
1603
1604 ch = peekchr();
1605 /* Try next concats */
1606 while (ch == Magic('&'))
1607 {
1608 skipchr();
1609 EMIT(NFA_NOPEN);
1610 EMIT(NFA_PREV_ATOM_NO_WIDTH);
Bram Moolenaar16299b52013-05-30 18:45:23 +02001611 old_post_pos = (int)(post_ptr - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001612 if (nfa_regconcat() == FAIL)
1613 return FAIL;
1614 /* if concat is empty, skip a input char. But do emit a node */
Bram Moolenaar16299b52013-05-30 18:45:23 +02001615 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001616 EMIT(NFA_SKIP_CHAR);
1617 EMIT(NFA_CONCAT);
1618 ch = peekchr();
1619 }
1620
1621 /* Even if a branch is empty, emit one node for it */
Bram Moolenaar16299b52013-05-30 18:45:23 +02001622 if (old_post_pos == (int)(post_ptr - post_start))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001623 EMIT(NFA_SKIP_CHAR);
1624
1625 return OK;
1626}
1627
1628/*
1629 * Parse a pattern, one or more branches, separated by "\|". It matches
1630 * anything that matches one of the branches. Example: "foo\|beep" matches
1631 * "foo" and matches "beep". If more than one branch matches, the first one
1632 * is used.
1633 *
1634 * pattern ::= branch
1635 * or branch \| branch
1636 * or branch \| branch \| branch
1637 * etc.
1638 */
1639 static int
1640nfa_reg(paren)
1641 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1642{
1643 int parno = 0;
1644
1645#ifdef FEAT_SYN_HL
1646#endif
1647 if (paren == REG_PAREN)
1648 {
1649 if (regnpar >= NSUBEXP) /* Too many `(' */
1650 {
1651 syntax_error = TRUE;
1652 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
1653 }
1654 parno = regnpar++;
1655 }
1656
1657 if (nfa_regbranch() == FAIL)
1658 return FAIL; /* cascaded error */
1659
1660 while (peekchr() == Magic('|'))
1661 {
1662 skipchr();
1663 if (nfa_regbranch() == FAIL)
1664 return FAIL; /* cascaded error */
1665 EMIT(NFA_OR);
1666 }
1667
1668 /* Check for proper termination. */
1669 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1670 {
1671 syntax_error = TRUE;
1672 if (paren == REG_NPAREN)
1673 EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
1674 else
1675 EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
1676 }
1677 else if (paren == REG_NOPAREN && peekchr() != NUL)
1678 {
1679 syntax_error = TRUE;
1680 if (peekchr() == Magic(')'))
1681 EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
1682 else
1683 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
1684 }
1685 /*
1686 * Here we set the flag allowing back references to this set of
1687 * parentheses.
1688 */
1689 if (paren == REG_PAREN)
1690 {
1691 had_endbrace[parno] = TRUE; /* have seen the close paren */
1692 EMIT(NFA_MOPEN + parno);
1693 }
1694
1695 return OK;
1696}
1697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001698#ifdef DEBUG
1699static char_u code[50];
1700
1701 static void
1702nfa_set_code(c)
1703 int c;
1704{
1705 int addnl = FALSE;
1706
1707 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL)
1708 {
1709 addnl = TRUE;
1710 c -= ADD_NL;
1711 }
1712
1713 STRCPY(code, "");
1714 switch (c)
1715 {
1716 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
1717 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
1718 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
1719 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
1720 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
1721 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
1722
Bram Moolenaar5714b802013-05-28 22:03:20 +02001723 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
1724 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
1725 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
1726 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
1727 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
1728 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
1729 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
1730 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
1731 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
1732 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
1733
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001734 case NFA_PREV_ATOM_NO_WIDTH:
1735 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
Bram Moolenaar423532e2013-05-29 21:14:42 +02001736 case NFA_PREV_ATOM_NO_WIDTH_NEG:
1737 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02001738 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
1739 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001740 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
1741 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
1742
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001743 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
1744 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
1745
1746 case NFA_MOPEN + 0:
1747 case NFA_MOPEN + 1:
1748 case NFA_MOPEN + 2:
1749 case NFA_MOPEN + 3:
1750 case NFA_MOPEN + 4:
1751 case NFA_MOPEN + 5:
1752 case NFA_MOPEN + 6:
1753 case NFA_MOPEN + 7:
1754 case NFA_MOPEN + 8:
1755 case NFA_MOPEN + 9:
1756 STRCPY(code, "NFA_MOPEN(x)");
1757 code[10] = c - NFA_MOPEN + '0';
1758 break;
1759 case NFA_MCLOSE + 0:
1760 case NFA_MCLOSE + 1:
1761 case NFA_MCLOSE + 2:
1762 case NFA_MCLOSE + 3:
1763 case NFA_MCLOSE + 4:
1764 case NFA_MCLOSE + 5:
1765 case NFA_MCLOSE + 6:
1766 case NFA_MCLOSE + 7:
1767 case NFA_MCLOSE + 8:
1768 case NFA_MCLOSE + 9:
1769 STRCPY(code, "NFA_MCLOSE(x)");
1770 code[11] = c - NFA_MCLOSE + '0';
1771 break;
1772 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
1773 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
1774 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
1775 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
Bram Moolenaar4b780632013-05-31 22:14:52 +02001776 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
1777 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001778 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001779 case NFA_NOT: STRCPY(code, "NFA_NOT "); break;
1780 case NFA_SKIP_CHAR: STRCPY(code, "NFA_SKIP_CHAR"); break;
1781 case NFA_OR: STRCPY(code, "NFA_OR"); break;
1782 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
1783 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
1784 case NFA_END_NEG_RANGE: STRCPY(code, "NFA_END_NEG_RANGE"); break;
1785 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
1786 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
1787 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
1788 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
1789 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
1790 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
1791 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
1792 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
1793 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
1794 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
1795 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
1796 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
1797 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
1798 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
1799 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
1800 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
1801
1802 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
1803 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
1804 case NFA_SIDENT:STRCPY(code, "NFA_SIDENT"); break;
1805 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
1806 case NFA_SKWORD:STRCPY(code, "NFA_SKWORD"); break;
1807 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
1808 case NFA_SFNAME:STRCPY(code, "NFA_SFNAME"); break;
1809 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
1810 case NFA_SPRINT:STRCPY(code, "NFA_SPRINT"); break;
1811 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
1812 case NFA_NWHITE:STRCPY(code, "NFA_NWHITE"); break;
1813 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
1814 case NFA_NDIGIT:STRCPY(code, "NFA_NDIGIT"); break;
1815 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
1816 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
1817 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
1818 case NFA_NOCTAL:STRCPY(code, "NFA_NOCTAL"); break;
1819 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
1820 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
1821 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
1822 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
1823 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
1824 case NFA_NALPHA:STRCPY(code, "NFA_NALPHA"); break;
1825 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
1826 case NFA_NLOWER:STRCPY(code, "NFA_NLOWER"); break;
1827 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
1828 case NFA_NUPPER:STRCPY(code, "NFA_NUPPER"); break;
1829
1830 default:
1831 STRCPY(code, "CHAR(x)");
1832 code[5] = c;
1833 }
1834
1835 if (addnl == TRUE)
1836 STRCAT(code, " + NEWLINE ");
1837
1838}
1839
1840#ifdef ENABLE_LOG
1841static FILE *log_fd;
1842
1843/*
1844 * Print the postfix notation of the current regexp.
1845 */
1846 static void
1847nfa_postfix_dump(expr, retval)
1848 char_u *expr;
1849 int retval;
1850{
1851 int *p;
1852 FILE *f;
1853
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02001854 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001855 if (f != NULL)
1856 {
1857 fprintf(f, "\n-------------------------\n");
1858 if (retval == FAIL)
1859 fprintf(f, ">>> NFA engine failed ... \n");
1860 else if (retval == OK)
1861 fprintf(f, ">>> NFA engine succeeded !\n");
1862 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
Bram Moolenaar745fc022013-05-20 22:20:02 +02001863 for (p = post_start; *p && p < post_end; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001864 {
1865 nfa_set_code(*p);
1866 fprintf(f, "%s, ", code);
1867 }
1868 fprintf(f, "\"\nPostfix notation (int): ");
Bram Moolenaar745fc022013-05-20 22:20:02 +02001869 for (p = post_start; *p && p < post_end; p++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001870 fprintf(f, "%d ", *p);
1871 fprintf(f, "\n\n");
1872 fclose(f);
1873 }
1874}
1875
1876/*
1877 * Print the NFA starting with a root node "state".
1878 */
1879 static void
Bram Moolenaar152e7892013-05-25 12:28:11 +02001880nfa_print_state(debugf, state)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001881 FILE *debugf;
1882 nfa_state_T *state;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001883{
Bram Moolenaar152e7892013-05-25 12:28:11 +02001884 garray_T indent;
1885
1886 ga_init2(&indent, 1, 64);
1887 ga_append(&indent, '\0');
1888 nfa_print_state2(debugf, state, &indent);
1889 ga_clear(&indent);
1890}
1891
1892 static void
1893nfa_print_state2(debugf, state, indent)
1894 FILE *debugf;
1895 nfa_state_T *state;
1896 garray_T *indent;
1897{
1898 char_u *p;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001899
1900 if (state == NULL)
1901 return;
1902
1903 fprintf(debugf, "(%2d)", abs(state->id));
Bram Moolenaar152e7892013-05-25 12:28:11 +02001904
1905 /* Output indent */
1906 p = (char_u *)indent->ga_data;
1907 if (indent->ga_len >= 3)
1908 {
1909 int last = indent->ga_len - 3;
1910 char_u save[2];
1911
1912 STRNCPY(save, &p[last], 2);
1913 STRNCPY(&p[last], "+-", 2);
1914 fprintf(debugf, " %s", p);
1915 STRNCPY(&p[last], save, 2);
1916 }
1917 else
1918 fprintf(debugf, " %s", p);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001919
1920 nfa_set_code(state->c);
Bram Moolenaar152e7892013-05-25 12:28:11 +02001921 fprintf(debugf, "%s%s (%d) (id=%d)\n",
1922 state->negated ? "NOT " : "", code, state->c, abs(state->id));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001923 if (state->id < 0)
1924 return;
1925
1926 state->id = abs(state->id) * -1;
Bram Moolenaar152e7892013-05-25 12:28:11 +02001927
1928 /* grow indent for state->out */
1929 indent->ga_len -= 1;
1930 if (state->out1)
Bram Moolenaarf47ca632013-05-25 15:31:05 +02001931 ga_concat(indent, (char_u *)"| ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02001932 else
Bram Moolenaarf47ca632013-05-25 15:31:05 +02001933 ga_concat(indent, (char_u *)" ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02001934 ga_append(indent, '\0');
1935
1936 nfa_print_state2(debugf, state->out, indent);
1937
1938 /* replace last part of indent for state->out1 */
1939 indent->ga_len -= 3;
Bram Moolenaarf47ca632013-05-25 15:31:05 +02001940 ga_concat(indent, (char_u *)" ");
Bram Moolenaar152e7892013-05-25 12:28:11 +02001941 ga_append(indent, '\0');
1942
1943 nfa_print_state2(debugf, state->out1, indent);
1944
1945 /* shrink indent */
1946 indent->ga_len -= 3;
1947 ga_append(indent, '\0');
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001948}
1949
1950/*
1951 * Print the NFA state machine.
1952 */
1953 static void
1954nfa_dump(prog)
1955 nfa_regprog_T *prog;
1956{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02001957 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001958
1959 if (debugf != NULL)
1960 {
Bram Moolenaar152e7892013-05-25 12:28:11 +02001961 nfa_print_state(debugf, prog->start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001962 fclose(debugf);
1963 }
1964}
1965#endif /* ENABLE_LOG */
1966#endif /* DEBUG */
1967
1968/*
1969 * Parse r.e. @expr and convert it into postfix form.
1970 * Return the postfix string on success, NULL otherwise.
1971 */
1972 static int *
1973re2post()
1974{
1975 if (nfa_reg(REG_NOPAREN) == FAIL)
1976 return NULL;
1977 EMIT(NFA_MOPEN);
1978 return post_start;
1979}
1980
1981/* NB. Some of the code below is inspired by Russ's. */
1982
1983/*
1984 * Represents an NFA state plus zero or one or two arrows exiting.
1985 * if c == MATCH, no arrows out; matching state.
1986 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
1987 * If c < 256, labeled arrow with character c to out.
1988 */
1989
1990static nfa_state_T *state_ptr; /* points to nfa_prog->state */
1991
1992/*
1993 * Allocate and initialize nfa_state_T.
1994 */
1995 static nfa_state_T *
1996new_state(c, out, out1)
1997 int c;
1998 nfa_state_T *out;
1999 nfa_state_T *out1;
2000{
2001 nfa_state_T *s;
2002
2003 if (istate >= nstate)
2004 return NULL;
2005
2006 s = &state_ptr[istate++];
2007
2008 s->c = c;
2009 s->out = out;
2010 s->out1 = out1;
2011
2012 s->id = istate;
2013 s->lastlist = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002014 s->negated = FALSE;
2015
2016 return s;
2017}
2018
2019/*
2020 * A partially built NFA without the matching state filled in.
2021 * Frag_T.start points at the start state.
2022 * Frag_T.out is a list of places that need to be set to the
2023 * next state for this fragment.
2024 */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002025
2026/* Since the out pointers in the list are always
2027 * uninitialized, we use the pointers themselves
2028 * as storage for the Ptrlists. */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002029typedef union Ptrlist Ptrlist;
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002030union Ptrlist
2031{
2032 Ptrlist *next;
2033 nfa_state_T *s;
2034};
2035
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002036struct Frag
2037{
Bram Moolenaar61db8b52013-05-26 17:45:49 +02002038 nfa_state_T *start;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002039 Ptrlist *out;
2040};
2041typedef struct Frag Frag_T;
2042
2043static Frag_T frag __ARGS((nfa_state_T *start, Ptrlist *out));
2044static Ptrlist *list1 __ARGS((nfa_state_T **outp));
2045static void patch __ARGS((Ptrlist *l, nfa_state_T *s));
2046static Ptrlist *append __ARGS((Ptrlist *l1, Ptrlist *l2));
2047static void st_push __ARGS((Frag_T s, Frag_T **p, Frag_T *stack_end));
2048static Frag_T st_pop __ARGS((Frag_T **p, Frag_T *stack));
2049
2050/*
Bram Moolenaar053bb602013-05-20 13:55:21 +02002051 * Initialize a Frag_T struct and return it.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002052 */
2053 static Frag_T
2054frag(start, out)
2055 nfa_state_T *start;
2056 Ptrlist *out;
2057{
Bram Moolenaar053bb602013-05-20 13:55:21 +02002058 Frag_T n;
2059
2060 n.start = start;
2061 n.out = out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002062 return n;
2063}
2064
2065/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002066 * Create singleton list containing just outp.
2067 */
2068 static Ptrlist *
2069list1(outp)
2070 nfa_state_T **outp;
2071{
2072 Ptrlist *l;
2073
2074 l = (Ptrlist *)outp;
2075 l->next = NULL;
2076 return l;
2077}
2078
2079/*
2080 * Patch the list of states at out to point to start.
2081 */
2082 static void
2083patch(l, s)
2084 Ptrlist *l;
2085 nfa_state_T *s;
2086{
2087 Ptrlist *next;
2088
2089 for (; l; l = next)
2090 {
2091 next = l->next;
2092 l->s = s;
2093 }
2094}
2095
2096
2097/*
2098 * Join the two lists l1 and l2, returning the combination.
2099 */
2100 static Ptrlist *
2101append(l1, l2)
2102 Ptrlist *l1;
2103 Ptrlist *l2;
2104{
2105 Ptrlist *oldl1;
2106
2107 oldl1 = l1;
2108 while (l1->next)
2109 l1 = l1->next;
2110 l1->next = l2;
2111 return oldl1;
2112}
2113
2114/*
2115 * Stack used for transforming postfix form into NFA.
2116 */
2117static Frag_T empty;
2118
2119 static void
2120st_error(postfix, end, p)
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002121 int *postfix UNUSED;
2122 int *end UNUSED;
2123 int *p UNUSED;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002124{
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002125#ifdef NFA_REGEXP_ERROR_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002126 FILE *df;
2127 int *p2;
2128
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002129 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002130 if (df)
2131 {
2132 fprintf(df, "Error popping the stack!\n");
2133#ifdef DEBUG
2134 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
2135#endif
2136 fprintf(df, "Postfix form is: ");
2137#ifdef DEBUG
2138 for (p2 = postfix; p2 < end; p2++)
2139 {
2140 nfa_set_code(*p2);
2141 fprintf(df, "%s, ", code);
2142 }
2143 nfa_set_code(*p);
2144 fprintf(df, "\nCurrent position is: ");
2145 for (p2 = postfix; p2 <= p; p2 ++)
2146 {
2147 nfa_set_code(*p2);
2148 fprintf(df, "%s, ", code);
2149 }
2150#else
2151 for (p2 = postfix; p2 < end; p2++)
2152 {
2153 fprintf(df, "%d, ", *p2);
2154 }
2155 fprintf(df, "\nCurrent position is: ");
2156 for (p2 = postfix; p2 <= p; p2 ++)
2157 {
2158 fprintf(df, "%d, ", *p2);
2159 }
2160#endif
2161 fprintf(df, "\n--------------------------\n");
2162 fclose(df);
2163 }
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002164#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002165 EMSG(_("E874: (NFA) Could not pop the stack !"));
2166}
2167
2168/*
2169 * Push an item onto the stack.
2170 */
2171 static void
2172st_push(s, p, stack_end)
2173 Frag_T s;
2174 Frag_T **p;
2175 Frag_T *stack_end;
2176{
2177 Frag_T *stackp = *p;
2178
2179 if (stackp >= stack_end)
2180 return;
2181 *stackp = s;
2182 *p = *p + 1;
2183}
2184
2185/*
2186 * Pop an item from the stack.
2187 */
2188 static Frag_T
2189st_pop(p, stack)
2190 Frag_T **p;
2191 Frag_T *stack;
2192{
2193 Frag_T *stackp;
2194
2195 *p = *p - 1;
2196 stackp = *p;
2197 if (stackp < stack)
2198 return empty;
2199 return **p;
2200}
2201
2202/*
2203 * Convert a postfix form into its equivalent NFA.
2204 * Return the NFA start state on success, NULL otherwise.
2205 */
2206 static nfa_state_T *
2207post2nfa(postfix, end, nfa_calc_size)
2208 int *postfix;
2209 int *end;
2210 int nfa_calc_size;
2211{
2212 int *p;
2213 int mopen;
2214 int mclose;
2215 Frag_T *stack = NULL;
2216 Frag_T *stackp = NULL;
2217 Frag_T *stack_end = NULL;
2218 Frag_T e1;
2219 Frag_T e2;
2220 Frag_T e;
2221 nfa_state_T *s;
2222 nfa_state_T *s1;
2223 nfa_state_T *matchstate;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002224 nfa_state_T *ret = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002225
2226 if (postfix == NULL)
2227 return NULL;
2228
Bram Moolenaar053bb602013-05-20 13:55:21 +02002229#define PUSH(s) st_push((s), &stackp, stack_end)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002230#define POP() st_pop(&stackp, stack); \
2231 if (stackp < stack) \
2232 { \
2233 st_error(postfix, end, p); \
2234 return NULL; \
2235 }
2236
2237 if (nfa_calc_size == FALSE)
2238 {
2239 /* Allocate space for the stack. Max states on the stack : nstate */
Bram Moolenaare3c7b862013-05-20 21:57:03 +02002240 stack = (Frag_T *) lalloc((nstate + 1) * sizeof(Frag_T), TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002241 stackp = stack;
Bram Moolenaare3c7b862013-05-20 21:57:03 +02002242 stack_end = stack + (nstate + 1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002243 }
2244
2245 for (p = postfix; p < end; ++p)
2246 {
2247 switch (*p)
2248 {
2249 case NFA_CONCAT:
2250 /* Catenation.
2251 * Pay attention: this operator does not exist
2252 * in the r.e. itself (it is implicit, really).
2253 * It is added when r.e. is translated to postfix
2254 * form in re2post().
2255 *
2256 * No new state added here. */
2257 if (nfa_calc_size == TRUE)
2258 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002259 /* nstate += 0; */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002260 break;
2261 }
2262 e2 = POP();
2263 e1 = POP();
2264 patch(e1.out, e2.start);
2265 PUSH(frag(e1.start, e2.out));
2266 break;
2267
2268 case NFA_NOT:
2269 /* Negation of a character */
2270 if (nfa_calc_size == TRUE)
2271 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002272 /* nstate += 0; */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002273 break;
2274 }
2275 e1 = POP();
2276 e1.start->negated = TRUE;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002277#ifdef FEAT_MBYTE
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002278 if (e1.start->c == NFA_COMPOSING)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002279 e1.start->out1->negated = TRUE;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002280#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002281 PUSH(e1);
2282 break;
2283
2284 case NFA_OR:
2285 /* Alternation */
2286 if (nfa_calc_size == TRUE)
2287 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002288 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002289 break;
2290 }
2291 e2 = POP();
2292 e1 = POP();
2293 s = new_state(NFA_SPLIT, e1.start, e2.start);
2294 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002295 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002296 PUSH(frag(s, append(e1.out, e2.out)));
2297 break;
2298
2299 case NFA_STAR:
2300 /* Zero or more */
2301 if (nfa_calc_size == TRUE)
2302 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002303 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002304 break;
2305 }
2306 e = POP();
2307 s = new_state(NFA_SPLIT, e.start, NULL);
2308 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002309 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002310 patch(e.out, s);
2311 PUSH(frag(s, list1(&s->out1)));
2312 break;
2313
2314 case NFA_QUEST:
2315 /* one or zero atoms=> greedy match */
2316 if (nfa_calc_size == TRUE)
2317 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002318 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002319 break;
2320 }
2321 e = POP();
2322 s = new_state(NFA_SPLIT, e.start, NULL);
2323 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002324 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002325 PUSH(frag(s, append(e.out, list1(&s->out1))));
2326 break;
2327
2328 case NFA_QUEST_NONGREEDY:
2329 /* zero or one atoms => non-greedy match */
2330 if (nfa_calc_size == TRUE)
2331 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002332 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002333 break;
2334 }
2335 e = POP();
2336 s = new_state(NFA_SPLIT, NULL, e.start);
2337 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002338 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002339 PUSH(frag(s, append(e.out, list1(&s->out))));
2340 break;
2341
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002342 case NFA_SKIP_CHAR:
2343 /* Symbol of 0-length, Used in a repetition
2344 * with max/min count of 0 */
2345 if (nfa_calc_size == TRUE)
2346 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002347 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002348 break;
2349 }
2350 s = new_state(NFA_SKIP_CHAR, NULL, NULL);
2351 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002352 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002353 PUSH(frag(s, list1(&s->out)));
2354 break;
2355
2356 case NFA_PREV_ATOM_NO_WIDTH:
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002357 case NFA_PREV_ATOM_NO_WIDTH_NEG:
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002358 /* The \@= operator: match the preceding atom with zero width.
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002359 * The \@! operator: no match for the preceding atom.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002360 * Surrounds the preceding atom with START_INVISIBLE and
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002361 * END_INVISIBLE, similarly to MOPEN. */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002362
2363 if (nfa_calc_size == TRUE)
2364 {
2365 nstate += 2;
2366 break;
2367 }
2368 e = POP();
2369 s1 = new_state(NFA_END_INVISIBLE, NULL, NULL);
2370 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002371 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002372 patch(e.out, s1);
2373
2374 s = new_state(NFA_START_INVISIBLE, e.start, s1);
2375 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002376 goto theend;
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02002377 if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG)
2378 {
2379 s->negated = TRUE;
2380 s1->negated = TRUE;
2381 }
2382
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002383 PUSH(frag(s, list1(&s1->out)));
2384 break;
2385
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002386#ifdef FEAT_MBYTE
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002387 case NFA_COMPOSING: /* char with composing char */
2388#if 0
2389 /* TODO */
2390 if (regflags & RF_ICOMBINE)
2391 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02002392 /* use the base character only */
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002393 }
2394#endif
2395 /* FALLTHROUGH */
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002396#endif
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002397
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002398 case NFA_MOPEN + 0: /* Submatch */
2399 case NFA_MOPEN + 1:
2400 case NFA_MOPEN + 2:
2401 case NFA_MOPEN + 3:
2402 case NFA_MOPEN + 4:
2403 case NFA_MOPEN + 5:
2404 case NFA_MOPEN + 6:
2405 case NFA_MOPEN + 7:
2406 case NFA_MOPEN + 8:
2407 case NFA_MOPEN + 9:
2408 case NFA_NOPEN: /* \%( "Invisible Submatch" */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002409 if (nfa_calc_size == TRUE)
2410 {
2411 nstate += 2;
2412 break;
2413 }
2414
2415 mopen = *p;
2416 switch (*p)
2417 {
2418 case NFA_NOPEN:
2419 mclose = NFA_NCLOSE;
2420 break;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002421#ifdef FEAT_MBYTE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002422 case NFA_COMPOSING:
2423 mclose = NFA_END_COMPOSING;
2424 break;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002425#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002426 default:
2427 /* NFA_MOPEN(0) ... NFA_MOPEN(9) */
2428 mclose = *p + NSUBEXP;
2429 break;
2430 }
2431
2432 /* Allow "NFA_MOPEN" as a valid postfix representation for
2433 * the empty regexp "". In this case, the NFA will be
2434 * NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
2435 * empty groups of parenthesis, and empty mbyte chars */
2436 if (stackp == stack)
2437 {
2438 s = new_state(mopen, NULL, NULL);
2439 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002440 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002441 s1 = new_state(mclose, NULL, NULL);
2442 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002443 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002444 patch(list1(&s->out), s1);
2445 PUSH(frag(s, list1(&s1->out)));
2446 break;
2447 }
2448
2449 /* At least one node was emitted before NFA_MOPEN, so
2450 * at least one node will be between NFA_MOPEN and NFA_MCLOSE */
2451 e = POP();
2452 s = new_state(mopen, e.start, NULL); /* `(' */
2453 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002454 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002455
2456 s1 = new_state(mclose, NULL, NULL); /* `)' */
2457 if (s1 == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002458 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002459 patch(e.out, s1);
2460
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002461#ifdef FEAT_MBYTE
Bram Moolenaar3c577f22013-05-24 21:59:54 +02002462 if (mopen == NFA_COMPOSING)
2463 /* COMPOSING->out1 = END_COMPOSING */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002464 patch(list1(&s->out1), s1);
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02002465#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002466
2467 PUSH(frag(s, list1(&s1->out)));
2468 break;
2469
Bram Moolenaar5714b802013-05-28 22:03:20 +02002470 case NFA_BACKREF1:
2471 case NFA_BACKREF2:
2472 case NFA_BACKREF3:
2473 case NFA_BACKREF4:
2474 case NFA_BACKREF5:
2475 case NFA_BACKREF6:
2476 case NFA_BACKREF7:
2477 case NFA_BACKREF8:
2478 case NFA_BACKREF9:
2479 if (nfa_calc_size == TRUE)
2480 {
2481 nstate += 2;
2482 break;
2483 }
2484 s = new_state(*p, NULL, NULL);
2485 if (s == NULL)
2486 goto theend;
2487 s1 = new_state(NFA_SKIP, NULL, NULL);
2488 if (s1 == NULL)
2489 goto theend;
2490 patch(list1(&s->out), s1);
2491 PUSH(frag(s, list1(&s1->out)));
2492 break;
2493
Bram Moolenaar423532e2013-05-29 21:14:42 +02002494 case NFA_LNUM:
2495 case NFA_LNUM_GT:
2496 case NFA_LNUM_LT:
2497 case NFA_VCOL:
2498 case NFA_VCOL_GT:
2499 case NFA_VCOL_LT:
2500 case NFA_COL:
2501 case NFA_COL_GT:
2502 case NFA_COL_LT:
2503 if (nfa_calc_size == TRUE)
2504 {
2505 nstate += 1;
2506 break;
2507 }
2508 e1 = POP();
2509 s = new_state(*p, NULL, NULL);
2510 if (s == NULL)
2511 goto theend;
2512 s->val = e1.start->c;
2513 PUSH(frag(s, list1(&s->out)));
2514 break;
2515
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002516 case NFA_ZSTART:
2517 case NFA_ZEND:
2518 default:
2519 /* Operands */
2520 if (nfa_calc_size == TRUE)
2521 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002522 nstate++;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002523 break;
2524 }
2525 s = new_state(*p, NULL, NULL);
2526 if (s == NULL)
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002527 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002528 PUSH(frag(s, list1(&s->out)));
2529 break;
2530
2531 } /* switch(*p) */
2532
2533 } /* for(p = postfix; *p; ++p) */
2534
2535 if (nfa_calc_size == TRUE)
2536 {
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02002537 nstate++;
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002538 goto theend; /* Return value when counting size is ignored anyway */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002539 }
2540
2541 e = POP();
2542 if (stackp != stack)
2543 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA), too many states left on stack"));
2544
2545 if (istate >= nstate)
2546 EMSG_RET_NULL(_("E876: (NFA regexp) Not enough space to store the whole NFA "));
2547
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002548 matchstate = &state_ptr[istate++]; /* the match state */
2549 matchstate->c = NFA_MATCH;
2550 matchstate->out = matchstate->out1 = NULL;
2551
2552 patch(e.out, matchstate);
Bram Moolenaarb09d9832013-05-21 16:28:11 +02002553 ret = e.start;
2554
2555theend:
2556 vim_free(stack);
2557 return ret;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002558
2559#undef POP1
2560#undef PUSH1
2561#undef POP2
2562#undef PUSH2
2563#undef POP
2564#undef PUSH
2565}
2566
2567/****************************************************************
2568 * NFA execution code.
2569 ****************************************************************/
2570
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002571typedef struct
2572{
2573 int in_use; /* number of subexpr with useful info */
2574
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002575 /* When REG_MULTI is TRUE list.multi is used, otherwise list.line. */
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002576 union
2577 {
2578 struct multipos
2579 {
2580 lpos_T start;
2581 lpos_T end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002582 } multi[NSUBEXP];
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002583 struct linepos
2584 {
2585 char_u *start;
2586 char_u *end;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002587 } line[NSUBEXP];
2588 } list;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002589} regsub_T;
2590
Bram Moolenaar963fee22013-05-26 21:47:28 +02002591/* nfa_thread_T contains execution information of a NFA state */
Bram Moolenaar4b417062013-05-25 20:19:50 +02002592typedef struct
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002593{
2594 nfa_state_T *state;
Bram Moolenaar5714b802013-05-28 22:03:20 +02002595 int count;
Bram Moolenaar963fee22013-05-26 21:47:28 +02002596 regsub_T sub; /* submatch info, only party used */
Bram Moolenaar4b417062013-05-25 20:19:50 +02002597} nfa_thread_T;
2598
Bram Moolenaar963fee22013-05-26 21:47:28 +02002599/* nfa_list_T contains the alternative NFA execution states. */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600typedef struct
2601{
Bram Moolenaar5714b802013-05-28 22:03:20 +02002602 nfa_thread_T *t; /* allocated array of states */
Bram Moolenaar428e9872013-05-30 17:05:39 +02002603 int n; /* nr of states currently in "t" */
2604 int len; /* max nr of states in "t" */
Bram Moolenaar5714b802013-05-28 22:03:20 +02002605 int id; /* ID of the list */
Bram Moolenaar4b417062013-05-25 20:19:50 +02002606} nfa_list_T;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002607
Bram Moolenaar5714b802013-05-28 22:03:20 +02002608#ifdef ENABLE_LOG
2609 static void
2610log_subexpr(sub)
2611 regsub_T *sub;
2612{
2613 int j;
2614
2615 for (j = 0; j < sub->in_use; j++)
2616 if (REG_MULTI)
2617 fprintf(log_fd, "\n *** group %d, start: c=%d, l=%d, end: c=%d, l=%d",
2618 j,
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002619 sub->list.multi[j].start.col,
2620 (int)sub->list.multi[j].start.lnum,
2621 sub->list.multi[j].end.col,
2622 (int)sub->list.multi[j].end.lnum);
Bram Moolenaar5714b802013-05-28 22:03:20 +02002623 else
2624 fprintf(log_fd, "\n *** group %d, start: \"%s\", end: \"%s\"",
2625 j,
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002626 (char *)sub->list.line[j].start,
2627 (char *)sub->list.line[j].end);
Bram Moolenaar5714b802013-05-28 22:03:20 +02002628 fprintf(log_fd, "\n");
2629}
2630#endif
2631
Bram Moolenaar963fee22013-05-26 21:47:28 +02002632/* Used during execution: whether a match has been found. */
2633static int nfa_match;
Bram Moolenaar4b417062013-05-25 20:19:50 +02002634
Bram Moolenaar428e9872013-05-30 17:05:39 +02002635static int sub_equal __ARGS((regsub_T *sub1, regsub_T *sub2));
Bram Moolenaar5714b802013-05-28 22:03:20 +02002636static void addstate __ARGS((nfa_list_T *l, nfa_state_T *state, regsub_T *sub, int off));
2637static void addstate_here __ARGS((nfa_list_T *l, nfa_state_T *state, regsub_T *sub, int *ip));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002638
Bram Moolenaar428e9872013-05-30 17:05:39 +02002639/*
2640 * Return TRUE if "sub1" and "sub2" have the same positions.
2641 */
2642 static int
2643sub_equal(sub1, sub2)
2644 regsub_T *sub1;
2645 regsub_T *sub2;
2646{
2647 int i;
2648 int todo;
2649 linenr_T s1, e1;
2650 linenr_T s2, e2;
2651 char_u *sp1, *ep1;
2652 char_u *sp2, *ep2;
2653
2654 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
2655 if (REG_MULTI)
2656 {
2657 for (i = 0; i < todo; ++i)
2658 {
2659 if (i < sub1->in_use)
2660 {
2661 s1 = sub1->list.multi[i].start.lnum;
2662 e1 = sub1->list.multi[i].end.lnum;
2663 }
2664 else
2665 {
2666 s1 = 0;
2667 e1 = 0;
2668 }
2669 if (i < sub2->in_use)
2670 {
2671 s2 = sub2->list.multi[i].start.lnum;
2672 e2 = sub2->list.multi[i].end.lnum;
2673 }
2674 else
2675 {
2676 s2 = 0;
2677 e2 = 0;
2678 }
2679 if (s1 != s2 || e1 != e2)
2680 return FALSE;
2681 if (s1 != 0 && sub1->list.multi[i].start.col
2682 != sub2->list.multi[i].start.col)
2683 return FALSE;
2684 if (e1 != 0 && sub1->list.multi[i].end.col
2685 != sub2->list.multi[i].end.col)
2686 return FALSE;
2687 }
2688 }
2689 else
2690 {
2691 for (i = 0; i < todo; ++i)
2692 {
2693 if (i < sub1->in_use)
2694 {
2695 sp1 = sub1->list.line[i].start;
2696 ep1 = sub1->list.line[i].end;
2697 }
2698 else
2699 {
2700 sp1 = NULL;
2701 ep1 = NULL;
2702 }
2703 if (i < sub2->in_use)
2704 {
2705 sp2 = sub2->list.line[i].start;
2706 ep2 = sub2->list.line[i].end;
2707 }
2708 else
2709 {
2710 sp2 = NULL;
2711 ep2 = NULL;
2712 }
2713 if (sp1 != sp2 || ep1 != ep2)
2714 return FALSE;
2715 }
2716 }
2717
2718 return TRUE;
2719}
2720
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002721 static void
Bram Moolenaar5714b802013-05-28 22:03:20 +02002722addstate(l, state, sub, off)
Bram Moolenaar4b417062013-05-25 20:19:50 +02002723 nfa_list_T *l; /* runtime state list */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002724 nfa_state_T *state; /* state to update */
Bram Moolenaar5714b802013-05-28 22:03:20 +02002725 regsub_T *sub; /* pointers to subexpressions */
Bram Moolenaar35b23862013-05-22 23:00:40 +02002726 int off; /* byte offset, when -1 go to next line */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002727{
Bram Moolenaar963fee22013-05-26 21:47:28 +02002728 int subidx;
Bram Moolenaar428e9872013-05-30 17:05:39 +02002729 nfa_thread_T *thread;
Bram Moolenaar963fee22013-05-26 21:47:28 +02002730 lpos_T save_lpos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002731 int save_in_use;
Bram Moolenaar963fee22013-05-26 21:47:28 +02002732 char_u *save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002733 int i;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002734#ifdef ENABLE_LOG
2735 int did_print = FALSE;
2736#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002737
2738 if (l == NULL || state == NULL)
2739 return;
2740
2741 switch (state->c)
2742 {
2743 case NFA_SPLIT:
2744 case NFA_NOT:
2745 case NFA_NOPEN:
2746 case NFA_NCLOSE:
2747 case NFA_MCLOSE:
2748 case NFA_MCLOSE + 1:
2749 case NFA_MCLOSE + 2:
2750 case NFA_MCLOSE + 3:
2751 case NFA_MCLOSE + 4:
2752 case NFA_MCLOSE + 5:
2753 case NFA_MCLOSE + 6:
2754 case NFA_MCLOSE + 7:
2755 case NFA_MCLOSE + 8:
2756 case NFA_MCLOSE + 9:
Bram Moolenaar5714b802013-05-28 22:03:20 +02002757 /* These nodes are not added themselves but their "out" and/or
2758 * "out1" may be added below. */
2759 break;
2760
2761 case NFA_MOPEN:
2762 case NFA_MOPEN + 1:
2763 case NFA_MOPEN + 2:
2764 case NFA_MOPEN + 3:
2765 case NFA_MOPEN + 4:
2766 case NFA_MOPEN + 5:
2767 case NFA_MOPEN + 6:
2768 case NFA_MOPEN + 7:
2769 case NFA_MOPEN + 8:
2770 case NFA_MOPEN + 9:
2771 /* These nodes do not need to be added, but we need to bail out
2772 * when it was tried to be added to this list before. */
2773 if (state->lastlist == l->id)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002774 goto skip_add;
Bram Moolenaar5714b802013-05-28 22:03:20 +02002775 state->lastlist = l->id;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002776 break;
2777
2778 default:
Bram Moolenaar5714b802013-05-28 22:03:20 +02002779 if (state->lastlist == l->id)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002780 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02002781 /* This state is already in the list, don't add it again,
2782 * unless it is an MOPEN that is used for a backreference. */
Bram Moolenaar428e9872013-05-30 17:05:39 +02002783 if (!nfa_has_backref)
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002784 {
2785skip_add:
2786#ifdef ENABLE_LOG
2787 nfa_set_code(state->c);
2788 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s\n",
2789 abs(state->id), l->id, state->c, code);
2790#endif
Bram Moolenaar428e9872013-05-30 17:05:39 +02002791 return;
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002792 }
Bram Moolenaar428e9872013-05-30 17:05:39 +02002793
2794 /* See if the same state is already in the list with the same
2795 * positions. */
2796 for (i = 0; i < l->n; ++i)
2797 {
2798 thread = &l->t[i];
2799 if (thread->state->id == state->id
2800 && sub_equal(&thread->sub, sub))
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002801 goto skip_add;
Bram Moolenaar428e9872013-05-30 17:05:39 +02002802 }
2803 }
2804
2805 /* when there are backreferences the number of states may be (a
2806 * lot) bigger */
2807 if (nfa_has_backref && l->n == l->len)
2808 {
2809 int newlen = l->len * 3 / 2 + 50;
2810
2811 l->t = vim_realloc(l->t, newlen * sizeof(nfa_thread_T));
2812 l->len = newlen;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002813 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02002814
2815 /* add the state to the list */
2816 state->lastlist = l->id;
Bram Moolenaar428e9872013-05-30 17:05:39 +02002817 thread = &l->t[l->n++];
2818 thread->state = state;
2819 thread->sub.in_use = sub->in_use;
Bram Moolenaar5714b802013-05-28 22:03:20 +02002820 if (sub->in_use > 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002821 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02002822 /* Copy the match start and end positions. */
2823 if (REG_MULTI)
Bram Moolenaar428e9872013-05-30 17:05:39 +02002824 mch_memmove(&thread->sub.list.multi[0],
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002825 &sub->list.multi[0],
Bram Moolenaar5714b802013-05-28 22:03:20 +02002826 sizeof(struct multipos) * sub->in_use);
2827 else
Bram Moolenaar428e9872013-05-30 17:05:39 +02002828 mch_memmove(&thread->sub.list.line[0],
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002829 &sub->list.line[0],
Bram Moolenaar5714b802013-05-28 22:03:20 +02002830 sizeof(struct linepos) * sub->in_use);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002831 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002832#ifdef ENABLE_LOG
2833 {
2834 int col;
2835
2836 if (thread->sub.in_use <= 0)
2837 col = -1;
2838 else if (REG_MULTI)
2839 col = thread->sub.list.multi[0].start.col;
2840 else
2841 col = (int)(thread->sub.list.line[0].start - regline);
2842 nfa_set_code(state->c);
2843 fprintf(log_fd, "> Adding state %d to list %d. char %d: %s (start col %d)\n",
2844 abs(state->id), l->id, state->c, code, col);
2845 did_print = TRUE;
2846 }
2847#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002848 }
2849
2850#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02002851 if (!did_print)
2852 {
2853 int col;
2854
2855 if (sub->in_use <= 0)
2856 col = -1;
2857 else if (REG_MULTI)
2858 col = sub->list.multi[0].start.col;
2859 else
2860 col = (int)(sub->list.line[0].start - regline);
2861 nfa_set_code(state->c);
2862 fprintf(log_fd, "> Processing state %d for list %d. char %d: %s (start col %d)\n",
2863 abs(state->id), l->id, state->c, code, col);
2864 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002865#endif
2866 switch (state->c)
2867 {
2868 case NFA_MATCH:
Bram Moolenaar963fee22013-05-26 21:47:28 +02002869 nfa_match = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002870 break;
2871
2872 case NFA_SPLIT:
Bram Moolenaar5714b802013-05-28 22:03:20 +02002873 addstate(l, state->out, sub, off);
2874 addstate(l, state->out1, sub, off);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002875 break;
2876
2877#if 0
2878 case NFA_END_NEG_RANGE:
2879 /* Nothing to handle here. nfa_regmatch() will take care of it */
2880 break;
2881
2882 case NFA_NOT:
2883 EMSG(_("E999: (NFA regexp internal error) Should not process NOT node !"));
2884#ifdef ENABLE_LOG
2885 fprintf(f, "\n\n>>> E999: Added state NFA_NOT to a list ... Something went wrong ! Why wasn't it processed already? \n\n");
2886#endif
2887 break;
2888
2889 case NFA_COMPOSING:
2890 /* nfa_regmatch() will match all the bytes of this composing char. */
2891 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002892#endif
2893
Bram Moolenaar5714b802013-05-28 22:03:20 +02002894 case NFA_SKIP_CHAR:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002895 case NFA_NOPEN:
2896 case NFA_NCLOSE:
Bram Moolenaar5714b802013-05-28 22:03:20 +02002897 addstate(l, state->out, sub, off);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002898 break;
2899
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002900 case NFA_MOPEN + 0:
2901 case NFA_MOPEN + 1:
2902 case NFA_MOPEN + 2:
2903 case NFA_MOPEN + 3:
2904 case NFA_MOPEN + 4:
2905 case NFA_MOPEN + 5:
2906 case NFA_MOPEN + 6:
2907 case NFA_MOPEN + 7:
2908 case NFA_MOPEN + 8:
2909 case NFA_MOPEN + 9:
2910 case NFA_ZSTART:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002911 if (state->c == NFA_ZSTART)
2912 subidx = 0;
Bram Moolenaar963fee22013-05-26 21:47:28 +02002913 else
2914 subidx = state->c - NFA_MOPEN;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002915
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002916 /* Set the position (with "off") in the subexpression. Save and
2917 * restore it when it was in use. Otherwise fill any gap. */
Bram Moolenaar4b6ebe62013-05-30 17:49:24 +02002918 save_ptr = NULL;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002919 if (REG_MULTI)
2920 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02002921 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002922 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002923 save_lpos = sub->list.multi[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002924 save_in_use = -1;
2925 }
2926 else
2927 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02002928 save_in_use = sub->in_use;
2929 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002930 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002931 sub->list.multi[i].start.lnum = -1;
2932 sub->list.multi[i].end.lnum = -1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002933 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02002934 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002935 }
Bram Moolenaar35b23862013-05-22 23:00:40 +02002936 if (off == -1)
2937 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002938 sub->list.multi[subidx].start.lnum = reglnum + 1;
2939 sub->list.multi[subidx].start.col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02002940 }
2941 else
2942 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002943 sub->list.multi[subidx].start.lnum = reglnum;
2944 sub->list.multi[subidx].start.col =
Bram Moolenaar35b23862013-05-22 23:00:40 +02002945 (colnr_T)(reginput - regline + off);
2946 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002947 }
2948 else
2949 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02002950 if (subidx < sub->in_use)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002951 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002952 save_ptr = sub->list.line[subidx].start;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002953 save_in_use = -1;
2954 }
2955 else
2956 {
Bram Moolenaar5714b802013-05-28 22:03:20 +02002957 save_in_use = sub->in_use;
2958 for (i = sub->in_use; i < subidx; ++i)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002959 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002960 sub->list.line[i].start = NULL;
2961 sub->list.line[i].end = NULL;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002962 }
Bram Moolenaar5714b802013-05-28 22:03:20 +02002963 sub->in_use = subidx + 1;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002964 }
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002965 sub->list.line[subidx].start = reginput + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002966 }
2967
Bram Moolenaar5714b802013-05-28 22:03:20 +02002968 addstate(l, state->out, sub, off);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002969
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002970 if (save_in_use == -1)
2971 {
2972 if (REG_MULTI)
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002973 sub->list.multi[subidx].start = save_lpos;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002974 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02002975 sub->list.line[subidx].start = save_ptr;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02002976 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002977 else
Bram Moolenaar5714b802013-05-28 22:03:20 +02002978 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002979 break;
2980
2981 case NFA_MCLOSE + 0:
Bram Moolenaar57a285b2013-05-26 16:57:28 +02002982 if (nfa_has_zend)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002983 {
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02002984 /* Do not overwrite the position set by \ze. If no \ze
2985 * encountered end will be set in nfa_regtry(). */
Bram Moolenaar5714b802013-05-28 22:03:20 +02002986 addstate(l, state->out, sub, off);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002987 break;
2988 }
2989 case NFA_MCLOSE + 1:
2990 case NFA_MCLOSE + 2:
2991 case NFA_MCLOSE + 3:
2992 case NFA_MCLOSE + 4:
2993 case NFA_MCLOSE + 5:
2994 case NFA_MCLOSE + 6:
2995 case NFA_MCLOSE + 7:
2996 case NFA_MCLOSE + 8:
2997 case NFA_MCLOSE + 9:
2998 case NFA_ZEND:
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002999 if (state->c == NFA_ZEND)
3000 subidx = 0;
Bram Moolenaar963fee22013-05-26 21:47:28 +02003001 else
3002 subidx = state->c - NFA_MCLOSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003003
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003004 /* We don't fill in gaps here, there must have been an MOPEN that
3005 * has done that. */
Bram Moolenaar5714b802013-05-28 22:03:20 +02003006 save_in_use = sub->in_use;
3007 if (sub->in_use <= subidx)
3008 sub->in_use = subidx + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003009 if (REG_MULTI)
3010 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003011 save_lpos = sub->list.multi[subidx].end;
Bram Moolenaar35b23862013-05-22 23:00:40 +02003012 if (off == -1)
3013 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003014 sub->list.multi[subidx].end.lnum = reglnum + 1;
3015 sub->list.multi[subidx].end.col = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02003016 }
3017 else
3018 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003019 sub->list.multi[subidx].end.lnum = reglnum;
3020 sub->list.multi[subidx].end.col =
Bram Moolenaar963fee22013-05-26 21:47:28 +02003021 (colnr_T)(reginput - regline + off);
Bram Moolenaar35b23862013-05-22 23:00:40 +02003022 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003023 }
3024 else
3025 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003026 save_ptr = sub->list.line[subidx].end;
3027 sub->list.line[subidx].end = reginput + off;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003028 }
3029
Bram Moolenaar5714b802013-05-28 22:03:20 +02003030 addstate(l, state->out, sub, off);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003031
3032 if (REG_MULTI)
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003033 sub->list.multi[subidx].end = save_lpos;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003034 else
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003035 sub->list.line[subidx].end = save_ptr;
Bram Moolenaar5714b802013-05-28 22:03:20 +02003036 sub->in_use = save_in_use;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003037 break;
3038 }
3039}
3040
3041/*
Bram Moolenaar4b417062013-05-25 20:19:50 +02003042 * Like addstate(), but the new state(s) are put at position "*ip".
3043 * Used for zero-width matches, next state to use is the added one.
3044 * This makes sure the order of states to be tried does not change, which
3045 * matters for alternatives.
3046 */
3047 static void
Bram Moolenaar5714b802013-05-28 22:03:20 +02003048addstate_here(l, state, sub, ip)
Bram Moolenaar4b417062013-05-25 20:19:50 +02003049 nfa_list_T *l; /* runtime state list */
3050 nfa_state_T *state; /* state to update */
Bram Moolenaar5714b802013-05-28 22:03:20 +02003051 regsub_T *sub; /* pointers to subexpressions */
Bram Moolenaar4b417062013-05-25 20:19:50 +02003052 int *ip;
3053{
3054 int tlen = l->n;
3055 int count;
3056 int i = *ip;
3057
3058 /* first add the state(s) at the end, so that we know how many there are */
Bram Moolenaar5714b802013-05-28 22:03:20 +02003059 addstate(l, state, sub, 0);
Bram Moolenaar4b417062013-05-25 20:19:50 +02003060
3061 /* when "*ip" was at the end of the list, nothing to do */
3062 if (i + 1 == tlen)
3063 return;
3064
3065 /* re-order to put the new state at the current position */
3066 count = l->n - tlen;
Bram Moolenaar428e9872013-05-30 17:05:39 +02003067 if (count == 1)
3068 {
3069 /* overwrite the current state */
3070 l->t[i] = l->t[l->n - 1];
3071 }
3072 else if (count > 1)
Bram Moolenaar4b417062013-05-25 20:19:50 +02003073 {
3074 /* make space for new states, then move them from the
3075 * end to the current position */
3076 mch_memmove(&(l->t[i + count]),
3077 &(l->t[i + 1]),
3078 sizeof(nfa_thread_T) * (l->n - i - 1));
3079 mch_memmove(&(l->t[i]),
3080 &(l->t[l->n - 1]),
3081 sizeof(nfa_thread_T) * count);
3082 }
Bram Moolenaar4b417062013-05-25 20:19:50 +02003083 --l->n;
3084 *ip = i - 1;
3085}
3086
3087/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003088 * Check character class "class" against current character c.
3089 */
3090 static int
3091check_char_class(class, c)
3092 int class;
3093 int c;
3094{
3095 switch (class)
3096 {
3097 case NFA_CLASS_ALNUM:
Bram Moolenaar745fc022013-05-20 22:20:02 +02003098 if (c >= 1 && c <= 255 && isalnum(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003099 return OK;
3100 break;
3101 case NFA_CLASS_ALPHA:
Bram Moolenaar745fc022013-05-20 22:20:02 +02003102 if (c >= 1 && c <= 255 && isalpha(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003103 return OK;
3104 break;
3105 case NFA_CLASS_BLANK:
3106 if (c == ' ' || c == '\t')
3107 return OK;
3108 break;
3109 case NFA_CLASS_CNTRL:
Bram Moolenaar745fc022013-05-20 22:20:02 +02003110 if (c >= 1 && c <= 255 && iscntrl(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003111 return OK;
3112 break;
3113 case NFA_CLASS_DIGIT:
3114 if (VIM_ISDIGIT(c))
3115 return OK;
3116 break;
3117 case NFA_CLASS_GRAPH:
Bram Moolenaar745fc022013-05-20 22:20:02 +02003118 if (c >= 1 && c <= 255 && isgraph(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003119 return OK;
3120 break;
3121 case NFA_CLASS_LOWER:
3122 if (MB_ISLOWER(c))
3123 return OK;
3124 break;
3125 case NFA_CLASS_PRINT:
3126 if (vim_isprintc(c))
3127 return OK;
3128 break;
3129 case NFA_CLASS_PUNCT:
Bram Moolenaar745fc022013-05-20 22:20:02 +02003130 if (c >= 1 && c <= 255 && ispunct(c))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003131 return OK;
3132 break;
3133 case NFA_CLASS_SPACE:
3134 if ((c >=9 && c <= 13) || (c == ' '))
3135 return OK;
3136 break;
3137 case NFA_CLASS_UPPER:
3138 if (MB_ISUPPER(c))
3139 return OK;
3140 break;
3141 case NFA_CLASS_XDIGIT:
3142 if (vim_isxdigit(c))
3143 return OK;
3144 break;
3145 case NFA_CLASS_TAB:
3146 if (c == '\t')
3147 return OK;
3148 break;
3149 case NFA_CLASS_RETURN:
3150 if (c == '\r')
3151 return OK;
3152 break;
3153 case NFA_CLASS_BACKSPACE:
3154 if (c == '\b')
3155 return OK;
3156 break;
3157 case NFA_CLASS_ESCAPE:
3158 if (c == '\033')
3159 return OK;
3160 break;
3161
3162 default:
3163 /* should not be here :P */
3164 EMSG_RET_FAIL(_("E877: (NFA regexp) Invalid character class "));
3165 }
3166 return FAIL;
3167}
3168
Bram Moolenaar5714b802013-05-28 22:03:20 +02003169static int match_backref __ARGS((regsub_T *sub, int subidx, int *bytelen));
3170
3171/*
3172 * Check for a match with subexpression "subidx".
3173 * return TRUE if it matches.
3174 */
3175 static int
3176match_backref(sub, subidx, bytelen)
3177 regsub_T *sub; /* pointers to subexpressions */
3178 int subidx;
3179 int *bytelen; /* out: length of match in bytes */
3180{
3181 int len;
3182
3183 if (sub->in_use <= subidx)
3184 {
3185retempty:
3186 /* backref was not set, match an empty string */
3187 *bytelen = 0;
3188 return TRUE;
3189 }
3190
3191 if (REG_MULTI)
3192 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003193 if (sub->list.multi[subidx].start.lnum < 0
3194 || sub->list.multi[subidx].end.lnum < 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003195 goto retempty;
3196 /* TODO: line breaks */
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003197 len = sub->list.multi[subidx].end.col
3198 - sub->list.multi[subidx].start.col;
3199 if (cstrncmp(regline + sub->list.multi[subidx].start.col,
Bram Moolenaar5714b802013-05-28 22:03:20 +02003200 reginput, &len) == 0)
3201 {
3202 *bytelen = len;
3203 return TRUE;
3204 }
3205 }
3206 else
3207 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003208 if (sub->list.line[subidx].start == NULL
3209 || sub->list.line[subidx].end == NULL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003210 goto retempty;
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003211 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
3212 if (cstrncmp(sub->list.line[subidx].start, reginput, &len) == 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003213 {
3214 *bytelen = len;
3215 return TRUE;
3216 }
3217 }
3218 return FALSE;
3219}
3220
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003221/*
3222 * Set all NFA nodes' list ID equal to -1.
3223 */
3224 static void
3225nfa_set_neg_listids(start)
3226 nfa_state_T *start;
3227{
Bram Moolenaar5714b802013-05-28 22:03:20 +02003228 if (start != NULL && start->lastlist >= 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003229 {
3230 start->lastlist = -1;
3231 nfa_set_neg_listids(start->out);
3232 nfa_set_neg_listids(start->out1);
3233 }
3234}
3235
3236/*
3237 * Set all NFA nodes' list ID equal to 0.
3238 */
3239 static void
3240nfa_set_null_listids(start)
3241 nfa_state_T *start;
3242{
Bram Moolenaar5714b802013-05-28 22:03:20 +02003243 if (start != NULL && start->lastlist == -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003244 {
3245 start->lastlist = 0;
3246 nfa_set_null_listids(start->out);
3247 nfa_set_null_listids(start->out1);
3248 }
3249}
3250
3251/*
3252 * Save list IDs for all NFA states in "list".
3253 */
3254 static void
3255nfa_save_listids(start, list)
3256 nfa_state_T *start;
3257 int *list;
3258{
Bram Moolenaar5714b802013-05-28 22:03:20 +02003259 if (start != NULL && start->lastlist != -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003260 {
3261 list[abs(start->id)] = start->lastlist;
3262 start->lastlist = -1;
3263 nfa_save_listids(start->out, list);
3264 nfa_save_listids(start->out1, list);
3265 }
3266}
3267
3268/*
3269 * Restore list IDs from "list" to all NFA states.
3270 */
3271 static void
3272nfa_restore_listids(start, list)
3273 nfa_state_T *start;
3274 int *list;
3275{
Bram Moolenaar5714b802013-05-28 22:03:20 +02003276 if (start != NULL && start->lastlist == -1)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003277 {
3278 start->lastlist = list[abs(start->id)];
3279 nfa_restore_listids(start->out, list);
3280 nfa_restore_listids(start->out1, list);
3281 }
3282}
3283
Bram Moolenaar423532e2013-05-29 21:14:42 +02003284 static int
3285nfa_re_num_cmp(val, op, pos)
3286 long_u val;
3287 int op;
3288 long_u pos;
3289{
3290 if (op == 1) return pos > val;
3291 if (op == 2) return pos < val;
3292 return val == pos;
3293}
3294
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003295static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m));
3296
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003297/*
3298 * Main matching routine.
3299 *
3300 * Run NFA to determine whether it matches reginput.
3301 *
3302 * Return TRUE if there is a match, FALSE otherwise.
3303 * Note: Caller must ensure that: start != NULL.
3304 */
3305 static int
3306nfa_regmatch(start, submatch, m)
3307 nfa_state_T *start;
3308 regsub_T *submatch;
3309 regsub_T *m;
3310{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003311 int result;
3312 int size = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003313 int flag = 0;
Bram Moolenaar4b417062013-05-25 20:19:50 +02003314 int go_to_nextline = FALSE;
3315 nfa_thread_T *t;
Bram Moolenaar4b417062013-05-25 20:19:50 +02003316 nfa_list_T list[3];
3317 nfa_list_T *listtbl[2][2];
3318 nfa_list_T *ll;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003319 int listid = 1;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003320 int listidx;
Bram Moolenaar4b417062013-05-25 20:19:50 +02003321 nfa_list_T *thislist;
3322 nfa_list_T *nextlist;
3323 nfa_list_T *neglist;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003324 int *listids = NULL;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02003325#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003326 FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003327
3328 if (debug == NULL)
3329 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003330 EMSG2(_("(NFA) COULD NOT OPEN %s !"), NFA_REGEXP_DEBUG_LOG);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003331 return FALSE;
3332 }
3333#endif
Bram Moolenaar963fee22013-05-26 21:47:28 +02003334 nfa_match = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003335
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003336 /* Allocate memory for the lists of nodes. */
Bram Moolenaar4b417062013-05-25 20:19:50 +02003337 size = (nstate + 1) * sizeof(nfa_thread_T);
Bram Moolenaar428e9872013-05-30 17:05:39 +02003338 list[0].t = (nfa_thread_T *)lalloc_clear(size, TRUE);
3339 list[0].len = nstate + 1;
3340 list[1].t = (nfa_thread_T *)lalloc_clear(size, TRUE);
3341 list[1].len = nstate + 1;
3342 list[2].t = (nfa_thread_T *)lalloc_clear(size, TRUE);
3343 list[2].len = nstate + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003344 if (list[0].t == NULL || list[1].t == NULL || list[2].t == NULL)
3345 goto theend;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003346
3347#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003348 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003349 if (log_fd != NULL)
3350 {
3351 fprintf(log_fd, "**********************************\n");
3352 nfa_set_code(start->c);
3353 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
3354 abs(start->id), code);
3355 fprintf(log_fd, "**********************************\n");
3356 }
3357 else
3358 {
3359 EMSG(_("Could not open temporary log file for writing, displaying on stderr ... "));
3360 log_fd = stderr;
3361 }
3362#endif
3363
3364 thislist = &list[0];
3365 thislist->n = 0;
3366 nextlist = &list[1];
3367 nextlist->n = 0;
3368 neglist = &list[2];
3369 neglist->n = 0;
3370#ifdef ENABLE_LOG
3371 fprintf(log_fd, "(---) STARTSTATE\n");
3372#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02003373 thislist->id = listid;
3374 addstate(thislist, start, m, 0);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003375
3376 /* There are two cases when the NFA advances: 1. input char matches the
3377 * NFA node and 2. input char does not match the NFA node, but the next
3378 * node is NFA_NOT. The following macro calls addstate() according to
3379 * these rules. It is used A LOT, so use the "listtbl" table for speed */
3380 listtbl[0][0] = NULL;
3381 listtbl[0][1] = neglist;
3382 listtbl[1][0] = nextlist;
3383 listtbl[1][1] = NULL;
3384#define ADD_POS_NEG_STATE(node) \
3385 ll = listtbl[result ? 1 : 0][node->negated]; \
3386 if (ll != NULL) \
Bram Moolenaar5714b802013-05-28 22:03:20 +02003387 addstate(ll, node->out , &t->sub, clen);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003388
3389
3390 /*
3391 * Run for each character.
3392 */
Bram Moolenaar35b23862013-05-22 23:00:40 +02003393 for (;;)
3394 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003395 int curc;
3396 int clen;
3397
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003398#ifdef FEAT_MBYTE
3399 if (has_mbyte)
3400 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003401 curc = (*mb_ptr2char)(reginput);
3402 clen = (*mb_ptr2len)(reginput);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003403 }
3404 else
3405#endif
3406 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003407 curc = *reginput;
3408 clen = 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003409 }
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003410 if (curc == NUL)
Bram Moolenaar35b23862013-05-22 23:00:40 +02003411 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003412 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02003413 go_to_nextline = FALSE;
3414 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003415
3416 /* swap lists */
3417 thislist = &list[flag];
3418 nextlist = &list[flag ^= 1];
Bram Moolenaar5714b802013-05-28 22:03:20 +02003419 nextlist->n = 0; /* clear nextlist */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003420 listtbl[1][0] = nextlist;
3421 ++listid;
Bram Moolenaar5714b802013-05-28 22:03:20 +02003422 thislist->id = listid;
3423 nextlist->id = listid + 1;
3424 neglist->id = listid + 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003425
3426#ifdef ENABLE_LOG
3427 fprintf(log_fd, "------------------------------------------\n");
3428 fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003429 fprintf(log_fd, ">>> Advanced one character ... Current char is %c (code %d) \n", curc, (int)curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003430 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003431 {
3432 int i;
3433
3434 for (i = 0; i < thislist->n; i++)
3435 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
3436 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003437 fprintf(log_fd, "\n");
3438#endif
3439
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02003440#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003441 fprintf(debug, "\n-------------------\n");
3442#endif
Bram Moolenaar66e83d72013-05-21 14:03:00 +02003443 /*
3444 * If the state lists are empty we can stop.
3445 */
3446 if (thislist->n == 0 && neglist->n == 0)
3447 break;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003448
3449 /* compute nextlist */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003450 for (listidx = 0; listidx < thislist->n || neglist->n > 0; ++listidx)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003451 {
3452 if (neglist->n > 0)
3453 {
3454 t = &neglist->t[0];
Bram Moolenaar0fabe3f2013-05-21 12:34:17 +02003455 neglist->n--;
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003456 listidx--;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003457 }
3458 else
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003459 t = &thislist->t[listidx];
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003460
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02003461#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003462 nfa_set_code(t->state->c);
3463 fprintf(debug, "%s, ", code);
3464#endif
3465#ifdef ENABLE_LOG
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02003466 {
3467 int col;
3468
3469 if (t->sub.in_use <= 0)
3470 col = -1;
3471 else if (REG_MULTI)
3472 col = t->sub.list.multi[0].start.col;
3473 else
3474 col = (int)(t->sub.list.line[0].start - regline);
3475 nfa_set_code(t->state->c);
3476 fprintf(log_fd, "(%d) char %d %s (start col %d) ... \n",
3477 abs(t->state->id), (int)t->state->c, code, col);
3478 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003479#endif
3480
3481 /*
3482 * Handle the possible codes of the current state.
3483 * The most important is NFA_MATCH.
3484 */
3485 switch (t->state->c)
3486 {
3487 case NFA_MATCH:
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02003488 {
3489 int j;
3490
Bram Moolenaar963fee22013-05-26 21:47:28 +02003491 nfa_match = TRUE;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003492 submatch->in_use = t->sub.in_use;
3493 if (REG_MULTI)
3494 for (j = 0; j < submatch->in_use; j++)
3495 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003496 submatch->list.multi[j].start =
3497 t->sub.list.multi[j].start;
3498 submatch->list.multi[j].end = t->sub.list.multi[j].end;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003499 }
3500 else
3501 for (j = 0; j < submatch->in_use; j++)
3502 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003503 submatch->list.line[j].start =
3504 t->sub.list.line[j].start;
3505 submatch->list.line[j].end = t->sub.list.line[j].end;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003506 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003507#ifdef ENABLE_LOG
Bram Moolenaar5714b802013-05-28 22:03:20 +02003508 log_subexpr(&t->sub);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003509#endif
Bram Moolenaar35b23862013-05-22 23:00:40 +02003510 /* Found the left-most longest match, do not look at any other
Bram Moolenaar57a285b2013-05-26 16:57:28 +02003511 * states at this position. When the list of states is going
3512 * to be empty quit without advancing, so that "reginput" is
3513 * correct. */
3514 if (nextlist->n == 0 && neglist->n == 0)
3515 clen = 0;
Bram Moolenaar35b23862013-05-22 23:00:40 +02003516 goto nextchar;
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02003517 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003518
3519 case NFA_END_INVISIBLE:
3520 /* This is only encountered after a NFA_START_INVISIBLE node.
3521 * They surround a zero-width group, used with "\@=" and "\&".
3522 * If we got here, it means that the current "invisible" group
3523 * finished successfully, so return control to the parent
3524 * nfa_regmatch(). Submatches are stored in *m, and used in
3525 * the parent call. */
3526 if (start->c == NFA_MOPEN + 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003527 addstate_here(thislist, t->state->out, &t->sub, &listidx);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003528 else
3529 {
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003530 /* do not set submatches for \@! */
3531 if (!t->state->negated)
3532 /* TODO: only copy positions in use. */
3533 *m = t->sub;
Bram Moolenaar963fee22013-05-26 21:47:28 +02003534 nfa_match = TRUE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003535 }
3536 break;
3537
3538 case NFA_START_INVISIBLE:
Bram Moolenaar14f55c62013-05-31 21:45:09 +02003539 {
3540 char_u *save_reginput = reginput;
3541 char_u *save_regline = regline;
3542 int save_reglnum = reglnum;
3543 int save_nfa_match = nfa_match;
3544
3545 /* Call nfa_regmatch() to check if the current concat matches
3546 * at this position. The concat ends with the node
3547 * NFA_END_INVISIBLE */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003548 if (listids == NULL)
3549 {
Bram Moolenaar14f55c62013-05-31 21:45:09 +02003550 listids = (int *)lalloc(sizeof(int) * nstate, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003551 if (listids == NULL)
3552 {
3553 EMSG(_("E878: (NFA) Could not allocate memory for branch traversal!"));
3554 return 0;
3555 }
3556 }
3557#ifdef ENABLE_LOG
3558 if (log_fd != stderr)
3559 fclose(log_fd);
3560 log_fd = NULL;
3561#endif
3562 /* Have to clear the listid field of the NFA nodes, so that
3563 * nfa_regmatch() and addstate() can run properly after
3564 * recursion. */
3565 nfa_save_listids(start, listids);
3566 nfa_set_null_listids(start);
3567 result = nfa_regmatch(t->state->out, submatch, m);
3568 nfa_set_neg_listids(start);
3569 nfa_restore_listids(start, listids);
Bram Moolenaar14f55c62013-05-31 21:45:09 +02003570
3571 /* restore position in input text */
3572 reginput = save_reginput;
3573 regline = save_regline;
3574 reglnum = save_reglnum;
3575 nfa_match = save_nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003576
3577#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003578 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003579 if (log_fd != NULL)
3580 {
3581 fprintf(log_fd, "****************************\n");
3582 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
3583 fprintf(log_fd, "MATCH = %s\n", result == TRUE ? "OK" : "FALSE");
3584 fprintf(log_fd, "****************************\n");
3585 }
3586 else
3587 {
3588 EMSG(_("Could not open temporary log file for writing, displaying on stderr ... "));
3589 log_fd = stderr;
3590 }
3591#endif
Bram Moolenaarb06e20e2013-05-30 22:44:02 +02003592 /* for \@! it is a match when result is FALSE */
3593 if (result != t->state->negated)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003594 {
Bram Moolenaareb3ecae2013-05-27 11:22:04 +02003595 int j;
3596
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003597 /* Copy submatch info from the recursive call */
3598 if (REG_MULTI)
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003599 for (j = 1; j < m->in_use; j++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003600 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003601 t->sub.list.multi[j].start = m->list.multi[j].start;
3602 t->sub.list.multi[j].end = m->list.multi[j].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003603 }
3604 else
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003605 for (j = 1; j < m->in_use; j++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003606 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02003607 t->sub.list.line[j].start = m->list.line[j].start;
3608 t->sub.list.line[j].end = m->list.line[j].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003609 }
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02003610 if (m->in_use > t->sub.in_use)
3611 t->sub.in_use = m->in_use;
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02003612
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02003613 /* t->state->out1 is the corresponding END_INVISIBLE node;
3614 * Add it to the current list (zero-width match). */
Bram Moolenaar4b417062013-05-25 20:19:50 +02003615 addstate_here(thislist, t->state->out1->out, &t->sub,
Bram Moolenaar5714b802013-05-28 22:03:20 +02003616 &listidx);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003617 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003618 break;
Bram Moolenaar14f55c62013-05-31 21:45:09 +02003619 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003620
3621 case NFA_BOL:
3622 if (reginput == regline)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003623 addstate_here(thislist, t->state->out, &t->sub, &listidx);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003624 break;
3625
3626 case NFA_EOL:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003627 if (curc == NUL)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003628 addstate_here(thislist, t->state->out, &t->sub, &listidx);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003629 break;
3630
3631 case NFA_BOW:
3632 {
3633 int bow = TRUE;
3634
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003635 if (curc == NUL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003636 bow = FALSE;
3637#ifdef FEAT_MBYTE
3638 else if (has_mbyte)
3639 {
3640 int this_class;
3641
3642 /* Get class of current and previous char (if it exists). */
Bram Moolenaarf878bf02013-05-21 21:20:20 +02003643 this_class = mb_get_class_buf(reginput, reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003644 if (this_class <= 1)
3645 bow = FALSE;
3646 else if (reg_prev_class() == this_class)
3647 bow = FALSE;
3648 }
3649#endif
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003650 else if (!vim_iswordc_buf(curc, reg_buf)
Bram Moolenaarf878bf02013-05-21 21:20:20 +02003651 || (reginput > regline
3652 && vim_iswordc_buf(reginput[-1], reg_buf)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003653 bow = FALSE;
3654 if (bow)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003655 addstate_here(thislist, t->state->out, &t->sub, &listidx);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003656 break;
3657 }
3658
3659 case NFA_EOW:
3660 {
3661 int eow = TRUE;
3662
3663 if (reginput == regline)
3664 eow = FALSE;
3665#ifdef FEAT_MBYTE
3666 else if (has_mbyte)
3667 {
3668 int this_class, prev_class;
3669
3670 /* Get class of current and previous char (if it exists). */
Bram Moolenaarf878bf02013-05-21 21:20:20 +02003671 this_class = mb_get_class_buf(reginput, reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003672 prev_class = reg_prev_class();
3673 if (this_class == prev_class
3674 || prev_class == 0 || prev_class == 1)
3675 eow = FALSE;
3676 }
3677#endif
Bram Moolenaarf878bf02013-05-21 21:20:20 +02003678 else if (!vim_iswordc_buf(reginput[-1], reg_buf)
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003679 || (reginput[0] != NUL
3680 && vim_iswordc_buf(curc, reg_buf)))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003681 eow = FALSE;
3682 if (eow)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003683 addstate_here(thislist, t->state->out, &t->sub, &listidx);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003684 break;
3685 }
3686
Bram Moolenaar4b780632013-05-31 22:14:52 +02003687 case NFA_BOF:
3688 if (reglnum == 0 && reginput == regline
3689 && (!REG_MULTI || reg_firstlnum == 1))
3690 addstate_here(thislist, t->state->out, &t->sub, &listidx);
3691 break;
3692
3693 case NFA_EOF:
3694 if (reglnum == reg_maxline && curc == NUL)
3695 addstate_here(thislist, t->state->out, &t->sub, &listidx);
3696 break;
3697
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003698#ifdef FEAT_MBYTE
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003699 case NFA_COMPOSING:
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003700 {
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003701 int mc = curc;
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02003702 int len = 0;
3703 nfa_state_T *end;
3704 nfa_state_T *sta;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02003705 int cchars[MAX_MCO];
3706 int ccount = 0;
3707 int j;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003708
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003709 sta = t->state->out;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003710 len = 0;
Bram Moolenaar56d58d52013-05-25 14:42:03 +02003711 if (utf_iscomposing(sta->c))
3712 {
3713 /* Only match composing character(s), ignore base
3714 * character. Used for ".{composing}" and "{composing}"
3715 * (no preceding character). */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003716 len += mb_char2len(mc);
Bram Moolenaar56d58d52013-05-25 14:42:03 +02003717 }
Bram Moolenaar3451d662013-05-26 15:14:55 +02003718 if (ireg_icombine && len == 0)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003719 {
Bram Moolenaar56d58d52013-05-25 14:42:03 +02003720 /* If \Z was present, then ignore composing characters.
3721 * When ignoring the base character this always matches. */
Bram Moolenaarfad8de02013-05-24 23:10:50 +02003722 /* TODO: How about negated? */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003723 if (len == 0 && sta->c != curc)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02003724 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02003725 else
3726 result = OK;
Bram Moolenaarfad8de02013-05-24 23:10:50 +02003727 while (sta->c != NFA_END_COMPOSING)
3728 sta = sta->out;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003729 }
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02003730
3731 /* Check base character matches first, unless ignored. */
3732 else if (len > 0 || mc == sta->c)
3733 {
3734 if (len == 0)
Bram Moolenaarfad8de02013-05-24 23:10:50 +02003735 {
Bram Moolenaarfad8de02013-05-24 23:10:50 +02003736 len += mb_char2len(mc);
3737 sta = sta->out;
3738 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003739
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02003740 /* We don't care about the order of composing characters.
3741 * Get them into cchars[] first. */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003742 while (len < clen)
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02003743 {
3744 mc = mb_ptr2char(reginput + len);
3745 cchars[ccount++] = mc;
3746 len += mb_char2len(mc);
3747 if (ccount == MAX_MCO)
3748 break;
3749 }
3750
3751 /* Check that each composing char in the pattern matches a
3752 * composing char in the text. We do not check if all
3753 * composing chars are matched. */
3754 result = OK;
3755 while (sta->c != NFA_END_COMPOSING)
3756 {
3757 for (j = 0; j < ccount; ++j)
3758 if (cchars[j] == sta->c)
3759 break;
3760 if (j == ccount)
3761 {
3762 result = FAIL;
3763 break;
3764 }
3765 sta = sta->out;
3766 }
3767 }
3768 else
Bram Moolenaar1d814752013-05-24 20:25:33 +02003769 result = FAIL;
Bram Moolenaar3f1682e2013-05-26 14:32:05 +02003770
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003771 end = t->state->out1; /* NFA_END_COMPOSING */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003772 ADD_POS_NEG_STATE(end);
3773 break;
Bram Moolenaar3c577f22013-05-24 21:59:54 +02003774 }
3775#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003776
3777 case NFA_NEWL:
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003778 if (curc == NUL && !reg_line_lbr && REG_MULTI
3779 && reglnum <= reg_maxline)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003780 {
Bram Moolenaar35b23862013-05-22 23:00:40 +02003781 go_to_nextline = TRUE;
3782 /* Pass -1 for the offset, which means taking the position
3783 * at the start of the next line. */
Bram Moolenaar5714b802013-05-28 22:03:20 +02003784 addstate(nextlist, t->state->out, &t->sub, -1);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003785 }
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003786 else if (curc == '\n' && reg_line_lbr)
3787 {
3788 /* match \n as if it is an ordinary character */
Bram Moolenaar5714b802013-05-28 22:03:20 +02003789 addstate(nextlist, t->state->out, &t->sub, 1);
Bram Moolenaar61db8b52013-05-26 17:45:49 +02003790 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003791 break;
3792
3793 case NFA_CLASS_ALNUM:
3794 case NFA_CLASS_ALPHA:
3795 case NFA_CLASS_BLANK:
3796 case NFA_CLASS_CNTRL:
3797 case NFA_CLASS_DIGIT:
3798 case NFA_CLASS_GRAPH:
3799 case NFA_CLASS_LOWER:
3800 case NFA_CLASS_PRINT:
3801 case NFA_CLASS_PUNCT:
3802 case NFA_CLASS_SPACE:
3803 case NFA_CLASS_UPPER:
3804 case NFA_CLASS_XDIGIT:
3805 case NFA_CLASS_TAB:
3806 case NFA_CLASS_RETURN:
3807 case NFA_CLASS_BACKSPACE:
3808 case NFA_CLASS_ESCAPE:
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003809 result = check_char_class(t->state->c, curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003810 ADD_POS_NEG_STATE(t->state);
3811 break;
3812
3813 case NFA_END_NEG_RANGE:
3814 /* This follows a series of negated nodes, like:
3815 * CHAR(x), NFA_NOT, CHAR(y), NFA_NOT etc. */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003816 if (curc > 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003817 addstate(nextlist, t->state->out, &t->sub, clen);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003818 break;
3819
3820 case NFA_ANY:
Bram Moolenaar35b23862013-05-22 23:00:40 +02003821 /* Any char except '\0', (end of input) does not match. */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003822 if (curc > 0)
Bram Moolenaar5714b802013-05-28 22:03:20 +02003823 addstate(nextlist, t->state->out, &t->sub, clen);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003824 break;
3825
3826 /*
3827 * Character classes like \a for alpha, \d for digit etc.
3828 */
3829 case NFA_IDENT: /* \i */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003830 result = vim_isIDc(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003831 ADD_POS_NEG_STATE(t->state);
3832 break;
3833
3834 case NFA_SIDENT: /* \I */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003835 result = !VIM_ISDIGIT(curc) && vim_isIDc(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003836 ADD_POS_NEG_STATE(t->state);
3837 break;
3838
3839 case NFA_KWORD: /* \k */
Bram Moolenaarf878bf02013-05-21 21:20:20 +02003840 result = vim_iswordp_buf(reginput, reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003841 ADD_POS_NEG_STATE(t->state);
3842 break;
3843
3844 case NFA_SKWORD: /* \K */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003845 result = !VIM_ISDIGIT(curc)
3846 && vim_iswordp_buf(reginput, reg_buf);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003847 ADD_POS_NEG_STATE(t->state);
3848 break;
3849
3850 case NFA_FNAME: /* \f */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003851 result = vim_isfilec(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003852 ADD_POS_NEG_STATE(t->state);
3853 break;
3854
3855 case NFA_SFNAME: /* \F */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003856 result = !VIM_ISDIGIT(curc) && vim_isfilec(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003857 ADD_POS_NEG_STATE(t->state);
3858 break;
3859
3860 case NFA_PRINT: /* \p */
Bram Moolenaar08050492013-05-21 12:43:56 +02003861 result = ptr2cells(reginput) == 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003862 ADD_POS_NEG_STATE(t->state);
3863 break;
3864
3865 case NFA_SPRINT: /* \P */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003866 result = !VIM_ISDIGIT(curc) && ptr2cells(reginput) == 1;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003867 ADD_POS_NEG_STATE(t->state);
3868 break;
3869
3870 case NFA_WHITE: /* \s */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003871 result = vim_iswhite(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003872 ADD_POS_NEG_STATE(t->state);
3873 break;
3874
3875 case NFA_NWHITE: /* \S */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003876 result = curc != NUL && !vim_iswhite(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003877 ADD_POS_NEG_STATE(t->state);
3878 break;
3879
3880 case NFA_DIGIT: /* \d */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003881 result = ri_digit(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003882 ADD_POS_NEG_STATE(t->state);
3883 break;
3884
3885 case NFA_NDIGIT: /* \D */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003886 result = curc != NUL && !ri_digit(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003887 ADD_POS_NEG_STATE(t->state);
3888 break;
3889
3890 case NFA_HEX: /* \x */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003891 result = ri_hex(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003892 ADD_POS_NEG_STATE(t->state);
3893 break;
3894
3895 case NFA_NHEX: /* \X */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003896 result = curc != NUL && !ri_hex(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003897 ADD_POS_NEG_STATE(t->state);
3898 break;
3899
3900 case NFA_OCTAL: /* \o */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003901 result = ri_octal(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003902 ADD_POS_NEG_STATE(t->state);
3903 break;
3904
3905 case NFA_NOCTAL: /* \O */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003906 result = curc != NUL && !ri_octal(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003907 ADD_POS_NEG_STATE(t->state);
3908 break;
3909
3910 case NFA_WORD: /* \w */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003911 result = ri_word(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003912 ADD_POS_NEG_STATE(t->state);
3913 break;
3914
3915 case NFA_NWORD: /* \W */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003916 result = curc != NUL && !ri_word(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003917 ADD_POS_NEG_STATE(t->state);
3918 break;
3919
3920 case NFA_HEAD: /* \h */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003921 result = ri_head(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003922 ADD_POS_NEG_STATE(t->state);
3923 break;
3924
3925 case NFA_NHEAD: /* \H */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003926 result = curc != NUL && !ri_head(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003927 ADD_POS_NEG_STATE(t->state);
3928 break;
3929
3930 case NFA_ALPHA: /* \a */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003931 result = ri_alpha(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003932 ADD_POS_NEG_STATE(t->state);
3933 break;
3934
3935 case NFA_NALPHA: /* \A */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003936 result = curc != NUL && !ri_alpha(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003937 ADD_POS_NEG_STATE(t->state);
3938 break;
3939
3940 case NFA_LOWER: /* \l */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003941 result = ri_lower(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003942 ADD_POS_NEG_STATE(t->state);
3943 break;
3944
3945 case NFA_NLOWER: /* \L */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003946 result = curc != NUL && !ri_lower(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003947 ADD_POS_NEG_STATE(t->state);
3948 break;
3949
3950 case NFA_UPPER: /* \u */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003951 result = ri_upper(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003952 ADD_POS_NEG_STATE(t->state);
3953 break;
3954
3955 case NFA_NUPPER: /* \U */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02003956 result = curc != NUL && !ri_upper(curc);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003957 ADD_POS_NEG_STATE(t->state);
3958 break;
3959
Bram Moolenaar5714b802013-05-28 22:03:20 +02003960 case NFA_BACKREF1:
3961 case NFA_BACKREF2:
3962 case NFA_BACKREF3:
3963 case NFA_BACKREF4:
3964 case NFA_BACKREF5:
3965 case NFA_BACKREF6:
3966 case NFA_BACKREF7:
3967 case NFA_BACKREF8:
3968 case NFA_BACKREF9:
3969 /* \1 .. \9 */
3970 {
3971 int subidx = t->state->c - NFA_BACKREF1 + 1;
3972 int bytelen;
3973
3974 result = match_backref(&t->sub, subidx, &bytelen);
3975 if (result)
3976 {
3977 if (bytelen == 0)
3978 {
3979 /* empty match always works, add NFA_SKIP with zero to
3980 * be used next */
3981 addstate_here(thislist, t->state->out, &t->sub,
3982 &listidx);
3983 thislist->t[listidx + 1].count = 0;
3984 }
3985 else if (bytelen <= clen)
3986 {
3987 /* match current character, jump ahead to out of
3988 * NFA_SKIP */
3989 addstate(nextlist, t->state->out->out, &t->sub, clen);
3990#ifdef ENABLE_LOG
3991 log_subexpr(&nextlist->t[nextlist->n - 1].sub);
3992#endif
3993 }
3994 else
3995 {
3996 /* skip ofer the matched characters, set character
3997 * count in NFA_SKIP */
3998 addstate(nextlist, t->state->out, &t->sub, bytelen);
3999 nextlist->t[nextlist->n - 1].count = bytelen - clen;
4000#ifdef ENABLE_LOG
4001 log_subexpr(&nextlist->t[nextlist->n - 1].sub);
4002#endif
4003 }
4004
4005 }
Bram Moolenaar12e40142013-05-21 15:33:41 +02004006 break;
Bram Moolenaar5714b802013-05-28 22:03:20 +02004007 }
4008 case NFA_SKIP:
4009 /* charater of previous matching \1 .. \9 */
4010 if (t->count - clen <= 0)
4011 {
4012 /* end of match, go to what follows */
4013 addstate(nextlist, t->state->out, &t->sub, clen);
4014#ifdef ENABLE_LOG
4015 log_subexpr(&nextlist->t[nextlist->n - 1].sub);
4016#endif
4017 }
4018 else
4019 {
4020 /* add state again with decremented count */
4021 addstate(nextlist, t->state, &t->sub, 0);
4022 nextlist->t[nextlist->n - 1].count = t->count - clen;
4023#ifdef ENABLE_LOG
4024 log_subexpr(&nextlist->t[nextlist->n - 1].sub);
4025#endif
4026 }
4027 break;
Bram Moolenaar12e40142013-05-21 15:33:41 +02004028
4029 case NFA_SKIP_CHAR:
4030 case NFA_ZSTART:
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02004031 case NFA_ZEND:
Bram Moolenaar12e40142013-05-21 15:33:41 +02004032 /* TODO: should not happen? */
4033 break;
4034
Bram Moolenaar423532e2013-05-29 21:14:42 +02004035 case NFA_LNUM:
4036 case NFA_LNUM_GT:
4037 case NFA_LNUM_LT:
4038 result = (REG_MULTI &&
4039 nfa_re_num_cmp(t->state->val, t->state->c - NFA_LNUM,
4040 (long_u)(reglnum + reg_firstlnum)));
4041 if (result)
4042 addstate_here(thislist, t->state->out, &t->sub, &listidx);
4043 break;
4044
4045 case NFA_COL:
4046 case NFA_COL_GT:
4047 case NFA_COL_LT:
4048 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_COL,
4049 (long_u)(reginput - regline) + 1);
4050 if (result)
4051 addstate_here(thislist, t->state->out, &t->sub, &listidx);
4052 break;
4053
4054 case NFA_VCOL:
4055 case NFA_VCOL_GT:
4056 case NFA_VCOL_LT:
4057 result = nfa_re_num_cmp(t->state->val, t->state->c - NFA_VCOL,
4058 (long_u)win_linetabsize(
4059 reg_win == NULL ? curwin : reg_win,
4060 regline, (colnr_T)(reginput - regline)) + 1);
4061 if (result)
4062 addstate_here(thislist, t->state->out, &t->sub, &listidx);
4063 break;
4064
4065 case NFA_CURSOR:
4066 result = (reg_win != NULL
4067 && (reglnum + reg_firstlnum == reg_win->w_cursor.lnum)
4068 && ((colnr_T)(reginput - regline)
4069 == reg_win->w_cursor.col));
4070 if (result)
4071 addstate_here(thislist, t->state->out, &t->sub, &listidx);
4072 break;
4073
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004074 default: /* regular character */
Bram Moolenaarc4912e52013-05-26 19:19:52 +02004075 {
4076 int c = t->state->c;
Bram Moolenaar12e40142013-05-21 15:33:41 +02004077
Bram Moolenaarc4912e52013-05-26 19:19:52 +02004078 /* TODO: put this in #ifdef later */
4079 if (c < -256)
4080 EMSGN("INTERNAL: Negative state char: %ld", c);
4081 if (is_Magic(c))
4082 c = un_Magic(c);
4083 result = (c == curc);
4084
4085 if (!result && ireg_ic)
4086 result = MB_TOLOWER(c) == MB_TOLOWER(curc);
Bram Moolenaar3c577f22013-05-24 21:59:54 +02004087#ifdef FEAT_MBYTE
4088 /* If there is a composing character which is not being
4089 * ignored there can be no match. Match with composing
4090 * character uses NFA_COMPOSING above. */
4091 if (result && enc_utf8 && !ireg_icombine
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02004092 && clen != utf_char2len(curc))
Bram Moolenaar3c577f22013-05-24 21:59:54 +02004093 result = FALSE;
4094#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004095 ADD_POS_NEG_STATE(t->state);
4096 break;
Bram Moolenaarc4912e52013-05-26 19:19:52 +02004097 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004098 }
4099
4100 } /* for (thislist = thislist; thislist->state; thislist++) */
4101
Bram Moolenaare23febd2013-05-26 18:40:14 +02004102 /* Look for the start of a match in the current position by adding the
4103 * start state to the list of states.
4104 * The first found match is the leftmost one, thus the order of states
4105 * matters!
4106 * Do not add the start state in recursive calls of nfa_regmatch(),
4107 * because recursive calls should only start in the first position.
4108 * Also don't start a match past the first line. */
Bram Moolenaar963fee22013-05-26 21:47:28 +02004109 if (nfa_match == FALSE && start->c == NFA_MOPEN + 0
Bram Moolenaar75eb1612013-05-29 18:45:11 +02004110 && reglnum == 0 && clen != 0
4111 && (ireg_maxcol == 0
4112 || (colnr_T)(reginput - regline) < ireg_maxcol))
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004113 {
4114#ifdef ENABLE_LOG
4115 fprintf(log_fd, "(---) STARTSTATE\n");
4116#endif
Bram Moolenaar5714b802013-05-28 22:03:20 +02004117 addstate(nextlist, start, m, clen);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004118 }
4119
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004120#ifdef ENABLE_LOG
4121 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02004122 {
4123 int i;
4124
4125 for (i = 0; i < thislist->n; i++)
4126 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
4127 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004128 fprintf(log_fd, "\n");
4129#endif
4130
4131nextchar:
Bram Moolenaar35b23862013-05-22 23:00:40 +02004132 /* Advance to the next character, or advance to the next line, or
4133 * finish. */
Bram Moolenaar7cd4d9c2013-05-26 14:54:12 +02004134 if (clen != 0)
4135 reginput += clen;
Bram Moolenaar35b23862013-05-22 23:00:40 +02004136 else if (go_to_nextline)
4137 reg_nextline();
4138 else
4139 break;
4140 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004141
4142#ifdef ENABLE_LOG
4143 if (log_fd != stderr)
4144 fclose(log_fd);
4145 log_fd = NULL;
4146#endif
4147
4148theend:
4149 /* Free memory */
4150 vim_free(list[0].t);
4151 vim_free(list[1].t);
4152 vim_free(list[2].t);
Bram Moolenaar963fee22013-05-26 21:47:28 +02004153 vim_free(listids);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004154#undef ADD_POS_NEG_STATE
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02004155#ifdef NFA_REGEXP_DEBUG_LOG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004156 fclose(debug);
4157#endif
4158
Bram Moolenaar963fee22013-05-26 21:47:28 +02004159 return nfa_match;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004160}
4161
4162/*
4163 * Try match of "prog" with at regline["col"].
4164 * Returns 0 for failure, number of lines contained in the match otherwise.
4165 */
4166 static long
4167nfa_regtry(start, col)
4168 nfa_state_T *start;
4169 colnr_T col;
4170{
4171 int i;
4172 regsub_T sub, m;
4173#ifdef ENABLE_LOG
4174 FILE *f;
4175#endif
4176
4177 reginput = regline + col;
4178 need_clear_subexpr = TRUE;
4179
4180#ifdef ENABLE_LOG
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02004181 f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004182 if (f != NULL)
4183 {
4184 fprintf(f, "\n\n\n\n\n\n\t\t=======================================================\n");
4185 fprintf(f, " =======================================================\n");
4186#ifdef DEBUG
4187 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
4188#endif
4189 fprintf(f, "\tInput text is \"%s\" \n", reginput);
Bram Moolenaar2d5e1122013-05-30 21:42:13 +02004190 fprintf(f, " =======================================================\n\n");
Bram Moolenaar152e7892013-05-25 12:28:11 +02004191 nfa_print_state(f, start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004192 fprintf(f, "\n\n");
4193 fclose(f);
4194 }
4195 else
4196 EMSG(_("Could not open temporary log file for writing "));
4197#endif
4198
4199 if (REG_MULTI)
4200 {
4201 /* Use 0xff to set lnum to -1 */
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004202 vim_memset(sub.list.multi, 0xff, sizeof(struct multipos) * nfa_nsubexpr);
4203 vim_memset(m.list.multi, 0xff, sizeof(struct multipos) * nfa_nsubexpr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004204 }
4205 else
4206 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004207 vim_memset(sub.list.line, 0, sizeof(struct linepos) * nfa_nsubexpr);
4208 vim_memset(m.list.line, 0, sizeof(struct linepos) * nfa_nsubexpr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004209 }
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004210 sub.in_use = 0;
4211 m.in_use = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004212
4213 if (nfa_regmatch(start, &sub, &m) == FALSE)
4214 return 0;
4215
4216 cleanup_subexpr();
4217 if (REG_MULTI)
4218 {
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004219 for (i = 0; i < sub.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004220 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004221 reg_startpos[i] = sub.list.multi[i].start;
4222 reg_endpos[i] = sub.list.multi[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004223 }
4224
4225 if (reg_startpos[0].lnum < 0)
4226 {
4227 reg_startpos[0].lnum = 0;
4228 reg_startpos[0].col = col;
4229 }
4230 if (reg_endpos[0].lnum < 0)
4231 {
Bram Moolenaare0fea9c2013-05-27 20:10:50 +02004232 /* pattern has a \ze but it didn't match, use current end */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004233 reg_endpos[0].lnum = reglnum;
4234 reg_endpos[0].col = (int)(reginput - regline);
4235 }
4236 else
4237 /* Use line number of "\ze". */
4238 reglnum = reg_endpos[0].lnum;
4239 }
4240 else
4241 {
Bram Moolenaar26c2f3f2013-05-26 22:56:19 +02004242 for (i = 0; i < sub.in_use; i++)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004243 {
Bram Moolenaarf9e56b22013-05-28 22:52:16 +02004244 reg_startp[i] = sub.list.line[i].start;
4245 reg_endp[i] = sub.list.line[i].end;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004246 }
4247
4248 if (reg_startp[0] == NULL)
4249 reg_startp[0] = regline + col;
4250 if (reg_endp[0] == NULL)
4251 reg_endp[0] = reginput;
4252 }
4253
4254 return 1 + reglnum;
4255}
4256
4257/*
4258 * Match a regexp against a string ("line" points to the string) or multiple
4259 * lines ("line" is NULL, use reg_getline()).
4260 *
4261 * Returns 0 for failure, number of lines contained in the match otherwise.
4262 */
4263 static long
4264nfa_regexec_both(line, col)
4265 char_u *line;
4266 colnr_T col; /* column to start looking for match */
4267{
4268 nfa_regprog_T *prog;
4269 long retval = 0L;
4270 int i;
4271
4272 if (REG_MULTI)
4273 {
4274 prog = (nfa_regprog_T *)reg_mmatch->regprog;
4275 line = reg_getline((linenr_T)0); /* relative to the cursor */
4276 reg_startpos = reg_mmatch->startpos;
4277 reg_endpos = reg_mmatch->endpos;
4278 }
4279 else
4280 {
4281 prog = (nfa_regprog_T *)reg_match->regprog;
4282 reg_startp = reg_match->startp;
4283 reg_endp = reg_match->endp;
4284 }
4285
4286 /* Be paranoid... */
4287 if (prog == NULL || line == NULL)
4288 {
4289 EMSG(_(e_null));
4290 goto theend;
4291 }
4292
4293 /* If the start column is past the maximum column: no need to try. */
4294 if (ireg_maxcol > 0 && col >= ireg_maxcol)
4295 goto theend;
4296
4297 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
4298 if (prog->regflags & RF_ICASE)
4299 ireg_ic = TRUE;
4300 else if (prog->regflags & RF_NOICASE)
4301 ireg_ic = FALSE;
4302
4303#ifdef FEAT_MBYTE
4304 /* If pattern contains "\Z" overrule value of ireg_icombine */
4305 if (prog->regflags & RF_ICOMBINE)
4306 ireg_icombine = TRUE;
4307#endif
4308
4309 regline = line;
4310 reglnum = 0; /* relative to line */
4311
Bram Moolenaar57a285b2013-05-26 16:57:28 +02004312 nfa_has_zend = prog->has_zend;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004313 nfa_has_backref = prog->has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004314 nfa_nsubexpr = prog->nsubexp;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004315
Bram Moolenaar57a285b2013-05-26 16:57:28 +02004316 nstate = prog->nstate;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004317 for (i = 0; i < nstate; ++i)
4318 {
4319 prog->state[i].id = i;
4320 prog->state[i].lastlist = 0;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004321 }
4322
4323 retval = nfa_regtry(prog->start, col);
4324
4325theend:
4326 return retval;
4327}
4328
4329/*
4330 * Compile a regular expression into internal code for the NFA matcher.
4331 * Returns the program in allocated space. Returns NULL for an error.
4332 */
4333 static regprog_T *
4334nfa_regcomp(expr, re_flags)
4335 char_u *expr;
4336 int re_flags;
4337{
Bram Moolenaaraae48832013-05-25 21:18:34 +02004338 nfa_regprog_T *prog = NULL;
Bram Moolenaarca12d7c2013-05-20 21:26:33 +02004339 size_t prog_size;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004340 int *postfix;
4341
4342 if (expr == NULL)
4343 return NULL;
4344
4345#ifdef DEBUG
4346 nfa_regengine.expr = expr;
4347#endif
4348
4349 init_class_tab();
4350
4351 if (nfa_regcomp_start(expr, re_flags) == FAIL)
4352 return NULL;
4353
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004354 /* Build postfix form of the regexp. Needed to build the NFA
Bram Moolenaaraae48832013-05-25 21:18:34 +02004355 * (and count its size). */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004356 postfix = re2post();
4357 if (postfix == NULL)
Bram Moolenaar61db8b52013-05-26 17:45:49 +02004358 {
4359 /* TODO: only give this error for debugging? */
4360 if (post_ptr >= post_end)
4361 EMSGN("Internal error: estimated max number of states insufficient: %ld", post_end - post_start);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004362 goto fail; /* Cascaded (syntax?) error */
Bram Moolenaar61db8b52013-05-26 17:45:49 +02004363 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004364
4365 /*
4366 * In order to build the NFA, we parse the input regexp twice:
4367 * 1. first pass to count size (so we can allocate space)
4368 * 2. second to emit code
4369 */
4370#ifdef ENABLE_LOG
4371 {
Bram Moolenaard6c11cb2013-05-25 12:18:39 +02004372 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004373
4374 if (f != NULL)
4375 {
4376 fprintf(f, "\n*****************************\n\n\n\n\tCompiling regexp \"%s\" ... hold on !\n", expr);
4377 fclose(f);
4378 }
4379 }
4380#endif
4381
4382 /*
4383 * PASS 1
4384 * Count number of NFA states in "nstate". Do not build the NFA.
4385 */
4386 post2nfa(postfix, post_ptr, TRUE);
Bram Moolenaaraae48832013-05-25 21:18:34 +02004387
4388 /* Space for compiled regexp */
4389 prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * nstate;
4390 prog = (nfa_regprog_T *)lalloc(prog_size, TRUE);
4391 if (prog == NULL)
4392 goto fail;
4393 vim_memset(prog, 0, prog_size);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004394 state_ptr = prog->state;
4395
4396 /*
4397 * PASS 2
4398 * Build the NFA
4399 */
4400 prog->start = post2nfa(postfix, post_ptr, FALSE);
4401 if (prog->start == NULL)
4402 goto fail;
4403
4404 prog->regflags = regflags;
4405 prog->engine = &nfa_regengine;
4406 prog->nstate = nstate;
Bram Moolenaar57a285b2013-05-26 16:57:28 +02004407 prog->has_zend = nfa_has_zend;
Bram Moolenaar428e9872013-05-30 17:05:39 +02004408 prog->has_backref = nfa_has_backref;
Bram Moolenaar963fee22013-05-26 21:47:28 +02004409 prog->nsubexp = regnpar;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004410#ifdef ENABLE_LOG
4411 nfa_postfix_dump(expr, OK);
4412 nfa_dump(prog);
4413#endif
4414
4415out:
4416 vim_free(post_start);
4417 post_start = post_ptr = post_end = NULL;
4418 state_ptr = NULL;
4419 return (regprog_T *)prog;
4420
4421fail:
4422 vim_free(prog);
4423 prog = NULL;
4424#ifdef ENABLE_LOG
4425 nfa_postfix_dump(expr, FAIL);
4426#endif
4427#ifdef DEBUG
4428 nfa_regengine.expr = NULL;
4429#endif
4430 goto out;
4431}
4432
4433
4434/*
4435 * Match a regexp against a string.
4436 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
4437 * Uses curbuf for line count and 'iskeyword'.
4438 *
4439 * Return TRUE if there is a match, FALSE if not.
4440 */
4441 static int
4442nfa_regexec(rmp, line, col)
4443 regmatch_T *rmp;
4444 char_u *line; /* string to match against */
4445 colnr_T col; /* column to start looking for match */
4446{
4447 reg_match = rmp;
4448 reg_mmatch = NULL;
4449 reg_maxline = 0;
4450 reg_line_lbr = FALSE;
4451 reg_buf = curbuf;
4452 reg_win = NULL;
4453 ireg_ic = rmp->rm_ic;
4454#ifdef FEAT_MBYTE
4455 ireg_icombine = FALSE;
4456#endif
4457 ireg_maxcol = 0;
4458 return (nfa_regexec_both(line, col) != 0);
4459}
4460
4461#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
4462 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
4463
4464static int nfa_regexec_nl __ARGS((regmatch_T *rmp, char_u *line, colnr_T col));
4465
4466/*
4467 * Like nfa_regexec(), but consider a "\n" in "line" to be a line break.
4468 */
4469 static int
4470nfa_regexec_nl(rmp, line, col)
4471 regmatch_T *rmp;
4472 char_u *line; /* string to match against */
4473 colnr_T col; /* column to start looking for match */
4474{
4475 reg_match = rmp;
4476 reg_mmatch = NULL;
4477 reg_maxline = 0;
4478 reg_line_lbr = TRUE;
4479 reg_buf = curbuf;
4480 reg_win = NULL;
4481 ireg_ic = rmp->rm_ic;
4482#ifdef FEAT_MBYTE
4483 ireg_icombine = FALSE;
4484#endif
4485 ireg_maxcol = 0;
4486 return (nfa_regexec_both(line, col) != 0);
4487}
4488#endif
4489
4490
4491/*
4492 * Match a regexp against multiple lines.
4493 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
4494 * Uses curbuf for line count and 'iskeyword'.
4495 *
4496 * Return zero if there is no match. Return number of lines contained in the
4497 * match otherwise.
4498 *
4499 * Note: the body is the same as bt_regexec() except for nfa_regexec_both()
4500 *
4501 * ! Also NOTE : match may actually be in another line. e.g.:
4502 * when r.e. is \nc, cursor is at 'a' and the text buffer looks like
4503 *
4504 * +-------------------------+
4505 * |a |
4506 * |b |
4507 * |c |
4508 * | |
4509 * +-------------------------+
4510 *
4511 * then nfa_regexec_multi() returns 3. while the original
4512 * vim_regexec_multi() returns 0 and a second call at line 2 will return 2.
4513 *
4514 * FIXME if this behavior is not compatible.
4515 */
4516 static long
4517nfa_regexec_multi(rmp, win, buf, lnum, col, tm)
4518 regmmatch_T *rmp;
4519 win_T *win; /* window in which to search or NULL */
4520 buf_T *buf; /* buffer in which to search */
4521 linenr_T lnum; /* nr of line to start looking for match */
4522 colnr_T col; /* column to start looking for match */
4523 proftime_T *tm UNUSED; /* timeout limit or NULL */
4524{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004525 reg_match = NULL;
4526 reg_mmatch = rmp;
4527 reg_buf = buf;
4528 reg_win = win;
4529 reg_firstlnum = lnum;
4530 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
4531 reg_line_lbr = FALSE;
4532 ireg_ic = rmp->rmm_ic;
4533#ifdef FEAT_MBYTE
4534 ireg_icombine = FALSE;
4535#endif
4536 ireg_maxcol = rmp->rmm_maxcol;
4537
Bram Moolenaarf878bf02013-05-21 21:20:20 +02004538 return nfa_regexec_both(NULL, col);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02004539}
4540
4541#ifdef DEBUG
4542# undef ENABLE_LOG
4543#endif