blob: 78be2b5f8dfc2074bf0a4244f233ce4fec14d2c2 [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
473 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
524#define REGMBC(x) regmbc(x);
525#define CASEMBC(x) case x:
526
527/*
528 * Produce the bytes for equivalence class "c".
529 * Currently only handles latin1, latin9 and utf-8.
530 * NOTE: When changing this function, also change nfa_emit_equi_class()
531 */
532 static void
533reg_equi_class(int c)
534{
535 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
536 || STRCMP(p_enc, "iso-8859-15") == 0)
537 {
538#ifdef EBCDIC
539 int i;
540
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200541 // This might be slower than switch/case below.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200542 for (i = 0; i < 16; i++)
543 {
544 if (vim_strchr(EQUIVAL_CLASS_C[i], c) != NULL)
545 {
546 char *p = EQUIVAL_CLASS_C[i];
547
548 while (*p != 0)
549 regmbc(*p++);
550 return;
551 }
552 }
553#else
554 switch (c)
555 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200556 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200557 case 'A': case 0xc0: case 0xc1: case 0xc2:
558 case 0xc3: case 0xc4: case 0xc5:
559 CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd)
560 CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2)
561 regmbc('A'); regmbc(0xc0); regmbc(0xc1);
562 regmbc(0xc2); regmbc(0xc3); regmbc(0xc4);
563 regmbc(0xc5);
564 REGMBC(0x100) REGMBC(0x102) REGMBC(0x104)
565 REGMBC(0x1cd) REGMBC(0x1de) REGMBC(0x1e0)
566 REGMBC(0x1ea2)
567 return;
568 case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06)
569 regmbc('B'); REGMBC(0x1e02) REGMBC(0x1e06)
570 return;
571 case 'C': case 0xc7:
572 CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c)
573 regmbc('C'); regmbc(0xc7);
574 REGMBC(0x106) REGMBC(0x108) REGMBC(0x10a)
575 REGMBC(0x10c)
576 return;
577 case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a)
578 CASEMBC(0x1e0e) CASEMBC(0x1e10)
579 regmbc('D'); REGMBC(0x10e) REGMBC(0x110)
580 REGMBC(0x1e0a) REGMBC(0x1e0e) REGMBC(0x1e10)
581 return;
582 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
583 CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118)
584 CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc)
585 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
586 regmbc(0xca); regmbc(0xcb);
587 REGMBC(0x112) REGMBC(0x114) REGMBC(0x116)
588 REGMBC(0x118) REGMBC(0x11a) REGMBC(0x1eba)
589 REGMBC(0x1ebc)
590 return;
591 case 'F': CASEMBC(0x1e1e)
592 regmbc('F'); REGMBC(0x1e1e)
593 return;
594 case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120)
595 CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4)
596 CASEMBC(0x1e20)
597 regmbc('G'); REGMBC(0x11c) REGMBC(0x11e)
598 REGMBC(0x120) REGMBC(0x122) REGMBC(0x1e4)
599 REGMBC(0x1e6) REGMBC(0x1f4) REGMBC(0x1e20)
600 return;
601 case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22)
602 CASEMBC(0x1e26) CASEMBC(0x1e28)
603 regmbc('H'); REGMBC(0x124) REGMBC(0x126)
604 REGMBC(0x1e22) REGMBC(0x1e26) REGMBC(0x1e28)
605 return;
606 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
607 CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e)
608 CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8)
609 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
610 regmbc(0xce); regmbc(0xcf);
611 REGMBC(0x128) REGMBC(0x12a) REGMBC(0x12c)
612 REGMBC(0x12e) REGMBC(0x130) REGMBC(0x1cf)
613 REGMBC(0x1ec8)
614 return;
615 case 'J': CASEMBC(0x134)
616 regmbc('J'); REGMBC(0x134)
617 return;
618 case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30)
619 CASEMBC(0x1e34)
620 regmbc('K'); REGMBC(0x136) REGMBC(0x1e8)
621 REGMBC(0x1e30) REGMBC(0x1e34)
622 return;
623 case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d)
624 CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a)
625 regmbc('L'); REGMBC(0x139) REGMBC(0x13b)
626 REGMBC(0x13d) REGMBC(0x13f) REGMBC(0x141)
627 REGMBC(0x1e3a)
628 return;
629 case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40)
630 regmbc('M'); REGMBC(0x1e3e) REGMBC(0x1e40)
631 return;
632 case 'N': case 0xd1:
633 CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44)
634 CASEMBC(0x1e48)
635 regmbc('N'); regmbc(0xd1);
636 REGMBC(0x143) REGMBC(0x145) REGMBC(0x147)
637 REGMBC(0x1e44) REGMBC(0x1e48)
638 return;
639 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5:
640 case 0xd6: case 0xd8:
641 CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0)
642 CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece)
643 regmbc('O'); regmbc(0xd2); regmbc(0xd3);
644 regmbc(0xd4); regmbc(0xd5); regmbc(0xd6);
645 regmbc(0xd8);
646 REGMBC(0x14c) REGMBC(0x14e) REGMBC(0x150)
647 REGMBC(0x1a0) REGMBC(0x1d1) REGMBC(0x1ea)
648 REGMBC(0x1ec) REGMBC(0x1ece)
649 return;
650 case 'P': case 0x1e54: case 0x1e56:
651 regmbc('P'); REGMBC(0x1e54) REGMBC(0x1e56)
652 return;
653 case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158)
654 CASEMBC(0x1e58) CASEMBC(0x1e5e)
655 regmbc('R'); REGMBC(0x154) REGMBC(0x156) REGMBC(0x158)
656 REGMBC(0x1e58) REGMBC(0x1e5e)
657 return;
658 case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e)
659 CASEMBC(0x160) CASEMBC(0x1e60)
660 regmbc('S'); REGMBC(0x15a) REGMBC(0x15c)
661 REGMBC(0x15e) REGMBC(0x160) REGMBC(0x1e60)
662 return;
663 case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166)
664 CASEMBC(0x1e6a) CASEMBC(0x1e6e)
665 regmbc('T'); REGMBC(0x162) REGMBC(0x164)
666 REGMBC(0x166) REGMBC(0x1e6a) REGMBC(0x1e6e)
667 return;
668 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
669 CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e)
670 CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3)
671 CASEMBC(0x1ee6)
672 regmbc('U'); regmbc(0xd9); regmbc(0xda);
673 regmbc(0xdb); regmbc(0xdc);
674 REGMBC(0x168) REGMBC(0x16a) REGMBC(0x16c)
675 REGMBC(0x16e) REGMBC(0x170) REGMBC(0x172)
676 REGMBC(0x1af) REGMBC(0x1d3) REGMBC(0x1ee6)
677 return;
678 case 'V': CASEMBC(0x1e7c)
679 regmbc('V'); REGMBC(0x1e7c)
680 return;
681 case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82)
682 CASEMBC(0x1e84) CASEMBC(0x1e86)
683 regmbc('W'); REGMBC(0x174) REGMBC(0x1e80)
684 REGMBC(0x1e82) REGMBC(0x1e84) REGMBC(0x1e86)
685 return;
686 case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c)
687 regmbc('X'); REGMBC(0x1e8a) REGMBC(0x1e8c)
688 return;
689 case 'Y': case 0xdd:
690 CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2)
691 CASEMBC(0x1ef6) CASEMBC(0x1ef8)
692 regmbc('Y'); regmbc(0xdd);
693 REGMBC(0x176) REGMBC(0x178) REGMBC(0x1e8e)
694 REGMBC(0x1ef2) REGMBC(0x1ef6) REGMBC(0x1ef8)
695 return;
696 case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d)
697 CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94)
698 regmbc('Z'); REGMBC(0x179) REGMBC(0x17b)
699 REGMBC(0x17d) REGMBC(0x1b5) REGMBC(0x1e90)
700 REGMBC(0x1e94)
701 return;
702 case 'a': case 0xe0: case 0xe1: case 0xe2:
703 case 0xe3: case 0xe4: case 0xe5:
704 CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce)
705 CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3)
706 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
707 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
708 regmbc(0xe5);
709 REGMBC(0x101) REGMBC(0x103) REGMBC(0x105)
710 REGMBC(0x1ce) REGMBC(0x1df) REGMBC(0x1e1)
711 REGMBC(0x1ea3)
712 return;
713 case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07)
714 regmbc('b'); REGMBC(0x1e03) REGMBC(0x1e07)
715 return;
716 case 'c': case 0xe7:
717 CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d)
718 regmbc('c'); regmbc(0xe7);
719 REGMBC(0x107) REGMBC(0x109) REGMBC(0x10b)
720 REGMBC(0x10d)
721 return;
722 case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b)
723 CASEMBC(0x1e0f) CASEMBC(0x1e11)
724 regmbc('d'); REGMBC(0x10f) REGMBC(0x111)
725 REGMBC(0x1e0b) REGMBC(0x1e0f) REGMBC(0x1e11)
726 return;
727 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
728 CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119)
729 CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd)
730 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
731 regmbc(0xea); regmbc(0xeb);
732 REGMBC(0x113) REGMBC(0x115) REGMBC(0x117)
733 REGMBC(0x119) REGMBC(0x11b) REGMBC(0x1ebb)
734 REGMBC(0x1ebd)
735 return;
736 case 'f': CASEMBC(0x1e1f)
737 regmbc('f'); REGMBC(0x1e1f)
738 return;
739 case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121)
740 CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5)
741 CASEMBC(0x1e21)
742 regmbc('g'); REGMBC(0x11d) REGMBC(0x11f)
743 REGMBC(0x121) REGMBC(0x123) REGMBC(0x1e5)
744 REGMBC(0x1e7) REGMBC(0x1f5) REGMBC(0x1e21)
745 return;
746 case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23)
747 CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96)
748 regmbc('h'); REGMBC(0x125) REGMBC(0x127)
749 REGMBC(0x1e23) REGMBC(0x1e27) REGMBC(0x1e29)
750 REGMBC(0x1e96)
751 return;
752 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
753 CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f)
754 CASEMBC(0x1d0) CASEMBC(0x1ec9)
755 regmbc('i'); regmbc(0xec); regmbc(0xed);
756 regmbc(0xee); regmbc(0xef);
757 REGMBC(0x129) REGMBC(0x12b) REGMBC(0x12d)
758 REGMBC(0x12f) REGMBC(0x1d0) REGMBC(0x1ec9)
759 return;
760 case 'j': CASEMBC(0x135) CASEMBC(0x1f0)
761 regmbc('j'); REGMBC(0x135) REGMBC(0x1f0)
762 return;
763 case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31)
764 CASEMBC(0x1e35)
765 regmbc('k'); REGMBC(0x137) REGMBC(0x1e9)
766 REGMBC(0x1e31) REGMBC(0x1e35)
767 return;
768 case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e)
769 CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b)
770 regmbc('l'); REGMBC(0x13a) REGMBC(0x13c)
771 REGMBC(0x13e) REGMBC(0x140) REGMBC(0x142)
772 REGMBC(0x1e3b)
773 return;
774 case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41)
775 regmbc('m'); REGMBC(0x1e3f) REGMBC(0x1e41)
776 return;
777 case 'n': case 0xf1:
778 CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149)
779 CASEMBC(0x1e45) CASEMBC(0x1e49)
780 regmbc('n'); regmbc(0xf1);
781 REGMBC(0x144) REGMBC(0x146) REGMBC(0x148)
782 REGMBC(0x149) REGMBC(0x1e45) REGMBC(0x1e49)
783 return;
784 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
785 case 0xf6: case 0xf8:
786 CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1)
787 CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf)
788 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
789 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
790 regmbc(0xf8);
791 REGMBC(0x14d) REGMBC(0x14f) REGMBC(0x151)
792 REGMBC(0x1a1) REGMBC(0x1d2) REGMBC(0x1eb)
793 REGMBC(0x1ed) REGMBC(0x1ecf)
794 return;
795 case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57)
796 regmbc('p'); REGMBC(0x1e55) REGMBC(0x1e57)
797 return;
798 case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159)
799 CASEMBC(0x1e59) CASEMBC(0x1e5f)
800 regmbc('r'); REGMBC(0x155) REGMBC(0x157) REGMBC(0x159)
801 REGMBC(0x1e59) REGMBC(0x1e5f)
802 return;
803 case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f)
804 CASEMBC(0x161) CASEMBC(0x1e61)
805 regmbc('s'); REGMBC(0x15b) REGMBC(0x15d)
806 REGMBC(0x15f) REGMBC(0x161) REGMBC(0x1e61)
807 return;
808 case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167)
809 CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97)
810 regmbc('t'); REGMBC(0x163) REGMBC(0x165) REGMBC(0x167)
811 REGMBC(0x1e6b) REGMBC(0x1e6f) REGMBC(0x1e97)
812 return;
813 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
814 CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f)
815 CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4)
816 CASEMBC(0x1ee7)
817 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
818 regmbc(0xfb); regmbc(0xfc);
819 REGMBC(0x169) REGMBC(0x16b) REGMBC(0x16d)
820 REGMBC(0x16f) REGMBC(0x171) REGMBC(0x173)
821 REGMBC(0x1b0) REGMBC(0x1d4) REGMBC(0x1ee7)
822 return;
823 case 'v': CASEMBC(0x1e7d)
824 regmbc('v'); REGMBC(0x1e7d)
825 return;
826 case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83)
827 CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98)
828 regmbc('w'); REGMBC(0x175) REGMBC(0x1e81)
829 REGMBC(0x1e83) REGMBC(0x1e85) REGMBC(0x1e87)
830 REGMBC(0x1e98)
831 return;
832 case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d)
833 regmbc('x'); REGMBC(0x1e8b) REGMBC(0x1e8d)
834 return;
835 case 'y': case 0xfd: case 0xff:
836 CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99)
837 CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9)
838 regmbc('y'); regmbc(0xfd); regmbc(0xff);
839 REGMBC(0x177) REGMBC(0x1e8f) REGMBC(0x1e99)
840 REGMBC(0x1ef3) REGMBC(0x1ef7) REGMBC(0x1ef9)
841 return;
842 case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e)
843 CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95)
844 regmbc('z'); REGMBC(0x17a) REGMBC(0x17c)
845 REGMBC(0x17e) REGMBC(0x1b6) REGMBC(0x1e91)
846 REGMBC(0x1e95)
847 return;
848 }
849#endif
850 }
851 regmbc(c);
852}
853
854/*
855 * Emit a node.
856 * Return pointer to generated code.
857 */
858 static char_u *
859regnode(int op)
860{
861 char_u *ret;
862
863 ret = regcode;
864 if (ret == JUST_CALC_SIZE)
865 regsize += 3;
866 else
867 {
868 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200869 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 *regcode++ = NUL;
871 }
872 return ret;
873}
874
875/*
876 * Write a long as four bytes at "p" and return pointer to the next char.
877 */
878 static char_u *
879re_put_long(char_u *p, long_u val)
880{
881 *p++ = (char_u) ((val >> 24) & 0377);
882 *p++ = (char_u) ((val >> 16) & 0377);
883 *p++ = (char_u) ((val >> 8) & 0377);
884 *p++ = (char_u) (val & 0377);
885 return p;
886}
887
888/*
889 * regnext - dig the "next" pointer out of a node
890 * Returns NULL when calculating size, when there is no next item and when
891 * there is an error.
892 */
893 static char_u *
894regnext(char_u *p)
895{
896 int offset;
897
898 if (p == JUST_CALC_SIZE || reg_toolong)
899 return NULL;
900
901 offset = NEXT(p);
902 if (offset == 0)
903 return NULL;
904
905 if (OP(p) == BACK)
906 return p - offset;
907 else
908 return p + offset;
909}
910
911/*
912 * Set the next-pointer at the end of a node chain.
913 */
914 static void
915regtail(char_u *p, char_u *val)
916{
917 char_u *scan;
918 char_u *temp;
919 int offset;
920
921 if (p == JUST_CALC_SIZE)
922 return;
923
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200924 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200925 scan = p;
926 for (;;)
927 {
928 temp = regnext(scan);
929 if (temp == NULL)
930 break;
931 scan = temp;
932 }
933
934 if (OP(scan) == BACK)
935 offset = (int)(scan - val);
936 else
937 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200938 // When the offset uses more than 16 bits it can no longer fit in the two
939 // bytes available. Use a global flag to avoid having to check return
940 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200941 if (offset > 0xffff)
942 reg_toolong = TRUE;
943 else
944 {
945 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
946 *(scan + 2) = (char_u) (offset & 0377);
947 }
948}
949
950/*
951 * Like regtail, on item after a BRANCH; nop if none.
952 */
953 static void
954regoptail(char_u *p, char_u *val)
955{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200956 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200957 if (p == NULL || p == JUST_CALC_SIZE
958 || (OP(p) != BRANCH
959 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
960 return;
961 regtail(OPERAND(p), val);
962}
963
964/*
965 * Insert an operator in front of already-emitted operand
966 *
967 * Means relocating the operand.
968 */
969 static void
970reginsert(int op, char_u *opnd)
971{
972 char_u *src;
973 char_u *dst;
974 char_u *place;
975
976 if (regcode == JUST_CALC_SIZE)
977 {
978 regsize += 3;
979 return;
980 }
981 src = regcode;
982 regcode += 3;
983 dst = regcode;
984 while (src > opnd)
985 *--dst = *--src;
986
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200987 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200988 *place++ = op;
989 *place++ = NUL;
990 *place = NUL;
991}
992
993/*
994 * Insert an operator in front of already-emitted operand.
995 * Add a number to the operator.
996 */
997 static void
998reginsert_nr(int op, long val, char_u *opnd)
999{
1000 char_u *src;
1001 char_u *dst;
1002 char_u *place;
1003
1004 if (regcode == JUST_CALC_SIZE)
1005 {
1006 regsize += 7;
1007 return;
1008 }
1009 src = regcode;
1010 regcode += 7;
1011 dst = regcode;
1012 while (src > opnd)
1013 *--dst = *--src;
1014
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001015 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001016 *place++ = op;
1017 *place++ = NUL;
1018 *place++ = NUL;
1019 re_put_long(place, (long_u)val);
1020}
1021
1022/*
1023 * Insert an operator in front of already-emitted operand.
1024 * The operator has the given limit values as operands. Also set next pointer.
1025 *
1026 * Means relocating the operand.
1027 */
1028 static void
1029reginsert_limits(
1030 int op,
1031 long minval,
1032 long maxval,
1033 char_u *opnd)
1034{
1035 char_u *src;
1036 char_u *dst;
1037 char_u *place;
1038
1039 if (regcode == JUST_CALC_SIZE)
1040 {
1041 regsize += 11;
1042 return;
1043 }
1044 src = regcode;
1045 regcode += 11;
1046 dst = regcode;
1047 while (src > opnd)
1048 *--dst = *--src;
1049
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001050 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001051 *place++ = op;
1052 *place++ = NUL;
1053 *place++ = NUL;
1054 place = re_put_long(place, (long_u)minval);
1055 place = re_put_long(place, (long_u)maxval);
1056 regtail(opnd, place);
1057}
1058
1059/*
1060 * Return TRUE if the back reference is legal. We must have seen the close
1061 * brace.
1062 * TODO: Should also check that we don't refer to something that is repeated
1063 * (+*=): what instance of the repetition should we match?
1064 */
1065 static int
1066seen_endbrace(int refnum)
1067{
1068 if (!had_endbrace[refnum])
1069 {
1070 char_u *p;
1071
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001072 // Trick: check if "@<=" or "@<!" follows, in which case
1073 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001074 for (p = regparse; *p != NUL; ++p)
1075 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1076 break;
1077 if (*p == NUL)
1078 {
1079 emsg(_("E65: Illegal back reference"));
1080 rc_did_emsg = TRUE;
1081 return FALSE;
1082 }
1083 }
1084 return TRUE;
1085}
1086
1087/*
1088 * Parse the lowest level.
1089 *
1090 * Optimization: gobbles an entire sequence of ordinary characters so that
1091 * it can turn them into a single node, which is smaller to store and
1092 * faster to run. Don't do this when one_exactly is set.
1093 */
1094 static char_u *
1095regatom(int *flagp)
1096{
1097 char_u *ret;
1098 int flags;
1099 int c;
1100 char_u *p;
1101 int extra = 0;
1102 int save_prev_at_start = prev_at_start;
1103
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001104 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001105
1106 c = getchr();
1107 switch (c)
1108 {
1109 case Magic('^'):
1110 ret = regnode(BOL);
1111 break;
1112
1113 case Magic('$'):
1114 ret = regnode(EOL);
1115#if defined(FEAT_SYN_HL) || defined(PROTO)
1116 had_eol = TRUE;
1117#endif
1118 break;
1119
1120 case Magic('<'):
1121 ret = regnode(BOW);
1122 break;
1123
1124 case Magic('>'):
1125 ret = regnode(EOW);
1126 break;
1127
1128 case Magic('_'):
1129 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001130 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001131 {
1132 ret = regnode(BOL);
1133 break;
1134 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001135 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001136 {
1137 ret = regnode(EOL);
1138#if defined(FEAT_SYN_HL) || defined(PROTO)
1139 had_eol = TRUE;
1140#endif
1141 break;
1142 }
1143
1144 extra = ADD_NL;
1145 *flagp |= HASNL;
1146
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001147 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001148 if (c == '[')
1149 goto collection;
1150
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001151 // "\_x" is character class plus newline
1152 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001153
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001154 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001155 case Magic('.'):
1156 case Magic('i'):
1157 case Magic('I'):
1158 case Magic('k'):
1159 case Magic('K'):
1160 case Magic('f'):
1161 case Magic('F'):
1162 case Magic('p'):
1163 case Magic('P'):
1164 case Magic('s'):
1165 case Magic('S'):
1166 case Magic('d'):
1167 case Magic('D'):
1168 case Magic('x'):
1169 case Magic('X'):
1170 case Magic('o'):
1171 case Magic('O'):
1172 case Magic('w'):
1173 case Magic('W'):
1174 case Magic('h'):
1175 case Magic('H'):
1176 case Magic('a'):
1177 case Magic('A'):
1178 case Magic('l'):
1179 case Magic('L'):
1180 case Magic('u'):
1181 case Magic('U'):
1182 p = vim_strchr(classchars, no_Magic(c));
1183 if (p == NULL)
1184 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1185
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001186 // When '.' is followed by a composing char ignore the dot, so that
1187 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001188 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1189 {
1190 c = getchr();
1191 goto do_multibyte;
1192 }
1193 ret = regnode(classcodes[p - classchars] + extra);
1194 *flagp |= HASWIDTH | SIMPLE;
1195 break;
1196
1197 case Magic('n'):
1198 if (reg_string)
1199 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001200 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001201 ret = regnode(EXACTLY);
1202 regc(NL);
1203 regc(NUL);
1204 *flagp |= HASWIDTH | SIMPLE;
1205 }
1206 else
1207 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001208 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001209 ret = regnode(NEWL);
1210 *flagp |= HASWIDTH | HASNL;
1211 }
1212 break;
1213
1214 case Magic('('):
1215 if (one_exactly)
1216 EMSG_ONE_RET_NULL;
1217 ret = reg(REG_PAREN, &flags);
1218 if (ret == NULL)
1219 return NULL;
1220 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1221 break;
1222
1223 case NUL:
1224 case Magic('|'):
1225 case Magic('&'):
1226 case Magic(')'):
1227 if (one_exactly)
1228 EMSG_ONE_RET_NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001229 IEMSG_RET_NULL(_(e_internal)); // Supposed to be caught earlier.
1230 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001231
1232 case Magic('='):
1233 case Magic('?'):
1234 case Magic('+'):
1235 case Magic('@'):
1236 case Magic('{'):
1237 case Magic('*'):
1238 c = no_Magic(c);
1239 EMSG3_RET_NULL(_("E64: %s%c follows nothing"),
1240 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001241 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001242
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001243 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001244 if (reg_prev_sub != NULL)
1245 {
1246 char_u *lp;
1247
1248 ret = regnode(EXACTLY);
1249 lp = reg_prev_sub;
1250 while (*lp != NUL)
1251 regc(*lp++);
1252 regc(NUL);
1253 if (*reg_prev_sub != NUL)
1254 {
1255 *flagp |= HASWIDTH;
1256 if ((lp - reg_prev_sub) == 1)
1257 *flagp |= SIMPLE;
1258 }
1259 }
1260 else
1261 EMSG_RET_NULL(_(e_nopresub));
1262 break;
1263
1264 case Magic('1'):
1265 case Magic('2'):
1266 case Magic('3'):
1267 case Magic('4'):
1268 case Magic('5'):
1269 case Magic('6'):
1270 case Magic('7'):
1271 case Magic('8'):
1272 case Magic('9'):
1273 {
1274 int refnum;
1275
1276 refnum = c - Magic('0');
1277 if (!seen_endbrace(refnum))
1278 return NULL;
1279 ret = regnode(BACKREF + refnum);
1280 }
1281 break;
1282
1283 case Magic('z'):
1284 {
1285 c = no_Magic(getchr());
1286 switch (c)
1287 {
1288#ifdef FEAT_SYN_HL
1289 case '(': if ((reg_do_extmatch & REX_SET) == 0)
1290 EMSG_RET_NULL(_(e_z_not_allowed));
1291 if (one_exactly)
1292 EMSG_ONE_RET_NULL;
1293 ret = reg(REG_ZPAREN, &flags);
1294 if (ret == NULL)
1295 return NULL;
1296 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1297 re_has_z = REX_SET;
1298 break;
1299
1300 case '1':
1301 case '2':
1302 case '3':
1303 case '4':
1304 case '5':
1305 case '6':
1306 case '7':
1307 case '8':
1308 case '9': if ((reg_do_extmatch & REX_USE) == 0)
1309 EMSG_RET_NULL(_(e_z1_not_allowed));
1310 ret = regnode(ZREF + c - '0');
1311 re_has_z = REX_USE;
1312 break;
1313#endif
1314
1315 case 's': ret = regnode(MOPEN + 0);
1316 if (re_mult_next("\\zs") == FAIL)
1317 return NULL;
1318 break;
1319
1320 case 'e': ret = regnode(MCLOSE + 0);
1321 if (re_mult_next("\\ze") == FAIL)
1322 return NULL;
1323 break;
1324
1325 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1326 }
1327 }
1328 break;
1329
1330 case Magic('%'):
1331 {
1332 c = no_Magic(getchr());
1333 switch (c)
1334 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001335 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001336 case '(':
1337 if (one_exactly)
1338 EMSG_ONE_RET_NULL;
1339 ret = reg(REG_NPAREN, &flags);
1340 if (ret == NULL)
1341 return NULL;
1342 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1343 break;
1344
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001345 // Catch \%^ and \%$ regardless of where they appear in the
1346 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001347 case '^':
1348 ret = regnode(RE_BOF);
1349 break;
1350
1351 case '$':
1352 ret = regnode(RE_EOF);
1353 break;
1354
1355 case '#':
1356 ret = regnode(CURSOR);
1357 break;
1358
1359 case 'V':
1360 ret = regnode(RE_VISUAL);
1361 break;
1362
1363 case 'C':
1364 ret = regnode(RE_COMPOSING);
1365 break;
1366
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001367 // \%[abc]: Emit as a list of branches, all ending at the last
1368 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001369 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001370 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001371 EMSG_ONE_RET_NULL;
1372 {
1373 char_u *lastbranch;
1374 char_u *lastnode = NULL;
1375 char_u *br;
1376
1377 ret = NULL;
1378 while ((c = getchr()) != ']')
1379 {
1380 if (c == NUL)
1381 EMSG2_RET_NULL(_(e_missing_sb),
1382 reg_magic == MAGIC_ALL);
1383 br = regnode(BRANCH);
1384 if (ret == NULL)
1385 ret = br;
1386 else
1387 {
1388 regtail(lastnode, br);
1389 if (reg_toolong)
1390 return NULL;
1391 }
1392
1393 ungetchr();
1394 one_exactly = TRUE;
1395 lastnode = regatom(flagp);
1396 one_exactly = FALSE;
1397 if (lastnode == NULL)
1398 return NULL;
1399 }
1400 if (ret == NULL)
1401 EMSG2_RET_NULL(_(e_empty_sb),
1402 reg_magic == MAGIC_ALL);
1403 lastbranch = regnode(BRANCH);
1404 br = regnode(NOTHING);
1405 if (ret != JUST_CALC_SIZE)
1406 {
1407 regtail(lastnode, br);
1408 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001409 // connect all branches to the NOTHING
1410 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001411 for (br = ret; br != lastnode; )
1412 {
1413 if (OP(br) == BRANCH)
1414 {
1415 regtail(br, lastbranch);
1416 if (reg_toolong)
1417 return NULL;
1418 br = OPERAND(br);
1419 }
1420 else
1421 br = regnext(br);
1422 }
1423 }
1424 *flagp &= ~(HASWIDTH | SIMPLE);
1425 break;
1426 }
1427
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001428 case 'd': // %d123 decimal
1429 case 'o': // %o123 octal
1430 case 'x': // %xab hex 2
1431 case 'u': // %uabcd hex 4
1432 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001433 {
1434 long i;
1435
1436 switch (c)
1437 {
1438 case 'd': i = getdecchrs(); break;
1439 case 'o': i = getoctchrs(); break;
1440 case 'x': i = gethexchrs(2); break;
1441 case 'u': i = gethexchrs(4); break;
1442 case 'U': i = gethexchrs(8); break;
1443 default: i = -1; break;
1444 }
1445
1446 if (i < 0 || i > INT_MAX)
1447 EMSG2_RET_NULL(
1448 _("E678: Invalid character after %s%%[dxouU]"),
1449 reg_magic == MAGIC_ALL);
1450 if (use_multibytecode(i))
1451 ret = regnode(MULTIBYTECODE);
1452 else
1453 ret = regnode(EXACTLY);
1454 if (i == 0)
1455 regc(0x0a);
1456 else
1457 regmbc(i);
1458 regc(NUL);
1459 *flagp |= HASWIDTH;
1460 break;
1461 }
1462
1463 default:
1464 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
1465 || c == '\'')
1466 {
1467 long_u n = 0;
1468 int cmp;
1469
1470 cmp = c;
1471 if (cmp == '<' || cmp == '>')
1472 c = getchr();
1473 while (VIM_ISDIGIT(c))
1474 {
1475 n = n * 10 + (c - '0');
1476 c = getchr();
1477 }
1478 if (c == '\'' && n == 0)
1479 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001480 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001481 c = getchr();
1482 ret = regnode(RE_MARK);
1483 if (ret == JUST_CALC_SIZE)
1484 regsize += 2;
1485 else
1486 {
1487 *regcode++ = c;
1488 *regcode++ = cmp;
1489 }
1490 break;
1491 }
1492 else if (c == 'l' || c == 'c' || c == 'v')
1493 {
1494 if (c == 'l')
1495 {
1496 ret = regnode(RE_LNUM);
1497 if (save_prev_at_start)
1498 at_start = TRUE;
1499 }
1500 else if (c == 'c')
1501 ret = regnode(RE_COL);
1502 else
1503 ret = regnode(RE_VCOL);
1504 if (ret == JUST_CALC_SIZE)
1505 regsize += 5;
1506 else
1507 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001508 // put the number and the optional
1509 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001510 regcode = re_put_long(regcode, n);
1511 *regcode++ = cmp;
1512 }
1513 break;
1514 }
1515 }
1516
1517 EMSG2_RET_NULL(_("E71: Invalid character after %s%%"),
1518 reg_magic == MAGIC_ALL);
1519 }
1520 }
1521 break;
1522
1523 case Magic('['):
1524collection:
1525 {
1526 char_u *lp;
1527
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001528 // If there is no matching ']', we assume the '[' is a normal
1529 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001530 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001531 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001532 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001533 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001534 int endc;
1535
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001536 // In a character class, different parsing rules apply.
1537 // Not even \ is special anymore, nothing is.
1538 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001539 {
1540 ret = regnode(ANYBUT + extra);
1541 regparse++;
1542 }
1543 else
1544 ret = regnode(ANYOF + extra);
1545
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001546 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001547 if (*regparse == ']' || *regparse == '-')
1548 {
1549 startc = *regparse;
1550 regc(*regparse++);
1551 }
1552
1553 while (*regparse != NUL && *regparse != ']')
1554 {
1555 if (*regparse == '-')
1556 {
1557 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001558 // The '-' is not used for a range at the end and
1559 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001560 if (*regparse == ']' || *regparse == NUL
1561 || startc == -1
1562 || (regparse[0] == '\\' && regparse[1] == 'n'))
1563 {
1564 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001565 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001566 }
1567 else
1568 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001569 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001570 endc = 0;
1571 if (*regparse == '[')
1572 endc = get_coll_element(&regparse);
1573 if (endc == 0)
1574 {
1575 if (has_mbyte)
1576 endc = mb_ptr2char_adv(&regparse);
1577 else
1578 endc = *regparse++;
1579 }
1580
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001581 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001582 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1583 endc = coll_get_char();
1584
1585 if (startc > endc)
1586 EMSG_RET_NULL(_(e_reverse_range));
1587 if (has_mbyte && ((*mb_char2len)(startc) > 1
1588 || (*mb_char2len)(endc) > 1))
1589 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001590 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001591 if (endc > startc + 256)
1592 EMSG_RET_NULL(_(e_large_class));
1593 while (++startc <= endc)
1594 regmbc(startc);
1595 }
1596 else
1597 {
1598#ifdef EBCDIC
1599 int alpha_only = FALSE;
1600
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001601 // for alphabetical range skip the gaps
1602 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001603 if (isalpha(startc) && isalpha(endc))
1604 alpha_only = TRUE;
1605#endif
1606 while (++startc <= endc)
1607#ifdef EBCDIC
1608 if (!alpha_only || isalpha(startc))
1609#endif
1610 regc(startc);
1611 }
1612 startc = -1;
1613 }
1614 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001615 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1616 // accepts "\t", "\e", etc., but only when the 'l' flag in
1617 // 'cpoptions' is not included.
1618 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001619 else if (*regparse == '\\'
1620 && !reg_cpo_bsl
1621 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1622 || (!reg_cpo_lit
1623 && vim_strchr(REGEXP_ABBR,
1624 regparse[1]) != NULL)))
1625 {
1626 regparse++;
1627 if (*regparse == 'n')
1628 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001629 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001630 if (ret != JUST_CALC_SIZE)
1631 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001632 // Using \n inside [^] does not change what
1633 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001634 if (*ret == ANYOF)
1635 {
1636 *ret = ANYOF + ADD_NL;
1637 *flagp |= HASNL;
1638 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001639 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001640 }
1641 regparse++;
1642 startc = -1;
1643 }
1644 else if (*regparse == 'd'
1645 || *regparse == 'o'
1646 || *regparse == 'x'
1647 || *regparse == 'u'
1648 || *regparse == 'U')
1649 {
1650 startc = coll_get_char();
1651 if (startc == 0)
1652 regc(0x0a);
1653 else
1654 regmbc(startc);
1655 }
1656 else
1657 {
1658 startc = backslash_trans(*regparse++);
1659 regc(startc);
1660 }
1661 }
1662 else if (*regparse == '[')
1663 {
1664 int c_class;
1665 int cu;
1666
1667 c_class = get_char_class(&regparse);
1668 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001669 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001670 switch (c_class)
1671 {
1672 case CLASS_NONE:
1673 c_class = get_equi_class(&regparse);
1674 if (c_class != 0)
1675 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001676 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001677 reg_equi_class(c_class);
1678 }
1679 else if ((c_class =
1680 get_coll_element(&regparse)) != 0)
1681 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001682 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001683 regmbc(c_class);
1684 }
1685 else
1686 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001687 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001688 startc = *regparse++;
1689 regc(startc);
1690 }
1691 break;
1692 case CLASS_ALNUM:
1693 for (cu = 1; cu < 128; cu++)
1694 if (isalnum(cu))
1695 regmbc(cu);
1696 break;
1697 case CLASS_ALPHA:
1698 for (cu = 1; cu < 128; cu++)
1699 if (isalpha(cu))
1700 regmbc(cu);
1701 break;
1702 case CLASS_BLANK:
1703 regc(' ');
1704 regc('\t');
1705 break;
1706 case CLASS_CNTRL:
1707 for (cu = 1; cu <= 127; cu++)
1708 if (iscntrl(cu))
1709 regmbc(cu);
1710 break;
1711 case CLASS_DIGIT:
1712 for (cu = 1; cu <= 127; cu++)
1713 if (VIM_ISDIGIT(cu))
1714 regmbc(cu);
1715 break;
1716 case CLASS_GRAPH:
1717 for (cu = 1; cu <= 127; cu++)
1718 if (isgraph(cu))
1719 regmbc(cu);
1720 break;
1721 case CLASS_LOWER:
1722 for (cu = 1; cu <= 255; cu++)
1723 if (MB_ISLOWER(cu) && cu != 170
1724 && cu != 186)
1725 regmbc(cu);
1726 break;
1727 case CLASS_PRINT:
1728 for (cu = 1; cu <= 255; cu++)
1729 if (vim_isprintc(cu))
1730 regmbc(cu);
1731 break;
1732 case CLASS_PUNCT:
1733 for (cu = 1; cu < 128; cu++)
1734 if (ispunct(cu))
1735 regmbc(cu);
1736 break;
1737 case CLASS_SPACE:
1738 for (cu = 9; cu <= 13; cu++)
1739 regc(cu);
1740 regc(' ');
1741 break;
1742 case CLASS_UPPER:
1743 for (cu = 1; cu <= 255; cu++)
1744 if (MB_ISUPPER(cu))
1745 regmbc(cu);
1746 break;
1747 case CLASS_XDIGIT:
1748 for (cu = 1; cu <= 255; cu++)
1749 if (vim_isxdigit(cu))
1750 regmbc(cu);
1751 break;
1752 case CLASS_TAB:
1753 regc('\t');
1754 break;
1755 case CLASS_RETURN:
1756 regc('\r');
1757 break;
1758 case CLASS_BACKSPACE:
1759 regc('\b');
1760 break;
1761 case CLASS_ESCAPE:
1762 regc('\033');
1763 break;
1764 case CLASS_IDENT:
1765 for (cu = 1; cu <= 255; cu++)
1766 if (vim_isIDc(cu))
1767 regmbc(cu);
1768 break;
1769 case CLASS_KEYWORD:
1770 for (cu = 1; cu <= 255; cu++)
1771 if (reg_iswordc(cu))
1772 regmbc(cu);
1773 break;
1774 case CLASS_FNAME:
1775 for (cu = 1; cu <= 255; cu++)
1776 if (vim_isfilec(cu))
1777 regmbc(cu);
1778 break;
1779 }
1780 }
1781 else
1782 {
1783 if (has_mbyte)
1784 {
1785 int len;
1786
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001787 // produce a multibyte character, including any
1788 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001789 startc = mb_ptr2char(regparse);
1790 len = (*mb_ptr2len)(regparse);
1791 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001792 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001793 while (--len >= 0)
1794 regc(*regparse++);
1795 }
1796 else
1797 {
1798 startc = *regparse++;
1799 regc(startc);
1800 }
1801 }
1802 }
1803 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001804 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001805 if (*regparse != ']')
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001806 EMSG_RET_NULL(_(e_toomsbra)); // Cannot happen?
1807 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001808 *flagp |= HASWIDTH | SIMPLE;
1809 break;
1810 }
1811 else if (reg_strict)
1812 EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF);
1813 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001814 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001815
1816 default:
1817 {
1818 int len;
1819
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001820 // A multi-byte character is handled as a separate atom if it's
1821 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001822 if (use_multibytecode(c))
1823 {
1824do_multibyte:
1825 ret = regnode(MULTIBYTECODE);
1826 regmbc(c);
1827 *flagp |= HASWIDTH | SIMPLE;
1828 break;
1829 }
1830
1831 ret = regnode(EXACTLY);
1832
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001833 // Append characters as long as:
1834 // - there is no following multi, we then need the character in
1835 // front of it as a single character operand
1836 // - not running into a Magic character
1837 // - "one_exactly" is not set
1838 // But always emit at least one character. Might be a Multi,
1839 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001840 for (len = 0; c != NUL && (len == 0
1841 || (re_multi_type(peekchr()) == NOT_MULTI
1842 && !one_exactly
1843 && !is_Magic(c))); ++len)
1844 {
1845 c = no_Magic(c);
1846 if (has_mbyte)
1847 {
1848 regmbc(c);
1849 if (enc_utf8)
1850 {
1851 int l;
1852
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001853 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001854 for (;;)
1855 {
1856 l = utf_ptr2len(regparse);
1857 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
1858 break;
1859 regmbc(utf_ptr2char(regparse));
1860 skipchr();
1861 }
1862 }
1863 }
1864 else
1865 regc(c);
1866 c = getchr();
1867 }
1868 ungetchr();
1869
1870 regc(NUL);
1871 *flagp |= HASWIDTH;
1872 if (len == 1)
1873 *flagp |= SIMPLE;
1874 }
1875 break;
1876 }
1877
1878 return ret;
1879}
1880
1881/*
1882 * Parse something followed by possible [*+=].
1883 *
1884 * Note that the branching code sequences used for = and the general cases
1885 * of * and + are somewhat optimized: they use the same NOTHING node as
1886 * both the endmarker for their branch list and the body of the last branch.
1887 * It might seem that this node could be dispensed with entirely, but the
1888 * endmarker role is not redundant.
1889 */
1890 static char_u *
1891regpiece(int *flagp)
1892{
1893 char_u *ret;
1894 int op;
1895 char_u *next;
1896 int flags;
1897 long minval;
1898 long maxval;
1899
1900 ret = regatom(&flags);
1901 if (ret == NULL)
1902 return NULL;
1903
1904 op = peekchr();
1905 if (re_multi_type(op) == NOT_MULTI)
1906 {
1907 *flagp = flags;
1908 return ret;
1909 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001910 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001911 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1912
1913 skipchr();
1914 switch (op)
1915 {
1916 case Magic('*'):
1917 if (flags & SIMPLE)
1918 reginsert(STAR, ret);
1919 else
1920 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001921 // Emit x* as (x&|), where & means "self".
1922 reginsert(BRANCH, ret); // Either x
1923 regoptail(ret, regnode(BACK)); // and loop
1924 regoptail(ret, ret); // back
1925 regtail(ret, regnode(BRANCH)); // or
1926 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001927 }
1928 break;
1929
1930 case Magic('+'):
1931 if (flags & SIMPLE)
1932 reginsert(PLUS, ret);
1933 else
1934 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001935 // Emit x+ as x(&|), where & means "self".
1936 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001937 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001938 regtail(regnode(BACK), ret); // loop back
1939 regtail(next, regnode(BRANCH)); // or
1940 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001941 }
1942 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1943 break;
1944
1945 case Magic('@'):
1946 {
1947 int lop = END;
1948 long nr;
1949
1950 nr = getdecchrs();
1951 switch (no_Magic(getchr()))
1952 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001953 case '=': lop = MATCH; break; // \@=
1954 case '!': lop = NOMATCH; break; // \@!
1955 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001956 case '<': switch (no_Magic(getchr()))
1957 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001958 case '=': lop = BEHIND; break; // \@<=
1959 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001960 }
1961 }
1962 if (lop == END)
1963 EMSG2_RET_NULL(_("E59: invalid character after %s@"),
1964 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001965 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001966 if (lop == BEHIND || lop == NOBEHIND)
1967 {
1968 regtail(ret, regnode(BHPOS));
1969 *flagp |= HASLOOKBH;
1970 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001971 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001972 if (lop == BEHIND || lop == NOBEHIND)
1973 {
1974 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001975 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001976 reginsert_nr(lop, nr, ret);
1977 }
1978 else
1979 reginsert(lop, ret);
1980 break;
1981 }
1982
1983 case Magic('?'):
1984 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001985 // Emit x= as (x|)
1986 reginsert(BRANCH, ret); // Either x
1987 regtail(ret, regnode(BRANCH)); // or
1988 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001989 regtail(ret, next);
1990 regoptail(ret, next);
1991 break;
1992
1993 case Magic('{'):
1994 if (!read_limits(&minval, &maxval))
1995 return NULL;
1996 if (flags & SIMPLE)
1997 {
1998 reginsert(BRACE_SIMPLE, ret);
1999 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2000 }
2001 else
2002 {
2003 if (num_complex_braces >= 10)
2004 EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"),
2005 reg_magic == MAGIC_ALL);
2006 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2007 regoptail(ret, regnode(BACK));
2008 regoptail(ret, ret);
2009 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2010 ++num_complex_braces;
2011 }
2012 if (minval > 0 && maxval > 0)
2013 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2014 break;
2015 }
2016 if (re_multi_type(peekchr()) != NOT_MULTI)
2017 {
2018 // Can't have a multi follow a multi.
2019 if (peekchr() == Magic('*'))
2020 EMSG2_RET_NULL(_("E61: Nested %s*"), reg_magic >= MAGIC_ON);
2021 EMSG3_RET_NULL(_("E62: Nested %s%c"), reg_magic == MAGIC_ALL,
2022 no_Magic(peekchr()));
2023 }
2024
2025 return ret;
2026}
2027
2028/*
2029 * Parse one alternative of an | or & operator.
2030 * Implements the concatenation operator.
2031 */
2032 static char_u *
2033regconcat(int *flagp)
2034{
2035 char_u *first = NULL;
2036 char_u *chain = NULL;
2037 char_u *latest;
2038 int flags;
2039 int cont = TRUE;
2040
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002041 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002042
2043 while (cont)
2044 {
2045 switch (peekchr())
2046 {
2047 case NUL:
2048 case Magic('|'):
2049 case Magic('&'):
2050 case Magic(')'):
2051 cont = FALSE;
2052 break;
2053 case Magic('Z'):
2054 regflags |= RF_ICOMBINE;
2055 skipchr_keepstart();
2056 break;
2057 case Magic('c'):
2058 regflags |= RF_ICASE;
2059 skipchr_keepstart();
2060 break;
2061 case Magic('C'):
2062 regflags |= RF_NOICASE;
2063 skipchr_keepstart();
2064 break;
2065 case Magic('v'):
2066 reg_magic = MAGIC_ALL;
2067 skipchr_keepstart();
2068 curchr = -1;
2069 break;
2070 case Magic('m'):
2071 reg_magic = MAGIC_ON;
2072 skipchr_keepstart();
2073 curchr = -1;
2074 break;
2075 case Magic('M'):
2076 reg_magic = MAGIC_OFF;
2077 skipchr_keepstart();
2078 curchr = -1;
2079 break;
2080 case Magic('V'):
2081 reg_magic = MAGIC_NONE;
2082 skipchr_keepstart();
2083 curchr = -1;
2084 break;
2085 default:
2086 latest = regpiece(&flags);
2087 if (latest == NULL || reg_toolong)
2088 return NULL;
2089 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002090 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002091 *flagp |= flags & SPSTART;
2092 else
2093 regtail(chain, latest);
2094 chain = latest;
2095 if (first == NULL)
2096 first = latest;
2097 break;
2098 }
2099 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002100 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002101 first = regnode(NOTHING);
2102 return first;
2103}
2104
2105/*
2106 * Parse one alternative of an | operator.
2107 * Implements the & operator.
2108 */
2109 static char_u *
2110regbranch(int *flagp)
2111{
2112 char_u *ret;
2113 char_u *chain = NULL;
2114 char_u *latest;
2115 int flags;
2116
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002117 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002118
2119 ret = regnode(BRANCH);
2120 for (;;)
2121 {
2122 latest = regconcat(&flags);
2123 if (latest == NULL)
2124 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002125 // If one of the branches has width, the whole thing has. If one of
2126 // the branches anchors at start-of-line, the whole thing does.
2127 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002128 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002129 // If one of the branches doesn't match a line-break, the whole thing
2130 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002131 *flagp &= ~HASNL | (flags & HASNL);
2132 if (chain != NULL)
2133 regtail(chain, latest);
2134 if (peekchr() != Magic('&'))
2135 break;
2136 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002137 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002138 if (reg_toolong)
2139 break;
2140 reginsert(MATCH, latest);
2141 chain = latest;
2142 }
2143
2144 return ret;
2145}
2146
2147/*
2148 * Parse regular expression, i.e. main body or parenthesized thing.
2149 *
2150 * Caller must absorb opening parenthesis.
2151 *
2152 * Combining parenthesis handling with the base level of regular expression
2153 * is a trifle forced, but the need to tie the tails of the branches to what
2154 * follows makes it hard to avoid.
2155 */
2156 static char_u *
2157reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002158 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002159 int *flagp)
2160{
2161 char_u *ret;
2162 char_u *br;
2163 char_u *ender;
2164 int parno = 0;
2165 int flags;
2166
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002167 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002168
2169#ifdef FEAT_SYN_HL
2170 if (paren == REG_ZPAREN)
2171 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002172 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002173 if (regnzpar >= NSUBEXP)
2174 EMSG_RET_NULL(_("E50: Too many \\z("));
2175 parno = regnzpar;
2176 regnzpar++;
2177 ret = regnode(ZOPEN + parno);
2178 }
2179 else
2180#endif
2181 if (paren == REG_PAREN)
2182 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002183 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002184 if (regnpar >= NSUBEXP)
2185 EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
2186 parno = regnpar;
2187 ++regnpar;
2188 ret = regnode(MOPEN + parno);
2189 }
2190 else if (paren == REG_NPAREN)
2191 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002192 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002193 ret = regnode(NOPEN);
2194 }
2195 else
2196 ret = NULL;
2197
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002198 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002199 br = regbranch(&flags);
2200 if (br == NULL)
2201 return NULL;
2202 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002203 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002204 else
2205 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002206 // If one of the branches can be zero-width, the whole thing can.
2207 // If one of the branches has * at start or matches a line-break, the
2208 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002209 if (!(flags & HASWIDTH))
2210 *flagp &= ~HASWIDTH;
2211 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2212 while (peekchr() == Magic('|'))
2213 {
2214 skipchr();
2215 br = regbranch(&flags);
2216 if (br == NULL || reg_toolong)
2217 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002218 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002219 if (!(flags & HASWIDTH))
2220 *flagp &= ~HASWIDTH;
2221 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2222 }
2223
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002224 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002225 ender = regnode(
2226#ifdef FEAT_SYN_HL
2227 paren == REG_ZPAREN ? ZCLOSE + parno :
2228#endif
2229 paren == REG_PAREN ? MCLOSE + parno :
2230 paren == REG_NPAREN ? NCLOSE : END);
2231 regtail(ret, ender);
2232
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002233 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002234 for (br = ret; br != NULL; br = regnext(br))
2235 regoptail(br, ender);
2236
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002237 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002238 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2239 {
2240#ifdef FEAT_SYN_HL
2241 if (paren == REG_ZPAREN)
2242 EMSG_RET_NULL(_("E52: Unmatched \\z("));
2243 else
2244#endif
2245 if (paren == REG_NPAREN)
2246 EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
2247 else
2248 EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
2249 }
2250 else if (paren == REG_NOPAREN && peekchr() != NUL)
2251 {
2252 if (curchr == Magic(')'))
2253 EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
2254 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002255 EMSG_RET_NULL(_(e_trailing)); // "Can't happen".
2256 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002257 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002258 // Here we set the flag allowing back references to this set of
2259 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002260 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002261 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002262 return ret;
2263}
2264
2265/*
2266 * bt_regcomp() - compile a regular expression into internal code for the
2267 * traditional back track matcher.
2268 * Returns the program in allocated space. Returns NULL for an error.
2269 *
2270 * We can't allocate space until we know how big the compiled form will be,
2271 * but we can't compile it (and thus know how big it is) until we've got a
2272 * place to put the code. So we cheat: we compile it twice, once with code
2273 * generation turned off and size counting turned on, and once "for real".
2274 * This also means that we don't allocate space until we are sure that the
2275 * thing really will compile successfully, and we never have to move the
2276 * code and thus invalidate pointers into it. (Note that it has to be in
2277 * one piece because vim_free() must be able to free it all.)
2278 *
2279 * Whether upper/lower case is to be ignored is decided when executing the
2280 * program, it does not matter here.
2281 *
2282 * Beware that the optimization-preparation code in here knows about some
2283 * of the structure of the compiled regexp.
2284 * "re_flags": RE_MAGIC and/or RE_STRING.
2285 */
2286 static regprog_T *
2287bt_regcomp(char_u *expr, int re_flags)
2288{
2289 bt_regprog_T *r;
2290 char_u *scan;
2291 char_u *longest;
2292 int len;
2293 int flags;
2294
2295 if (expr == NULL)
2296 EMSG_RET_NULL(_(e_null));
2297
2298 init_class_tab();
2299
2300 // First pass: determine size, legality.
2301 regcomp_start(expr, re_flags);
2302 regcode = JUST_CALC_SIZE;
2303 regc(REGMAGIC);
2304 if (reg(REG_NOPAREN, &flags) == NULL)
2305 return NULL;
2306
2307 // Allocate space.
2308 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2309 if (r == NULL)
2310 return NULL;
2311 r->re_in_use = FALSE;
2312
2313 // Second pass: emit code.
2314 regcomp_start(expr, re_flags);
2315 regcode = r->program;
2316 regc(REGMAGIC);
2317 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2318 {
2319 vim_free(r);
2320 if (reg_toolong)
2321 EMSG_RET_NULL(_("E339: Pattern too long"));
2322 return NULL;
2323 }
2324
2325 // Dig out information for optimizations.
2326 r->regstart = NUL; // Worst-case defaults.
2327 r->reganch = 0;
2328 r->regmust = NULL;
2329 r->regmlen = 0;
2330 r->regflags = regflags;
2331 if (flags & HASNL)
2332 r->regflags |= RF_HASNL;
2333 if (flags & HASLOOKBH)
2334 r->regflags |= RF_LOOKBH;
2335#ifdef FEAT_SYN_HL
2336 // Remember whether this pattern has any \z specials in it.
2337 r->reghasz = re_has_z;
2338#endif
2339 scan = r->program + 1; // First BRANCH.
2340 if (OP(regnext(scan)) == END) // Only one top-level choice.
2341 {
2342 scan = OPERAND(scan);
2343
2344 // Starting-point info.
2345 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2346 {
2347 r->reganch++;
2348 scan = regnext(scan);
2349 }
2350
2351 if (OP(scan) == EXACTLY)
2352 {
2353 if (has_mbyte)
2354 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2355 else
2356 r->regstart = *OPERAND(scan);
2357 }
2358 else if ((OP(scan) == BOW
2359 || OP(scan) == EOW
2360 || OP(scan) == NOTHING
2361 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2362 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2363 && OP(regnext(scan)) == EXACTLY)
2364 {
2365 if (has_mbyte)
2366 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2367 else
2368 r->regstart = *OPERAND(regnext(scan));
2369 }
2370
2371 // If there's something expensive in the r.e., find the longest
2372 // literal string that must appear and make it the regmust. Resolve
2373 // ties in favor of later strings, since the regstart check works
2374 // with the beginning of the r.e. and avoiding duplication
2375 // strengthens checking. Not a strong reason, but sufficient in the
2376 // absence of others.
2377
2378 // When the r.e. starts with BOW, it is faster to look for a regmust
2379 // first. Used a lot for "#" and "*" commands. (Added by mool).
2380 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2381 && !(flags & HASNL))
2382 {
2383 longest = NULL;
2384 len = 0;
2385 for (; scan != NULL; scan = regnext(scan))
2386 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
2387 {
2388 longest = OPERAND(scan);
2389 len = (int)STRLEN(OPERAND(scan));
2390 }
2391 r->regmust = longest;
2392 r->regmlen = len;
2393 }
2394 }
2395#ifdef BT_REGEXP_DUMP
2396 regdump(expr, r);
2397#endif
2398 r->engine = &bt_regengine;
2399 return (regprog_T *)r;
2400}
2401
2402#if defined(FEAT_SYN_HL) || defined(PROTO)
2403/*
2404 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2405 * found. This is messy, but it works fine.
2406 */
2407 int
2408vim_regcomp_had_eol(void)
2409{
2410 return had_eol;
2411}
2412#endif
2413
2414/*
2415 * Get a number after a backslash that is inside [].
2416 * When nothing is recognized return a backslash.
2417 */
2418 static int
2419coll_get_char(void)
2420{
2421 long nr = -1;
2422
2423 switch (*regparse++)
2424 {
2425 case 'd': nr = getdecchrs(); break;
2426 case 'o': nr = getoctchrs(); break;
2427 case 'x': nr = gethexchrs(2); break;
2428 case 'u': nr = gethexchrs(4); break;
2429 case 'U': nr = gethexchrs(8); break;
2430 }
2431 if (nr < 0 || nr > INT_MAX)
2432 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002433 // If getting the number fails be backwards compatible: the character
2434 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002435 --regparse;
2436 nr = '\\';
2437 }
2438 return nr;
2439}
2440
2441/*
2442 * Free a compiled regexp program, returned by bt_regcomp().
2443 */
2444 static void
2445bt_regfree(regprog_T *prog)
2446{
2447 vim_free(prog);
2448}
2449
2450#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2451
2452/*
2453 * The arguments from BRACE_LIMITS are stored here. They are actually local
2454 * to regmatch(), but they are here to reduce the amount of stack space used
2455 * (it can be called recursively many times).
2456 */
2457static long bl_minval;
2458static long bl_maxval;
2459
2460/*
2461 * Save the input line and position in a regsave_T.
2462 */
2463 static void
2464reg_save(regsave_T *save, garray_T *gap)
2465{
2466 if (REG_MULTI)
2467 {
2468 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2469 save->rs_u.pos.lnum = rex.lnum;
2470 }
2471 else
2472 save->rs_u.ptr = rex.input;
2473 save->rs_len = gap->ga_len;
2474}
2475
2476/*
2477 * Restore the input line and position from a regsave_T.
2478 */
2479 static void
2480reg_restore(regsave_T *save, garray_T *gap)
2481{
2482 if (REG_MULTI)
2483 {
2484 if (rex.lnum != save->rs_u.pos.lnum)
2485 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002486 // only call reg_getline() when the line number changed to save
2487 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002488 rex.lnum = save->rs_u.pos.lnum;
2489 rex.line = reg_getline(rex.lnum);
2490 }
2491 rex.input = rex.line + save->rs_u.pos.col;
2492 }
2493 else
2494 rex.input = save->rs_u.ptr;
2495 gap->ga_len = save->rs_len;
2496}
2497
2498/*
2499 * Return TRUE if current position is equal to saved position.
2500 */
2501 static int
2502reg_save_equal(regsave_T *save)
2503{
2504 if (REG_MULTI)
2505 return rex.lnum == save->rs_u.pos.lnum
2506 && rex.input == rex.line + save->rs_u.pos.col;
2507 return rex.input == save->rs_u.ptr;
2508}
2509
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002510// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002511#define save_se(savep, posp, pp) \
2512 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2513
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002514// After a failed match restore the sub-expressions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002515#define restore_se(savep, posp, pp) { \
2516 if (REG_MULTI) \
2517 *(posp) = (savep)->se_u.pos; \
2518 else \
2519 *(pp) = (savep)->se_u.ptr; }
2520
2521/*
2522 * Tentatively set the sub-expression start to the current position (after
2523 * calling regmatch() they will have changed). Need to save the existing
2524 * values for when there is no match.
2525 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2526 * depending on REG_MULTI.
2527 */
2528 static void
2529save_se_multi(save_se_T *savep, lpos_T *posp)
2530{
2531 savep->se_u.pos = *posp;
2532 posp->lnum = rex.lnum;
2533 posp->col = (colnr_T)(rex.input - rex.line);
2534}
2535
2536 static void
2537save_se_one(save_se_T *savep, char_u **pp)
2538{
2539 savep->se_u.ptr = *pp;
2540 *pp = rex.input;
2541}
2542
2543/*
2544 * regrepeat - repeatedly match something simple, return how many.
2545 * Advances rex.input (and rex.lnum) to just after the matched chars.
2546 */
2547 static int
2548regrepeat(
2549 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002550 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002551{
2552 long count = 0;
2553 char_u *scan;
2554 char_u *opnd;
2555 int mask;
2556 int testval = 0;
2557
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002558 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002559 opnd = OPERAND(p);
2560 switch (OP(p))
2561 {
2562 case ANY:
2563 case ANY + ADD_NL:
2564 while (count < maxcount)
2565 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002566 // Matching anything means we continue until end-of-line (or
2567 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002568 while (*scan != NUL && count < maxcount)
2569 {
2570 ++count;
2571 MB_PTR_ADV(scan);
2572 }
2573 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2574 || rex.reg_line_lbr || count == maxcount)
2575 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002576 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002577 reg_nextline();
2578 scan = rex.input;
2579 if (got_int)
2580 break;
2581 }
2582 break;
2583
2584 case IDENT:
2585 case IDENT + ADD_NL:
2586 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002587 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002588 case SIDENT:
2589 case SIDENT + ADD_NL:
2590 while (count < maxcount)
2591 {
2592 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2593 {
2594 MB_PTR_ADV(scan);
2595 }
2596 else if (*scan == NUL)
2597 {
2598 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2599 || rex.reg_line_lbr)
2600 break;
2601 reg_nextline();
2602 scan = rex.input;
2603 if (got_int)
2604 break;
2605 }
2606 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2607 ++scan;
2608 else
2609 break;
2610 ++count;
2611 }
2612 break;
2613
2614 case KWORD:
2615 case KWORD + ADD_NL:
2616 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002617 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002618 case SKWORD:
2619 case SKWORD + ADD_NL:
2620 while (count < maxcount)
2621 {
2622 if (vim_iswordp_buf(scan, rex.reg_buf)
2623 && (testval || !VIM_ISDIGIT(*scan)))
2624 {
2625 MB_PTR_ADV(scan);
2626 }
2627 else if (*scan == NUL)
2628 {
2629 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2630 || rex.reg_line_lbr)
2631 break;
2632 reg_nextline();
2633 scan = rex.input;
2634 if (got_int)
2635 break;
2636 }
2637 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2638 ++scan;
2639 else
2640 break;
2641 ++count;
2642 }
2643 break;
2644
2645 case FNAME:
2646 case FNAME + ADD_NL:
2647 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002648 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002649 case SFNAME:
2650 case SFNAME + ADD_NL:
2651 while (count < maxcount)
2652 {
2653 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2654 {
2655 MB_PTR_ADV(scan);
2656 }
2657 else if (*scan == NUL)
2658 {
2659 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2660 || rex.reg_line_lbr)
2661 break;
2662 reg_nextline();
2663 scan = rex.input;
2664 if (got_int)
2665 break;
2666 }
2667 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2668 ++scan;
2669 else
2670 break;
2671 ++count;
2672 }
2673 break;
2674
2675 case PRINT:
2676 case PRINT + ADD_NL:
2677 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002678 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002679 case SPRINT:
2680 case SPRINT + ADD_NL:
2681 while (count < maxcount)
2682 {
2683 if (*scan == NUL)
2684 {
2685 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2686 || rex.reg_line_lbr)
2687 break;
2688 reg_nextline();
2689 scan = rex.input;
2690 if (got_int)
2691 break;
2692 }
2693 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2694 && (testval || !VIM_ISDIGIT(*scan)))
2695 {
2696 MB_PTR_ADV(scan);
2697 }
2698 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2699 ++scan;
2700 else
2701 break;
2702 ++count;
2703 }
2704 break;
2705
2706 case WHITE:
2707 case WHITE + ADD_NL:
2708 testval = mask = RI_WHITE;
2709do_class:
2710 while (count < maxcount)
2711 {
2712 int l;
2713
2714 if (*scan == NUL)
2715 {
2716 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2717 || rex.reg_line_lbr)
2718 break;
2719 reg_nextline();
2720 scan = rex.input;
2721 if (got_int)
2722 break;
2723 }
2724 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2725 {
2726 if (testval != 0)
2727 break;
2728 scan += l;
2729 }
2730 else if ((class_tab[*scan] & mask) == testval)
2731 ++scan;
2732 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2733 ++scan;
2734 else
2735 break;
2736 ++count;
2737 }
2738 break;
2739
2740 case NWHITE:
2741 case NWHITE + ADD_NL:
2742 mask = RI_WHITE;
2743 goto do_class;
2744 case DIGIT:
2745 case DIGIT + ADD_NL:
2746 testval = mask = RI_DIGIT;
2747 goto do_class;
2748 case NDIGIT:
2749 case NDIGIT + ADD_NL:
2750 mask = RI_DIGIT;
2751 goto do_class;
2752 case HEX:
2753 case HEX + ADD_NL:
2754 testval = mask = RI_HEX;
2755 goto do_class;
2756 case NHEX:
2757 case NHEX + ADD_NL:
2758 mask = RI_HEX;
2759 goto do_class;
2760 case OCTAL:
2761 case OCTAL + ADD_NL:
2762 testval = mask = RI_OCTAL;
2763 goto do_class;
2764 case NOCTAL:
2765 case NOCTAL + ADD_NL:
2766 mask = RI_OCTAL;
2767 goto do_class;
2768 case WORD:
2769 case WORD + ADD_NL:
2770 testval = mask = RI_WORD;
2771 goto do_class;
2772 case NWORD:
2773 case NWORD + ADD_NL:
2774 mask = RI_WORD;
2775 goto do_class;
2776 case HEAD:
2777 case HEAD + ADD_NL:
2778 testval = mask = RI_HEAD;
2779 goto do_class;
2780 case NHEAD:
2781 case NHEAD + ADD_NL:
2782 mask = RI_HEAD;
2783 goto do_class;
2784 case ALPHA:
2785 case ALPHA + ADD_NL:
2786 testval = mask = RI_ALPHA;
2787 goto do_class;
2788 case NALPHA:
2789 case NALPHA + ADD_NL:
2790 mask = RI_ALPHA;
2791 goto do_class;
2792 case LOWER:
2793 case LOWER + ADD_NL:
2794 testval = mask = RI_LOWER;
2795 goto do_class;
2796 case NLOWER:
2797 case NLOWER + ADD_NL:
2798 mask = RI_LOWER;
2799 goto do_class;
2800 case UPPER:
2801 case UPPER + ADD_NL:
2802 testval = mask = RI_UPPER;
2803 goto do_class;
2804 case NUPPER:
2805 case NUPPER + ADD_NL:
2806 mask = RI_UPPER;
2807 goto do_class;
2808
2809 case EXACTLY:
2810 {
2811 int cu, cl;
2812
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002813 // This doesn't do a multi-byte character, because a MULTIBYTECODE
2814 // would have been used for it. It does handle single-byte
2815 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002816 if (rex.reg_ic)
2817 {
2818 cu = MB_TOUPPER(*opnd);
2819 cl = MB_TOLOWER(*opnd);
2820 while (count < maxcount && (*scan == cu || *scan == cl))
2821 {
2822 count++;
2823 scan++;
2824 }
2825 }
2826 else
2827 {
2828 cu = *opnd;
2829 while (count < maxcount && *scan == cu)
2830 {
2831 count++;
2832 scan++;
2833 }
2834 }
2835 break;
2836 }
2837
2838 case MULTIBYTECODE:
2839 {
2840 int i, len, cf = 0;
2841
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002842 // Safety check (just in case 'encoding' was changed since
2843 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002844 if ((len = (*mb_ptr2len)(opnd)) > 1)
2845 {
2846 if (rex.reg_ic && enc_utf8)
2847 cf = utf_fold(utf_ptr2char(opnd));
2848 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
2849 {
2850 for (i = 0; i < len; ++i)
2851 if (opnd[i] != scan[i])
2852 break;
2853 if (i < len && (!rex.reg_ic || !enc_utf8
2854 || utf_fold(utf_ptr2char(scan)) != cf))
2855 break;
2856 scan += len;
2857 ++count;
2858 }
2859 }
2860 }
2861 break;
2862
2863 case ANYOF:
2864 case ANYOF + ADD_NL:
2865 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002866 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002867
2868 case ANYBUT:
2869 case ANYBUT + ADD_NL:
2870 while (count < maxcount)
2871 {
2872 int len;
2873
2874 if (*scan == NUL)
2875 {
2876 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2877 || rex.reg_line_lbr)
2878 break;
2879 reg_nextline();
2880 scan = rex.input;
2881 if (got_int)
2882 break;
2883 }
2884 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2885 ++scan;
2886 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
2887 {
2888 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
2889 break;
2890 scan += len;
2891 }
2892 else
2893 {
2894 if ((cstrchr(opnd, *scan) == NULL) == testval)
2895 break;
2896 ++scan;
2897 }
2898 ++count;
2899 }
2900 break;
2901
2902 case NEWL:
2903 while (count < maxcount
2904 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
2905 && !rex.reg_line_lbr && REG_MULTI)
2906 || (*scan == '\n' && rex.reg_line_lbr)))
2907 {
2908 count++;
2909 if (rex.reg_line_lbr)
2910 ADVANCE_REGINPUT();
2911 else
2912 reg_nextline();
2913 scan = rex.input;
2914 if (got_int)
2915 break;
2916 }
2917 break;
2918
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002919 default: // Oh dear. Called inappropriately.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002920 emsg(_(e_re_corr));
2921#ifdef DEBUG
2922 printf("Called regrepeat with op code %d\n", OP(p));
2923#endif
2924 break;
2925 }
2926
2927 rex.input = scan;
2928
2929 return (int)count;
2930}
2931
2932/*
2933 * Push an item onto the regstack.
2934 * Returns pointer to new item. Returns NULL when out of memory.
2935 */
2936 static regitem_T *
2937regstack_push(regstate_T state, char_u *scan)
2938{
2939 regitem_T *rp;
2940
2941 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
2942 {
2943 emsg(_(e_maxmempat));
2944 return NULL;
2945 }
2946 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
2947 return NULL;
2948
2949 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
2950 rp->rs_state = state;
2951 rp->rs_scan = scan;
2952
2953 regstack.ga_len += sizeof(regitem_T);
2954 return rp;
2955}
2956
2957/*
2958 * Pop an item from the regstack.
2959 */
2960 static void
2961regstack_pop(char_u **scan)
2962{
2963 regitem_T *rp;
2964
2965 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
2966 *scan = rp->rs_scan;
2967
2968 regstack.ga_len -= sizeof(regitem_T);
2969}
2970
2971/*
2972 * Save the current subexpr to "bp", so that they can be restored
2973 * later by restore_subexpr().
2974 */
2975 static void
2976save_subexpr(regbehind_T *bp)
2977{
2978 int i;
2979
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002980 // When "rex.need_clear_subexpr" is set we don't need to save the values,
2981 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002982 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
2983 if (!rex.need_clear_subexpr)
2984 {
2985 for (i = 0; i < NSUBEXP; ++i)
2986 {
2987 if (REG_MULTI)
2988 {
2989 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
2990 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
2991 }
2992 else
2993 {
2994 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
2995 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
2996 }
2997 }
2998 }
2999}
3000
3001/*
3002 * Restore the subexpr from "bp".
3003 */
3004 static void
3005restore_subexpr(regbehind_T *bp)
3006{
3007 int i;
3008
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003009 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003010 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
3011 if (!rex.need_clear_subexpr)
3012 {
3013 for (i = 0; i < NSUBEXP; ++i)
3014 {
3015 if (REG_MULTI)
3016 {
3017 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3018 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3019 }
3020 else
3021 {
3022 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3023 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
3024 }
3025 }
3026 }
3027}
3028
3029/*
3030 * regmatch - main matching routine
3031 *
3032 * Conceptually the strategy is simple: Check to see whether the current node
3033 * matches, push an item onto the regstack and loop to see whether the rest
3034 * matches, and then act accordingly. In practice we make some effort to
3035 * avoid using the regstack, in particular by going through "ordinary" nodes
3036 * (that don't need to know whether the rest of the match failed) by a nested
3037 * loop.
3038 *
3039 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3040 * the last matched character.
3041 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3042 * undefined state!
3043 */
3044 static int
3045regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003046 char_u *scan, // Current node.
3047 proftime_T *tm UNUSED, // timeout limit or NULL
3048 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003049{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003050 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003051 int op;
3052 int c;
3053 regitem_T *rp;
3054 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003055 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003056#ifdef FEAT_RELTIME
3057 int tm_count = 0;
3058#endif
3059
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003060 // Make "regstack" and "backpos" empty. They are allocated and freed in
3061 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003062 regstack.ga_len = 0;
3063 backpos.ga_len = 0;
3064
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003065 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003066 for (;;)
3067 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003068 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3069 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003070 fast_breakcheck();
3071
3072#ifdef DEBUG
3073 if (scan != NULL && regnarrate)
3074 {
3075 mch_errmsg((char *)regprop(scan));
3076 mch_errmsg("(\n");
3077 }
3078#endif
3079
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003080 // Repeat for items that can be matched sequentially, without using the
3081 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003082 for (;;)
3083 {
3084 if (got_int || scan == NULL)
3085 {
3086 status = RA_FAIL;
3087 break;
3088 }
3089#ifdef FEAT_RELTIME
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003090 // Check for timeout once in a 100 times to avoid overhead.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003091 if (tm != NULL && ++tm_count == 100)
3092 {
3093 tm_count = 0;
3094 if (profile_passed_limit(tm))
3095 {
3096 if (timed_out != NULL)
3097 *timed_out = TRUE;
3098 status = RA_FAIL;
3099 break;
3100 }
3101 }
3102#endif
3103 status = RA_CONT;
3104
3105#ifdef DEBUG
3106 if (regnarrate)
3107 {
3108 mch_errmsg((char *)regprop(scan));
3109 mch_errmsg("...\n");
3110# ifdef FEAT_SYN_HL
3111 if (re_extmatch_in != NULL)
3112 {
3113 int i;
3114
3115 mch_errmsg(_("External submatches:\n"));
3116 for (i = 0; i < NSUBEXP; i++)
3117 {
3118 mch_errmsg(" \"");
3119 if (re_extmatch_in->matches[i] != NULL)
3120 mch_errmsg((char *)re_extmatch_in->matches[i]);
3121 mch_errmsg("\"\n");
3122 }
3123 }
3124# endif
3125 }
3126#endif
3127 next = regnext(scan);
3128
3129 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003130 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003131 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3132 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
3133 {
3134 reg_nextline();
3135 }
3136 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3137 {
3138 ADVANCE_REGINPUT();
3139 }
3140 else
3141 {
3142 if (WITH_NL(op))
3143 op -= ADD_NL;
3144 if (has_mbyte)
3145 c = (*mb_ptr2char)(rex.input);
3146 else
3147 c = *rex.input;
3148 switch (op)
3149 {
3150 case BOL:
3151 if (rex.input != rex.line)
3152 status = RA_NOMATCH;
3153 break;
3154
3155 case EOL:
3156 if (c != NUL)
3157 status = RA_NOMATCH;
3158 break;
3159
3160 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003161 // We're not at the beginning of the file when below the first
3162 // line where we started, not at the start of the line or we
3163 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003164 if (rex.lnum != 0 || rex.input != rex.line
3165 || (REG_MULTI && rex.reg_firstlnum > 1))
3166 status = RA_NOMATCH;
3167 break;
3168
3169 case RE_EOF:
3170 if (rex.lnum != rex.reg_maxline || c != NUL)
3171 status = RA_NOMATCH;
3172 break;
3173
3174 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003175 // Check if the buffer is in a window and compare the
3176 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003177 if (rex.reg_win == NULL
3178 || (rex.lnum + rex.reg_firstlnum
3179 != rex.reg_win->w_cursor.lnum)
3180 || ((colnr_T)(rex.input - rex.line)
3181 != rex.reg_win->w_cursor.col))
3182 status = RA_NOMATCH;
3183 break;
3184
3185 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003186 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003187 {
3188 int mark = OPERAND(scan)[0];
3189 int cmp = OPERAND(scan)[1];
3190 pos_T *pos;
3191
3192 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003193 if (pos == NULL // mark doesn't exist
3194 || pos->lnum <= 0 // mark isn't set in reg_buf
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003195 || (pos->lnum == rex.lnum + rex.reg_firstlnum
3196 ? (pos->col == (colnr_T)(rex.input - rex.line)
3197 ? (cmp == '<' || cmp == '>')
3198 : (pos->col < (colnr_T)(rex.input - rex.line)
3199 ? cmp != '>'
3200 : cmp != '<'))
3201 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3202 ? cmp != '>'
3203 : cmp != '<')))
3204 status = RA_NOMATCH;
3205 }
3206 break;
3207
3208 case RE_VISUAL:
3209 if (!reg_match_visual())
3210 status = RA_NOMATCH;
3211 break;
3212
3213 case RE_LNUM:
3214 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3215 scan))
3216 status = RA_NOMATCH;
3217 break;
3218
3219 case RE_COL:
3220 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3221 status = RA_NOMATCH;
3222 break;
3223
3224 case RE_VCOL:
3225 if (!re_num_cmp((long_u)win_linetabsize(
3226 rex.reg_win == NULL ? curwin : rex.reg_win,
3227 rex.line, (colnr_T)(rex.input - rex.line)) + 1, scan))
3228 status = RA_NOMATCH;
3229 break;
3230
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003231 case BOW: // \<word; rex.input points to w
3232 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003233 status = RA_NOMATCH;
3234 else if (has_mbyte)
3235 {
3236 int this_class;
3237
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003238 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003239 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3240 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003241 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003242 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003243 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003244 }
3245 else
3246 {
3247 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3248 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3249 status = RA_NOMATCH;
3250 }
3251 break;
3252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003253 case EOW: // word\>; rex.input points after d
3254 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003255 status = RA_NOMATCH;
3256 else if (has_mbyte)
3257 {
3258 int this_class, prev_class;
3259
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003260 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003261 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3262 prev_class = reg_prev_class();
3263 if (this_class == prev_class
3264 || prev_class == 0 || prev_class == 1)
3265 status = RA_NOMATCH;
3266 }
3267 else
3268 {
3269 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3270 || (rex.input[0] != NUL
3271 && vim_iswordc_buf(c, rex.reg_buf)))
3272 status = RA_NOMATCH;
3273 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003274 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003275
3276 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003277 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003278 if (c == NUL)
3279 status = RA_NOMATCH;
3280 else
3281 ADVANCE_REGINPUT();
3282 break;
3283
3284 case IDENT:
3285 if (!vim_isIDc(c))
3286 status = RA_NOMATCH;
3287 else
3288 ADVANCE_REGINPUT();
3289 break;
3290
3291 case SIDENT:
3292 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3293 status = RA_NOMATCH;
3294 else
3295 ADVANCE_REGINPUT();
3296 break;
3297
3298 case KWORD:
3299 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3300 status = RA_NOMATCH;
3301 else
3302 ADVANCE_REGINPUT();
3303 break;
3304
3305 case SKWORD:
3306 if (VIM_ISDIGIT(*rex.input)
3307 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3308 status = RA_NOMATCH;
3309 else
3310 ADVANCE_REGINPUT();
3311 break;
3312
3313 case FNAME:
3314 if (!vim_isfilec(c))
3315 status = RA_NOMATCH;
3316 else
3317 ADVANCE_REGINPUT();
3318 break;
3319
3320 case SFNAME:
3321 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3322 status = RA_NOMATCH;
3323 else
3324 ADVANCE_REGINPUT();
3325 break;
3326
3327 case PRINT:
3328 if (!vim_isprintc(PTR2CHAR(rex.input)))
3329 status = RA_NOMATCH;
3330 else
3331 ADVANCE_REGINPUT();
3332 break;
3333
3334 case SPRINT:
3335 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3336 status = RA_NOMATCH;
3337 else
3338 ADVANCE_REGINPUT();
3339 break;
3340
3341 case WHITE:
3342 if (!VIM_ISWHITE(c))
3343 status = RA_NOMATCH;
3344 else
3345 ADVANCE_REGINPUT();
3346 break;
3347
3348 case NWHITE:
3349 if (c == NUL || VIM_ISWHITE(c))
3350 status = RA_NOMATCH;
3351 else
3352 ADVANCE_REGINPUT();
3353 break;
3354
3355 case DIGIT:
3356 if (!ri_digit(c))
3357 status = RA_NOMATCH;
3358 else
3359 ADVANCE_REGINPUT();
3360 break;
3361
3362 case NDIGIT:
3363 if (c == NUL || ri_digit(c))
3364 status = RA_NOMATCH;
3365 else
3366 ADVANCE_REGINPUT();
3367 break;
3368
3369 case HEX:
3370 if (!ri_hex(c))
3371 status = RA_NOMATCH;
3372 else
3373 ADVANCE_REGINPUT();
3374 break;
3375
3376 case NHEX:
3377 if (c == NUL || ri_hex(c))
3378 status = RA_NOMATCH;
3379 else
3380 ADVANCE_REGINPUT();
3381 break;
3382
3383 case OCTAL:
3384 if (!ri_octal(c))
3385 status = RA_NOMATCH;
3386 else
3387 ADVANCE_REGINPUT();
3388 break;
3389
3390 case NOCTAL:
3391 if (c == NUL || ri_octal(c))
3392 status = RA_NOMATCH;
3393 else
3394 ADVANCE_REGINPUT();
3395 break;
3396
3397 case WORD:
3398 if (!ri_word(c))
3399 status = RA_NOMATCH;
3400 else
3401 ADVANCE_REGINPUT();
3402 break;
3403
3404 case NWORD:
3405 if (c == NUL || ri_word(c))
3406 status = RA_NOMATCH;
3407 else
3408 ADVANCE_REGINPUT();
3409 break;
3410
3411 case HEAD:
3412 if (!ri_head(c))
3413 status = RA_NOMATCH;
3414 else
3415 ADVANCE_REGINPUT();
3416 break;
3417
3418 case NHEAD:
3419 if (c == NUL || ri_head(c))
3420 status = RA_NOMATCH;
3421 else
3422 ADVANCE_REGINPUT();
3423 break;
3424
3425 case ALPHA:
3426 if (!ri_alpha(c))
3427 status = RA_NOMATCH;
3428 else
3429 ADVANCE_REGINPUT();
3430 break;
3431
3432 case NALPHA:
3433 if (c == NUL || ri_alpha(c))
3434 status = RA_NOMATCH;
3435 else
3436 ADVANCE_REGINPUT();
3437 break;
3438
3439 case LOWER:
3440 if (!ri_lower(c))
3441 status = RA_NOMATCH;
3442 else
3443 ADVANCE_REGINPUT();
3444 break;
3445
3446 case NLOWER:
3447 if (c == NUL || ri_lower(c))
3448 status = RA_NOMATCH;
3449 else
3450 ADVANCE_REGINPUT();
3451 break;
3452
3453 case UPPER:
3454 if (!ri_upper(c))
3455 status = RA_NOMATCH;
3456 else
3457 ADVANCE_REGINPUT();
3458 break;
3459
3460 case NUPPER:
3461 if (c == NUL || ri_upper(c))
3462 status = RA_NOMATCH;
3463 else
3464 ADVANCE_REGINPUT();
3465 break;
3466
3467 case EXACTLY:
3468 {
3469 int len;
3470 char_u *opnd;
3471
3472 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003473 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003474 if (*opnd != *rex.input
3475 && (!rex.reg_ic
3476 || (!enc_utf8
3477 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3478 status = RA_NOMATCH;
3479 else if (*opnd == NUL)
3480 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003481 // match empty string always works; happens when "~" is
3482 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003483 }
3484 else
3485 {
3486 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3487 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003488 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003489 }
3490 else
3491 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003492 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003493 len = (int)STRLEN(opnd);
3494 if (cstrncmp(opnd, rex.input, &len) != 0)
3495 status = RA_NOMATCH;
3496 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003497 // Check for following composing character, unless %C
3498 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003499 if (status != RA_NOMATCH
3500 && enc_utf8
3501 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3502 && !rex.reg_icombine
3503 && OP(next) != RE_COMPOSING)
3504 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003505 // raaron: This code makes a composing character get
3506 // ignored, which is the correct behavior (sometimes)
3507 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003508 status = RA_NOMATCH;
3509 }
3510 if (status != RA_NOMATCH)
3511 rex.input += len;
3512 }
3513 }
3514 break;
3515
3516 case ANYOF:
3517 case ANYBUT:
3518 if (c == NUL)
3519 status = RA_NOMATCH;
3520 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3521 status = RA_NOMATCH;
3522 else
3523 ADVANCE_REGINPUT();
3524 break;
3525
3526 case MULTIBYTECODE:
3527 if (has_mbyte)
3528 {
3529 int i, len;
3530 char_u *opnd;
3531 int opndc = 0, inpc;
3532
3533 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003534 // Safety check (just in case 'encoding' was changed since
3535 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003536 if ((len = (*mb_ptr2len)(opnd)) < 2)
3537 {
3538 status = RA_NOMATCH;
3539 break;
3540 }
3541 if (enc_utf8)
3542 opndc = utf_ptr2char(opnd);
3543 if (enc_utf8 && utf_iscomposing(opndc))
3544 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003545 // When only a composing char is given match at any
3546 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003547 status = RA_NOMATCH;
3548 for (i = 0; rex.input[i] != NUL;
3549 i += utf_ptr2len(rex.input + i))
3550 {
3551 inpc = utf_ptr2char(rex.input + i);
3552 if (!utf_iscomposing(inpc))
3553 {
3554 if (i > 0)
3555 break;
3556 }
3557 else if (opndc == inpc)
3558 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003559 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003560 len = i + utfc_ptr2len(rex.input + i);
3561 status = RA_MATCH;
3562 break;
3563 }
3564 }
3565 }
3566 else
3567 for (i = 0; i < len; ++i)
3568 if (opnd[i] != rex.input[i])
3569 {
3570 status = RA_NOMATCH;
3571 break;
3572 }
3573 rex.input += len;
3574 }
3575 else
3576 status = RA_NOMATCH;
3577 break;
3578 case RE_COMPOSING:
3579 if (enc_utf8)
3580 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003581 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003582 while (utf_iscomposing(utf_ptr2char(rex.input)))
3583 MB_CPTR_ADV(rex.input);
3584 }
3585 break;
3586
3587 case NOTHING:
3588 break;
3589
3590 case BACK:
3591 {
3592 int i;
3593 backpos_T *bp;
3594
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003595 // When we run into BACK we need to check if we don't keep
3596 // looping without matching any input. The second and later
3597 // times a BACK is encountered it fails if the input is still
3598 // at the same position as the previous time.
3599 // The positions are stored in "backpos" and found by the
3600 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003601 bp = (backpos_T *)backpos.ga_data;
3602 for (i = 0; i < backpos.ga_len; ++i)
3603 if (bp[i].bp_scan == scan)
3604 break;
3605 if (i == backpos.ga_len)
3606 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003607 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003608 if (ga_grow(&backpos, 1) == FAIL)
3609 status = RA_FAIL;
3610 else
3611 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003612 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003613 bp = (backpos_T *)backpos.ga_data;
3614 bp[i].bp_scan = scan;
3615 ++backpos.ga_len;
3616 }
3617 }
3618 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003619 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003620 status = RA_NOMATCH;
3621
3622 if (status != RA_FAIL && status != RA_NOMATCH)
3623 reg_save(&bp[i].bp_pos, &backpos);
3624 }
3625 break;
3626
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003627 case MOPEN + 0: // Match start: \zs
3628 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003629 case MOPEN + 2:
3630 case MOPEN + 3:
3631 case MOPEN + 4:
3632 case MOPEN + 5:
3633 case MOPEN + 6:
3634 case MOPEN + 7:
3635 case MOPEN + 8:
3636 case MOPEN + 9:
3637 {
3638 no = op - MOPEN;
3639 cleanup_subexpr();
3640 rp = regstack_push(RS_MOPEN, scan);
3641 if (rp == NULL)
3642 status = RA_FAIL;
3643 else
3644 {
3645 rp->rs_no = no;
3646 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3647 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003648 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003649 }
3650 }
3651 break;
3652
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003653 case NOPEN: // \%(
3654 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003655 if (regstack_push(RS_NOPEN, scan) == NULL)
3656 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003657 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003658 break;
3659
3660#ifdef FEAT_SYN_HL
3661 case ZOPEN + 1:
3662 case ZOPEN + 2:
3663 case ZOPEN + 3:
3664 case ZOPEN + 4:
3665 case ZOPEN + 5:
3666 case ZOPEN + 6:
3667 case ZOPEN + 7:
3668 case ZOPEN + 8:
3669 case ZOPEN + 9:
3670 {
3671 no = op - ZOPEN;
3672 cleanup_zsubexpr();
3673 rp = regstack_push(RS_ZOPEN, scan);
3674 if (rp == NULL)
3675 status = RA_FAIL;
3676 else
3677 {
3678 rp->rs_no = no;
3679 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3680 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003681 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003682 }
3683 }
3684 break;
3685#endif
3686
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003687 case MCLOSE + 0: // Match end: \ze
3688 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003689 case MCLOSE + 2:
3690 case MCLOSE + 3:
3691 case MCLOSE + 4:
3692 case MCLOSE + 5:
3693 case MCLOSE + 6:
3694 case MCLOSE + 7:
3695 case MCLOSE + 8:
3696 case MCLOSE + 9:
3697 {
3698 no = op - MCLOSE;
3699 cleanup_subexpr();
3700 rp = regstack_push(RS_MCLOSE, scan);
3701 if (rp == NULL)
3702 status = RA_FAIL;
3703 else
3704 {
3705 rp->rs_no = no;
3706 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3707 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003708 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003709 }
3710 }
3711 break;
3712
3713#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003714 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003715 case ZCLOSE + 2:
3716 case ZCLOSE + 3:
3717 case ZCLOSE + 4:
3718 case ZCLOSE + 5:
3719 case ZCLOSE + 6:
3720 case ZCLOSE + 7:
3721 case ZCLOSE + 8:
3722 case ZCLOSE + 9:
3723 {
3724 no = op - ZCLOSE;
3725 cleanup_zsubexpr();
3726 rp = regstack_push(RS_ZCLOSE, scan);
3727 if (rp == NULL)
3728 status = RA_FAIL;
3729 else
3730 {
3731 rp->rs_no = no;
3732 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
3733 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003734 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003735 }
3736 }
3737 break;
3738#endif
3739
3740 case BACKREF + 1:
3741 case BACKREF + 2:
3742 case BACKREF + 3:
3743 case BACKREF + 4:
3744 case BACKREF + 5:
3745 case BACKREF + 6:
3746 case BACKREF + 7:
3747 case BACKREF + 8:
3748 case BACKREF + 9:
3749 {
3750 int len;
3751
3752 no = op - BACKREF;
3753 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003754 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003755 {
3756 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
3757 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003758 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003759 len = 0;
3760 }
3761 else
3762 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003763 // Compare current input with back-ref in the same
3764 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003765 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
3766 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
3767 status = RA_NOMATCH;
3768 }
3769 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003770 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003771 {
3772 if (rex.reg_startpos[no].lnum < 0
3773 || rex.reg_endpos[no].lnum < 0)
3774 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003775 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003776 len = 0;
3777 }
3778 else
3779 {
3780 if (rex.reg_startpos[no].lnum == rex.lnum
3781 && rex.reg_endpos[no].lnum == rex.lnum)
3782 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003783 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003784 len = rex.reg_endpos[no].col
3785 - rex.reg_startpos[no].col;
3786 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
3787 rex.input, &len) != 0)
3788 status = RA_NOMATCH;
3789 }
3790 else
3791 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003792 // Messy situation: Need to compare between two
3793 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003794 int r = match_with_backref(
3795 rex.reg_startpos[no].lnum,
3796 rex.reg_startpos[no].col,
3797 rex.reg_endpos[no].lnum,
3798 rex.reg_endpos[no].col,
3799 &len);
3800
3801 if (r != RA_MATCH)
3802 status = r;
3803 }
3804 }
3805 }
3806
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003807 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003808 rex.input += len;
3809 }
3810 break;
3811
3812#ifdef FEAT_SYN_HL
3813 case ZREF + 1:
3814 case ZREF + 2:
3815 case ZREF + 3:
3816 case ZREF + 4:
3817 case ZREF + 5:
3818 case ZREF + 6:
3819 case ZREF + 7:
3820 case ZREF + 8:
3821 case ZREF + 9:
3822 {
3823 int len;
3824
3825 cleanup_zsubexpr();
3826 no = op - ZREF;
3827 if (re_extmatch_in != NULL
3828 && re_extmatch_in->matches[no] != NULL)
3829 {
3830 len = (int)STRLEN(re_extmatch_in->matches[no]);
3831 if (cstrncmp(re_extmatch_in->matches[no],
3832 rex.input, &len) != 0)
3833 status = RA_NOMATCH;
3834 else
3835 rex.input += len;
3836 }
3837 else
3838 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003839 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003840 }
3841 }
3842 break;
3843#endif
3844
3845 case BRANCH:
3846 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003847 if (OP(next) != BRANCH) // No choice.
3848 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003849 else
3850 {
3851 rp = regstack_push(RS_BRANCH, scan);
3852 if (rp == NULL)
3853 status = RA_FAIL;
3854 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003855 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003856 }
3857 }
3858 break;
3859
3860 case BRACE_LIMITS:
3861 {
3862 if (OP(next) == BRACE_SIMPLE)
3863 {
3864 bl_minval = OPERAND_MIN(scan);
3865 bl_maxval = OPERAND_MAX(scan);
3866 }
3867 else if (OP(next) >= BRACE_COMPLEX
3868 && OP(next) < BRACE_COMPLEX + 10)
3869 {
3870 no = OP(next) - BRACE_COMPLEX;
3871 brace_min[no] = OPERAND_MIN(scan);
3872 brace_max[no] = OPERAND_MAX(scan);
3873 brace_count[no] = 0;
3874 }
3875 else
3876 {
3877 internal_error("BRACE_LIMITS");
3878 status = RA_FAIL;
3879 }
3880 }
3881 break;
3882
3883 case BRACE_COMPLEX + 0:
3884 case BRACE_COMPLEX + 1:
3885 case BRACE_COMPLEX + 2:
3886 case BRACE_COMPLEX + 3:
3887 case BRACE_COMPLEX + 4:
3888 case BRACE_COMPLEX + 5:
3889 case BRACE_COMPLEX + 6:
3890 case BRACE_COMPLEX + 7:
3891 case BRACE_COMPLEX + 8:
3892 case BRACE_COMPLEX + 9:
3893 {
3894 no = op - BRACE_COMPLEX;
3895 ++brace_count[no];
3896
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003897 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003898 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
3899 ? brace_min[no] : brace_max[no]))
3900 {
3901 rp = regstack_push(RS_BRCPLX_MORE, scan);
3902 if (rp == NULL)
3903 status = RA_FAIL;
3904 else
3905 {
3906 rp->rs_no = no;
3907 reg_save(&rp->rs_un.regsave, &backpos);
3908 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003909 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003910 }
3911 break;
3912 }
3913
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003914 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003915 if (brace_min[no] <= brace_max[no])
3916 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003917 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003918 if (brace_count[no] <= brace_max[no])
3919 {
3920 rp = regstack_push(RS_BRCPLX_LONG, scan);
3921 if (rp == NULL)
3922 status = RA_FAIL;
3923 else
3924 {
3925 rp->rs_no = no;
3926 reg_save(&rp->rs_un.regsave, &backpos);
3927 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003928 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003929 }
3930 }
3931 }
3932 else
3933 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003934 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003935 if (brace_count[no] <= brace_min[no])
3936 {
3937 rp = regstack_push(RS_BRCPLX_SHORT, scan);
3938 if (rp == NULL)
3939 status = RA_FAIL;
3940 else
3941 {
3942 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003943 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003944 }
3945 }
3946 }
3947 }
3948 break;
3949
3950 case BRACE_SIMPLE:
3951 case STAR:
3952 case PLUS:
3953 {
3954 regstar_T rst;
3955
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003956 // Lookahead to avoid useless match attempts when we know
3957 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003958 if (OP(next) == EXACTLY)
3959 {
3960 rst.nextb = *OPERAND(next);
3961 if (rex.reg_ic)
3962 {
3963 if (MB_ISUPPER(rst.nextb))
3964 rst.nextb_ic = MB_TOLOWER(rst.nextb);
3965 else
3966 rst.nextb_ic = MB_TOUPPER(rst.nextb);
3967 }
3968 else
3969 rst.nextb_ic = rst.nextb;
3970 }
3971 else
3972 {
3973 rst.nextb = NUL;
3974 rst.nextb_ic = NUL;
3975 }
3976 if (op != BRACE_SIMPLE)
3977 {
3978 rst.minval = (op == STAR) ? 0 : 1;
3979 rst.maxval = MAX_LIMIT;
3980 }
3981 else
3982 {
3983 rst.minval = bl_minval;
3984 rst.maxval = bl_maxval;
3985 }
3986
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003987 // When maxval > minval, try matching as much as possible, up
3988 // to maxval. When maxval < minval, try matching at least the
3989 // minimal number (since the range is backwards, that's also
3990 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003991 rst.count = regrepeat(OPERAND(scan), rst.maxval);
3992 if (got_int)
3993 {
3994 status = RA_FAIL;
3995 break;
3996 }
3997 if (rst.minval <= rst.maxval
3998 ? rst.count >= rst.minval : rst.count >= rst.maxval)
3999 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004000 // It could match. Prepare for trying to match what
4001 // follows. The code is below. Parameters are stored in
4002 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004003 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4004 {
4005 emsg(_(e_maxmempat));
4006 status = RA_FAIL;
4007 }
4008 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4009 status = RA_FAIL;
4010 else
4011 {
4012 regstack.ga_len += sizeof(regstar_T);
4013 rp = regstack_push(rst.minval <= rst.maxval
4014 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4015 if (rp == NULL)
4016 status = RA_FAIL;
4017 else
4018 {
4019 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004020 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004021 }
4022 }
4023 }
4024 else
4025 status = RA_NOMATCH;
4026
4027 }
4028 break;
4029
4030 case NOMATCH:
4031 case MATCH:
4032 case SUBPAT:
4033 rp = regstack_push(RS_NOMATCH, scan);
4034 if (rp == NULL)
4035 status = RA_FAIL;
4036 else
4037 {
4038 rp->rs_no = op;
4039 reg_save(&rp->rs_un.regsave, &backpos);
4040 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004041 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004042 }
4043 break;
4044
4045 case BEHIND:
4046 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004047 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004048 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4049 {
4050 emsg(_(e_maxmempat));
4051 status = RA_FAIL;
4052 }
4053 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4054 status = RA_FAIL;
4055 else
4056 {
4057 regstack.ga_len += sizeof(regbehind_T);
4058 rp = regstack_push(RS_BEHIND1, scan);
4059 if (rp == NULL)
4060 status = RA_FAIL;
4061 else
4062 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004063 // Need to save the subexpr to be able to restore them
4064 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004065 save_subexpr(((regbehind_T *)rp) - 1);
4066
4067 rp->rs_no = op;
4068 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004069 // First try if what follows matches. If it does then we
4070 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004071 }
4072 }
4073 break;
4074
4075 case BHPOS:
4076 if (REG_MULTI)
4077 {
4078 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4079 || behind_pos.rs_u.pos.lnum != rex.lnum)
4080 status = RA_NOMATCH;
4081 }
4082 else if (behind_pos.rs_u.ptr != rex.input)
4083 status = RA_NOMATCH;
4084 break;
4085
4086 case NEWL:
4087 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4088 || rex.reg_line_lbr)
4089 && (c != '\n' || !rex.reg_line_lbr))
4090 status = RA_NOMATCH;
4091 else if (rex.reg_line_lbr)
4092 ADVANCE_REGINPUT();
4093 else
4094 reg_nextline();
4095 break;
4096
4097 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004098 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004099 break;
4100
4101 default:
4102 emsg(_(e_re_corr));
4103#ifdef DEBUG
4104 printf("Illegal op code %d\n", op);
4105#endif
4106 status = RA_FAIL;
4107 break;
4108 }
4109 }
4110
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004111 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004112 if (status != RA_CONT)
4113 break;
4114
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004115 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004116 scan = next;
4117
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004118 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004119
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004120 // If there is something on the regstack execute the code for the state.
4121 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004122 while (regstack.ga_len > 0 && status != RA_FAIL)
4123 {
4124 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4125 switch (rp->rs_state)
4126 {
4127 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004128 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004129 regstack_pop(&scan);
4130 break;
4131
4132 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004133 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004134 if (status == RA_NOMATCH)
4135 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4136 &rex.reg_startp[rp->rs_no]);
4137 regstack_pop(&scan);
4138 break;
4139
4140#ifdef FEAT_SYN_HL
4141 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004142 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004143 if (status == RA_NOMATCH)
4144 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4145 &reg_startzp[rp->rs_no]);
4146 regstack_pop(&scan);
4147 break;
4148#endif
4149
4150 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004151 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004152 if (status == RA_NOMATCH)
4153 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4154 &rex.reg_endp[rp->rs_no]);
4155 regstack_pop(&scan);
4156 break;
4157
4158#ifdef FEAT_SYN_HL
4159 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004160 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004161 if (status == RA_NOMATCH)
4162 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4163 &reg_endzp[rp->rs_no]);
4164 regstack_pop(&scan);
4165 break;
4166#endif
4167
4168 case RS_BRANCH:
4169 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004170 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004171 regstack_pop(&scan);
4172 else
4173 {
4174 if (status != RA_BREAK)
4175 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004176 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004177 reg_restore(&rp->rs_un.regsave, &backpos);
4178 scan = rp->rs_scan;
4179 }
4180 if (scan == NULL || OP(scan) != BRANCH)
4181 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004182 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004183 status = RA_NOMATCH;
4184 regstack_pop(&scan);
4185 }
4186 else
4187 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004188 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004189 rp->rs_scan = regnext(scan);
4190 reg_save(&rp->rs_un.regsave, &backpos);
4191 scan = OPERAND(scan);
4192 }
4193 }
4194 break;
4195
4196 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004197 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004198 if (status == RA_NOMATCH)
4199 {
4200 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004201 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004202 }
4203 regstack_pop(&scan);
4204 break;
4205
4206 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004207 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004208 if (status == RA_NOMATCH)
4209 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004210 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004211 reg_restore(&rp->rs_un.regsave, &backpos);
4212 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004213 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004214 status = RA_CONT;
4215 }
4216 regstack_pop(&scan);
4217 if (status == RA_CONT)
4218 scan = regnext(scan);
4219 break;
4220
4221 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004222 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004223 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004224 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004225 reg_restore(&rp->rs_un.regsave, &backpos);
4226 regstack_pop(&scan);
4227 if (status == RA_NOMATCH)
4228 {
4229 scan = OPERAND(scan);
4230 status = RA_CONT;
4231 }
4232 break;
4233
4234 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004235 // Pop the state. If the operand matches for NOMATCH or
4236 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4237 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004238 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4239 status = RA_NOMATCH;
4240 else
4241 {
4242 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004243 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004244 reg_restore(&rp->rs_un.regsave, &backpos);
4245 }
4246 regstack_pop(&scan);
4247 if (status == RA_CONT)
4248 scan = regnext(scan);
4249 break;
4250
4251 case RS_BEHIND1:
4252 if (status == RA_NOMATCH)
4253 {
4254 regstack_pop(&scan);
4255 regstack.ga_len -= sizeof(regbehind_T);
4256 }
4257 else
4258 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004259 // The stuff after BEHIND/NOBEHIND matches. Now try if
4260 // the behind part does (not) match before the current
4261 // position in the input. This must be done at every
4262 // position in the input and checking if the match ends at
4263 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004264
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004265 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004266 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4267
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004268 // Start looking for a match with operand at the current
4269 // position. Go back one character until we find the
4270 // result, hitting the start of the line or the previous
4271 // line (for multi-line matching).
4272 // Set behind_pos to where the match should end, BHPOS
4273 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004274 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4275 behind_pos = rp->rs_un.regsave;
4276
4277 rp->rs_state = RS_BEHIND2;
4278
4279 reg_restore(&rp->rs_un.regsave, &backpos);
4280 scan = OPERAND(rp->rs_scan) + 4;
4281 }
4282 break;
4283
4284 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004285 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004286 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4287 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004288 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004289 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4290 if (rp->rs_no == BEHIND)
4291 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4292 &backpos);
4293 else
4294 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004295 // But we didn't want a match. Need to restore the
4296 // subexpr, because what follows matched, so they have
4297 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004298 status = RA_NOMATCH;
4299 restore_subexpr(((regbehind_T *)rp) - 1);
4300 }
4301 regstack_pop(&scan);
4302 regstack.ga_len -= sizeof(regbehind_T);
4303 }
4304 else
4305 {
4306 long limit;
4307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004308 // No match or a match that doesn't end where we want it: Go
4309 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004310 no = OK;
4311 limit = OPERAND_MIN(rp->rs_scan);
4312 if (REG_MULTI)
4313 {
4314 if (limit > 0
4315 && ((rp->rs_un.regsave.rs_u.pos.lnum
4316 < behind_pos.rs_u.pos.lnum
4317 ? (colnr_T)STRLEN(rex.line)
4318 : behind_pos.rs_u.pos.col)
4319 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4320 no = FAIL;
4321 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4322 {
4323 if (rp->rs_un.regsave.rs_u.pos.lnum
4324 < behind_pos.rs_u.pos.lnum
4325 || reg_getline(
4326 --rp->rs_un.regsave.rs_u.pos.lnum)
4327 == NULL)
4328 no = FAIL;
4329 else
4330 {
4331 reg_restore(&rp->rs_un.regsave, &backpos);
4332 rp->rs_un.regsave.rs_u.pos.col =
4333 (colnr_T)STRLEN(rex.line);
4334 }
4335 }
4336 else
4337 {
4338 if (has_mbyte)
4339 {
4340 char_u *line =
4341 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4342
4343 rp->rs_un.regsave.rs_u.pos.col -=
4344 (*mb_head_off)(line, line
4345 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4346 }
4347 else
4348 --rp->rs_un.regsave.rs_u.pos.col;
4349 }
4350 }
4351 else
4352 {
4353 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4354 no = FAIL;
4355 else
4356 {
4357 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4358 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4359 - rp->rs_un.regsave.rs_u.ptr) > limit)
4360 no = FAIL;
4361 }
4362 }
4363 if (no == OK)
4364 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004365 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004366 reg_restore(&rp->rs_un.regsave, &backpos);
4367 scan = OPERAND(rp->rs_scan) + 4;
4368 if (status == RA_MATCH)
4369 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004370 // We did match, so subexpr may have been changed,
4371 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004372 status = RA_NOMATCH;
4373 restore_subexpr(((regbehind_T *)rp) - 1);
4374 }
4375 }
4376 else
4377 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004378 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004379 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4380 if (rp->rs_no == NOBEHIND)
4381 {
4382 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4383 &backpos);
4384 status = RA_MATCH;
4385 }
4386 else
4387 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004388 // We do want a proper match. Need to restore the
4389 // subexpr if we had a match, because they may have
4390 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004391 if (status == RA_MATCH)
4392 {
4393 status = RA_NOMATCH;
4394 restore_subexpr(((regbehind_T *)rp) - 1);
4395 }
4396 }
4397 regstack_pop(&scan);
4398 regstack.ga_len -= sizeof(regbehind_T);
4399 }
4400 }
4401 break;
4402
4403 case RS_STAR_LONG:
4404 case RS_STAR_SHORT:
4405 {
4406 regstar_T *rst = ((regstar_T *)rp) - 1;
4407
4408 if (status == RA_MATCH)
4409 {
4410 regstack_pop(&scan);
4411 regstack.ga_len -= sizeof(regstar_T);
4412 break;
4413 }
4414
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004415 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004416 if (status != RA_BREAK)
4417 reg_restore(&rp->rs_un.regsave, &backpos);
4418
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004419 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004420 for (;;)
4421 {
4422 if (status != RA_BREAK)
4423 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004424 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004425 if (rp->rs_state == RS_STAR_LONG)
4426 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004427 // Trying for longest match, but couldn't or
4428 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004429 if (--rst->count < rst->minval)
4430 break;
4431 if (rex.input == rex.line)
4432 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004433 // backup to last char of previous line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004434 --rex.lnum;
4435 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004436 // Just in case regrepeat() didn't count
4437 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004438 if (rex.line == NULL)
4439 break;
4440 rex.input = rex.line + STRLEN(rex.line);
4441 fast_breakcheck();
4442 }
4443 else
4444 MB_PTR_BACK(rex.line, rex.input);
4445 }
4446 else
4447 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004448 // Range is backwards, use shortest match first.
4449 // Careful: maxval and minval are exchanged!
4450 // Couldn't or didn't match: try advancing one
4451 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004452 if (rst->count == rst->minval
4453 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4454 break;
4455 ++rst->count;
4456 }
4457 if (got_int)
4458 break;
4459 }
4460 else
4461 status = RA_NOMATCH;
4462
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004463 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004464 if (rst->nextb == NUL || *rex.input == rst->nextb
4465 || *rex.input == rst->nextb_ic)
4466 {
4467 reg_save(&rp->rs_un.regsave, &backpos);
4468 scan = regnext(rp->rs_scan);
4469 status = RA_CONT;
4470 break;
4471 }
4472 }
4473 if (status != RA_CONT)
4474 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004475 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004476 regstack_pop(&scan);
4477 regstack.ga_len -= sizeof(regstar_T);
4478 status = RA_NOMATCH;
4479 }
4480 }
4481 break;
4482 }
4483
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004484 // If we want to continue the inner loop or didn't pop a state
4485 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004486 if (status == RA_CONT || rp == (regitem_T *)
4487 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4488 break;
4489 }
4490
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004491 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004492 if (status == RA_CONT)
4493 continue;
4494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004495 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004496 if (regstack.ga_len == 0 || status == RA_FAIL)
4497 {
4498 if (scan == NULL)
4499 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004500 // We get here only if there's trouble -- normally "case END" is
4501 // the terminating point.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004502 emsg(_(e_re_corr));
4503#ifdef DEBUG
4504 printf("Premature EOL\n");
4505#endif
4506 }
4507 return (status == RA_MATCH);
4508 }
4509
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004510 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004511
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004512 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004513}
4514
4515/*
4516 * regtry - try match of "prog" with at rex.line["col"].
4517 * Returns 0 for failure, number of lines contained in the match otherwise.
4518 */
4519 static long
4520regtry(
4521 bt_regprog_T *prog,
4522 colnr_T col,
4523 proftime_T *tm, // timeout limit or NULL
4524 int *timed_out) // flag set on timeout or NULL
4525{
4526 rex.input = rex.line + col;
4527 rex.need_clear_subexpr = TRUE;
4528#ifdef FEAT_SYN_HL
4529 // Clear the external match subpointers if necessary.
4530 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4531#endif
4532
4533 if (regmatch(prog->program + 1, tm, timed_out) == 0)
4534 return 0;
4535
4536 cleanup_subexpr();
4537 if (REG_MULTI)
4538 {
4539 if (rex.reg_startpos[0].lnum < 0)
4540 {
4541 rex.reg_startpos[0].lnum = 0;
4542 rex.reg_startpos[0].col = col;
4543 }
4544 if (rex.reg_endpos[0].lnum < 0)
4545 {
4546 rex.reg_endpos[0].lnum = rex.lnum;
4547 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4548 }
4549 else
4550 // Use line number of "\ze".
4551 rex.lnum = rex.reg_endpos[0].lnum;
4552 }
4553 else
4554 {
4555 if (rex.reg_startp[0] == NULL)
4556 rex.reg_startp[0] = rex.line + col;
4557 if (rex.reg_endp[0] == NULL)
4558 rex.reg_endp[0] = rex.input;
4559 }
4560#ifdef FEAT_SYN_HL
4561 // Package any found \z(...\) matches for export. Default is none.
4562 unref_extmatch(re_extmatch_out);
4563 re_extmatch_out = NULL;
4564
4565 if (prog->reghasz == REX_SET)
4566 {
4567 int i;
4568
4569 cleanup_zsubexpr();
4570 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004571 if (re_extmatch_out == NULL)
4572 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004573 for (i = 0; i < NSUBEXP; i++)
4574 {
4575 if (REG_MULTI)
4576 {
4577 // Only accept single line matches.
4578 if (reg_startzpos[i].lnum >= 0
4579 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4580 && reg_endzpos[i].col >= reg_startzpos[i].col)
4581 re_extmatch_out->matches[i] =
4582 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4583 + reg_startzpos[i].col,
4584 reg_endzpos[i].col - reg_startzpos[i].col);
4585 }
4586 else
4587 {
4588 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4589 re_extmatch_out->matches[i] =
4590 vim_strnsave(reg_startzp[i],
4591 (int)(reg_endzp[i] - reg_startzp[i]));
4592 }
4593 }
4594 }
4595#endif
4596 return 1 + rex.lnum;
4597}
4598
4599/*
4600 * Match a regexp against a string ("line" points to the string) or multiple
4601 * lines ("line" is NULL, use reg_getline()).
4602 * Returns 0 for failure, number of lines contained in the match otherwise.
4603 */
4604 static long
4605bt_regexec_both(
4606 char_u *line,
4607 colnr_T col, // column to start looking for match
4608 proftime_T *tm, // timeout limit or NULL
4609 int *timed_out) // flag set on timeout or NULL
4610{
4611 bt_regprog_T *prog;
4612 char_u *s;
4613 long retval = 0L;
4614
4615 // Create "regstack" and "backpos" if they are not allocated yet.
4616 // We allocate *_INITIAL amount of bytes first and then set the grow size
4617 // to much bigger value to avoid many malloc calls in case of deep regular
4618 // expressions.
4619 if (regstack.ga_data == NULL)
4620 {
4621 // Use an item size of 1 byte, since we push different things
4622 // onto the regstack.
4623 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4624 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4625 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4626 }
4627
4628 if (backpos.ga_data == NULL)
4629 {
4630 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4631 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4632 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4633 }
4634
4635 if (REG_MULTI)
4636 {
4637 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4638 line = reg_getline((linenr_T)0);
4639 rex.reg_startpos = rex.reg_mmatch->startpos;
4640 rex.reg_endpos = rex.reg_mmatch->endpos;
4641 }
4642 else
4643 {
4644 prog = (bt_regprog_T *)rex.reg_match->regprog;
4645 rex.reg_startp = rex.reg_match->startp;
4646 rex.reg_endp = rex.reg_match->endp;
4647 }
4648
4649 // Be paranoid...
4650 if (prog == NULL || line == NULL)
4651 {
4652 emsg(_(e_null));
4653 goto theend;
4654 }
4655
4656 // Check validity of program.
4657 if (prog_magic_wrong())
4658 goto theend;
4659
4660 // If the start column is past the maximum column: no need to try.
4661 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4662 goto theend;
4663
4664 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4665 if (prog->regflags & RF_ICASE)
4666 rex.reg_ic = TRUE;
4667 else if (prog->regflags & RF_NOICASE)
4668 rex.reg_ic = FALSE;
4669
4670 // If pattern contains "\Z" overrule value of rex.reg_icombine
4671 if (prog->regflags & RF_ICOMBINE)
4672 rex.reg_icombine = TRUE;
4673
4674 // If there is a "must appear" string, look for it.
4675 if (prog->regmust != NULL)
4676 {
4677 int c;
4678
4679 if (has_mbyte)
4680 c = (*mb_ptr2char)(prog->regmust);
4681 else
4682 c = *prog->regmust;
4683 s = line + col;
4684
4685 // This is used very often, esp. for ":global". Use three versions of
4686 // the loop to avoid overhead of conditions.
4687 if (!rex.reg_ic && !has_mbyte)
4688 while ((s = vim_strbyte(s, c)) != NULL)
4689 {
4690 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4691 break; // Found it.
4692 ++s;
4693 }
4694 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4695 while ((s = vim_strchr(s, c)) != NULL)
4696 {
4697 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4698 break; // Found it.
4699 MB_PTR_ADV(s);
4700 }
4701 else
4702 while ((s = cstrchr(s, c)) != NULL)
4703 {
4704 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4705 break; // Found it.
4706 MB_PTR_ADV(s);
4707 }
4708 if (s == NULL) // Not present.
4709 goto theend;
4710 }
4711
4712 rex.line = line;
4713 rex.lnum = 0;
4714 reg_toolong = FALSE;
4715
4716 // Simplest case: Anchored match need be tried only once.
4717 if (prog->reganch)
4718 {
4719 int c;
4720
4721 if (has_mbyte)
4722 c = (*mb_ptr2char)(rex.line + col);
4723 else
4724 c = rex.line[col];
4725 if (prog->regstart == NUL
4726 || prog->regstart == c
4727 || (rex.reg_ic
4728 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
4729 || (c < 255 && prog->regstart < 255 &&
4730 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
4731 retval = regtry(prog, col, tm, timed_out);
4732 else
4733 retval = 0;
4734 }
4735 else
4736 {
4737#ifdef FEAT_RELTIME
4738 int tm_count = 0;
4739#endif
4740 // Messy cases: unanchored match.
4741 while (!got_int)
4742 {
4743 if (prog->regstart != NUL)
4744 {
4745 // Skip until the char we know it must start with.
4746 // Used often, do some work to avoid call overhead.
4747 if (!rex.reg_ic && !has_mbyte)
4748 s = vim_strbyte(rex.line + col, prog->regstart);
4749 else
4750 s = cstrchr(rex.line + col, prog->regstart);
4751 if (s == NULL)
4752 {
4753 retval = 0;
4754 break;
4755 }
4756 col = (int)(s - rex.line);
4757 }
4758
4759 // Check for maximum column to try.
4760 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4761 {
4762 retval = 0;
4763 break;
4764 }
4765
4766 retval = regtry(prog, col, tm, timed_out);
4767 if (retval > 0)
4768 break;
4769
4770 // if not currently on the first line, get it again
4771 if (rex.lnum != 0)
4772 {
4773 rex.lnum = 0;
4774 rex.line = reg_getline((linenr_T)0);
4775 }
4776 if (rex.line[col] == NUL)
4777 break;
4778 if (has_mbyte)
4779 col += (*mb_ptr2len)(rex.line + col);
4780 else
4781 ++col;
4782#ifdef FEAT_RELTIME
4783 // Check for timeout once in a twenty times to avoid overhead.
4784 if (tm != NULL && ++tm_count == 20)
4785 {
4786 tm_count = 0;
4787 if (profile_passed_limit(tm))
4788 {
4789 if (timed_out != NULL)
4790 *timed_out = TRUE;
4791 break;
4792 }
4793 }
4794#endif
4795 }
4796 }
4797
4798theend:
4799 // Free "reg_tofree" when it's a bit big.
4800 // Free regstack and backpos if they are bigger than their initial size.
4801 if (reg_tofreelen > 400)
4802 VIM_CLEAR(reg_tofree);
4803 if (regstack.ga_maxlen > REGSTACK_INITIAL)
4804 ga_clear(&regstack);
4805 if (backpos.ga_maxlen > BACKPOS_INITIAL)
4806 ga_clear(&backpos);
4807
4808 return retval;
4809}
4810
4811/*
4812 * Match a regexp against a string.
4813 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
4814 * Uses curbuf for line count and 'iskeyword'.
4815 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
4816 *
4817 * Returns 0 for failure, number of lines contained in the match otherwise.
4818 */
4819 static int
4820bt_regexec_nl(
4821 regmatch_T *rmp,
4822 char_u *line, // string to match against
4823 colnr_T col, // column to start looking for match
4824 int line_lbr)
4825{
4826 rex.reg_match = rmp;
4827 rex.reg_mmatch = NULL;
4828 rex.reg_maxline = 0;
4829 rex.reg_line_lbr = line_lbr;
4830 rex.reg_buf = curbuf;
4831 rex.reg_win = NULL;
4832 rex.reg_ic = rmp->rm_ic;
4833 rex.reg_icombine = FALSE;
4834 rex.reg_maxcol = 0;
4835
4836 return bt_regexec_both(line, col, NULL, NULL);
4837}
4838
4839/*
4840 * Match a regexp against multiple lines.
4841 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
4842 * Uses curbuf for line count and 'iskeyword'.
4843 *
4844 * Return zero if there is no match. Return number of lines contained in the
4845 * match otherwise.
4846 */
4847 static long
4848bt_regexec_multi(
4849 regmmatch_T *rmp,
4850 win_T *win, // window in which to search or NULL
4851 buf_T *buf, // buffer in which to search
4852 linenr_T lnum, // nr of line to start looking for match
4853 colnr_T col, // column to start looking for match
4854 proftime_T *tm, // timeout limit or NULL
4855 int *timed_out) // flag set on timeout or NULL
4856{
4857 rex.reg_match = NULL;
4858 rex.reg_mmatch = rmp;
4859 rex.reg_buf = buf;
4860 rex.reg_win = win;
4861 rex.reg_firstlnum = lnum;
4862 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
4863 rex.reg_line_lbr = FALSE;
4864 rex.reg_ic = rmp->rmm_ic;
4865 rex.reg_icombine = FALSE;
4866 rex.reg_maxcol = rmp->rmm_maxcol;
4867
4868 return bt_regexec_both(NULL, col, tm, timed_out);
4869}
4870
4871/*
4872 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
4873 */
4874 static int
4875re_num_cmp(long_u val, char_u *scan)
4876{
4877 long_u n = OPERAND_MIN(scan);
4878
4879 if (OPERAND_CMP(scan) == '>')
4880 return val > n;
4881 if (OPERAND_CMP(scan) == '<')
4882 return val < n;
4883 return val == n;
4884}
4885
4886#ifdef BT_REGEXP_DUMP
4887
4888/*
4889 * regdump - dump a regexp onto stdout in vaguely comprehensible form
4890 */
4891 static void
4892regdump(char_u *pattern, bt_regprog_T *r)
4893{
4894 char_u *s;
4895 int op = EXACTLY; // Arbitrary non-END op.
4896 char_u *next;
4897 char_u *end = NULL;
4898 FILE *f;
4899
4900#ifdef BT_REGEXP_LOG
4901 f = fopen("bt_regexp_log.log", "a");
4902#else
4903 f = stdout;
4904#endif
4905 if (f == NULL)
4906 return;
4907 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
4908
4909 s = r->program + 1;
4910 // Loop until we find the END that isn't before a referred next (an END
4911 // can also appear in a NOMATCH operand).
4912 while (op != END || s <= end)
4913 {
4914 op = OP(s);
4915 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
4916 next = regnext(s);
4917 if (next == NULL) // Next ptr.
4918 fprintf(f, "(0)");
4919 else
4920 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
4921 if (end < next)
4922 end = next;
4923 if (op == BRACE_LIMITS)
4924 {
4925 // Two ints
4926 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
4927 s += 8;
4928 }
4929 else if (op == BEHIND || op == NOBEHIND)
4930 {
4931 // one int
4932 fprintf(f, " count %ld", OPERAND_MIN(s));
4933 s += 4;
4934 }
4935 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
4936 {
4937 // one int plus comparator
4938 fprintf(f, " count %ld", OPERAND_MIN(s));
4939 s += 5;
4940 }
4941 s += 3;
4942 if (op == ANYOF || op == ANYOF + ADD_NL
4943 || op == ANYBUT || op == ANYBUT + ADD_NL
4944 || op == EXACTLY)
4945 {
4946 // Literal string, where present.
4947 fprintf(f, "\nxxxxxxxxx\n");
4948 while (*s != NUL)
4949 fprintf(f, "%c", *s++);
4950 fprintf(f, "\nxxxxxxxxx\n");
4951 s++;
4952 }
4953 fprintf(f, "\r\n");
4954 }
4955
4956 // Header fields of interest.
4957 if (r->regstart != NUL)
4958 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
4959 ? (char *)transchar(r->regstart)
4960 : "multibyte", r->regstart);
4961 if (r->reganch)
4962 fprintf(f, "anchored; ");
4963 if (r->regmust != NULL)
4964 fprintf(f, "must have \"%s\"", r->regmust);
4965 fprintf(f, "\r\n");
4966
4967#ifdef BT_REGEXP_LOG
4968 fclose(f);
4969#endif
4970}
4971#endif // BT_REGEXP_DUMP
4972
4973#ifdef DEBUG
4974/*
4975 * regprop - printable representation of opcode
4976 */
4977 static char_u *
4978regprop(char_u *op)
4979{
4980 char *p;
4981 static char buf[50];
4982
4983 STRCPY(buf, ":");
4984
4985 switch ((int) OP(op))
4986 {
4987 case BOL:
4988 p = "BOL";
4989 break;
4990 case EOL:
4991 p = "EOL";
4992 break;
4993 case RE_BOF:
4994 p = "BOF";
4995 break;
4996 case RE_EOF:
4997 p = "EOF";
4998 break;
4999 case CURSOR:
5000 p = "CURSOR";
5001 break;
5002 case RE_VISUAL:
5003 p = "RE_VISUAL";
5004 break;
5005 case RE_LNUM:
5006 p = "RE_LNUM";
5007 break;
5008 case RE_MARK:
5009 p = "RE_MARK";
5010 break;
5011 case RE_COL:
5012 p = "RE_COL";
5013 break;
5014 case RE_VCOL:
5015 p = "RE_VCOL";
5016 break;
5017 case BOW:
5018 p = "BOW";
5019 break;
5020 case EOW:
5021 p = "EOW";
5022 break;
5023 case ANY:
5024 p = "ANY";
5025 break;
5026 case ANY + ADD_NL:
5027 p = "ANY+NL";
5028 break;
5029 case ANYOF:
5030 p = "ANYOF";
5031 break;
5032 case ANYOF + ADD_NL:
5033 p = "ANYOF+NL";
5034 break;
5035 case ANYBUT:
5036 p = "ANYBUT";
5037 break;
5038 case ANYBUT + ADD_NL:
5039 p = "ANYBUT+NL";
5040 break;
5041 case IDENT:
5042 p = "IDENT";
5043 break;
5044 case IDENT + ADD_NL:
5045 p = "IDENT+NL";
5046 break;
5047 case SIDENT:
5048 p = "SIDENT";
5049 break;
5050 case SIDENT + ADD_NL:
5051 p = "SIDENT+NL";
5052 break;
5053 case KWORD:
5054 p = "KWORD";
5055 break;
5056 case KWORD + ADD_NL:
5057 p = "KWORD+NL";
5058 break;
5059 case SKWORD:
5060 p = "SKWORD";
5061 break;
5062 case SKWORD + ADD_NL:
5063 p = "SKWORD+NL";
5064 break;
5065 case FNAME:
5066 p = "FNAME";
5067 break;
5068 case FNAME + ADD_NL:
5069 p = "FNAME+NL";
5070 break;
5071 case SFNAME:
5072 p = "SFNAME";
5073 break;
5074 case SFNAME + ADD_NL:
5075 p = "SFNAME+NL";
5076 break;
5077 case PRINT:
5078 p = "PRINT";
5079 break;
5080 case PRINT + ADD_NL:
5081 p = "PRINT+NL";
5082 break;
5083 case SPRINT:
5084 p = "SPRINT";
5085 break;
5086 case SPRINT + ADD_NL:
5087 p = "SPRINT+NL";
5088 break;
5089 case WHITE:
5090 p = "WHITE";
5091 break;
5092 case WHITE + ADD_NL:
5093 p = "WHITE+NL";
5094 break;
5095 case NWHITE:
5096 p = "NWHITE";
5097 break;
5098 case NWHITE + ADD_NL:
5099 p = "NWHITE+NL";
5100 break;
5101 case DIGIT:
5102 p = "DIGIT";
5103 break;
5104 case DIGIT + ADD_NL:
5105 p = "DIGIT+NL";
5106 break;
5107 case NDIGIT:
5108 p = "NDIGIT";
5109 break;
5110 case NDIGIT + ADD_NL:
5111 p = "NDIGIT+NL";
5112 break;
5113 case HEX:
5114 p = "HEX";
5115 break;
5116 case HEX + ADD_NL:
5117 p = "HEX+NL";
5118 break;
5119 case NHEX:
5120 p = "NHEX";
5121 break;
5122 case NHEX + ADD_NL:
5123 p = "NHEX+NL";
5124 break;
5125 case OCTAL:
5126 p = "OCTAL";
5127 break;
5128 case OCTAL + ADD_NL:
5129 p = "OCTAL+NL";
5130 break;
5131 case NOCTAL:
5132 p = "NOCTAL";
5133 break;
5134 case NOCTAL + ADD_NL:
5135 p = "NOCTAL+NL";
5136 break;
5137 case WORD:
5138 p = "WORD";
5139 break;
5140 case WORD + ADD_NL:
5141 p = "WORD+NL";
5142 break;
5143 case NWORD:
5144 p = "NWORD";
5145 break;
5146 case NWORD + ADD_NL:
5147 p = "NWORD+NL";
5148 break;
5149 case HEAD:
5150 p = "HEAD";
5151 break;
5152 case HEAD + ADD_NL:
5153 p = "HEAD+NL";
5154 break;
5155 case NHEAD:
5156 p = "NHEAD";
5157 break;
5158 case NHEAD + ADD_NL:
5159 p = "NHEAD+NL";
5160 break;
5161 case ALPHA:
5162 p = "ALPHA";
5163 break;
5164 case ALPHA + ADD_NL:
5165 p = "ALPHA+NL";
5166 break;
5167 case NALPHA:
5168 p = "NALPHA";
5169 break;
5170 case NALPHA + ADD_NL:
5171 p = "NALPHA+NL";
5172 break;
5173 case LOWER:
5174 p = "LOWER";
5175 break;
5176 case LOWER + ADD_NL:
5177 p = "LOWER+NL";
5178 break;
5179 case NLOWER:
5180 p = "NLOWER";
5181 break;
5182 case NLOWER + ADD_NL:
5183 p = "NLOWER+NL";
5184 break;
5185 case UPPER:
5186 p = "UPPER";
5187 break;
5188 case UPPER + ADD_NL:
5189 p = "UPPER+NL";
5190 break;
5191 case NUPPER:
5192 p = "NUPPER";
5193 break;
5194 case NUPPER + ADD_NL:
5195 p = "NUPPER+NL";
5196 break;
5197 case BRANCH:
5198 p = "BRANCH";
5199 break;
5200 case EXACTLY:
5201 p = "EXACTLY";
5202 break;
5203 case NOTHING:
5204 p = "NOTHING";
5205 break;
5206 case BACK:
5207 p = "BACK";
5208 break;
5209 case END:
5210 p = "END";
5211 break;
5212 case MOPEN + 0:
5213 p = "MATCH START";
5214 break;
5215 case MOPEN + 1:
5216 case MOPEN + 2:
5217 case MOPEN + 3:
5218 case MOPEN + 4:
5219 case MOPEN + 5:
5220 case MOPEN + 6:
5221 case MOPEN + 7:
5222 case MOPEN + 8:
5223 case MOPEN + 9:
5224 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5225 p = NULL;
5226 break;
5227 case MCLOSE + 0:
5228 p = "MATCH END";
5229 break;
5230 case MCLOSE + 1:
5231 case MCLOSE + 2:
5232 case MCLOSE + 3:
5233 case MCLOSE + 4:
5234 case MCLOSE + 5:
5235 case MCLOSE + 6:
5236 case MCLOSE + 7:
5237 case MCLOSE + 8:
5238 case MCLOSE + 9:
5239 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5240 p = NULL;
5241 break;
5242 case BACKREF + 1:
5243 case BACKREF + 2:
5244 case BACKREF + 3:
5245 case BACKREF + 4:
5246 case BACKREF + 5:
5247 case BACKREF + 6:
5248 case BACKREF + 7:
5249 case BACKREF + 8:
5250 case BACKREF + 9:
5251 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5252 p = NULL;
5253 break;
5254 case NOPEN:
5255 p = "NOPEN";
5256 break;
5257 case NCLOSE:
5258 p = "NCLOSE";
5259 break;
5260#ifdef FEAT_SYN_HL
5261 case ZOPEN + 1:
5262 case ZOPEN + 2:
5263 case ZOPEN + 3:
5264 case ZOPEN + 4:
5265 case ZOPEN + 5:
5266 case ZOPEN + 6:
5267 case ZOPEN + 7:
5268 case ZOPEN + 8:
5269 case ZOPEN + 9:
5270 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5271 p = NULL;
5272 break;
5273 case ZCLOSE + 1:
5274 case ZCLOSE + 2:
5275 case ZCLOSE + 3:
5276 case ZCLOSE + 4:
5277 case ZCLOSE + 5:
5278 case ZCLOSE + 6:
5279 case ZCLOSE + 7:
5280 case ZCLOSE + 8:
5281 case ZCLOSE + 9:
5282 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5283 p = NULL;
5284 break;
5285 case ZREF + 1:
5286 case ZREF + 2:
5287 case ZREF + 3:
5288 case ZREF + 4:
5289 case ZREF + 5:
5290 case ZREF + 6:
5291 case ZREF + 7:
5292 case ZREF + 8:
5293 case ZREF + 9:
5294 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5295 p = NULL;
5296 break;
5297#endif
5298 case STAR:
5299 p = "STAR";
5300 break;
5301 case PLUS:
5302 p = "PLUS";
5303 break;
5304 case NOMATCH:
5305 p = "NOMATCH";
5306 break;
5307 case MATCH:
5308 p = "MATCH";
5309 break;
5310 case BEHIND:
5311 p = "BEHIND";
5312 break;
5313 case NOBEHIND:
5314 p = "NOBEHIND";
5315 break;
5316 case SUBPAT:
5317 p = "SUBPAT";
5318 break;
5319 case BRACE_LIMITS:
5320 p = "BRACE_LIMITS";
5321 break;
5322 case BRACE_SIMPLE:
5323 p = "BRACE_SIMPLE";
5324 break;
5325 case BRACE_COMPLEX + 0:
5326 case BRACE_COMPLEX + 1:
5327 case BRACE_COMPLEX + 2:
5328 case BRACE_COMPLEX + 3:
5329 case BRACE_COMPLEX + 4:
5330 case BRACE_COMPLEX + 5:
5331 case BRACE_COMPLEX + 6:
5332 case BRACE_COMPLEX + 7:
5333 case BRACE_COMPLEX + 8:
5334 case BRACE_COMPLEX + 9:
5335 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5336 p = NULL;
5337 break;
5338 case MULTIBYTECODE:
5339 p = "MULTIBYTECODE";
5340 break;
5341 case NEWL:
5342 p = "NEWL";
5343 break;
5344 default:
5345 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5346 p = NULL;
5347 break;
5348 }
5349 if (p != NULL)
5350 STRCAT(buf, p);
5351 return (char_u *)buf;
5352}
5353#endif // DEBUG