blob: 70ddfe986f0fc60f62c392ed8ce8eb9e40c7ca34 [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
536#ifdef EBCDIC
537 int i;
538
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200539 // This might be slower than switch/case below.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200540 for (i = 0; i < 16; i++)
541 {
542 if (vim_strchr(EQUIVAL_CLASS_C[i], c) != NULL)
543 {
544 char *p = EQUIVAL_CLASS_C[i];
545
546 while (*p != 0)
547 regmbc(*p++);
548 return;
549 }
550 }
551#else
552 switch (c)
553 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200554 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200555 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
556 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
557 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
558 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
559 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
560 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
561 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
562 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
563 regmbc(0x100); regmbc(0x102); regmbc(0x104);
564 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
565 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
566 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
567 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
568 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
569 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
570 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200571 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200572 case 'B': case 0x181: case 0x243: case 0x1e02:
573 case 0x1e04: case 0x1e06:
574 regmbc('B');
575 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
576 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200577 return;
578 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200579 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
580 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200581 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200582 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
583 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
584 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200585 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200586 case 'D': case 0x10e: case 0x110: case 0x18a:
587 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
588 case 0x1e12:
589 regmbc('D'); regmbc(0x10e); regmbc(0x110);
590 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
591 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
593 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200594 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
595 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
596 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
597 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
598 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200599 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200600 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
601 regmbc(0x114); regmbc(0x116); regmbc(0x118);
602 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
603 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
604 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
605 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
606 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
607 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200608 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200609 case 'F': case 0x191: case 0x1e1e: case 0xa798:
610 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
611 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200613 case 'G': case 0x11c: case 0x11e: case 0x120:
614 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
615 case 0x1f4: case 0x1e20: case 0xa7a0:
616 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
617 regmbc(0x120); regmbc(0x122); regmbc(0x193);
618 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
619 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200620 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200621 case 'H': case 0x124: case 0x126: case 0x21e:
622 case 0x1e22: case 0x1e24: case 0x1e26:
623 case 0x1e28: case 0x1e2a: case 0x2c67:
624 regmbc('H'); regmbc(0x124); regmbc(0x126);
625 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
626 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
627 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200628 return;
629 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200630 case 0x128: case 0x12a: case 0x12c: case 0x12e:
631 case 0x130: case 0x197: case 0x1cf: case 0x208:
632 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
633 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200634 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200635 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
636 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
637 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
638 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
639 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200640 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200641 case 'J': case 0x134: case 0x248:
642 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200643 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200644 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
645 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
646 regmbc('K'); regmbc(0x136); regmbc(0x198);
647 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
648 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200649 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200650 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
651 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
652 case 0x1e3a: case 0x1e3c: case 0x2c60:
653 regmbc('L'); regmbc(0x139); regmbc(0x13b);
654 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
655 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
656 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200657 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200658 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
659 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
660 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200661 return;
662 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200663 case 0x143: case 0x145: case 0x147: case 0x1f8:
664 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
665 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200666 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200667 regmbc(0x143); regmbc(0x145); regmbc(0x147);
668 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
669 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200670 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200671 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
672 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
673 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
674 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
675 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
676 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
677 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
678 case 0x1ee0: case 0x1ee2:
679 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
680 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
681 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
682 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
683 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
684 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
685 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
686 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
687 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
688 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
689 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
690 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
691 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
694 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
695 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200696 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200697 case 'Q': case 0x24a:
698 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200699 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200700 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
701 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
702 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
703 regmbc('R'); regmbc(0x154); regmbc(0x156);
704 regmbc(0x210); regmbc(0x212); regmbc(0x158);
705 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
706 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
707 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200708 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200709 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
710 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
711 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
712 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
713 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
714 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
715 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
716 regmbc(0xa7a8);
717 return;
718 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
719 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
720 case 0x1e6e: case 0x1e70:
721 regmbc('T'); regmbc(0x162); regmbc(0x164);
722 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
723 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
724 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200725 return;
726 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200727 case 0x168: case 0x16a: case 0x16c: case 0x16e:
728 case 0x170: case 0x172: case 0x1af: case 0x1d3:
729 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
730 case 0x214: case 0x216: case 0x244: case 0x1e72:
731 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
732 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
733 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200734 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200735 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
736 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
737 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
738 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
739 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
740 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
741 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
742 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
743 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
744 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200745 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200746 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
747 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
748 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200749 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200750 case 'W': case 0x174: case 0x1e80: case 0x1e82:
751 case 0x1e84: case 0x1e86: case 0x1e88:
752 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
753 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
754 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200755 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200756 case 'X': case 0x1e8a: case 0x1e8c:
757 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200758 return;
759 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200760 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
761 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
762 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
763 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
764 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
765 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200766 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
768 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
769 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
770 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
771 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200772 return;
773 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200774 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
775 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
776 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
777 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
778 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
779 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
780 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200781 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
782 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200783 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
784 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
785 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
786 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
787 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
788 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
789 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
790 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
791 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200792 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200793 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
794 case 0x1e03: case 0x1e05: case 0x1e07:
795 regmbc('b');
796 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
797 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
798 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
802 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
803 regmbc('c'); regmbc(0xe7); regmbc(0x107);
804 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
805 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
806 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
809 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
810 case 0x1e0f: case 0x1e11: case 0x1e13:
811 regmbc('d'); regmbc(0x10f); regmbc(0x111);
812 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
813 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
814 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200815 return;
816 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200817 case 0x113: case 0x115: case 0x117: case 0x119:
818 case 0x11b: case 0x205: case 0x207: case 0x229:
819 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
820 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
821 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
822 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200823 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200824 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
825 regmbc(0x115); regmbc(0x117); regmbc(0x119);
826 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
827 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
828 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
829 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
830 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
831 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
832 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200833 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200834 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
835 case 0x1e1f: case 0xa799:
836 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
837 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
838 return;
839 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
840 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
841 case 0x1e21: case 0xa7a1:
842 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
843 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
844 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
845 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200846 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200847 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
848 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
849 case 0x1e96: case 0x2c68: case 0xa795:
850 regmbc('h'); regmbc(0x125); regmbc(0x127);
851 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
852 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
853 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
855 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200856 case 0x129: case 0x12b: case 0x12d: case 0x12f:
857 case 0x1d0: case 0x209: case 0x20b: case 0x268:
858 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
859 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200860 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 regmbc(0xee); regmbc(0xef); regmbc(0x129);
862 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
863 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
864 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
865 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200866 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200867 case 'j': case 0x135: case 0x1f0: case 0x249:
868 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
869 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'k': case 0x137: case 0x199: case 0x1e9:
872 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
873 case 0x2c6a: case 0xa741:
874 regmbc('k'); regmbc(0x137); regmbc(0x199);
875 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
876 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
877 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200878 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200879 case 'l': case 0x13a: case 0x13c: case 0x13e:
880 case 0x140: case 0x142: case 0x19a: case 0x1e37:
881 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
882 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
883 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
884 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
885 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200886 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200887 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
888 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
889 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200890 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200891 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
892 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
893 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
894 case 0xa7a5:
895 regmbc('n'); regmbc(0xf1); regmbc(0x144);
896 regmbc(0x146); regmbc(0x148); regmbc(0x149);
897 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
898 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
899 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200900 return;
901 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200902 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
903 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
904 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
905 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
906 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
907 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
908 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
909 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200910 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
911 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200912 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
913 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
914 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
915 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
916 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
917 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
918 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
919 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
920 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
921 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
922 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200923 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200924 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
925 case 0x1e55: case 0x1e57:
926 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
927 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
928 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200929 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200930 case 'q': case 0x24b: case 0x2a0:
931 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200932 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200933 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
934 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
935 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
936 case 0xa7a7:
937 regmbc('r'); regmbc(0x155); regmbc(0x157);
938 regmbc(0x159); regmbc(0x211); regmbc(0x213);
939 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
940 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
941 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
942 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200943 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200944 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
945 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
946 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
947 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
948 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
949 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
950 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
951 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
952 return;
953 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
954 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
955 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
956 regmbc('t'); regmbc(0x163); regmbc(0x165);
957 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
958 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
959 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
960 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200961 return;
962 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200963 case 0x169: case 0x16b: case 0x16d: case 0x16f:
964 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
965 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
966 case 0x215: case 0x217: case 0x289: case 0x1e73:
967 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
968 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
969 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
970 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
973 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
974 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
975 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
976 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
977 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
978 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
979 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
980 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
981 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
982 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200983 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200984 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
985 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
986 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200987 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200988 case 'w': case 0x175: case 0x1e81: case 0x1e83:
989 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
990 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
991 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
992 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200993 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200994 case 'x': case 0x1e8b: case 0x1e8d:
995 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200996 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200997 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
998 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
999 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001000 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001001 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
1002 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
1003 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
1004 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001005 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001006 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1007 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1008 case 0x1e95: case 0x2c6c:
1009 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
1010 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
1011 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
1012 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001013 return;
1014 }
1015#endif
1016 }
1017 regmbc(c);
1018}
1019
1020/*
1021 * Emit a node.
1022 * Return pointer to generated code.
1023 */
1024 static char_u *
1025regnode(int op)
1026{
1027 char_u *ret;
1028
1029 ret = regcode;
1030 if (ret == JUST_CALC_SIZE)
1031 regsize += 3;
1032 else
1033 {
1034 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001035 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001036 *regcode++ = NUL;
1037 }
1038 return ret;
1039}
1040
1041/*
1042 * Write a long as four bytes at "p" and return pointer to the next char.
1043 */
1044 static char_u *
1045re_put_long(char_u *p, long_u val)
1046{
1047 *p++ = (char_u) ((val >> 24) & 0377);
1048 *p++ = (char_u) ((val >> 16) & 0377);
1049 *p++ = (char_u) ((val >> 8) & 0377);
1050 *p++ = (char_u) (val & 0377);
1051 return p;
1052}
1053
1054/*
1055 * regnext - dig the "next" pointer out of a node
1056 * Returns NULL when calculating size, when there is no next item and when
1057 * there is an error.
1058 */
1059 static char_u *
1060regnext(char_u *p)
1061{
1062 int offset;
1063
1064 if (p == JUST_CALC_SIZE || reg_toolong)
1065 return NULL;
1066
1067 offset = NEXT(p);
1068 if (offset == 0)
1069 return NULL;
1070
1071 if (OP(p) == BACK)
1072 return p - offset;
1073 else
1074 return p + offset;
1075}
1076
1077/*
1078 * Set the next-pointer at the end of a node chain.
1079 */
1080 static void
1081regtail(char_u *p, char_u *val)
1082{
1083 char_u *scan;
1084 char_u *temp;
1085 int offset;
1086
1087 if (p == JUST_CALC_SIZE)
1088 return;
1089
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001090 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001091 scan = p;
1092 for (;;)
1093 {
1094 temp = regnext(scan);
1095 if (temp == NULL)
1096 break;
1097 scan = temp;
1098 }
1099
1100 if (OP(scan) == BACK)
1101 offset = (int)(scan - val);
1102 else
1103 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001104 // When the offset uses more than 16 bits it can no longer fit in the two
1105 // bytes available. Use a global flag to avoid having to check return
1106 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001107 if (offset > 0xffff)
1108 reg_toolong = TRUE;
1109 else
1110 {
1111 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1112 *(scan + 2) = (char_u) (offset & 0377);
1113 }
1114}
1115
1116/*
1117 * Like regtail, on item after a BRANCH; nop if none.
1118 */
1119 static void
1120regoptail(char_u *p, char_u *val)
1121{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001122 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001123 if (p == NULL || p == JUST_CALC_SIZE
1124 || (OP(p) != BRANCH
1125 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1126 return;
1127 regtail(OPERAND(p), val);
1128}
1129
1130/*
1131 * Insert an operator in front of already-emitted operand
1132 *
1133 * Means relocating the operand.
1134 */
1135 static void
1136reginsert(int op, char_u *opnd)
1137{
1138 char_u *src;
1139 char_u *dst;
1140 char_u *place;
1141
1142 if (regcode == JUST_CALC_SIZE)
1143 {
1144 regsize += 3;
1145 return;
1146 }
1147 src = regcode;
1148 regcode += 3;
1149 dst = regcode;
1150 while (src > opnd)
1151 *--dst = *--src;
1152
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001153 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001154 *place++ = op;
1155 *place++ = NUL;
1156 *place = NUL;
1157}
1158
1159/*
1160 * Insert an operator in front of already-emitted operand.
1161 * Add a number to the operator.
1162 */
1163 static void
1164reginsert_nr(int op, long val, char_u *opnd)
1165{
1166 char_u *src;
1167 char_u *dst;
1168 char_u *place;
1169
1170 if (regcode == JUST_CALC_SIZE)
1171 {
1172 regsize += 7;
1173 return;
1174 }
1175 src = regcode;
1176 regcode += 7;
1177 dst = regcode;
1178 while (src > opnd)
1179 *--dst = *--src;
1180
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001181 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001182 *place++ = op;
1183 *place++ = NUL;
1184 *place++ = NUL;
1185 re_put_long(place, (long_u)val);
1186}
1187
1188/*
1189 * Insert an operator in front of already-emitted operand.
1190 * The operator has the given limit values as operands. Also set next pointer.
1191 *
1192 * Means relocating the operand.
1193 */
1194 static void
1195reginsert_limits(
1196 int op,
1197 long minval,
1198 long maxval,
1199 char_u *opnd)
1200{
1201 char_u *src;
1202 char_u *dst;
1203 char_u *place;
1204
1205 if (regcode == JUST_CALC_SIZE)
1206 {
1207 regsize += 11;
1208 return;
1209 }
1210 src = regcode;
1211 regcode += 11;
1212 dst = regcode;
1213 while (src > opnd)
1214 *--dst = *--src;
1215
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001216 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001217 *place++ = op;
1218 *place++ = NUL;
1219 *place++ = NUL;
1220 place = re_put_long(place, (long_u)minval);
1221 place = re_put_long(place, (long_u)maxval);
1222 regtail(opnd, place);
1223}
1224
1225/*
1226 * Return TRUE if the back reference is legal. We must have seen the close
1227 * brace.
1228 * TODO: Should also check that we don't refer to something that is repeated
1229 * (+*=): what instance of the repetition should we match?
1230 */
1231 static int
1232seen_endbrace(int refnum)
1233{
1234 if (!had_endbrace[refnum])
1235 {
1236 char_u *p;
1237
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001238 // Trick: check if "@<=" or "@<!" follows, in which case
1239 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001240 for (p = regparse; *p != NUL; ++p)
1241 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1242 break;
1243 if (*p == NUL)
1244 {
1245 emsg(_("E65: Illegal back reference"));
1246 rc_did_emsg = TRUE;
1247 return FALSE;
1248 }
1249 }
1250 return TRUE;
1251}
1252
1253/*
1254 * Parse the lowest level.
1255 *
1256 * Optimization: gobbles an entire sequence of ordinary characters so that
1257 * it can turn them into a single node, which is smaller to store and
1258 * faster to run. Don't do this when one_exactly is set.
1259 */
1260 static char_u *
1261regatom(int *flagp)
1262{
1263 char_u *ret;
1264 int flags;
1265 int c;
1266 char_u *p;
1267 int extra = 0;
1268 int save_prev_at_start = prev_at_start;
1269
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001270 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001271
1272 c = getchr();
1273 switch (c)
1274 {
1275 case Magic('^'):
1276 ret = regnode(BOL);
1277 break;
1278
1279 case Magic('$'):
1280 ret = regnode(EOL);
1281#if defined(FEAT_SYN_HL) || defined(PROTO)
1282 had_eol = TRUE;
1283#endif
1284 break;
1285
1286 case Magic('<'):
1287 ret = regnode(BOW);
1288 break;
1289
1290 case Magic('>'):
1291 ret = regnode(EOW);
1292 break;
1293
1294 case Magic('_'):
1295 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 {
1298 ret = regnode(BOL);
1299 break;
1300 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001301 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302 {
1303 ret = regnode(EOL);
1304#if defined(FEAT_SYN_HL) || defined(PROTO)
1305 had_eol = TRUE;
1306#endif
1307 break;
1308 }
1309
1310 extra = ADD_NL;
1311 *flagp |= HASNL;
1312
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001313 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001314 if (c == '[')
1315 goto collection;
1316
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001317 // "\_x" is character class plus newline
1318 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001319
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001320 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001321 case Magic('.'):
1322 case Magic('i'):
1323 case Magic('I'):
1324 case Magic('k'):
1325 case Magic('K'):
1326 case Magic('f'):
1327 case Magic('F'):
1328 case Magic('p'):
1329 case Magic('P'):
1330 case Magic('s'):
1331 case Magic('S'):
1332 case Magic('d'):
1333 case Magic('D'):
1334 case Magic('x'):
1335 case Magic('X'):
1336 case Magic('o'):
1337 case Magic('O'):
1338 case Magic('w'):
1339 case Magic('W'):
1340 case Magic('h'):
1341 case Magic('H'):
1342 case Magic('a'):
1343 case Magic('A'):
1344 case Magic('l'):
1345 case Magic('L'):
1346 case Magic('u'):
1347 case Magic('U'):
1348 p = vim_strchr(classchars, no_Magic(c));
1349 if (p == NULL)
1350 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1351
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001352 // When '.' is followed by a composing char ignore the dot, so that
1353 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001354 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1355 {
1356 c = getchr();
1357 goto do_multibyte;
1358 }
1359 ret = regnode(classcodes[p - classchars] + extra);
1360 *flagp |= HASWIDTH | SIMPLE;
1361 break;
1362
1363 case Magic('n'):
1364 if (reg_string)
1365 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001366 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001367 ret = regnode(EXACTLY);
1368 regc(NL);
1369 regc(NUL);
1370 *flagp |= HASWIDTH | SIMPLE;
1371 }
1372 else
1373 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001374 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001375 ret = regnode(NEWL);
1376 *flagp |= HASWIDTH | HASNL;
1377 }
1378 break;
1379
1380 case Magic('('):
1381 if (one_exactly)
1382 EMSG_ONE_RET_NULL;
1383 ret = reg(REG_PAREN, &flags);
1384 if (ret == NULL)
1385 return NULL;
1386 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1387 break;
1388
1389 case NUL:
1390 case Magic('|'):
1391 case Magic('&'):
1392 case Magic(')'):
1393 if (one_exactly)
1394 EMSG_ONE_RET_NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001395 IEMSG_RET_NULL(_(e_internal)); // Supposed to be caught earlier.
1396 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001397
1398 case Magic('='):
1399 case Magic('?'):
1400 case Magic('+'):
1401 case Magic('@'):
1402 case Magic('{'):
1403 case Magic('*'):
1404 c = no_Magic(c);
1405 EMSG3_RET_NULL(_("E64: %s%c follows nothing"),
1406 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001407 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001408
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001409 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001410 if (reg_prev_sub != NULL)
1411 {
1412 char_u *lp;
1413
1414 ret = regnode(EXACTLY);
1415 lp = reg_prev_sub;
1416 while (*lp != NUL)
1417 regc(*lp++);
1418 regc(NUL);
1419 if (*reg_prev_sub != NUL)
1420 {
1421 *flagp |= HASWIDTH;
1422 if ((lp - reg_prev_sub) == 1)
1423 *flagp |= SIMPLE;
1424 }
1425 }
1426 else
1427 EMSG_RET_NULL(_(e_nopresub));
1428 break;
1429
1430 case Magic('1'):
1431 case Magic('2'):
1432 case Magic('3'):
1433 case Magic('4'):
1434 case Magic('5'):
1435 case Magic('6'):
1436 case Magic('7'):
1437 case Magic('8'):
1438 case Magic('9'):
1439 {
1440 int refnum;
1441
1442 refnum = c - Magic('0');
1443 if (!seen_endbrace(refnum))
1444 return NULL;
1445 ret = regnode(BACKREF + refnum);
1446 }
1447 break;
1448
1449 case Magic('z'):
1450 {
1451 c = no_Magic(getchr());
1452 switch (c)
1453 {
1454#ifdef FEAT_SYN_HL
1455 case '(': if ((reg_do_extmatch & REX_SET) == 0)
1456 EMSG_RET_NULL(_(e_z_not_allowed));
1457 if (one_exactly)
1458 EMSG_ONE_RET_NULL;
1459 ret = reg(REG_ZPAREN, &flags);
1460 if (ret == NULL)
1461 return NULL;
1462 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1463 re_has_z = REX_SET;
1464 break;
1465
1466 case '1':
1467 case '2':
1468 case '3':
1469 case '4':
1470 case '5':
1471 case '6':
1472 case '7':
1473 case '8':
1474 case '9': if ((reg_do_extmatch & REX_USE) == 0)
1475 EMSG_RET_NULL(_(e_z1_not_allowed));
1476 ret = regnode(ZREF + c - '0');
1477 re_has_z = REX_USE;
1478 break;
1479#endif
1480
1481 case 's': ret = regnode(MOPEN + 0);
1482 if (re_mult_next("\\zs") == FAIL)
1483 return NULL;
1484 break;
1485
1486 case 'e': ret = regnode(MCLOSE + 0);
1487 if (re_mult_next("\\ze") == FAIL)
1488 return NULL;
1489 break;
1490
1491 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1492 }
1493 }
1494 break;
1495
1496 case Magic('%'):
1497 {
1498 c = no_Magic(getchr());
1499 switch (c)
1500 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001501 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001502 case '(':
1503 if (one_exactly)
1504 EMSG_ONE_RET_NULL;
1505 ret = reg(REG_NPAREN, &flags);
1506 if (ret == NULL)
1507 return NULL;
1508 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1509 break;
1510
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001511 // Catch \%^ and \%$ regardless of where they appear in the
1512 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001513 case '^':
1514 ret = regnode(RE_BOF);
1515 break;
1516
1517 case '$':
1518 ret = regnode(RE_EOF);
1519 break;
1520
1521 case '#':
1522 ret = regnode(CURSOR);
1523 break;
1524
1525 case 'V':
1526 ret = regnode(RE_VISUAL);
1527 break;
1528
1529 case 'C':
1530 ret = regnode(RE_COMPOSING);
1531 break;
1532
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001533 // \%[abc]: Emit as a list of branches, all ending at the last
1534 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001535 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001536 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001537 EMSG_ONE_RET_NULL;
1538 {
1539 char_u *lastbranch;
1540 char_u *lastnode = NULL;
1541 char_u *br;
1542
1543 ret = NULL;
1544 while ((c = getchr()) != ']')
1545 {
1546 if (c == NUL)
1547 EMSG2_RET_NULL(_(e_missing_sb),
1548 reg_magic == MAGIC_ALL);
1549 br = regnode(BRANCH);
1550 if (ret == NULL)
1551 ret = br;
1552 else
1553 {
1554 regtail(lastnode, br);
1555 if (reg_toolong)
1556 return NULL;
1557 }
1558
1559 ungetchr();
1560 one_exactly = TRUE;
1561 lastnode = regatom(flagp);
1562 one_exactly = FALSE;
1563 if (lastnode == NULL)
1564 return NULL;
1565 }
1566 if (ret == NULL)
1567 EMSG2_RET_NULL(_(e_empty_sb),
1568 reg_magic == MAGIC_ALL);
1569 lastbranch = regnode(BRANCH);
1570 br = regnode(NOTHING);
1571 if (ret != JUST_CALC_SIZE)
1572 {
1573 regtail(lastnode, br);
1574 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001575 // connect all branches to the NOTHING
1576 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001577 for (br = ret; br != lastnode; )
1578 {
1579 if (OP(br) == BRANCH)
1580 {
1581 regtail(br, lastbranch);
1582 if (reg_toolong)
1583 return NULL;
1584 br = OPERAND(br);
1585 }
1586 else
1587 br = regnext(br);
1588 }
1589 }
1590 *flagp &= ~(HASWIDTH | SIMPLE);
1591 break;
1592 }
1593
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001594 case 'd': // %d123 decimal
1595 case 'o': // %o123 octal
1596 case 'x': // %xab hex 2
1597 case 'u': // %uabcd hex 4
1598 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001599 {
1600 long i;
1601
1602 switch (c)
1603 {
1604 case 'd': i = getdecchrs(); break;
1605 case 'o': i = getoctchrs(); break;
1606 case 'x': i = gethexchrs(2); break;
1607 case 'u': i = gethexchrs(4); break;
1608 case 'U': i = gethexchrs(8); break;
1609 default: i = -1; break;
1610 }
1611
1612 if (i < 0 || i > INT_MAX)
1613 EMSG2_RET_NULL(
1614 _("E678: Invalid character after %s%%[dxouU]"),
1615 reg_magic == MAGIC_ALL);
1616 if (use_multibytecode(i))
1617 ret = regnode(MULTIBYTECODE);
1618 else
1619 ret = regnode(EXACTLY);
1620 if (i == 0)
1621 regc(0x0a);
1622 else
1623 regmbc(i);
1624 regc(NUL);
1625 *flagp |= HASWIDTH;
1626 break;
1627 }
1628
1629 default:
1630 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
1631 || c == '\'')
1632 {
1633 long_u n = 0;
1634 int cmp;
1635
1636 cmp = c;
1637 if (cmp == '<' || cmp == '>')
1638 c = getchr();
1639 while (VIM_ISDIGIT(c))
1640 {
1641 n = n * 10 + (c - '0');
1642 c = getchr();
1643 }
1644 if (c == '\'' && n == 0)
1645 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001646 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001647 c = getchr();
1648 ret = regnode(RE_MARK);
1649 if (ret == JUST_CALC_SIZE)
1650 regsize += 2;
1651 else
1652 {
1653 *regcode++ = c;
1654 *regcode++ = cmp;
1655 }
1656 break;
1657 }
1658 else if (c == 'l' || c == 'c' || c == 'v')
1659 {
1660 if (c == 'l')
1661 {
1662 ret = regnode(RE_LNUM);
1663 if (save_prev_at_start)
1664 at_start = TRUE;
1665 }
1666 else if (c == 'c')
1667 ret = regnode(RE_COL);
1668 else
1669 ret = regnode(RE_VCOL);
1670 if (ret == JUST_CALC_SIZE)
1671 regsize += 5;
1672 else
1673 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001674 // put the number and the optional
1675 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001676 regcode = re_put_long(regcode, n);
1677 *regcode++ = cmp;
1678 }
1679 break;
1680 }
1681 }
1682
1683 EMSG2_RET_NULL(_("E71: Invalid character after %s%%"),
1684 reg_magic == MAGIC_ALL);
1685 }
1686 }
1687 break;
1688
1689 case Magic('['):
1690collection:
1691 {
1692 char_u *lp;
1693
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001694 // If there is no matching ']', we assume the '[' is a normal
1695 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001696 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001697 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001698 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001699 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001700 int endc;
1701
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001702 // In a character class, different parsing rules apply.
1703 // Not even \ is special anymore, nothing is.
1704 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001705 {
1706 ret = regnode(ANYBUT + extra);
1707 regparse++;
1708 }
1709 else
1710 ret = regnode(ANYOF + extra);
1711
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001712 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001713 if (*regparse == ']' || *regparse == '-')
1714 {
1715 startc = *regparse;
1716 regc(*regparse++);
1717 }
1718
1719 while (*regparse != NUL && *regparse != ']')
1720 {
1721 if (*regparse == '-')
1722 {
1723 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001724 // The '-' is not used for a range at the end and
1725 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001726 if (*regparse == ']' || *regparse == NUL
1727 || startc == -1
1728 || (regparse[0] == '\\' && regparse[1] == 'n'))
1729 {
1730 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001731 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001732 }
1733 else
1734 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001735 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001736 endc = 0;
1737 if (*regparse == '[')
1738 endc = get_coll_element(&regparse);
1739 if (endc == 0)
1740 {
1741 if (has_mbyte)
1742 endc = mb_ptr2char_adv(&regparse);
1743 else
1744 endc = *regparse++;
1745 }
1746
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001747 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001748 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1749 endc = coll_get_char();
1750
1751 if (startc > endc)
1752 EMSG_RET_NULL(_(e_reverse_range));
1753 if (has_mbyte && ((*mb_char2len)(startc) > 1
1754 || (*mb_char2len)(endc) > 1))
1755 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001756 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001757 if (endc > startc + 256)
1758 EMSG_RET_NULL(_(e_large_class));
1759 while (++startc <= endc)
1760 regmbc(startc);
1761 }
1762 else
1763 {
1764#ifdef EBCDIC
1765 int alpha_only = FALSE;
1766
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001767 // for alphabetical range skip the gaps
1768 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001769 if (isalpha(startc) && isalpha(endc))
1770 alpha_only = TRUE;
1771#endif
1772 while (++startc <= endc)
1773#ifdef EBCDIC
1774 if (!alpha_only || isalpha(startc))
1775#endif
1776 regc(startc);
1777 }
1778 startc = -1;
1779 }
1780 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001781 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1782 // accepts "\t", "\e", etc., but only when the 'l' flag in
1783 // 'cpoptions' is not included.
1784 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001785 else if (*regparse == '\\'
1786 && !reg_cpo_bsl
1787 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1788 || (!reg_cpo_lit
1789 && vim_strchr(REGEXP_ABBR,
1790 regparse[1]) != NULL)))
1791 {
1792 regparse++;
1793 if (*regparse == 'n')
1794 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001795 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001796 if (ret != JUST_CALC_SIZE)
1797 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001798 // Using \n inside [^] does not change what
1799 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001800 if (*ret == ANYOF)
1801 {
1802 *ret = ANYOF + ADD_NL;
1803 *flagp |= HASNL;
1804 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001805 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001806 }
1807 regparse++;
1808 startc = -1;
1809 }
1810 else if (*regparse == 'd'
1811 || *regparse == 'o'
1812 || *regparse == 'x'
1813 || *regparse == 'u'
1814 || *regparse == 'U')
1815 {
1816 startc = coll_get_char();
1817 if (startc == 0)
1818 regc(0x0a);
1819 else
1820 regmbc(startc);
1821 }
1822 else
1823 {
1824 startc = backslash_trans(*regparse++);
1825 regc(startc);
1826 }
1827 }
1828 else if (*regparse == '[')
1829 {
1830 int c_class;
1831 int cu;
1832
1833 c_class = get_char_class(&regparse);
1834 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001835 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001836 switch (c_class)
1837 {
1838 case CLASS_NONE:
1839 c_class = get_equi_class(&regparse);
1840 if (c_class != 0)
1841 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001842 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001843 reg_equi_class(c_class);
1844 }
1845 else if ((c_class =
1846 get_coll_element(&regparse)) != 0)
1847 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001848 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001849 regmbc(c_class);
1850 }
1851 else
1852 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001853 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001854 startc = *regparse++;
1855 regc(startc);
1856 }
1857 break;
1858 case CLASS_ALNUM:
1859 for (cu = 1; cu < 128; cu++)
1860 if (isalnum(cu))
1861 regmbc(cu);
1862 break;
1863 case CLASS_ALPHA:
1864 for (cu = 1; cu < 128; cu++)
1865 if (isalpha(cu))
1866 regmbc(cu);
1867 break;
1868 case CLASS_BLANK:
1869 regc(' ');
1870 regc('\t');
1871 break;
1872 case CLASS_CNTRL:
1873 for (cu = 1; cu <= 127; cu++)
1874 if (iscntrl(cu))
1875 regmbc(cu);
1876 break;
1877 case CLASS_DIGIT:
1878 for (cu = 1; cu <= 127; cu++)
1879 if (VIM_ISDIGIT(cu))
1880 regmbc(cu);
1881 break;
1882 case CLASS_GRAPH:
1883 for (cu = 1; cu <= 127; cu++)
1884 if (isgraph(cu))
1885 regmbc(cu);
1886 break;
1887 case CLASS_LOWER:
1888 for (cu = 1; cu <= 255; cu++)
1889 if (MB_ISLOWER(cu) && cu != 170
1890 && cu != 186)
1891 regmbc(cu);
1892 break;
1893 case CLASS_PRINT:
1894 for (cu = 1; cu <= 255; cu++)
1895 if (vim_isprintc(cu))
1896 regmbc(cu);
1897 break;
1898 case CLASS_PUNCT:
1899 for (cu = 1; cu < 128; cu++)
1900 if (ispunct(cu))
1901 regmbc(cu);
1902 break;
1903 case CLASS_SPACE:
1904 for (cu = 9; cu <= 13; cu++)
1905 regc(cu);
1906 regc(' ');
1907 break;
1908 case CLASS_UPPER:
1909 for (cu = 1; cu <= 255; cu++)
1910 if (MB_ISUPPER(cu))
1911 regmbc(cu);
1912 break;
1913 case CLASS_XDIGIT:
1914 for (cu = 1; cu <= 255; cu++)
1915 if (vim_isxdigit(cu))
1916 regmbc(cu);
1917 break;
1918 case CLASS_TAB:
1919 regc('\t');
1920 break;
1921 case CLASS_RETURN:
1922 regc('\r');
1923 break;
1924 case CLASS_BACKSPACE:
1925 regc('\b');
1926 break;
1927 case CLASS_ESCAPE:
1928 regc('\033');
1929 break;
1930 case CLASS_IDENT:
1931 for (cu = 1; cu <= 255; cu++)
1932 if (vim_isIDc(cu))
1933 regmbc(cu);
1934 break;
1935 case CLASS_KEYWORD:
1936 for (cu = 1; cu <= 255; cu++)
1937 if (reg_iswordc(cu))
1938 regmbc(cu);
1939 break;
1940 case CLASS_FNAME:
1941 for (cu = 1; cu <= 255; cu++)
1942 if (vim_isfilec(cu))
1943 regmbc(cu);
1944 break;
1945 }
1946 }
1947 else
1948 {
1949 if (has_mbyte)
1950 {
1951 int len;
1952
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001953 // produce a multibyte character, including any
1954 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001955 startc = mb_ptr2char(regparse);
1956 len = (*mb_ptr2len)(regparse);
1957 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001958 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001959 while (--len >= 0)
1960 regc(*regparse++);
1961 }
1962 else
1963 {
1964 startc = *regparse++;
1965 regc(startc);
1966 }
1967 }
1968 }
1969 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001970 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001971 if (*regparse != ']')
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001972 EMSG_RET_NULL(_(e_toomsbra)); // Cannot happen?
1973 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001974 *flagp |= HASWIDTH | SIMPLE;
1975 break;
1976 }
1977 else if (reg_strict)
1978 EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF);
1979 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001980 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001981
1982 default:
1983 {
1984 int len;
1985
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001986 // A multi-byte character is handled as a separate atom if it's
1987 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001988 if (use_multibytecode(c))
1989 {
1990do_multibyte:
1991 ret = regnode(MULTIBYTECODE);
1992 regmbc(c);
1993 *flagp |= HASWIDTH | SIMPLE;
1994 break;
1995 }
1996
1997 ret = regnode(EXACTLY);
1998
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001999 // Append characters as long as:
2000 // - there is no following multi, we then need the character in
2001 // front of it as a single character operand
2002 // - not running into a Magic character
2003 // - "one_exactly" is not set
2004 // But always emit at least one character. Might be a Multi,
2005 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002006 for (len = 0; c != NUL && (len == 0
2007 || (re_multi_type(peekchr()) == NOT_MULTI
2008 && !one_exactly
2009 && !is_Magic(c))); ++len)
2010 {
2011 c = no_Magic(c);
2012 if (has_mbyte)
2013 {
2014 regmbc(c);
2015 if (enc_utf8)
2016 {
2017 int l;
2018
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002019 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002020 for (;;)
2021 {
2022 l = utf_ptr2len(regparse);
2023 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2024 break;
2025 regmbc(utf_ptr2char(regparse));
2026 skipchr();
2027 }
2028 }
2029 }
2030 else
2031 regc(c);
2032 c = getchr();
2033 }
2034 ungetchr();
2035
2036 regc(NUL);
2037 *flagp |= HASWIDTH;
2038 if (len == 1)
2039 *flagp |= SIMPLE;
2040 }
2041 break;
2042 }
2043
2044 return ret;
2045}
2046
2047/*
2048 * Parse something followed by possible [*+=].
2049 *
2050 * Note that the branching code sequences used for = and the general cases
2051 * of * and + are somewhat optimized: they use the same NOTHING node as
2052 * both the endmarker for their branch list and the body of the last branch.
2053 * It might seem that this node could be dispensed with entirely, but the
2054 * endmarker role is not redundant.
2055 */
2056 static char_u *
2057regpiece(int *flagp)
2058{
2059 char_u *ret;
2060 int op;
2061 char_u *next;
2062 int flags;
2063 long minval;
2064 long maxval;
2065
2066 ret = regatom(&flags);
2067 if (ret == NULL)
2068 return NULL;
2069
2070 op = peekchr();
2071 if (re_multi_type(op) == NOT_MULTI)
2072 {
2073 *flagp = flags;
2074 return ret;
2075 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002076 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002077 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2078
2079 skipchr();
2080 switch (op)
2081 {
2082 case Magic('*'):
2083 if (flags & SIMPLE)
2084 reginsert(STAR, ret);
2085 else
2086 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002087 // Emit x* as (x&|), where & means "self".
2088 reginsert(BRANCH, ret); // Either x
2089 regoptail(ret, regnode(BACK)); // and loop
2090 regoptail(ret, ret); // back
2091 regtail(ret, regnode(BRANCH)); // or
2092 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002093 }
2094 break;
2095
2096 case Magic('+'):
2097 if (flags & SIMPLE)
2098 reginsert(PLUS, ret);
2099 else
2100 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002101 // Emit x+ as x(&|), where & means "self".
2102 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002103 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002104 regtail(regnode(BACK), ret); // loop back
2105 regtail(next, regnode(BRANCH)); // or
2106 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002107 }
2108 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2109 break;
2110
2111 case Magic('@'):
2112 {
2113 int lop = END;
2114 long nr;
2115
2116 nr = getdecchrs();
2117 switch (no_Magic(getchr()))
2118 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002119 case '=': lop = MATCH; break; // \@=
2120 case '!': lop = NOMATCH; break; // \@!
2121 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002122 case '<': switch (no_Magic(getchr()))
2123 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002124 case '=': lop = BEHIND; break; // \@<=
2125 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002126 }
2127 }
2128 if (lop == END)
2129 EMSG2_RET_NULL(_("E59: invalid character after %s@"),
2130 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002131 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002132 if (lop == BEHIND || lop == NOBEHIND)
2133 {
2134 regtail(ret, regnode(BHPOS));
2135 *flagp |= HASLOOKBH;
2136 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002137 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002138 if (lop == BEHIND || lop == NOBEHIND)
2139 {
2140 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002141 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002142 reginsert_nr(lop, nr, ret);
2143 }
2144 else
2145 reginsert(lop, ret);
2146 break;
2147 }
2148
2149 case Magic('?'):
2150 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002151 // Emit x= as (x|)
2152 reginsert(BRANCH, ret); // Either x
2153 regtail(ret, regnode(BRANCH)); // or
2154 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002155 regtail(ret, next);
2156 regoptail(ret, next);
2157 break;
2158
2159 case Magic('{'):
2160 if (!read_limits(&minval, &maxval))
2161 return NULL;
2162 if (flags & SIMPLE)
2163 {
2164 reginsert(BRACE_SIMPLE, ret);
2165 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2166 }
2167 else
2168 {
2169 if (num_complex_braces >= 10)
2170 EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"),
2171 reg_magic == MAGIC_ALL);
2172 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2173 regoptail(ret, regnode(BACK));
2174 regoptail(ret, ret);
2175 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2176 ++num_complex_braces;
2177 }
2178 if (minval > 0 && maxval > 0)
2179 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2180 break;
2181 }
2182 if (re_multi_type(peekchr()) != NOT_MULTI)
2183 {
2184 // Can't have a multi follow a multi.
2185 if (peekchr() == Magic('*'))
2186 EMSG2_RET_NULL(_("E61: Nested %s*"), reg_magic >= MAGIC_ON);
2187 EMSG3_RET_NULL(_("E62: Nested %s%c"), reg_magic == MAGIC_ALL,
2188 no_Magic(peekchr()));
2189 }
2190
2191 return ret;
2192}
2193
2194/*
2195 * Parse one alternative of an | or & operator.
2196 * Implements the concatenation operator.
2197 */
2198 static char_u *
2199regconcat(int *flagp)
2200{
2201 char_u *first = NULL;
2202 char_u *chain = NULL;
2203 char_u *latest;
2204 int flags;
2205 int cont = TRUE;
2206
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002207 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002208
2209 while (cont)
2210 {
2211 switch (peekchr())
2212 {
2213 case NUL:
2214 case Magic('|'):
2215 case Magic('&'):
2216 case Magic(')'):
2217 cont = FALSE;
2218 break;
2219 case Magic('Z'):
2220 regflags |= RF_ICOMBINE;
2221 skipchr_keepstart();
2222 break;
2223 case Magic('c'):
2224 regflags |= RF_ICASE;
2225 skipchr_keepstart();
2226 break;
2227 case Magic('C'):
2228 regflags |= RF_NOICASE;
2229 skipchr_keepstart();
2230 break;
2231 case Magic('v'):
2232 reg_magic = MAGIC_ALL;
2233 skipchr_keepstart();
2234 curchr = -1;
2235 break;
2236 case Magic('m'):
2237 reg_magic = MAGIC_ON;
2238 skipchr_keepstart();
2239 curchr = -1;
2240 break;
2241 case Magic('M'):
2242 reg_magic = MAGIC_OFF;
2243 skipchr_keepstart();
2244 curchr = -1;
2245 break;
2246 case Magic('V'):
2247 reg_magic = MAGIC_NONE;
2248 skipchr_keepstart();
2249 curchr = -1;
2250 break;
2251 default:
2252 latest = regpiece(&flags);
2253 if (latest == NULL || reg_toolong)
2254 return NULL;
2255 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002256 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002257 *flagp |= flags & SPSTART;
2258 else
2259 regtail(chain, latest);
2260 chain = latest;
2261 if (first == NULL)
2262 first = latest;
2263 break;
2264 }
2265 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002266 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002267 first = regnode(NOTHING);
2268 return first;
2269}
2270
2271/*
2272 * Parse one alternative of an | operator.
2273 * Implements the & operator.
2274 */
2275 static char_u *
2276regbranch(int *flagp)
2277{
2278 char_u *ret;
2279 char_u *chain = NULL;
2280 char_u *latest;
2281 int flags;
2282
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002283 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002284
2285 ret = regnode(BRANCH);
2286 for (;;)
2287 {
2288 latest = regconcat(&flags);
2289 if (latest == NULL)
2290 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002291 // If one of the branches has width, the whole thing has. If one of
2292 // the branches anchors at start-of-line, the whole thing does.
2293 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002294 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002295 // If one of the branches doesn't match a line-break, the whole thing
2296 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002297 *flagp &= ~HASNL | (flags & HASNL);
2298 if (chain != NULL)
2299 regtail(chain, latest);
2300 if (peekchr() != Magic('&'))
2301 break;
2302 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002303 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002304 if (reg_toolong)
2305 break;
2306 reginsert(MATCH, latest);
2307 chain = latest;
2308 }
2309
2310 return ret;
2311}
2312
2313/*
2314 * Parse regular expression, i.e. main body or parenthesized thing.
2315 *
2316 * Caller must absorb opening parenthesis.
2317 *
2318 * Combining parenthesis handling with the base level of regular expression
2319 * is a trifle forced, but the need to tie the tails of the branches to what
2320 * follows makes it hard to avoid.
2321 */
2322 static char_u *
2323reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002324 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002325 int *flagp)
2326{
2327 char_u *ret;
2328 char_u *br;
2329 char_u *ender;
2330 int parno = 0;
2331 int flags;
2332
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002333 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002334
2335#ifdef FEAT_SYN_HL
2336 if (paren == REG_ZPAREN)
2337 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002338 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002339 if (regnzpar >= NSUBEXP)
2340 EMSG_RET_NULL(_("E50: Too many \\z("));
2341 parno = regnzpar;
2342 regnzpar++;
2343 ret = regnode(ZOPEN + parno);
2344 }
2345 else
2346#endif
2347 if (paren == REG_PAREN)
2348 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002349 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002350 if (regnpar >= NSUBEXP)
2351 EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
2352 parno = regnpar;
2353 ++regnpar;
2354 ret = regnode(MOPEN + parno);
2355 }
2356 else if (paren == REG_NPAREN)
2357 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002358 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002359 ret = regnode(NOPEN);
2360 }
2361 else
2362 ret = NULL;
2363
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002364 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002365 br = regbranch(&flags);
2366 if (br == NULL)
2367 return NULL;
2368 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002369 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002370 else
2371 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002372 // If one of the branches can be zero-width, the whole thing can.
2373 // If one of the branches has * at start or matches a line-break, the
2374 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002375 if (!(flags & HASWIDTH))
2376 *flagp &= ~HASWIDTH;
2377 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2378 while (peekchr() == Magic('|'))
2379 {
2380 skipchr();
2381 br = regbranch(&flags);
2382 if (br == NULL || reg_toolong)
2383 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002384 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002385 if (!(flags & HASWIDTH))
2386 *flagp &= ~HASWIDTH;
2387 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2388 }
2389
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002390 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002391 ender = regnode(
2392#ifdef FEAT_SYN_HL
2393 paren == REG_ZPAREN ? ZCLOSE + parno :
2394#endif
2395 paren == REG_PAREN ? MCLOSE + parno :
2396 paren == REG_NPAREN ? NCLOSE : END);
2397 regtail(ret, ender);
2398
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002399 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002400 for (br = ret; br != NULL; br = regnext(br))
2401 regoptail(br, ender);
2402
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002403 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002404 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2405 {
2406#ifdef FEAT_SYN_HL
2407 if (paren == REG_ZPAREN)
2408 EMSG_RET_NULL(_("E52: Unmatched \\z("));
2409 else
2410#endif
2411 if (paren == REG_NPAREN)
2412 EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
2413 else
2414 EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
2415 }
2416 else if (paren == REG_NOPAREN && peekchr() != NUL)
2417 {
2418 if (curchr == Magic(')'))
2419 EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
2420 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002421 EMSG_RET_NULL(_(e_trailing)); // "Can't happen".
2422 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002423 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002424 // Here we set the flag allowing back references to this set of
2425 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002426 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002427 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002428 return ret;
2429}
2430
2431/*
2432 * bt_regcomp() - compile a regular expression into internal code for the
2433 * traditional back track matcher.
2434 * Returns the program in allocated space. Returns NULL for an error.
2435 *
2436 * We can't allocate space until we know how big the compiled form will be,
2437 * but we can't compile it (and thus know how big it is) until we've got a
2438 * place to put the code. So we cheat: we compile it twice, once with code
2439 * generation turned off and size counting turned on, and once "for real".
2440 * This also means that we don't allocate space until we are sure that the
2441 * thing really will compile successfully, and we never have to move the
2442 * code and thus invalidate pointers into it. (Note that it has to be in
2443 * one piece because vim_free() must be able to free it all.)
2444 *
2445 * Whether upper/lower case is to be ignored is decided when executing the
2446 * program, it does not matter here.
2447 *
2448 * Beware that the optimization-preparation code in here knows about some
2449 * of the structure of the compiled regexp.
2450 * "re_flags": RE_MAGIC and/or RE_STRING.
2451 */
2452 static regprog_T *
2453bt_regcomp(char_u *expr, int re_flags)
2454{
2455 bt_regprog_T *r;
2456 char_u *scan;
2457 char_u *longest;
2458 int len;
2459 int flags;
2460
2461 if (expr == NULL)
Bram Moolenaare83cca22020-09-07 18:53:21 +02002462 IEMSG_RET_NULL(_(e_null));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002463
2464 init_class_tab();
2465
2466 // First pass: determine size, legality.
2467 regcomp_start(expr, re_flags);
2468 regcode = JUST_CALC_SIZE;
2469 regc(REGMAGIC);
2470 if (reg(REG_NOPAREN, &flags) == NULL)
2471 return NULL;
2472
2473 // Allocate space.
2474 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2475 if (r == NULL)
2476 return NULL;
2477 r->re_in_use = FALSE;
2478
2479 // Second pass: emit code.
2480 regcomp_start(expr, re_flags);
2481 regcode = r->program;
2482 regc(REGMAGIC);
2483 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2484 {
2485 vim_free(r);
2486 if (reg_toolong)
2487 EMSG_RET_NULL(_("E339: Pattern too long"));
2488 return NULL;
2489 }
2490
2491 // Dig out information for optimizations.
2492 r->regstart = NUL; // Worst-case defaults.
2493 r->reganch = 0;
2494 r->regmust = NULL;
2495 r->regmlen = 0;
2496 r->regflags = regflags;
2497 if (flags & HASNL)
2498 r->regflags |= RF_HASNL;
2499 if (flags & HASLOOKBH)
2500 r->regflags |= RF_LOOKBH;
2501#ifdef FEAT_SYN_HL
2502 // Remember whether this pattern has any \z specials in it.
2503 r->reghasz = re_has_z;
2504#endif
2505 scan = r->program + 1; // First BRANCH.
2506 if (OP(regnext(scan)) == END) // Only one top-level choice.
2507 {
2508 scan = OPERAND(scan);
2509
2510 // Starting-point info.
2511 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2512 {
2513 r->reganch++;
2514 scan = regnext(scan);
2515 }
2516
2517 if (OP(scan) == EXACTLY)
2518 {
2519 if (has_mbyte)
2520 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2521 else
2522 r->regstart = *OPERAND(scan);
2523 }
2524 else if ((OP(scan) == BOW
2525 || OP(scan) == EOW
2526 || OP(scan) == NOTHING
2527 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2528 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2529 && OP(regnext(scan)) == EXACTLY)
2530 {
2531 if (has_mbyte)
2532 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2533 else
2534 r->regstart = *OPERAND(regnext(scan));
2535 }
2536
2537 // If there's something expensive in the r.e., find the longest
2538 // literal string that must appear and make it the regmust. Resolve
2539 // ties in favor of later strings, since the regstart check works
2540 // with the beginning of the r.e. and avoiding duplication
2541 // strengthens checking. Not a strong reason, but sufficient in the
2542 // absence of others.
2543
2544 // When the r.e. starts with BOW, it is faster to look for a regmust
2545 // first. Used a lot for "#" and "*" commands. (Added by mool).
2546 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2547 && !(flags & HASNL))
2548 {
2549 longest = NULL;
2550 len = 0;
2551 for (; scan != NULL; scan = regnext(scan))
2552 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
2553 {
2554 longest = OPERAND(scan);
2555 len = (int)STRLEN(OPERAND(scan));
2556 }
2557 r->regmust = longest;
2558 r->regmlen = len;
2559 }
2560 }
2561#ifdef BT_REGEXP_DUMP
2562 regdump(expr, r);
2563#endif
2564 r->engine = &bt_regengine;
2565 return (regprog_T *)r;
2566}
2567
2568#if defined(FEAT_SYN_HL) || defined(PROTO)
2569/*
2570 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2571 * found. This is messy, but it works fine.
2572 */
2573 int
2574vim_regcomp_had_eol(void)
2575{
2576 return had_eol;
2577}
2578#endif
2579
2580/*
2581 * Get a number after a backslash that is inside [].
2582 * When nothing is recognized return a backslash.
2583 */
2584 static int
2585coll_get_char(void)
2586{
2587 long nr = -1;
2588
2589 switch (*regparse++)
2590 {
2591 case 'd': nr = getdecchrs(); break;
2592 case 'o': nr = getoctchrs(); break;
2593 case 'x': nr = gethexchrs(2); break;
2594 case 'u': nr = gethexchrs(4); break;
2595 case 'U': nr = gethexchrs(8); break;
2596 }
2597 if (nr < 0 || nr > INT_MAX)
2598 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002599 // If getting the number fails be backwards compatible: the character
2600 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002601 --regparse;
2602 nr = '\\';
2603 }
2604 return nr;
2605}
2606
2607/*
2608 * Free a compiled regexp program, returned by bt_regcomp().
2609 */
2610 static void
2611bt_regfree(regprog_T *prog)
2612{
2613 vim_free(prog);
2614}
2615
2616#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2617
2618/*
2619 * The arguments from BRACE_LIMITS are stored here. They are actually local
2620 * to regmatch(), but they are here to reduce the amount of stack space used
2621 * (it can be called recursively many times).
2622 */
2623static long bl_minval;
2624static long bl_maxval;
2625
2626/*
2627 * Save the input line and position in a regsave_T.
2628 */
2629 static void
2630reg_save(regsave_T *save, garray_T *gap)
2631{
2632 if (REG_MULTI)
2633 {
2634 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2635 save->rs_u.pos.lnum = rex.lnum;
2636 }
2637 else
2638 save->rs_u.ptr = rex.input;
2639 save->rs_len = gap->ga_len;
2640}
2641
2642/*
2643 * Restore the input line and position from a regsave_T.
2644 */
2645 static void
2646reg_restore(regsave_T *save, garray_T *gap)
2647{
2648 if (REG_MULTI)
2649 {
2650 if (rex.lnum != save->rs_u.pos.lnum)
2651 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002652 // only call reg_getline() when the line number changed to save
2653 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002654 rex.lnum = save->rs_u.pos.lnum;
2655 rex.line = reg_getline(rex.lnum);
2656 }
2657 rex.input = rex.line + save->rs_u.pos.col;
2658 }
2659 else
2660 rex.input = save->rs_u.ptr;
2661 gap->ga_len = save->rs_len;
2662}
2663
2664/*
2665 * Return TRUE if current position is equal to saved position.
2666 */
2667 static int
2668reg_save_equal(regsave_T *save)
2669{
2670 if (REG_MULTI)
2671 return rex.lnum == save->rs_u.pos.lnum
2672 && rex.input == rex.line + save->rs_u.pos.col;
2673 return rex.input == save->rs_u.ptr;
2674}
2675
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002676// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002677#define save_se(savep, posp, pp) \
2678 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2679
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002680// After a failed match restore the sub-expressions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002681#define restore_se(savep, posp, pp) { \
2682 if (REG_MULTI) \
2683 *(posp) = (savep)->se_u.pos; \
2684 else \
2685 *(pp) = (savep)->se_u.ptr; }
2686
2687/*
2688 * Tentatively set the sub-expression start to the current position (after
2689 * calling regmatch() they will have changed). Need to save the existing
2690 * values for when there is no match.
2691 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2692 * depending on REG_MULTI.
2693 */
2694 static void
2695save_se_multi(save_se_T *savep, lpos_T *posp)
2696{
2697 savep->se_u.pos = *posp;
2698 posp->lnum = rex.lnum;
2699 posp->col = (colnr_T)(rex.input - rex.line);
2700}
2701
2702 static void
2703save_se_one(save_se_T *savep, char_u **pp)
2704{
2705 savep->se_u.ptr = *pp;
2706 *pp = rex.input;
2707}
2708
2709/*
2710 * regrepeat - repeatedly match something simple, return how many.
2711 * Advances rex.input (and rex.lnum) to just after the matched chars.
2712 */
2713 static int
2714regrepeat(
2715 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002716 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002717{
2718 long count = 0;
2719 char_u *scan;
2720 char_u *opnd;
2721 int mask;
2722 int testval = 0;
2723
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002724 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002725 opnd = OPERAND(p);
2726 switch (OP(p))
2727 {
2728 case ANY:
2729 case ANY + ADD_NL:
2730 while (count < maxcount)
2731 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002732 // Matching anything means we continue until end-of-line (or
2733 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002734 while (*scan != NUL && count < maxcount)
2735 {
2736 ++count;
2737 MB_PTR_ADV(scan);
2738 }
2739 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2740 || rex.reg_line_lbr || count == maxcount)
2741 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002742 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002743 reg_nextline();
2744 scan = rex.input;
2745 if (got_int)
2746 break;
2747 }
2748 break;
2749
2750 case IDENT:
2751 case IDENT + ADD_NL:
2752 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002753 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002754 case SIDENT:
2755 case SIDENT + ADD_NL:
2756 while (count < maxcount)
2757 {
2758 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2759 {
2760 MB_PTR_ADV(scan);
2761 }
2762 else if (*scan == NUL)
2763 {
2764 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2765 || rex.reg_line_lbr)
2766 break;
2767 reg_nextline();
2768 scan = rex.input;
2769 if (got_int)
2770 break;
2771 }
2772 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2773 ++scan;
2774 else
2775 break;
2776 ++count;
2777 }
2778 break;
2779
2780 case KWORD:
2781 case KWORD + ADD_NL:
2782 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002783 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002784 case SKWORD:
2785 case SKWORD + ADD_NL:
2786 while (count < maxcount)
2787 {
2788 if (vim_iswordp_buf(scan, rex.reg_buf)
2789 && (testval || !VIM_ISDIGIT(*scan)))
2790 {
2791 MB_PTR_ADV(scan);
2792 }
2793 else if (*scan == NUL)
2794 {
2795 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2796 || rex.reg_line_lbr)
2797 break;
2798 reg_nextline();
2799 scan = rex.input;
2800 if (got_int)
2801 break;
2802 }
2803 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2804 ++scan;
2805 else
2806 break;
2807 ++count;
2808 }
2809 break;
2810
2811 case FNAME:
2812 case FNAME + ADD_NL:
2813 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002814 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002815 case SFNAME:
2816 case SFNAME + ADD_NL:
2817 while (count < maxcount)
2818 {
2819 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2820 {
2821 MB_PTR_ADV(scan);
2822 }
2823 else if (*scan == NUL)
2824 {
2825 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2826 || rex.reg_line_lbr)
2827 break;
2828 reg_nextline();
2829 scan = rex.input;
2830 if (got_int)
2831 break;
2832 }
2833 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2834 ++scan;
2835 else
2836 break;
2837 ++count;
2838 }
2839 break;
2840
2841 case PRINT:
2842 case PRINT + ADD_NL:
2843 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002844 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002845 case SPRINT:
2846 case SPRINT + ADD_NL:
2847 while (count < maxcount)
2848 {
2849 if (*scan == NUL)
2850 {
2851 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2852 || rex.reg_line_lbr)
2853 break;
2854 reg_nextline();
2855 scan = rex.input;
2856 if (got_int)
2857 break;
2858 }
2859 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2860 && (testval || !VIM_ISDIGIT(*scan)))
2861 {
2862 MB_PTR_ADV(scan);
2863 }
2864 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2865 ++scan;
2866 else
2867 break;
2868 ++count;
2869 }
2870 break;
2871
2872 case WHITE:
2873 case WHITE + ADD_NL:
2874 testval = mask = RI_WHITE;
2875do_class:
2876 while (count < maxcount)
2877 {
2878 int l;
2879
2880 if (*scan == NUL)
2881 {
2882 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2883 || rex.reg_line_lbr)
2884 break;
2885 reg_nextline();
2886 scan = rex.input;
2887 if (got_int)
2888 break;
2889 }
2890 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2891 {
2892 if (testval != 0)
2893 break;
2894 scan += l;
2895 }
2896 else if ((class_tab[*scan] & mask) == testval)
2897 ++scan;
2898 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2899 ++scan;
2900 else
2901 break;
2902 ++count;
2903 }
2904 break;
2905
2906 case NWHITE:
2907 case NWHITE + ADD_NL:
2908 mask = RI_WHITE;
2909 goto do_class;
2910 case DIGIT:
2911 case DIGIT + ADD_NL:
2912 testval = mask = RI_DIGIT;
2913 goto do_class;
2914 case NDIGIT:
2915 case NDIGIT + ADD_NL:
2916 mask = RI_DIGIT;
2917 goto do_class;
2918 case HEX:
2919 case HEX + ADD_NL:
2920 testval = mask = RI_HEX;
2921 goto do_class;
2922 case NHEX:
2923 case NHEX + ADD_NL:
2924 mask = RI_HEX;
2925 goto do_class;
2926 case OCTAL:
2927 case OCTAL + ADD_NL:
2928 testval = mask = RI_OCTAL;
2929 goto do_class;
2930 case NOCTAL:
2931 case NOCTAL + ADD_NL:
2932 mask = RI_OCTAL;
2933 goto do_class;
2934 case WORD:
2935 case WORD + ADD_NL:
2936 testval = mask = RI_WORD;
2937 goto do_class;
2938 case NWORD:
2939 case NWORD + ADD_NL:
2940 mask = RI_WORD;
2941 goto do_class;
2942 case HEAD:
2943 case HEAD + ADD_NL:
2944 testval = mask = RI_HEAD;
2945 goto do_class;
2946 case NHEAD:
2947 case NHEAD + ADD_NL:
2948 mask = RI_HEAD;
2949 goto do_class;
2950 case ALPHA:
2951 case ALPHA + ADD_NL:
2952 testval = mask = RI_ALPHA;
2953 goto do_class;
2954 case NALPHA:
2955 case NALPHA + ADD_NL:
2956 mask = RI_ALPHA;
2957 goto do_class;
2958 case LOWER:
2959 case LOWER + ADD_NL:
2960 testval = mask = RI_LOWER;
2961 goto do_class;
2962 case NLOWER:
2963 case NLOWER + ADD_NL:
2964 mask = RI_LOWER;
2965 goto do_class;
2966 case UPPER:
2967 case UPPER + ADD_NL:
2968 testval = mask = RI_UPPER;
2969 goto do_class;
2970 case NUPPER:
2971 case NUPPER + ADD_NL:
2972 mask = RI_UPPER;
2973 goto do_class;
2974
2975 case EXACTLY:
2976 {
2977 int cu, cl;
2978
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002979 // This doesn't do a multi-byte character, because a MULTIBYTECODE
2980 // would have been used for it. It does handle single-byte
2981 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002982 if (rex.reg_ic)
2983 {
2984 cu = MB_TOUPPER(*opnd);
2985 cl = MB_TOLOWER(*opnd);
2986 while (count < maxcount && (*scan == cu || *scan == cl))
2987 {
2988 count++;
2989 scan++;
2990 }
2991 }
2992 else
2993 {
2994 cu = *opnd;
2995 while (count < maxcount && *scan == cu)
2996 {
2997 count++;
2998 scan++;
2999 }
3000 }
3001 break;
3002 }
3003
3004 case MULTIBYTECODE:
3005 {
3006 int i, len, cf = 0;
3007
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003008 // Safety check (just in case 'encoding' was changed since
3009 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003010 if ((len = (*mb_ptr2len)(opnd)) > 1)
3011 {
3012 if (rex.reg_ic && enc_utf8)
3013 cf = utf_fold(utf_ptr2char(opnd));
3014 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3015 {
3016 for (i = 0; i < len; ++i)
3017 if (opnd[i] != scan[i])
3018 break;
3019 if (i < len && (!rex.reg_ic || !enc_utf8
3020 || utf_fold(utf_ptr2char(scan)) != cf))
3021 break;
3022 scan += len;
3023 ++count;
3024 }
3025 }
3026 }
3027 break;
3028
3029 case ANYOF:
3030 case ANYOF + ADD_NL:
3031 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003032 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003033
3034 case ANYBUT:
3035 case ANYBUT + ADD_NL:
3036 while (count < maxcount)
3037 {
3038 int len;
3039
3040 if (*scan == NUL)
3041 {
3042 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3043 || rex.reg_line_lbr)
3044 break;
3045 reg_nextline();
3046 scan = rex.input;
3047 if (got_int)
3048 break;
3049 }
3050 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3051 ++scan;
3052 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3053 {
3054 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3055 break;
3056 scan += len;
3057 }
3058 else
3059 {
3060 if ((cstrchr(opnd, *scan) == NULL) == testval)
3061 break;
3062 ++scan;
3063 }
3064 ++count;
3065 }
3066 break;
3067
3068 case NEWL:
3069 while (count < maxcount
3070 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3071 && !rex.reg_line_lbr && REG_MULTI)
3072 || (*scan == '\n' && rex.reg_line_lbr)))
3073 {
3074 count++;
3075 if (rex.reg_line_lbr)
3076 ADVANCE_REGINPUT();
3077 else
3078 reg_nextline();
3079 scan = rex.input;
3080 if (got_int)
3081 break;
3082 }
3083 break;
3084
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003085 default: // Oh dear. Called inappropriately.
Bram Moolenaare83cca22020-09-07 18:53:21 +02003086 iemsg(_(e_re_corr));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003087#ifdef DEBUG
3088 printf("Called regrepeat with op code %d\n", OP(p));
3089#endif
3090 break;
3091 }
3092
3093 rex.input = scan;
3094
3095 return (int)count;
3096}
3097
3098/*
3099 * Push an item onto the regstack.
3100 * Returns pointer to new item. Returns NULL when out of memory.
3101 */
3102 static regitem_T *
3103regstack_push(regstate_T state, char_u *scan)
3104{
3105 regitem_T *rp;
3106
3107 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3108 {
3109 emsg(_(e_maxmempat));
3110 return NULL;
3111 }
3112 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3113 return NULL;
3114
3115 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3116 rp->rs_state = state;
3117 rp->rs_scan = scan;
3118
3119 regstack.ga_len += sizeof(regitem_T);
3120 return rp;
3121}
3122
3123/*
3124 * Pop an item from the regstack.
3125 */
3126 static void
3127regstack_pop(char_u **scan)
3128{
3129 regitem_T *rp;
3130
3131 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3132 *scan = rp->rs_scan;
3133
3134 regstack.ga_len -= sizeof(regitem_T);
3135}
3136
3137/*
3138 * Save the current subexpr to "bp", so that they can be restored
3139 * later by restore_subexpr().
3140 */
3141 static void
3142save_subexpr(regbehind_T *bp)
3143{
3144 int i;
3145
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003146 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3147 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003148 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
3149 if (!rex.need_clear_subexpr)
3150 {
3151 for (i = 0; i < NSUBEXP; ++i)
3152 {
3153 if (REG_MULTI)
3154 {
3155 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3156 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3157 }
3158 else
3159 {
3160 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3161 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
3162 }
3163 }
3164 }
3165}
3166
3167/*
3168 * Restore the subexpr from "bp".
3169 */
3170 static void
3171restore_subexpr(regbehind_T *bp)
3172{
3173 int i;
3174
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003175 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003176 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
3177 if (!rex.need_clear_subexpr)
3178 {
3179 for (i = 0; i < NSUBEXP; ++i)
3180 {
3181 if (REG_MULTI)
3182 {
3183 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3184 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3185 }
3186 else
3187 {
3188 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3189 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
3190 }
3191 }
3192 }
3193}
3194
3195/*
3196 * regmatch - main matching routine
3197 *
3198 * Conceptually the strategy is simple: Check to see whether the current node
3199 * matches, push an item onto the regstack and loop to see whether the rest
3200 * matches, and then act accordingly. In practice we make some effort to
3201 * avoid using the regstack, in particular by going through "ordinary" nodes
3202 * (that don't need to know whether the rest of the match failed) by a nested
3203 * loop.
3204 *
3205 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3206 * the last matched character.
3207 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3208 * undefined state!
3209 */
3210 static int
3211regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003212 char_u *scan, // Current node.
3213 proftime_T *tm UNUSED, // timeout limit or NULL
3214 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003215{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003216 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003217 int op;
3218 int c;
3219 regitem_T *rp;
3220 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003221 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003222#ifdef FEAT_RELTIME
3223 int tm_count = 0;
3224#endif
3225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003226 // Make "regstack" and "backpos" empty. They are allocated and freed in
3227 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003228 regstack.ga_len = 0;
3229 backpos.ga_len = 0;
3230
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003231 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003232 for (;;)
3233 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003234 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3235 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003236 fast_breakcheck();
3237
3238#ifdef DEBUG
3239 if (scan != NULL && regnarrate)
3240 {
3241 mch_errmsg((char *)regprop(scan));
3242 mch_errmsg("(\n");
3243 }
3244#endif
3245
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003246 // Repeat for items that can be matched sequentially, without using the
3247 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003248 for (;;)
3249 {
3250 if (got_int || scan == NULL)
3251 {
3252 status = RA_FAIL;
3253 break;
3254 }
3255#ifdef FEAT_RELTIME
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003256 // Check for timeout once in a 100 times to avoid overhead.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003257 if (tm != NULL && ++tm_count == 100)
3258 {
3259 tm_count = 0;
3260 if (profile_passed_limit(tm))
3261 {
3262 if (timed_out != NULL)
3263 *timed_out = TRUE;
3264 status = RA_FAIL;
3265 break;
3266 }
3267 }
3268#endif
3269 status = RA_CONT;
3270
3271#ifdef DEBUG
3272 if (regnarrate)
3273 {
3274 mch_errmsg((char *)regprop(scan));
3275 mch_errmsg("...\n");
3276# ifdef FEAT_SYN_HL
3277 if (re_extmatch_in != NULL)
3278 {
3279 int i;
3280
3281 mch_errmsg(_("External submatches:\n"));
3282 for (i = 0; i < NSUBEXP; i++)
3283 {
3284 mch_errmsg(" \"");
3285 if (re_extmatch_in->matches[i] != NULL)
3286 mch_errmsg((char *)re_extmatch_in->matches[i]);
3287 mch_errmsg("\"\n");
3288 }
3289 }
3290# endif
3291 }
3292#endif
3293 next = regnext(scan);
3294
3295 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003296 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003297 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3298 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
3299 {
3300 reg_nextline();
3301 }
3302 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3303 {
3304 ADVANCE_REGINPUT();
3305 }
3306 else
3307 {
3308 if (WITH_NL(op))
3309 op -= ADD_NL;
3310 if (has_mbyte)
3311 c = (*mb_ptr2char)(rex.input);
3312 else
3313 c = *rex.input;
3314 switch (op)
3315 {
3316 case BOL:
3317 if (rex.input != rex.line)
3318 status = RA_NOMATCH;
3319 break;
3320
3321 case EOL:
3322 if (c != NUL)
3323 status = RA_NOMATCH;
3324 break;
3325
3326 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003327 // We're not at the beginning of the file when below the first
3328 // line where we started, not at the start of the line or we
3329 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003330 if (rex.lnum != 0 || rex.input != rex.line
3331 || (REG_MULTI && rex.reg_firstlnum > 1))
3332 status = RA_NOMATCH;
3333 break;
3334
3335 case RE_EOF:
3336 if (rex.lnum != rex.reg_maxline || c != NUL)
3337 status = RA_NOMATCH;
3338 break;
3339
3340 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003341 // Check if the buffer is in a window and compare the
3342 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003343 if (rex.reg_win == NULL
3344 || (rex.lnum + rex.reg_firstlnum
3345 != rex.reg_win->w_cursor.lnum)
3346 || ((colnr_T)(rex.input - rex.line)
3347 != rex.reg_win->w_cursor.col))
3348 status = RA_NOMATCH;
3349 break;
3350
3351 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003352 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003353 {
3354 int mark = OPERAND(scan)[0];
3355 int cmp = OPERAND(scan)[1];
3356 pos_T *pos;
3357
3358 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003359 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003360 || pos->lnum <= 0) // mark isn't set in reg_buf
3361 {
3362 status = RA_NOMATCH;
3363 }
3364 else
3365 {
3366 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3367 && pos->col == MAXCOL
3368 ? (colnr_T)STRLEN(reg_getline(
3369 pos->lnum - rex.reg_firstlnum))
3370 : pos->col;
3371
3372 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3373 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003374 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003375 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003376 ? cmp != '>'
3377 : cmp != '<'))
3378 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3379 ? cmp != '>'
3380 : cmp != '<')))
3381 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003382 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003383 }
3384 break;
3385
3386 case RE_VISUAL:
3387 if (!reg_match_visual())
3388 status = RA_NOMATCH;
3389 break;
3390
3391 case RE_LNUM:
3392 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3393 scan))
3394 status = RA_NOMATCH;
3395 break;
3396
3397 case RE_COL:
3398 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3399 status = RA_NOMATCH;
3400 break;
3401
3402 case RE_VCOL:
3403 if (!re_num_cmp((long_u)win_linetabsize(
3404 rex.reg_win == NULL ? curwin : rex.reg_win,
3405 rex.line, (colnr_T)(rex.input - rex.line)) + 1, scan))
3406 status = RA_NOMATCH;
3407 break;
3408
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003409 case BOW: // \<word; rex.input points to w
3410 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003411 status = RA_NOMATCH;
3412 else if (has_mbyte)
3413 {
3414 int this_class;
3415
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003416 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003417 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3418 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003419 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003420 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003421 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003422 }
3423 else
3424 {
3425 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3426 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3427 status = RA_NOMATCH;
3428 }
3429 break;
3430
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003431 case EOW: // word\>; rex.input points after d
3432 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003433 status = RA_NOMATCH;
3434 else if (has_mbyte)
3435 {
3436 int this_class, prev_class;
3437
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003438 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003439 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3440 prev_class = reg_prev_class();
3441 if (this_class == prev_class
3442 || prev_class == 0 || prev_class == 1)
3443 status = RA_NOMATCH;
3444 }
3445 else
3446 {
3447 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3448 || (rex.input[0] != NUL
3449 && vim_iswordc_buf(c, rex.reg_buf)))
3450 status = RA_NOMATCH;
3451 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003452 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003453
3454 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003455 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003456 if (c == NUL)
3457 status = RA_NOMATCH;
3458 else
3459 ADVANCE_REGINPUT();
3460 break;
3461
3462 case IDENT:
3463 if (!vim_isIDc(c))
3464 status = RA_NOMATCH;
3465 else
3466 ADVANCE_REGINPUT();
3467 break;
3468
3469 case SIDENT:
3470 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3471 status = RA_NOMATCH;
3472 else
3473 ADVANCE_REGINPUT();
3474 break;
3475
3476 case KWORD:
3477 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3478 status = RA_NOMATCH;
3479 else
3480 ADVANCE_REGINPUT();
3481 break;
3482
3483 case SKWORD:
3484 if (VIM_ISDIGIT(*rex.input)
3485 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3486 status = RA_NOMATCH;
3487 else
3488 ADVANCE_REGINPUT();
3489 break;
3490
3491 case FNAME:
3492 if (!vim_isfilec(c))
3493 status = RA_NOMATCH;
3494 else
3495 ADVANCE_REGINPUT();
3496 break;
3497
3498 case SFNAME:
3499 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3500 status = RA_NOMATCH;
3501 else
3502 ADVANCE_REGINPUT();
3503 break;
3504
3505 case PRINT:
3506 if (!vim_isprintc(PTR2CHAR(rex.input)))
3507 status = RA_NOMATCH;
3508 else
3509 ADVANCE_REGINPUT();
3510 break;
3511
3512 case SPRINT:
3513 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3514 status = RA_NOMATCH;
3515 else
3516 ADVANCE_REGINPUT();
3517 break;
3518
3519 case WHITE:
3520 if (!VIM_ISWHITE(c))
3521 status = RA_NOMATCH;
3522 else
3523 ADVANCE_REGINPUT();
3524 break;
3525
3526 case NWHITE:
3527 if (c == NUL || VIM_ISWHITE(c))
3528 status = RA_NOMATCH;
3529 else
3530 ADVANCE_REGINPUT();
3531 break;
3532
3533 case DIGIT:
3534 if (!ri_digit(c))
3535 status = RA_NOMATCH;
3536 else
3537 ADVANCE_REGINPUT();
3538 break;
3539
3540 case NDIGIT:
3541 if (c == NUL || ri_digit(c))
3542 status = RA_NOMATCH;
3543 else
3544 ADVANCE_REGINPUT();
3545 break;
3546
3547 case HEX:
3548 if (!ri_hex(c))
3549 status = RA_NOMATCH;
3550 else
3551 ADVANCE_REGINPUT();
3552 break;
3553
3554 case NHEX:
3555 if (c == NUL || ri_hex(c))
3556 status = RA_NOMATCH;
3557 else
3558 ADVANCE_REGINPUT();
3559 break;
3560
3561 case OCTAL:
3562 if (!ri_octal(c))
3563 status = RA_NOMATCH;
3564 else
3565 ADVANCE_REGINPUT();
3566 break;
3567
3568 case NOCTAL:
3569 if (c == NUL || ri_octal(c))
3570 status = RA_NOMATCH;
3571 else
3572 ADVANCE_REGINPUT();
3573 break;
3574
3575 case WORD:
3576 if (!ri_word(c))
3577 status = RA_NOMATCH;
3578 else
3579 ADVANCE_REGINPUT();
3580 break;
3581
3582 case NWORD:
3583 if (c == NUL || ri_word(c))
3584 status = RA_NOMATCH;
3585 else
3586 ADVANCE_REGINPUT();
3587 break;
3588
3589 case HEAD:
3590 if (!ri_head(c))
3591 status = RA_NOMATCH;
3592 else
3593 ADVANCE_REGINPUT();
3594 break;
3595
3596 case NHEAD:
3597 if (c == NUL || ri_head(c))
3598 status = RA_NOMATCH;
3599 else
3600 ADVANCE_REGINPUT();
3601 break;
3602
3603 case ALPHA:
3604 if (!ri_alpha(c))
3605 status = RA_NOMATCH;
3606 else
3607 ADVANCE_REGINPUT();
3608 break;
3609
3610 case NALPHA:
3611 if (c == NUL || ri_alpha(c))
3612 status = RA_NOMATCH;
3613 else
3614 ADVANCE_REGINPUT();
3615 break;
3616
3617 case LOWER:
3618 if (!ri_lower(c))
3619 status = RA_NOMATCH;
3620 else
3621 ADVANCE_REGINPUT();
3622 break;
3623
3624 case NLOWER:
3625 if (c == NUL || ri_lower(c))
3626 status = RA_NOMATCH;
3627 else
3628 ADVANCE_REGINPUT();
3629 break;
3630
3631 case UPPER:
3632 if (!ri_upper(c))
3633 status = RA_NOMATCH;
3634 else
3635 ADVANCE_REGINPUT();
3636 break;
3637
3638 case NUPPER:
3639 if (c == NUL || ri_upper(c))
3640 status = RA_NOMATCH;
3641 else
3642 ADVANCE_REGINPUT();
3643 break;
3644
3645 case EXACTLY:
3646 {
3647 int len;
3648 char_u *opnd;
3649
3650 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003651 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003652 if (*opnd != *rex.input
3653 && (!rex.reg_ic
3654 || (!enc_utf8
3655 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3656 status = RA_NOMATCH;
3657 else if (*opnd == NUL)
3658 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003659 // match empty string always works; happens when "~" is
3660 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003661 }
3662 else
3663 {
3664 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3665 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003666 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003667 }
3668 else
3669 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003670 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003671 len = (int)STRLEN(opnd);
3672 if (cstrncmp(opnd, rex.input, &len) != 0)
3673 status = RA_NOMATCH;
3674 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003675 // Check for following composing character, unless %C
3676 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003677 if (status != RA_NOMATCH
3678 && enc_utf8
3679 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3680 && !rex.reg_icombine
3681 && OP(next) != RE_COMPOSING)
3682 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003683 // raaron: This code makes a composing character get
3684 // ignored, which is the correct behavior (sometimes)
3685 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003686 status = RA_NOMATCH;
3687 }
3688 if (status != RA_NOMATCH)
3689 rex.input += len;
3690 }
3691 }
3692 break;
3693
3694 case ANYOF:
3695 case ANYBUT:
3696 if (c == NUL)
3697 status = RA_NOMATCH;
3698 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3699 status = RA_NOMATCH;
3700 else
3701 ADVANCE_REGINPUT();
3702 break;
3703
3704 case MULTIBYTECODE:
3705 if (has_mbyte)
3706 {
3707 int i, len;
3708 char_u *opnd;
3709 int opndc = 0, inpc;
3710
3711 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003712 // Safety check (just in case 'encoding' was changed since
3713 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003714 if ((len = (*mb_ptr2len)(opnd)) < 2)
3715 {
3716 status = RA_NOMATCH;
3717 break;
3718 }
3719 if (enc_utf8)
3720 opndc = utf_ptr2char(opnd);
3721 if (enc_utf8 && utf_iscomposing(opndc))
3722 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003723 // When only a composing char is given match at any
3724 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003725 status = RA_NOMATCH;
3726 for (i = 0; rex.input[i] != NUL;
3727 i += utf_ptr2len(rex.input + i))
3728 {
3729 inpc = utf_ptr2char(rex.input + i);
3730 if (!utf_iscomposing(inpc))
3731 {
3732 if (i > 0)
3733 break;
3734 }
3735 else if (opndc == inpc)
3736 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003737 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003738 len = i + utfc_ptr2len(rex.input + i);
3739 status = RA_MATCH;
3740 break;
3741 }
3742 }
3743 }
3744 else
3745 for (i = 0; i < len; ++i)
3746 if (opnd[i] != rex.input[i])
3747 {
3748 status = RA_NOMATCH;
3749 break;
3750 }
3751 rex.input += len;
3752 }
3753 else
3754 status = RA_NOMATCH;
3755 break;
3756 case RE_COMPOSING:
3757 if (enc_utf8)
3758 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003759 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003760 while (utf_iscomposing(utf_ptr2char(rex.input)))
3761 MB_CPTR_ADV(rex.input);
3762 }
3763 break;
3764
3765 case NOTHING:
3766 break;
3767
3768 case BACK:
3769 {
3770 int i;
3771 backpos_T *bp;
3772
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003773 // When we run into BACK we need to check if we don't keep
3774 // looping without matching any input. The second and later
3775 // times a BACK is encountered it fails if the input is still
3776 // at the same position as the previous time.
3777 // The positions are stored in "backpos" and found by the
3778 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003779 bp = (backpos_T *)backpos.ga_data;
3780 for (i = 0; i < backpos.ga_len; ++i)
3781 if (bp[i].bp_scan == scan)
3782 break;
3783 if (i == backpos.ga_len)
3784 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003785 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003786 if (ga_grow(&backpos, 1) == FAIL)
3787 status = RA_FAIL;
3788 else
3789 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003790 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003791 bp = (backpos_T *)backpos.ga_data;
3792 bp[i].bp_scan = scan;
3793 ++backpos.ga_len;
3794 }
3795 }
3796 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003797 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003798 status = RA_NOMATCH;
3799
3800 if (status != RA_FAIL && status != RA_NOMATCH)
3801 reg_save(&bp[i].bp_pos, &backpos);
3802 }
3803 break;
3804
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003805 case MOPEN + 0: // Match start: \zs
3806 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003807 case MOPEN + 2:
3808 case MOPEN + 3:
3809 case MOPEN + 4:
3810 case MOPEN + 5:
3811 case MOPEN + 6:
3812 case MOPEN + 7:
3813 case MOPEN + 8:
3814 case MOPEN + 9:
3815 {
3816 no = op - MOPEN;
3817 cleanup_subexpr();
3818 rp = regstack_push(RS_MOPEN, scan);
3819 if (rp == NULL)
3820 status = RA_FAIL;
3821 else
3822 {
3823 rp->rs_no = no;
3824 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3825 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003826 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003827 }
3828 }
3829 break;
3830
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003831 case NOPEN: // \%(
3832 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003833 if (regstack_push(RS_NOPEN, scan) == NULL)
3834 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003835 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003836 break;
3837
3838#ifdef FEAT_SYN_HL
3839 case ZOPEN + 1:
3840 case ZOPEN + 2:
3841 case ZOPEN + 3:
3842 case ZOPEN + 4:
3843 case ZOPEN + 5:
3844 case ZOPEN + 6:
3845 case ZOPEN + 7:
3846 case ZOPEN + 8:
3847 case ZOPEN + 9:
3848 {
3849 no = op - ZOPEN;
3850 cleanup_zsubexpr();
3851 rp = regstack_push(RS_ZOPEN, scan);
3852 if (rp == NULL)
3853 status = RA_FAIL;
3854 else
3855 {
3856 rp->rs_no = no;
3857 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3858 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003859 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003860 }
3861 }
3862 break;
3863#endif
3864
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003865 case MCLOSE + 0: // Match end: \ze
3866 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003867 case MCLOSE + 2:
3868 case MCLOSE + 3:
3869 case MCLOSE + 4:
3870 case MCLOSE + 5:
3871 case MCLOSE + 6:
3872 case MCLOSE + 7:
3873 case MCLOSE + 8:
3874 case MCLOSE + 9:
3875 {
3876 no = op - MCLOSE;
3877 cleanup_subexpr();
3878 rp = regstack_push(RS_MCLOSE, scan);
3879 if (rp == NULL)
3880 status = RA_FAIL;
3881 else
3882 {
3883 rp->rs_no = no;
3884 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3885 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003886 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003887 }
3888 }
3889 break;
3890
3891#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003892 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003893 case ZCLOSE + 2:
3894 case ZCLOSE + 3:
3895 case ZCLOSE + 4:
3896 case ZCLOSE + 5:
3897 case ZCLOSE + 6:
3898 case ZCLOSE + 7:
3899 case ZCLOSE + 8:
3900 case ZCLOSE + 9:
3901 {
3902 no = op - ZCLOSE;
3903 cleanup_zsubexpr();
3904 rp = regstack_push(RS_ZCLOSE, scan);
3905 if (rp == NULL)
3906 status = RA_FAIL;
3907 else
3908 {
3909 rp->rs_no = no;
3910 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
3911 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003912 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003913 }
3914 }
3915 break;
3916#endif
3917
3918 case BACKREF + 1:
3919 case BACKREF + 2:
3920 case BACKREF + 3:
3921 case BACKREF + 4:
3922 case BACKREF + 5:
3923 case BACKREF + 6:
3924 case BACKREF + 7:
3925 case BACKREF + 8:
3926 case BACKREF + 9:
3927 {
3928 int len;
3929
3930 no = op - BACKREF;
3931 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003932 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003933 {
3934 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
3935 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003936 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003937 len = 0;
3938 }
3939 else
3940 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003941 // Compare current input with back-ref in the same
3942 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003943 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
3944 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
3945 status = RA_NOMATCH;
3946 }
3947 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003948 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003949 {
3950 if (rex.reg_startpos[no].lnum < 0
3951 || rex.reg_endpos[no].lnum < 0)
3952 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003953 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003954 len = 0;
3955 }
3956 else
3957 {
3958 if (rex.reg_startpos[no].lnum == rex.lnum
3959 && rex.reg_endpos[no].lnum == rex.lnum)
3960 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003961 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003962 len = rex.reg_endpos[no].col
3963 - rex.reg_startpos[no].col;
3964 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
3965 rex.input, &len) != 0)
3966 status = RA_NOMATCH;
3967 }
3968 else
3969 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003970 // Messy situation: Need to compare between two
3971 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003972 int r = match_with_backref(
3973 rex.reg_startpos[no].lnum,
3974 rex.reg_startpos[no].col,
3975 rex.reg_endpos[no].lnum,
3976 rex.reg_endpos[no].col,
3977 &len);
3978
3979 if (r != RA_MATCH)
3980 status = r;
3981 }
3982 }
3983 }
3984
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003985 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003986 rex.input += len;
3987 }
3988 break;
3989
3990#ifdef FEAT_SYN_HL
3991 case ZREF + 1:
3992 case ZREF + 2:
3993 case ZREF + 3:
3994 case ZREF + 4:
3995 case ZREF + 5:
3996 case ZREF + 6:
3997 case ZREF + 7:
3998 case ZREF + 8:
3999 case ZREF + 9:
4000 {
4001 int len;
4002
4003 cleanup_zsubexpr();
4004 no = op - ZREF;
4005 if (re_extmatch_in != NULL
4006 && re_extmatch_in->matches[no] != NULL)
4007 {
4008 len = (int)STRLEN(re_extmatch_in->matches[no]);
4009 if (cstrncmp(re_extmatch_in->matches[no],
4010 rex.input, &len) != 0)
4011 status = RA_NOMATCH;
4012 else
4013 rex.input += len;
4014 }
4015 else
4016 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004017 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004018 }
4019 }
4020 break;
4021#endif
4022
4023 case BRANCH:
4024 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004025 if (OP(next) != BRANCH) // No choice.
4026 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004027 else
4028 {
4029 rp = regstack_push(RS_BRANCH, scan);
4030 if (rp == NULL)
4031 status = RA_FAIL;
4032 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004033 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004034 }
4035 }
4036 break;
4037
4038 case BRACE_LIMITS:
4039 {
4040 if (OP(next) == BRACE_SIMPLE)
4041 {
4042 bl_minval = OPERAND_MIN(scan);
4043 bl_maxval = OPERAND_MAX(scan);
4044 }
4045 else if (OP(next) >= BRACE_COMPLEX
4046 && OP(next) < BRACE_COMPLEX + 10)
4047 {
4048 no = OP(next) - BRACE_COMPLEX;
4049 brace_min[no] = OPERAND_MIN(scan);
4050 brace_max[no] = OPERAND_MAX(scan);
4051 brace_count[no] = 0;
4052 }
4053 else
4054 {
4055 internal_error("BRACE_LIMITS");
4056 status = RA_FAIL;
4057 }
4058 }
4059 break;
4060
4061 case BRACE_COMPLEX + 0:
4062 case BRACE_COMPLEX + 1:
4063 case BRACE_COMPLEX + 2:
4064 case BRACE_COMPLEX + 3:
4065 case BRACE_COMPLEX + 4:
4066 case BRACE_COMPLEX + 5:
4067 case BRACE_COMPLEX + 6:
4068 case BRACE_COMPLEX + 7:
4069 case BRACE_COMPLEX + 8:
4070 case BRACE_COMPLEX + 9:
4071 {
4072 no = op - BRACE_COMPLEX;
4073 ++brace_count[no];
4074
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004075 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004076 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4077 ? brace_min[no] : brace_max[no]))
4078 {
4079 rp = regstack_push(RS_BRCPLX_MORE, scan);
4080 if (rp == NULL)
4081 status = RA_FAIL;
4082 else
4083 {
4084 rp->rs_no = no;
4085 reg_save(&rp->rs_un.regsave, &backpos);
4086 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004087 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004088 }
4089 break;
4090 }
4091
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004092 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004093 if (brace_min[no] <= brace_max[no])
4094 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004095 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004096 if (brace_count[no] <= brace_max[no])
4097 {
4098 rp = regstack_push(RS_BRCPLX_LONG, scan);
4099 if (rp == NULL)
4100 status = RA_FAIL;
4101 else
4102 {
4103 rp->rs_no = no;
4104 reg_save(&rp->rs_un.regsave, &backpos);
4105 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004106 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004107 }
4108 }
4109 }
4110 else
4111 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004112 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004113 if (brace_count[no] <= brace_min[no])
4114 {
4115 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4116 if (rp == NULL)
4117 status = RA_FAIL;
4118 else
4119 {
4120 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004121 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004122 }
4123 }
4124 }
4125 }
4126 break;
4127
4128 case BRACE_SIMPLE:
4129 case STAR:
4130 case PLUS:
4131 {
4132 regstar_T rst;
4133
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004134 // Lookahead to avoid useless match attempts when we know
4135 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004136 if (OP(next) == EXACTLY)
4137 {
4138 rst.nextb = *OPERAND(next);
4139 if (rex.reg_ic)
4140 {
4141 if (MB_ISUPPER(rst.nextb))
4142 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4143 else
4144 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4145 }
4146 else
4147 rst.nextb_ic = rst.nextb;
4148 }
4149 else
4150 {
4151 rst.nextb = NUL;
4152 rst.nextb_ic = NUL;
4153 }
4154 if (op != BRACE_SIMPLE)
4155 {
4156 rst.minval = (op == STAR) ? 0 : 1;
4157 rst.maxval = MAX_LIMIT;
4158 }
4159 else
4160 {
4161 rst.minval = bl_minval;
4162 rst.maxval = bl_maxval;
4163 }
4164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004165 // When maxval > minval, try matching as much as possible, up
4166 // to maxval. When maxval < minval, try matching at least the
4167 // minimal number (since the range is backwards, that's also
4168 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004169 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4170 if (got_int)
4171 {
4172 status = RA_FAIL;
4173 break;
4174 }
4175 if (rst.minval <= rst.maxval
4176 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4177 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004178 // It could match. Prepare for trying to match what
4179 // follows. The code is below. Parameters are stored in
4180 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004181 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4182 {
4183 emsg(_(e_maxmempat));
4184 status = RA_FAIL;
4185 }
4186 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4187 status = RA_FAIL;
4188 else
4189 {
4190 regstack.ga_len += sizeof(regstar_T);
4191 rp = regstack_push(rst.minval <= rst.maxval
4192 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4193 if (rp == NULL)
4194 status = RA_FAIL;
4195 else
4196 {
4197 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004198 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004199 }
4200 }
4201 }
4202 else
4203 status = RA_NOMATCH;
4204
4205 }
4206 break;
4207
4208 case NOMATCH:
4209 case MATCH:
4210 case SUBPAT:
4211 rp = regstack_push(RS_NOMATCH, scan);
4212 if (rp == NULL)
4213 status = RA_FAIL;
4214 else
4215 {
4216 rp->rs_no = op;
4217 reg_save(&rp->rs_un.regsave, &backpos);
4218 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004219 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004220 }
4221 break;
4222
4223 case BEHIND:
4224 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004225 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004226 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4227 {
4228 emsg(_(e_maxmempat));
4229 status = RA_FAIL;
4230 }
4231 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4232 status = RA_FAIL;
4233 else
4234 {
4235 regstack.ga_len += sizeof(regbehind_T);
4236 rp = regstack_push(RS_BEHIND1, scan);
4237 if (rp == NULL)
4238 status = RA_FAIL;
4239 else
4240 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004241 // Need to save the subexpr to be able to restore them
4242 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004243 save_subexpr(((regbehind_T *)rp) - 1);
4244
4245 rp->rs_no = op;
4246 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004247 // First try if what follows matches. If it does then we
4248 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004249 }
4250 }
4251 break;
4252
4253 case BHPOS:
4254 if (REG_MULTI)
4255 {
4256 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4257 || behind_pos.rs_u.pos.lnum != rex.lnum)
4258 status = RA_NOMATCH;
4259 }
4260 else if (behind_pos.rs_u.ptr != rex.input)
4261 status = RA_NOMATCH;
4262 break;
4263
4264 case NEWL:
4265 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4266 || rex.reg_line_lbr)
4267 && (c != '\n' || !rex.reg_line_lbr))
4268 status = RA_NOMATCH;
4269 else if (rex.reg_line_lbr)
4270 ADVANCE_REGINPUT();
4271 else
4272 reg_nextline();
4273 break;
4274
4275 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004276 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004277 break;
4278
4279 default:
Bram Moolenaare83cca22020-09-07 18:53:21 +02004280 iemsg(_(e_re_corr));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004281#ifdef DEBUG
4282 printf("Illegal op code %d\n", op);
4283#endif
4284 status = RA_FAIL;
4285 break;
4286 }
4287 }
4288
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004289 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004290 if (status != RA_CONT)
4291 break;
4292
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004293 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004294 scan = next;
4295
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004296 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004297
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004298 // If there is something on the regstack execute the code for the state.
4299 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004300 while (regstack.ga_len > 0 && status != RA_FAIL)
4301 {
4302 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4303 switch (rp->rs_state)
4304 {
4305 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004306 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004307 regstack_pop(&scan);
4308 break;
4309
4310 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004311 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004312 if (status == RA_NOMATCH)
4313 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4314 &rex.reg_startp[rp->rs_no]);
4315 regstack_pop(&scan);
4316 break;
4317
4318#ifdef FEAT_SYN_HL
4319 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004320 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004321 if (status == RA_NOMATCH)
4322 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4323 &reg_startzp[rp->rs_no]);
4324 regstack_pop(&scan);
4325 break;
4326#endif
4327
4328 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004329 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004330 if (status == RA_NOMATCH)
4331 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4332 &rex.reg_endp[rp->rs_no]);
4333 regstack_pop(&scan);
4334 break;
4335
4336#ifdef FEAT_SYN_HL
4337 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004338 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004339 if (status == RA_NOMATCH)
4340 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4341 &reg_endzp[rp->rs_no]);
4342 regstack_pop(&scan);
4343 break;
4344#endif
4345
4346 case RS_BRANCH:
4347 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004348 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004349 regstack_pop(&scan);
4350 else
4351 {
4352 if (status != RA_BREAK)
4353 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004354 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004355 reg_restore(&rp->rs_un.regsave, &backpos);
4356 scan = rp->rs_scan;
4357 }
4358 if (scan == NULL || OP(scan) != BRANCH)
4359 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004360 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004361 status = RA_NOMATCH;
4362 regstack_pop(&scan);
4363 }
4364 else
4365 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004366 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004367 rp->rs_scan = regnext(scan);
4368 reg_save(&rp->rs_un.regsave, &backpos);
4369 scan = OPERAND(scan);
4370 }
4371 }
4372 break;
4373
4374 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004375 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004376 if (status == RA_NOMATCH)
4377 {
4378 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004379 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004380 }
4381 regstack_pop(&scan);
4382 break;
4383
4384 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004385 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004386 if (status == RA_NOMATCH)
4387 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004388 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004389 reg_restore(&rp->rs_un.regsave, &backpos);
4390 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004391 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004392 status = RA_CONT;
4393 }
4394 regstack_pop(&scan);
4395 if (status == RA_CONT)
4396 scan = regnext(scan);
4397 break;
4398
4399 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004400 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004401 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004402 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004403 reg_restore(&rp->rs_un.regsave, &backpos);
4404 regstack_pop(&scan);
4405 if (status == RA_NOMATCH)
4406 {
4407 scan = OPERAND(scan);
4408 status = RA_CONT;
4409 }
4410 break;
4411
4412 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004413 // Pop the state. If the operand matches for NOMATCH or
4414 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4415 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004416 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4417 status = RA_NOMATCH;
4418 else
4419 {
4420 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004421 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004422 reg_restore(&rp->rs_un.regsave, &backpos);
4423 }
4424 regstack_pop(&scan);
4425 if (status == RA_CONT)
4426 scan = regnext(scan);
4427 break;
4428
4429 case RS_BEHIND1:
4430 if (status == RA_NOMATCH)
4431 {
4432 regstack_pop(&scan);
4433 regstack.ga_len -= sizeof(regbehind_T);
4434 }
4435 else
4436 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004437 // The stuff after BEHIND/NOBEHIND matches. Now try if
4438 // the behind part does (not) match before the current
4439 // position in the input. This must be done at every
4440 // position in the input and checking if the match ends at
4441 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004442
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004443 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004444 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4445
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004446 // Start looking for a match with operand at the current
4447 // position. Go back one character until we find the
4448 // result, hitting the start of the line or the previous
4449 // line (for multi-line matching).
4450 // Set behind_pos to where the match should end, BHPOS
4451 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004452 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4453 behind_pos = rp->rs_un.regsave;
4454
4455 rp->rs_state = RS_BEHIND2;
4456
4457 reg_restore(&rp->rs_un.regsave, &backpos);
4458 scan = OPERAND(rp->rs_scan) + 4;
4459 }
4460 break;
4461
4462 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004463 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004464 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4465 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004466 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004467 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4468 if (rp->rs_no == BEHIND)
4469 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4470 &backpos);
4471 else
4472 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004473 // But we didn't want a match. Need to restore the
4474 // subexpr, because what follows matched, so they have
4475 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004476 status = RA_NOMATCH;
4477 restore_subexpr(((regbehind_T *)rp) - 1);
4478 }
4479 regstack_pop(&scan);
4480 regstack.ga_len -= sizeof(regbehind_T);
4481 }
4482 else
4483 {
4484 long limit;
4485
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004486 // No match or a match that doesn't end where we want it: Go
4487 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004488 no = OK;
4489 limit = OPERAND_MIN(rp->rs_scan);
4490 if (REG_MULTI)
4491 {
4492 if (limit > 0
4493 && ((rp->rs_un.regsave.rs_u.pos.lnum
4494 < behind_pos.rs_u.pos.lnum
4495 ? (colnr_T)STRLEN(rex.line)
4496 : behind_pos.rs_u.pos.col)
4497 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4498 no = FAIL;
4499 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4500 {
4501 if (rp->rs_un.regsave.rs_u.pos.lnum
4502 < behind_pos.rs_u.pos.lnum
4503 || reg_getline(
4504 --rp->rs_un.regsave.rs_u.pos.lnum)
4505 == NULL)
4506 no = FAIL;
4507 else
4508 {
4509 reg_restore(&rp->rs_un.regsave, &backpos);
4510 rp->rs_un.regsave.rs_u.pos.col =
4511 (colnr_T)STRLEN(rex.line);
4512 }
4513 }
4514 else
4515 {
4516 if (has_mbyte)
4517 {
4518 char_u *line =
4519 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4520
4521 rp->rs_un.regsave.rs_u.pos.col -=
4522 (*mb_head_off)(line, line
4523 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4524 }
4525 else
4526 --rp->rs_un.regsave.rs_u.pos.col;
4527 }
4528 }
4529 else
4530 {
4531 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4532 no = FAIL;
4533 else
4534 {
4535 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4536 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4537 - rp->rs_un.regsave.rs_u.ptr) > limit)
4538 no = FAIL;
4539 }
4540 }
4541 if (no == OK)
4542 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004543 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004544 reg_restore(&rp->rs_un.regsave, &backpos);
4545 scan = OPERAND(rp->rs_scan) + 4;
4546 if (status == RA_MATCH)
4547 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004548 // We did match, so subexpr may have been changed,
4549 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004550 status = RA_NOMATCH;
4551 restore_subexpr(((regbehind_T *)rp) - 1);
4552 }
4553 }
4554 else
4555 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004556 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004557 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4558 if (rp->rs_no == NOBEHIND)
4559 {
4560 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4561 &backpos);
4562 status = RA_MATCH;
4563 }
4564 else
4565 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004566 // We do want a proper match. Need to restore the
4567 // subexpr if we had a match, because they may have
4568 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004569 if (status == RA_MATCH)
4570 {
4571 status = RA_NOMATCH;
4572 restore_subexpr(((regbehind_T *)rp) - 1);
4573 }
4574 }
4575 regstack_pop(&scan);
4576 regstack.ga_len -= sizeof(regbehind_T);
4577 }
4578 }
4579 break;
4580
4581 case RS_STAR_LONG:
4582 case RS_STAR_SHORT:
4583 {
4584 regstar_T *rst = ((regstar_T *)rp) - 1;
4585
4586 if (status == RA_MATCH)
4587 {
4588 regstack_pop(&scan);
4589 regstack.ga_len -= sizeof(regstar_T);
4590 break;
4591 }
4592
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004593 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004594 if (status != RA_BREAK)
4595 reg_restore(&rp->rs_un.regsave, &backpos);
4596
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004597 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004598 for (;;)
4599 {
4600 if (status != RA_BREAK)
4601 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004602 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004603 if (rp->rs_state == RS_STAR_LONG)
4604 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004605 // Trying for longest match, but couldn't or
4606 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004607 if (--rst->count < rst->minval)
4608 break;
4609 if (rex.input == rex.line)
4610 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004611 // backup to last char of previous line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004612 --rex.lnum;
4613 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004614 // Just in case regrepeat() didn't count
4615 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004616 if (rex.line == NULL)
4617 break;
4618 rex.input = rex.line + STRLEN(rex.line);
4619 fast_breakcheck();
4620 }
4621 else
4622 MB_PTR_BACK(rex.line, rex.input);
4623 }
4624 else
4625 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004626 // Range is backwards, use shortest match first.
4627 // Careful: maxval and minval are exchanged!
4628 // Couldn't or didn't match: try advancing one
4629 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004630 if (rst->count == rst->minval
4631 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4632 break;
4633 ++rst->count;
4634 }
4635 if (got_int)
4636 break;
4637 }
4638 else
4639 status = RA_NOMATCH;
4640
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004641 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004642 if (rst->nextb == NUL || *rex.input == rst->nextb
4643 || *rex.input == rst->nextb_ic)
4644 {
4645 reg_save(&rp->rs_un.regsave, &backpos);
4646 scan = regnext(rp->rs_scan);
4647 status = RA_CONT;
4648 break;
4649 }
4650 }
4651 if (status != RA_CONT)
4652 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004653 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004654 regstack_pop(&scan);
4655 regstack.ga_len -= sizeof(regstar_T);
4656 status = RA_NOMATCH;
4657 }
4658 }
4659 break;
4660 }
4661
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004662 // If we want to continue the inner loop or didn't pop a state
4663 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004664 if (status == RA_CONT || rp == (regitem_T *)
4665 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4666 break;
4667 }
4668
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004669 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004670 if (status == RA_CONT)
4671 continue;
4672
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004673 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004674 if (regstack.ga_len == 0 || status == RA_FAIL)
4675 {
4676 if (scan == NULL)
4677 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004678 // We get here only if there's trouble -- normally "case END" is
4679 // the terminating point.
Bram Moolenaare83cca22020-09-07 18:53:21 +02004680 iemsg(_(e_re_corr));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004681#ifdef DEBUG
4682 printf("Premature EOL\n");
4683#endif
4684 }
4685 return (status == RA_MATCH);
4686 }
4687
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004688 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004689
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004690 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004691}
4692
4693/*
4694 * regtry - try match of "prog" with at rex.line["col"].
4695 * Returns 0 for failure, number of lines contained in the match otherwise.
4696 */
4697 static long
4698regtry(
4699 bt_regprog_T *prog,
4700 colnr_T col,
4701 proftime_T *tm, // timeout limit or NULL
4702 int *timed_out) // flag set on timeout or NULL
4703{
4704 rex.input = rex.line + col;
4705 rex.need_clear_subexpr = TRUE;
4706#ifdef FEAT_SYN_HL
4707 // Clear the external match subpointers if necessary.
4708 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4709#endif
4710
4711 if (regmatch(prog->program + 1, tm, timed_out) == 0)
4712 return 0;
4713
4714 cleanup_subexpr();
4715 if (REG_MULTI)
4716 {
4717 if (rex.reg_startpos[0].lnum < 0)
4718 {
4719 rex.reg_startpos[0].lnum = 0;
4720 rex.reg_startpos[0].col = col;
4721 }
4722 if (rex.reg_endpos[0].lnum < 0)
4723 {
4724 rex.reg_endpos[0].lnum = rex.lnum;
4725 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4726 }
4727 else
4728 // Use line number of "\ze".
4729 rex.lnum = rex.reg_endpos[0].lnum;
4730 }
4731 else
4732 {
4733 if (rex.reg_startp[0] == NULL)
4734 rex.reg_startp[0] = rex.line + col;
4735 if (rex.reg_endp[0] == NULL)
4736 rex.reg_endp[0] = rex.input;
4737 }
4738#ifdef FEAT_SYN_HL
4739 // Package any found \z(...\) matches for export. Default is none.
4740 unref_extmatch(re_extmatch_out);
4741 re_extmatch_out = NULL;
4742
4743 if (prog->reghasz == REX_SET)
4744 {
4745 int i;
4746
4747 cleanup_zsubexpr();
4748 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004749 if (re_extmatch_out == NULL)
4750 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004751 for (i = 0; i < NSUBEXP; i++)
4752 {
4753 if (REG_MULTI)
4754 {
4755 // Only accept single line matches.
4756 if (reg_startzpos[i].lnum >= 0
4757 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4758 && reg_endzpos[i].col >= reg_startzpos[i].col)
4759 re_extmatch_out->matches[i] =
4760 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4761 + reg_startzpos[i].col,
4762 reg_endzpos[i].col - reg_startzpos[i].col);
4763 }
4764 else
4765 {
4766 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4767 re_extmatch_out->matches[i] =
4768 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004769 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004770 }
4771 }
4772 }
4773#endif
4774 return 1 + rex.lnum;
4775}
4776
4777/*
4778 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004779 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004780 * Returns 0 for failure, number of lines contained in the match otherwise.
4781 */
4782 static long
4783bt_regexec_both(
4784 char_u *line,
4785 colnr_T col, // column to start looking for match
4786 proftime_T *tm, // timeout limit or NULL
4787 int *timed_out) // flag set on timeout or NULL
4788{
4789 bt_regprog_T *prog;
4790 char_u *s;
4791 long retval = 0L;
4792
4793 // Create "regstack" and "backpos" if they are not allocated yet.
4794 // We allocate *_INITIAL amount of bytes first and then set the grow size
4795 // to much bigger value to avoid many malloc calls in case of deep regular
4796 // expressions.
4797 if (regstack.ga_data == NULL)
4798 {
4799 // Use an item size of 1 byte, since we push different things
4800 // onto the regstack.
4801 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4802 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4803 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4804 }
4805
4806 if (backpos.ga_data == NULL)
4807 {
4808 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4809 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4810 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4811 }
4812
4813 if (REG_MULTI)
4814 {
4815 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4816 line = reg_getline((linenr_T)0);
4817 rex.reg_startpos = rex.reg_mmatch->startpos;
4818 rex.reg_endpos = rex.reg_mmatch->endpos;
4819 }
4820 else
4821 {
4822 prog = (bt_regprog_T *)rex.reg_match->regprog;
4823 rex.reg_startp = rex.reg_match->startp;
4824 rex.reg_endp = rex.reg_match->endp;
4825 }
4826
4827 // Be paranoid...
4828 if (prog == NULL || line == NULL)
4829 {
Bram Moolenaare83cca22020-09-07 18:53:21 +02004830 iemsg(_(e_null));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004831 goto theend;
4832 }
4833
4834 // Check validity of program.
4835 if (prog_magic_wrong())
4836 goto theend;
4837
4838 // If the start column is past the maximum column: no need to try.
4839 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4840 goto theend;
4841
4842 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4843 if (prog->regflags & RF_ICASE)
4844 rex.reg_ic = TRUE;
4845 else if (prog->regflags & RF_NOICASE)
4846 rex.reg_ic = FALSE;
4847
4848 // If pattern contains "\Z" overrule value of rex.reg_icombine
4849 if (prog->regflags & RF_ICOMBINE)
4850 rex.reg_icombine = TRUE;
4851
4852 // If there is a "must appear" string, look for it.
4853 if (prog->regmust != NULL)
4854 {
4855 int c;
4856
4857 if (has_mbyte)
4858 c = (*mb_ptr2char)(prog->regmust);
4859 else
4860 c = *prog->regmust;
4861 s = line + col;
4862
4863 // This is used very often, esp. for ":global". Use three versions of
4864 // the loop to avoid overhead of conditions.
4865 if (!rex.reg_ic && !has_mbyte)
4866 while ((s = vim_strbyte(s, c)) != NULL)
4867 {
4868 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4869 break; // Found it.
4870 ++s;
4871 }
4872 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4873 while ((s = vim_strchr(s, c)) != NULL)
4874 {
4875 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4876 break; // Found it.
4877 MB_PTR_ADV(s);
4878 }
4879 else
4880 while ((s = cstrchr(s, c)) != NULL)
4881 {
4882 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4883 break; // Found it.
4884 MB_PTR_ADV(s);
4885 }
4886 if (s == NULL) // Not present.
4887 goto theend;
4888 }
4889
4890 rex.line = line;
4891 rex.lnum = 0;
4892 reg_toolong = FALSE;
4893
4894 // Simplest case: Anchored match need be tried only once.
4895 if (prog->reganch)
4896 {
4897 int c;
4898
4899 if (has_mbyte)
4900 c = (*mb_ptr2char)(rex.line + col);
4901 else
4902 c = rex.line[col];
4903 if (prog->regstart == NUL
4904 || prog->regstart == c
4905 || (rex.reg_ic
4906 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
4907 || (c < 255 && prog->regstart < 255 &&
4908 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
4909 retval = regtry(prog, col, tm, timed_out);
4910 else
4911 retval = 0;
4912 }
4913 else
4914 {
4915#ifdef FEAT_RELTIME
4916 int tm_count = 0;
4917#endif
4918 // Messy cases: unanchored match.
4919 while (!got_int)
4920 {
4921 if (prog->regstart != NUL)
4922 {
4923 // Skip until the char we know it must start with.
4924 // Used often, do some work to avoid call overhead.
4925 if (!rex.reg_ic && !has_mbyte)
4926 s = vim_strbyte(rex.line + col, prog->regstart);
4927 else
4928 s = cstrchr(rex.line + col, prog->regstart);
4929 if (s == NULL)
4930 {
4931 retval = 0;
4932 break;
4933 }
4934 col = (int)(s - rex.line);
4935 }
4936
4937 // Check for maximum column to try.
4938 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4939 {
4940 retval = 0;
4941 break;
4942 }
4943
4944 retval = regtry(prog, col, tm, timed_out);
4945 if (retval > 0)
4946 break;
4947
4948 // if not currently on the first line, get it again
4949 if (rex.lnum != 0)
4950 {
4951 rex.lnum = 0;
4952 rex.line = reg_getline((linenr_T)0);
4953 }
4954 if (rex.line[col] == NUL)
4955 break;
4956 if (has_mbyte)
4957 col += (*mb_ptr2len)(rex.line + col);
4958 else
4959 ++col;
4960#ifdef FEAT_RELTIME
4961 // Check for timeout once in a twenty times to avoid overhead.
4962 if (tm != NULL && ++tm_count == 20)
4963 {
4964 tm_count = 0;
4965 if (profile_passed_limit(tm))
4966 {
4967 if (timed_out != NULL)
4968 *timed_out = TRUE;
4969 break;
4970 }
4971 }
4972#endif
4973 }
4974 }
4975
4976theend:
4977 // Free "reg_tofree" when it's a bit big.
4978 // Free regstack and backpos if they are bigger than their initial size.
4979 if (reg_tofreelen > 400)
4980 VIM_CLEAR(reg_tofree);
4981 if (regstack.ga_maxlen > REGSTACK_INITIAL)
4982 ga_clear(&regstack);
4983 if (backpos.ga_maxlen > BACKPOS_INITIAL)
4984 ga_clear(&backpos);
4985
Bram Moolenaara3d10a52020-12-21 18:24:00 +01004986 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01004987 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01004988 // Make sure the end is never before the start. Can happen when \zs
4989 // and \ze are used.
4990 if (REG_MULTI)
4991 {
4992 lpos_T *start = &rex.reg_mmatch->startpos[0];
4993 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01004994
Bram Moolenaara3d10a52020-12-21 18:24:00 +01004995 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01004996 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01004997 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
4998 }
4999 else
5000 {
5001 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5002 rex.reg_match->endp[0] = rex.reg_match->startp[0];
5003 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005004 }
5005
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005006 return retval;
5007}
5008
5009/*
5010 * Match a regexp against a string.
5011 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5012 * Uses curbuf for line count and 'iskeyword'.
5013 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5014 *
5015 * Returns 0 for failure, number of lines contained in the match otherwise.
5016 */
5017 static int
5018bt_regexec_nl(
5019 regmatch_T *rmp,
5020 char_u *line, // string to match against
5021 colnr_T col, // column to start looking for match
5022 int line_lbr)
5023{
5024 rex.reg_match = rmp;
5025 rex.reg_mmatch = NULL;
5026 rex.reg_maxline = 0;
5027 rex.reg_line_lbr = line_lbr;
5028 rex.reg_buf = curbuf;
5029 rex.reg_win = NULL;
5030 rex.reg_ic = rmp->rm_ic;
5031 rex.reg_icombine = FALSE;
5032 rex.reg_maxcol = 0;
5033
5034 return bt_regexec_both(line, col, NULL, NULL);
5035}
5036
5037/*
5038 * Match a regexp against multiple lines.
5039 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5040 * Uses curbuf for line count and 'iskeyword'.
5041 *
5042 * Return zero if there is no match. Return number of lines contained in the
5043 * match otherwise.
5044 */
5045 static long
5046bt_regexec_multi(
5047 regmmatch_T *rmp,
5048 win_T *win, // window in which to search or NULL
5049 buf_T *buf, // buffer in which to search
5050 linenr_T lnum, // nr of line to start looking for match
5051 colnr_T col, // column to start looking for match
5052 proftime_T *tm, // timeout limit or NULL
5053 int *timed_out) // flag set on timeout or NULL
5054{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005055 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005056 return bt_regexec_both(NULL, col, tm, timed_out);
5057}
5058
5059/*
5060 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5061 */
5062 static int
5063re_num_cmp(long_u val, char_u *scan)
5064{
5065 long_u n = OPERAND_MIN(scan);
5066
5067 if (OPERAND_CMP(scan) == '>')
5068 return val > n;
5069 if (OPERAND_CMP(scan) == '<')
5070 return val < n;
5071 return val == n;
5072}
5073
5074#ifdef BT_REGEXP_DUMP
5075
5076/*
5077 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5078 */
5079 static void
5080regdump(char_u *pattern, bt_regprog_T *r)
5081{
5082 char_u *s;
5083 int op = EXACTLY; // Arbitrary non-END op.
5084 char_u *next;
5085 char_u *end = NULL;
5086 FILE *f;
5087
5088#ifdef BT_REGEXP_LOG
5089 f = fopen("bt_regexp_log.log", "a");
5090#else
5091 f = stdout;
5092#endif
5093 if (f == NULL)
5094 return;
5095 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5096
5097 s = r->program + 1;
5098 // Loop until we find the END that isn't before a referred next (an END
5099 // can also appear in a NOMATCH operand).
5100 while (op != END || s <= end)
5101 {
5102 op = OP(s);
5103 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5104 next = regnext(s);
5105 if (next == NULL) // Next ptr.
5106 fprintf(f, "(0)");
5107 else
5108 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5109 if (end < next)
5110 end = next;
5111 if (op == BRACE_LIMITS)
5112 {
5113 // Two ints
5114 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5115 s += 8;
5116 }
5117 else if (op == BEHIND || op == NOBEHIND)
5118 {
5119 // one int
5120 fprintf(f, " count %ld", OPERAND_MIN(s));
5121 s += 4;
5122 }
5123 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5124 {
5125 // one int plus comparator
5126 fprintf(f, " count %ld", OPERAND_MIN(s));
5127 s += 5;
5128 }
5129 s += 3;
5130 if (op == ANYOF || op == ANYOF + ADD_NL
5131 || op == ANYBUT || op == ANYBUT + ADD_NL
5132 || op == EXACTLY)
5133 {
5134 // Literal string, where present.
5135 fprintf(f, "\nxxxxxxxxx\n");
5136 while (*s != NUL)
5137 fprintf(f, "%c", *s++);
5138 fprintf(f, "\nxxxxxxxxx\n");
5139 s++;
5140 }
5141 fprintf(f, "\r\n");
5142 }
5143
5144 // Header fields of interest.
5145 if (r->regstart != NUL)
5146 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5147 ? (char *)transchar(r->regstart)
5148 : "multibyte", r->regstart);
5149 if (r->reganch)
5150 fprintf(f, "anchored; ");
5151 if (r->regmust != NULL)
5152 fprintf(f, "must have \"%s\"", r->regmust);
5153 fprintf(f, "\r\n");
5154
5155#ifdef BT_REGEXP_LOG
5156 fclose(f);
5157#endif
5158}
5159#endif // BT_REGEXP_DUMP
5160
5161#ifdef DEBUG
5162/*
5163 * regprop - printable representation of opcode
5164 */
5165 static char_u *
5166regprop(char_u *op)
5167{
5168 char *p;
5169 static char buf[50];
5170
5171 STRCPY(buf, ":");
5172
5173 switch ((int) OP(op))
5174 {
5175 case BOL:
5176 p = "BOL";
5177 break;
5178 case EOL:
5179 p = "EOL";
5180 break;
5181 case RE_BOF:
5182 p = "BOF";
5183 break;
5184 case RE_EOF:
5185 p = "EOF";
5186 break;
5187 case CURSOR:
5188 p = "CURSOR";
5189 break;
5190 case RE_VISUAL:
5191 p = "RE_VISUAL";
5192 break;
5193 case RE_LNUM:
5194 p = "RE_LNUM";
5195 break;
5196 case RE_MARK:
5197 p = "RE_MARK";
5198 break;
5199 case RE_COL:
5200 p = "RE_COL";
5201 break;
5202 case RE_VCOL:
5203 p = "RE_VCOL";
5204 break;
5205 case BOW:
5206 p = "BOW";
5207 break;
5208 case EOW:
5209 p = "EOW";
5210 break;
5211 case ANY:
5212 p = "ANY";
5213 break;
5214 case ANY + ADD_NL:
5215 p = "ANY+NL";
5216 break;
5217 case ANYOF:
5218 p = "ANYOF";
5219 break;
5220 case ANYOF + ADD_NL:
5221 p = "ANYOF+NL";
5222 break;
5223 case ANYBUT:
5224 p = "ANYBUT";
5225 break;
5226 case ANYBUT + ADD_NL:
5227 p = "ANYBUT+NL";
5228 break;
5229 case IDENT:
5230 p = "IDENT";
5231 break;
5232 case IDENT + ADD_NL:
5233 p = "IDENT+NL";
5234 break;
5235 case SIDENT:
5236 p = "SIDENT";
5237 break;
5238 case SIDENT + ADD_NL:
5239 p = "SIDENT+NL";
5240 break;
5241 case KWORD:
5242 p = "KWORD";
5243 break;
5244 case KWORD + ADD_NL:
5245 p = "KWORD+NL";
5246 break;
5247 case SKWORD:
5248 p = "SKWORD";
5249 break;
5250 case SKWORD + ADD_NL:
5251 p = "SKWORD+NL";
5252 break;
5253 case FNAME:
5254 p = "FNAME";
5255 break;
5256 case FNAME + ADD_NL:
5257 p = "FNAME+NL";
5258 break;
5259 case SFNAME:
5260 p = "SFNAME";
5261 break;
5262 case SFNAME + ADD_NL:
5263 p = "SFNAME+NL";
5264 break;
5265 case PRINT:
5266 p = "PRINT";
5267 break;
5268 case PRINT + ADD_NL:
5269 p = "PRINT+NL";
5270 break;
5271 case SPRINT:
5272 p = "SPRINT";
5273 break;
5274 case SPRINT + ADD_NL:
5275 p = "SPRINT+NL";
5276 break;
5277 case WHITE:
5278 p = "WHITE";
5279 break;
5280 case WHITE + ADD_NL:
5281 p = "WHITE+NL";
5282 break;
5283 case NWHITE:
5284 p = "NWHITE";
5285 break;
5286 case NWHITE + ADD_NL:
5287 p = "NWHITE+NL";
5288 break;
5289 case DIGIT:
5290 p = "DIGIT";
5291 break;
5292 case DIGIT + ADD_NL:
5293 p = "DIGIT+NL";
5294 break;
5295 case NDIGIT:
5296 p = "NDIGIT";
5297 break;
5298 case NDIGIT + ADD_NL:
5299 p = "NDIGIT+NL";
5300 break;
5301 case HEX:
5302 p = "HEX";
5303 break;
5304 case HEX + ADD_NL:
5305 p = "HEX+NL";
5306 break;
5307 case NHEX:
5308 p = "NHEX";
5309 break;
5310 case NHEX + ADD_NL:
5311 p = "NHEX+NL";
5312 break;
5313 case OCTAL:
5314 p = "OCTAL";
5315 break;
5316 case OCTAL + ADD_NL:
5317 p = "OCTAL+NL";
5318 break;
5319 case NOCTAL:
5320 p = "NOCTAL";
5321 break;
5322 case NOCTAL + ADD_NL:
5323 p = "NOCTAL+NL";
5324 break;
5325 case WORD:
5326 p = "WORD";
5327 break;
5328 case WORD + ADD_NL:
5329 p = "WORD+NL";
5330 break;
5331 case NWORD:
5332 p = "NWORD";
5333 break;
5334 case NWORD + ADD_NL:
5335 p = "NWORD+NL";
5336 break;
5337 case HEAD:
5338 p = "HEAD";
5339 break;
5340 case HEAD + ADD_NL:
5341 p = "HEAD+NL";
5342 break;
5343 case NHEAD:
5344 p = "NHEAD";
5345 break;
5346 case NHEAD + ADD_NL:
5347 p = "NHEAD+NL";
5348 break;
5349 case ALPHA:
5350 p = "ALPHA";
5351 break;
5352 case ALPHA + ADD_NL:
5353 p = "ALPHA+NL";
5354 break;
5355 case NALPHA:
5356 p = "NALPHA";
5357 break;
5358 case NALPHA + ADD_NL:
5359 p = "NALPHA+NL";
5360 break;
5361 case LOWER:
5362 p = "LOWER";
5363 break;
5364 case LOWER + ADD_NL:
5365 p = "LOWER+NL";
5366 break;
5367 case NLOWER:
5368 p = "NLOWER";
5369 break;
5370 case NLOWER + ADD_NL:
5371 p = "NLOWER+NL";
5372 break;
5373 case UPPER:
5374 p = "UPPER";
5375 break;
5376 case UPPER + ADD_NL:
5377 p = "UPPER+NL";
5378 break;
5379 case NUPPER:
5380 p = "NUPPER";
5381 break;
5382 case NUPPER + ADD_NL:
5383 p = "NUPPER+NL";
5384 break;
5385 case BRANCH:
5386 p = "BRANCH";
5387 break;
5388 case EXACTLY:
5389 p = "EXACTLY";
5390 break;
5391 case NOTHING:
5392 p = "NOTHING";
5393 break;
5394 case BACK:
5395 p = "BACK";
5396 break;
5397 case END:
5398 p = "END";
5399 break;
5400 case MOPEN + 0:
5401 p = "MATCH START";
5402 break;
5403 case MOPEN + 1:
5404 case MOPEN + 2:
5405 case MOPEN + 3:
5406 case MOPEN + 4:
5407 case MOPEN + 5:
5408 case MOPEN + 6:
5409 case MOPEN + 7:
5410 case MOPEN + 8:
5411 case MOPEN + 9:
5412 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5413 p = NULL;
5414 break;
5415 case MCLOSE + 0:
5416 p = "MATCH END";
5417 break;
5418 case MCLOSE + 1:
5419 case MCLOSE + 2:
5420 case MCLOSE + 3:
5421 case MCLOSE + 4:
5422 case MCLOSE + 5:
5423 case MCLOSE + 6:
5424 case MCLOSE + 7:
5425 case MCLOSE + 8:
5426 case MCLOSE + 9:
5427 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5428 p = NULL;
5429 break;
5430 case BACKREF + 1:
5431 case BACKREF + 2:
5432 case BACKREF + 3:
5433 case BACKREF + 4:
5434 case BACKREF + 5:
5435 case BACKREF + 6:
5436 case BACKREF + 7:
5437 case BACKREF + 8:
5438 case BACKREF + 9:
5439 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5440 p = NULL;
5441 break;
5442 case NOPEN:
5443 p = "NOPEN";
5444 break;
5445 case NCLOSE:
5446 p = "NCLOSE";
5447 break;
5448#ifdef FEAT_SYN_HL
5449 case ZOPEN + 1:
5450 case ZOPEN + 2:
5451 case ZOPEN + 3:
5452 case ZOPEN + 4:
5453 case ZOPEN + 5:
5454 case ZOPEN + 6:
5455 case ZOPEN + 7:
5456 case ZOPEN + 8:
5457 case ZOPEN + 9:
5458 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5459 p = NULL;
5460 break;
5461 case ZCLOSE + 1:
5462 case ZCLOSE + 2:
5463 case ZCLOSE + 3:
5464 case ZCLOSE + 4:
5465 case ZCLOSE + 5:
5466 case ZCLOSE + 6:
5467 case ZCLOSE + 7:
5468 case ZCLOSE + 8:
5469 case ZCLOSE + 9:
5470 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5471 p = NULL;
5472 break;
5473 case ZREF + 1:
5474 case ZREF + 2:
5475 case ZREF + 3:
5476 case ZREF + 4:
5477 case ZREF + 5:
5478 case ZREF + 6:
5479 case ZREF + 7:
5480 case ZREF + 8:
5481 case ZREF + 9:
5482 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5483 p = NULL;
5484 break;
5485#endif
5486 case STAR:
5487 p = "STAR";
5488 break;
5489 case PLUS:
5490 p = "PLUS";
5491 break;
5492 case NOMATCH:
5493 p = "NOMATCH";
5494 break;
5495 case MATCH:
5496 p = "MATCH";
5497 break;
5498 case BEHIND:
5499 p = "BEHIND";
5500 break;
5501 case NOBEHIND:
5502 p = "NOBEHIND";
5503 break;
5504 case SUBPAT:
5505 p = "SUBPAT";
5506 break;
5507 case BRACE_LIMITS:
5508 p = "BRACE_LIMITS";
5509 break;
5510 case BRACE_SIMPLE:
5511 p = "BRACE_SIMPLE";
5512 break;
5513 case BRACE_COMPLEX + 0:
5514 case BRACE_COMPLEX + 1:
5515 case BRACE_COMPLEX + 2:
5516 case BRACE_COMPLEX + 3:
5517 case BRACE_COMPLEX + 4:
5518 case BRACE_COMPLEX + 5:
5519 case BRACE_COMPLEX + 6:
5520 case BRACE_COMPLEX + 7:
5521 case BRACE_COMPLEX + 8:
5522 case BRACE_COMPLEX + 9:
5523 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5524 p = NULL;
5525 break;
5526 case MULTIBYTECODE:
5527 p = "MULTIBYTECODE";
5528 break;
5529 case NEWL:
5530 p = "NEWL";
5531 break;
5532 default:
5533 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5534 p = NULL;
5535 break;
5536 }
5537 if (p != NULL)
5538 STRCAT(buf, p);
5539 return (char_u *)buf;
5540}
5541#endif // DEBUG