blob: 793faaf6bc0fd9fcdb7e6cc57555e481bb610570 [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200536 switch (c)
537 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200538 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200539 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
540 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
541 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
542 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
543 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
544 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
545 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
546 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
547 regmbc(0x100); regmbc(0x102); regmbc(0x104);
548 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
549 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
550 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
551 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
552 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
553 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
554 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200555 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200556 case 'B': case 0x181: case 0x243: case 0x1e02:
557 case 0x1e04: case 0x1e06:
558 regmbc('B');
559 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
560 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200561 return;
562 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200563 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
564 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200565 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200566 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
567 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
568 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200569 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200570 case 'D': case 0x10e: case 0x110: case 0x18a:
571 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
572 case 0x1e12:
573 regmbc('D'); regmbc(0x10e); regmbc(0x110);
574 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
575 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200576 return;
577 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200578 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
579 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
580 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
581 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
582 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200583 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200584 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
585 regmbc(0x114); regmbc(0x116); regmbc(0x118);
586 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
587 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
588 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
589 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
590 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
591 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200593 case 'F': case 0x191: case 0x1e1e: case 0xa798:
594 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
595 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200596 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200597 case 'G': case 0x11c: case 0x11e: case 0x120:
598 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
599 case 0x1f4: case 0x1e20: case 0xa7a0:
600 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
601 regmbc(0x120); regmbc(0x122); regmbc(0x193);
602 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
603 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200604 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200605 case 'H': case 0x124: case 0x126: case 0x21e:
606 case 0x1e22: case 0x1e24: case 0x1e26:
607 case 0x1e28: case 0x1e2a: case 0x2c67:
608 regmbc('H'); regmbc(0x124); regmbc(0x126);
609 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
610 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
611 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
613 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200614 case 0x128: case 0x12a: case 0x12c: case 0x12e:
615 case 0x130: case 0x197: case 0x1cf: case 0x208:
616 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
617 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200618 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200619 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
620 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
621 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
622 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
623 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200624 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200625 case 'J': case 0x134: case 0x248:
626 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200627 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200628 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
629 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
630 regmbc('K'); regmbc(0x136); regmbc(0x198);
631 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
632 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200633 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200634 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
635 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
636 case 0x1e3a: case 0x1e3c: case 0x2c60:
637 regmbc('L'); regmbc(0x139); regmbc(0x13b);
638 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
639 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
640 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200641 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200642 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
643 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
644 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200645 return;
646 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200647 case 0x143: case 0x145: case 0x147: case 0x1f8:
648 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
649 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200650 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200651 regmbc(0x143); regmbc(0x145); regmbc(0x147);
652 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
653 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200654 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200655 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
656 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
657 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
658 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
659 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
660 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
661 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
662 case 0x1ee0: case 0x1ee2:
663 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
664 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
665 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
666 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
667 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
668 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
669 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
670 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
671 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
672 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
673 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
674 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
675 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200676 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200677 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
678 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
679 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200680 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200681 case 'Q': case 0x24a:
682 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200683 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200684 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
685 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
686 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
687 regmbc('R'); regmbc(0x154); regmbc(0x156);
688 regmbc(0x210); regmbc(0x212); regmbc(0x158);
689 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
690 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
691 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
694 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
695 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
696 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
697 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
698 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
699 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
700 regmbc(0xa7a8);
701 return;
702 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
703 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
704 case 0x1e6e: case 0x1e70:
705 regmbc('T'); regmbc(0x162); regmbc(0x164);
706 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
707 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
708 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200709 return;
710 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200711 case 0x168: case 0x16a: case 0x16c: case 0x16e:
712 case 0x170: case 0x172: case 0x1af: case 0x1d3:
713 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
714 case 0x214: case 0x216: case 0x244: case 0x1e72:
715 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
716 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
717 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200718 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200719 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
720 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
721 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
722 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
723 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
724 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
725 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
726 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
727 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
728 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200729 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200730 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
731 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
732 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200733 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200734 case 'W': case 0x174: case 0x1e80: case 0x1e82:
735 case 0x1e84: case 0x1e86: case 0x1e88:
736 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
737 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
738 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200739 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200740 case 'X': case 0x1e8a: case 0x1e8c:
741 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200742 return;
743 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200744 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
745 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
746 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
747 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
748 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
749 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200750 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200751 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
752 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
753 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
754 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
755 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200756 return;
757 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200758 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
759 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
760 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
761 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
762 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
763 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
764 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200765 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
766 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
768 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
769 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
770 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
771 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
772 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
773 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
774 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
775 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200776 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200777 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
778 case 0x1e03: case 0x1e05: case 0x1e07:
779 regmbc('b');
780 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
781 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
782 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200783 return;
784 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200785 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
786 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
787 regmbc('c'); regmbc(0xe7); regmbc(0x107);
788 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
789 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
790 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200791 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200792 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
793 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
794 case 0x1e0f: case 0x1e11: case 0x1e13:
795 regmbc('d'); regmbc(0x10f); regmbc(0x111);
796 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
797 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
798 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x113: case 0x115: case 0x117: case 0x119:
802 case 0x11b: case 0x205: case 0x207: case 0x229:
803 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
804 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
805 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
806 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
809 regmbc(0x115); regmbc(0x117); regmbc(0x119);
810 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
811 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
812 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
813 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
814 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
815 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
816 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200817 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200818 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
819 case 0x1e1f: case 0xa799:
820 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
821 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
822 return;
823 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
824 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
825 case 0x1e21: case 0xa7a1:
826 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
827 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
828 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
829 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200830 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200831 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
832 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
833 case 0x1e96: case 0x2c68: case 0xa795:
834 regmbc('h'); regmbc(0x125); regmbc(0x127);
835 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
836 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
837 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200838 return;
839 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200840 case 0x129: case 0x12b: case 0x12d: case 0x12f:
841 case 0x1d0: case 0x209: case 0x20b: case 0x268:
842 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
843 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200844 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200845 regmbc(0xee); regmbc(0xef); regmbc(0x129);
846 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
847 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
848 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
849 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200850 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200851 case 'j': case 0x135: case 0x1f0: case 0x249:
852 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
853 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200855 case 'k': case 0x137: case 0x199: case 0x1e9:
856 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
857 case 0x2c6a: case 0xa741:
858 regmbc('k'); regmbc(0x137); regmbc(0x199);
859 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
860 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
861 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200862 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200863 case 'l': case 0x13a: case 0x13c: case 0x13e:
864 case 0x140: case 0x142: case 0x19a: case 0x1e37:
865 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
866 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
867 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
868 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
869 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
872 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
873 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200874 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200875 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
876 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
877 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
878 case 0xa7a5:
879 regmbc('n'); regmbc(0xf1); regmbc(0x144);
880 regmbc(0x146); regmbc(0x148); regmbc(0x149);
881 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
882 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
883 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200884 return;
885 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200886 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
887 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
888 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
889 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
890 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
891 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
892 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
893 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200894 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
895 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200896 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
897 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
898 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
899 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
900 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
901 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
902 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
903 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
904 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
905 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
906 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200907 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200908 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
909 case 0x1e55: case 0x1e57:
910 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
911 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
912 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200913 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200914 case 'q': case 0x24b: case 0x2a0:
915 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200916 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200917 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
918 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
919 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
920 case 0xa7a7:
921 regmbc('r'); regmbc(0x155); regmbc(0x157);
922 regmbc(0x159); regmbc(0x211); regmbc(0x213);
923 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
924 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
925 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
926 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200927 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200928 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
929 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
930 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
931 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
932 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
933 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
934 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
935 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
936 return;
937 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
938 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
939 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
940 regmbc('t'); regmbc(0x163); regmbc(0x165);
941 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
942 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
943 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
944 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200945 return;
946 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200947 case 0x169: case 0x16b: case 0x16d: case 0x16f:
948 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
949 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
950 case 0x215: case 0x217: case 0x289: case 0x1e73:
951 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
952 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
953 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
954 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200955 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
957 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
958 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
959 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
960 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
961 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
962 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
963 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
964 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
965 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
966 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200967 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200968 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
969 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
970 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 case 'w': case 0x175: case 0x1e81: case 0x1e83:
973 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
974 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
975 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
976 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200977 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200978 case 'x': case 0x1e8b: case 0x1e8d:
979 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200980 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
982 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
983 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200984 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
986 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
987 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
988 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200989 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
991 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
992 case 0x1e95: case 0x2c6c:
993 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
994 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
995 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
996 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200997 return;
998 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200999 }
1000 regmbc(c);
1001}
1002
1003/*
1004 * Emit a node.
1005 * Return pointer to generated code.
1006 */
1007 static char_u *
1008regnode(int op)
1009{
1010 char_u *ret;
1011
1012 ret = regcode;
1013 if (ret == JUST_CALC_SIZE)
1014 regsize += 3;
1015 else
1016 {
1017 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001018 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001019 *regcode++ = NUL;
1020 }
1021 return ret;
1022}
1023
1024/*
1025 * Write a long as four bytes at "p" and return pointer to the next char.
1026 */
1027 static char_u *
1028re_put_long(char_u *p, long_u val)
1029{
1030 *p++ = (char_u) ((val >> 24) & 0377);
1031 *p++ = (char_u) ((val >> 16) & 0377);
1032 *p++ = (char_u) ((val >> 8) & 0377);
1033 *p++ = (char_u) (val & 0377);
1034 return p;
1035}
1036
1037/*
1038 * regnext - dig the "next" pointer out of a node
1039 * Returns NULL when calculating size, when there is no next item and when
1040 * there is an error.
1041 */
1042 static char_u *
1043regnext(char_u *p)
1044{
1045 int offset;
1046
1047 if (p == JUST_CALC_SIZE || reg_toolong)
1048 return NULL;
1049
1050 offset = NEXT(p);
1051 if (offset == 0)
1052 return NULL;
1053
1054 if (OP(p) == BACK)
1055 return p - offset;
1056 else
1057 return p + offset;
1058}
1059
1060/*
1061 * Set the next-pointer at the end of a node chain.
1062 */
1063 static void
1064regtail(char_u *p, char_u *val)
1065{
1066 char_u *scan;
1067 char_u *temp;
1068 int offset;
1069
1070 if (p == JUST_CALC_SIZE)
1071 return;
1072
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001073 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001074 scan = p;
1075 for (;;)
1076 {
1077 temp = regnext(scan);
1078 if (temp == NULL)
1079 break;
1080 scan = temp;
1081 }
1082
1083 if (OP(scan) == BACK)
1084 offset = (int)(scan - val);
1085 else
1086 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001087 // When the offset uses more than 16 bits it can no longer fit in the two
1088 // bytes available. Use a global flag to avoid having to check return
1089 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001090 if (offset > 0xffff)
1091 reg_toolong = TRUE;
1092 else
1093 {
1094 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1095 *(scan + 2) = (char_u) (offset & 0377);
1096 }
1097}
1098
1099/*
1100 * Like regtail, on item after a BRANCH; nop if none.
1101 */
1102 static void
1103regoptail(char_u *p, char_u *val)
1104{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001105 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001106 if (p == NULL || p == JUST_CALC_SIZE
1107 || (OP(p) != BRANCH
1108 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1109 return;
1110 regtail(OPERAND(p), val);
1111}
1112
1113/*
1114 * Insert an operator in front of already-emitted operand
1115 *
1116 * Means relocating the operand.
1117 */
1118 static void
1119reginsert(int op, char_u *opnd)
1120{
1121 char_u *src;
1122 char_u *dst;
1123 char_u *place;
1124
1125 if (regcode == JUST_CALC_SIZE)
1126 {
1127 regsize += 3;
1128 return;
1129 }
1130 src = regcode;
1131 regcode += 3;
1132 dst = regcode;
1133 while (src > opnd)
1134 *--dst = *--src;
1135
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001136 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001137 *place++ = op;
1138 *place++ = NUL;
1139 *place = NUL;
1140}
1141
1142/*
1143 * Insert an operator in front of already-emitted operand.
1144 * Add a number to the operator.
1145 */
1146 static void
1147reginsert_nr(int op, long val, char_u *opnd)
1148{
1149 char_u *src;
1150 char_u *dst;
1151 char_u *place;
1152
1153 if (regcode == JUST_CALC_SIZE)
1154 {
1155 regsize += 7;
1156 return;
1157 }
1158 src = regcode;
1159 regcode += 7;
1160 dst = regcode;
1161 while (src > opnd)
1162 *--dst = *--src;
1163
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001164 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001165 *place++ = op;
1166 *place++ = NUL;
1167 *place++ = NUL;
1168 re_put_long(place, (long_u)val);
1169}
1170
1171/*
1172 * Insert an operator in front of already-emitted operand.
1173 * The operator has the given limit values as operands. Also set next pointer.
1174 *
1175 * Means relocating the operand.
1176 */
1177 static void
1178reginsert_limits(
1179 int op,
1180 long minval,
1181 long maxval,
1182 char_u *opnd)
1183{
1184 char_u *src;
1185 char_u *dst;
1186 char_u *place;
1187
1188 if (regcode == JUST_CALC_SIZE)
1189 {
1190 regsize += 11;
1191 return;
1192 }
1193 src = regcode;
1194 regcode += 11;
1195 dst = regcode;
1196 while (src > opnd)
1197 *--dst = *--src;
1198
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001199 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001200 *place++ = op;
1201 *place++ = NUL;
1202 *place++ = NUL;
1203 place = re_put_long(place, (long_u)minval);
1204 place = re_put_long(place, (long_u)maxval);
1205 regtail(opnd, place);
1206}
1207
1208/*
1209 * Return TRUE if the back reference is legal. We must have seen the close
1210 * brace.
1211 * TODO: Should also check that we don't refer to something that is repeated
1212 * (+*=): what instance of the repetition should we match?
1213 */
1214 static int
1215seen_endbrace(int refnum)
1216{
1217 if (!had_endbrace[refnum])
1218 {
1219 char_u *p;
1220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001221 // Trick: check if "@<=" or "@<!" follows, in which case
1222 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001223 for (p = regparse; *p != NUL; ++p)
1224 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1225 break;
1226 if (*p == NUL)
1227 {
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001228 emsg(_(e_illegal_back_reference));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001229 rc_did_emsg = TRUE;
1230 return FALSE;
1231 }
1232 }
1233 return TRUE;
1234}
1235
1236/*
1237 * Parse the lowest level.
1238 *
1239 * Optimization: gobbles an entire sequence of ordinary characters so that
1240 * it can turn them into a single node, which is smaller to store and
1241 * faster to run. Don't do this when one_exactly is set.
1242 */
1243 static char_u *
1244regatom(int *flagp)
1245{
1246 char_u *ret;
1247 int flags;
1248 int c;
1249 char_u *p;
1250 int extra = 0;
1251 int save_prev_at_start = prev_at_start;
1252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001253 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001254
1255 c = getchr();
1256 switch (c)
1257 {
1258 case Magic('^'):
1259 ret = regnode(BOL);
1260 break;
1261
1262 case Magic('$'):
1263 ret = regnode(EOL);
1264#if defined(FEAT_SYN_HL) || defined(PROTO)
1265 had_eol = TRUE;
1266#endif
1267 break;
1268
1269 case Magic('<'):
1270 ret = regnode(BOW);
1271 break;
1272
1273 case Magic('>'):
1274 ret = regnode(EOW);
1275 break;
1276
1277 case Magic('_'):
1278 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001279 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001280 {
1281 ret = regnode(BOL);
1282 break;
1283 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001284 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001285 {
1286 ret = regnode(EOL);
1287#if defined(FEAT_SYN_HL) || defined(PROTO)
1288 had_eol = TRUE;
1289#endif
1290 break;
1291 }
1292
1293 extra = ADD_NL;
1294 *flagp |= HASNL;
1295
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 if (c == '[')
1298 goto collection;
1299
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001300 // "\_x" is character class plus newline
1301 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001303 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001304 case Magic('.'):
1305 case Magic('i'):
1306 case Magic('I'):
1307 case Magic('k'):
1308 case Magic('K'):
1309 case Magic('f'):
1310 case Magic('F'):
1311 case Magic('p'):
1312 case Magic('P'):
1313 case Magic('s'):
1314 case Magic('S'):
1315 case Magic('d'):
1316 case Magic('D'):
1317 case Magic('x'):
1318 case Magic('X'):
1319 case Magic('o'):
1320 case Magic('O'):
1321 case Magic('w'):
1322 case Magic('W'):
1323 case Magic('h'):
1324 case Magic('H'):
1325 case Magic('a'):
1326 case Magic('A'):
1327 case Magic('l'):
1328 case Magic('L'):
1329 case Magic('u'):
1330 case Magic('U'):
1331 p = vim_strchr(classchars, no_Magic(c));
1332 if (p == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001333 EMSG_RET_NULL(_(e_invalid_use_of_underscore));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001334
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001335 // When '.' is followed by a composing char ignore the dot, so that
1336 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001337 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1338 {
1339 c = getchr();
1340 goto do_multibyte;
1341 }
1342 ret = regnode(classcodes[p - classchars] + extra);
1343 *flagp |= HASWIDTH | SIMPLE;
1344 break;
1345
1346 case Magic('n'):
1347 if (reg_string)
1348 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001349 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001350 ret = regnode(EXACTLY);
1351 regc(NL);
1352 regc(NUL);
1353 *flagp |= HASWIDTH | SIMPLE;
1354 }
1355 else
1356 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001357 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001358 ret = regnode(NEWL);
1359 *flagp |= HASWIDTH | HASNL;
1360 }
1361 break;
1362
1363 case Magic('('):
1364 if (one_exactly)
1365 EMSG_ONE_RET_NULL;
1366 ret = reg(REG_PAREN, &flags);
1367 if (ret == NULL)
1368 return NULL;
1369 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1370 break;
1371
1372 case NUL:
1373 case Magic('|'):
1374 case Magic('&'):
1375 case Magic(')'):
1376 if (one_exactly)
1377 EMSG_ONE_RET_NULL;
Bram Moolenaard0819d12021-12-31 23:15:53 +00001378 // Supposed to be caught earlier.
1379 IEMSG_RET_NULL(_(e_internal_error_in_regexp));
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001380 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001381
1382 case Magic('='):
1383 case Magic('?'):
1384 case Magic('+'):
1385 case Magic('@'):
1386 case Magic('{'):
1387 case Magic('*'):
1388 c = no_Magic(c);
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001389 EMSG3_RET_NULL(_(e_str_chr_follows_nothing),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001390 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001391 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001392
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001393 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001394 if (reg_prev_sub != NULL)
1395 {
1396 char_u *lp;
1397
1398 ret = regnode(EXACTLY);
1399 lp = reg_prev_sub;
1400 while (*lp != NUL)
1401 regc(*lp++);
1402 regc(NUL);
1403 if (*reg_prev_sub != NUL)
1404 {
1405 *flagp |= HASWIDTH;
1406 if ((lp - reg_prev_sub) == 1)
1407 *flagp |= SIMPLE;
1408 }
1409 }
1410 else
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001411 EMSG_RET_NULL(_(e_no_previous_substitute_regular_expression));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001412 break;
1413
1414 case Magic('1'):
1415 case Magic('2'):
1416 case Magic('3'):
1417 case Magic('4'):
1418 case Magic('5'):
1419 case Magic('6'):
1420 case Magic('7'):
1421 case Magic('8'):
1422 case Magic('9'):
1423 {
1424 int refnum;
1425
1426 refnum = c - Magic('0');
1427 if (!seen_endbrace(refnum))
1428 return NULL;
1429 ret = regnode(BACKREF + refnum);
1430 }
1431 break;
1432
1433 case Magic('z'):
1434 {
1435 c = no_Magic(getchr());
1436 switch (c)
1437 {
1438#ifdef FEAT_SYN_HL
1439 case '(': if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001440 EMSG_RET_NULL(_(e_z_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001441 if (one_exactly)
1442 EMSG_ONE_RET_NULL;
1443 ret = reg(REG_ZPAREN, &flags);
1444 if (ret == NULL)
1445 return NULL;
1446 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1447 re_has_z = REX_SET;
1448 break;
1449
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9': if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001459 EMSG_RET_NULL(_(e_z1_z9_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001460 ret = regnode(ZREF + c - '0');
1461 re_has_z = REX_USE;
1462 break;
1463#endif
1464
1465 case 's': ret = regnode(MOPEN + 0);
1466 if (re_mult_next("\\zs") == FAIL)
1467 return NULL;
1468 break;
1469
1470 case 'e': ret = regnode(MCLOSE + 0);
1471 if (re_mult_next("\\ze") == FAIL)
1472 return NULL;
1473 break;
1474
Bram Moolenaarb2810f12022-01-08 21:38:52 +00001475 default: EMSG_RET_NULL(_(e_invalid_character_after_bsl_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001476 }
1477 }
1478 break;
1479
1480 case Magic('%'):
1481 {
1482 c = no_Magic(getchr());
1483 switch (c)
1484 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001485 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001486 case '(':
1487 if (one_exactly)
1488 EMSG_ONE_RET_NULL;
1489 ret = reg(REG_NPAREN, &flags);
1490 if (ret == NULL)
1491 return NULL;
1492 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1493 break;
1494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001495 // Catch \%^ and \%$ regardless of where they appear in the
1496 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001497 case '^':
1498 ret = regnode(RE_BOF);
1499 break;
1500
1501 case '$':
1502 ret = regnode(RE_EOF);
1503 break;
1504
1505 case '#':
1506 ret = regnode(CURSOR);
1507 break;
1508
1509 case 'V':
1510 ret = regnode(RE_VISUAL);
1511 break;
1512
1513 case 'C':
1514 ret = regnode(RE_COMPOSING);
1515 break;
1516
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001517 // \%[abc]: Emit as a list of branches, all ending at the last
1518 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001519 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001520 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001521 EMSG_ONE_RET_NULL;
1522 {
1523 char_u *lastbranch;
1524 char_u *lastnode = NULL;
1525 char_u *br;
1526
1527 ret = NULL;
1528 while ((c = getchr()) != ']')
1529 {
1530 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001531 EMSG2_RET_NULL(_(e_missing_sb_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001532 reg_magic == MAGIC_ALL);
1533 br = regnode(BRANCH);
1534 if (ret == NULL)
1535 ret = br;
1536 else
1537 {
1538 regtail(lastnode, br);
1539 if (reg_toolong)
1540 return NULL;
1541 }
1542
1543 ungetchr();
1544 one_exactly = TRUE;
1545 lastnode = regatom(flagp);
1546 one_exactly = FALSE;
1547 if (lastnode == NULL)
1548 return NULL;
1549 }
1550 if (ret == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001551 EMSG2_RET_NULL(_(e_empty_str_brackets),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001552 reg_magic == MAGIC_ALL);
1553 lastbranch = regnode(BRANCH);
1554 br = regnode(NOTHING);
1555 if (ret != JUST_CALC_SIZE)
1556 {
1557 regtail(lastnode, br);
1558 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001559 // connect all branches to the NOTHING
1560 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001561 for (br = ret; br != lastnode; )
1562 {
1563 if (OP(br) == BRANCH)
1564 {
1565 regtail(br, lastbranch);
1566 if (reg_toolong)
1567 return NULL;
1568 br = OPERAND(br);
1569 }
1570 else
1571 br = regnext(br);
1572 }
1573 }
1574 *flagp &= ~(HASWIDTH | SIMPLE);
1575 break;
1576 }
1577
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001578 case 'd': // %d123 decimal
1579 case 'o': // %o123 octal
1580 case 'x': // %xab hex 2
1581 case 'u': // %uabcd hex 4
1582 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001583 {
1584 long i;
1585
1586 switch (c)
1587 {
1588 case 'd': i = getdecchrs(); break;
1589 case 'o': i = getoctchrs(); break;
1590 case 'x': i = gethexchrs(2); break;
1591 case 'u': i = gethexchrs(4); break;
1592 case 'U': i = gethexchrs(8); break;
1593 default: i = -1; break;
1594 }
1595
1596 if (i < 0 || i > INT_MAX)
1597 EMSG2_RET_NULL(
Bram Moolenaara6f79292022-01-04 21:30:47 +00001598 _(e_invalid_character_after_str_2),
1599 reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001600 if (use_multibytecode(i))
1601 ret = regnode(MULTIBYTECODE);
1602 else
1603 ret = regnode(EXACTLY);
1604 if (i == 0)
1605 regc(0x0a);
1606 else
1607 regmbc(i);
1608 regc(NUL);
1609 *flagp |= HASWIDTH;
1610 break;
1611 }
1612
1613 default:
1614 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001615 || c == '\'' || c == '.')
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001616 {
1617 long_u n = 0;
1618 int cmp;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001619 int cur = FALSE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001620
1621 cmp = c;
1622 if (cmp == '<' || cmp == '>')
1623 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001624 if (no_Magic(c) == '.')
1625 {
1626 cur = TRUE;
1627 c = getchr();
1628 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001629 while (VIM_ISDIGIT(c))
1630 {
1631 n = n * 10 + (c - '0');
1632 c = getchr();
1633 }
1634 if (c == '\'' && n == 0)
1635 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001636 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001637 c = getchr();
1638 ret = regnode(RE_MARK);
1639 if (ret == JUST_CALC_SIZE)
1640 regsize += 2;
1641 else
1642 {
1643 *regcode++ = c;
1644 *regcode++ = cmp;
1645 }
1646 break;
1647 }
1648 else if (c == 'l' || c == 'c' || c == 'v')
1649 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001650 if (cur && n)
1651 {
1652 semsg(_(e_regexp_number_after_dot_pos_search), no_Magic(c));
1653 rc_did_emsg = TRUE;
1654 return NULL;
1655 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001656 if (c == 'l')
1657 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001658 if (cur)
1659 n = curwin->w_cursor.lnum;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001660 ret = regnode(RE_LNUM);
1661 if (save_prev_at_start)
1662 at_start = TRUE;
1663 }
1664 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001665 {
1666 if (cur)
1667 {
1668 n = curwin->w_cursor.col;
1669 n++;
1670 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001671 ret = regnode(RE_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001672 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001673 else
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001674 {
1675 if (cur)
1676 {
1677 colnr_T vcol = 0;
1678
1679 getvvcol(curwin, &curwin->w_cursor,
1680 NULL, NULL, &vcol);
1681 ++vcol;
1682 n = vcol;
1683 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001684 ret = regnode(RE_VCOL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001685 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001686 if (ret == JUST_CALC_SIZE)
1687 regsize += 5;
1688 else
1689 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001690 // put the number and the optional
1691 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001692 regcode = re_put_long(regcode, n);
1693 *regcode++ = cmp;
1694 }
1695 break;
1696 }
1697 }
1698
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001699 EMSG2_RET_NULL(_(e_invalid_character_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001700 reg_magic == MAGIC_ALL);
1701 }
1702 }
1703 break;
1704
1705 case Magic('['):
1706collection:
1707 {
1708 char_u *lp;
1709
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001710 // If there is no matching ']', we assume the '[' is a normal
1711 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001712 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001713 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001714 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001715 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001716 int endc;
1717
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001718 // In a character class, different parsing rules apply.
1719 // Not even \ is special anymore, nothing is.
1720 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001721 {
1722 ret = regnode(ANYBUT + extra);
1723 regparse++;
1724 }
1725 else
1726 ret = regnode(ANYOF + extra);
1727
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001728 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001729 if (*regparse == ']' || *regparse == '-')
1730 {
1731 startc = *regparse;
1732 regc(*regparse++);
1733 }
1734
1735 while (*regparse != NUL && *regparse != ']')
1736 {
1737 if (*regparse == '-')
1738 {
1739 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001740 // The '-' is not used for a range at the end and
1741 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001742 if (*regparse == ']' || *regparse == NUL
1743 || startc == -1
1744 || (regparse[0] == '\\' && regparse[1] == 'n'))
1745 {
1746 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001747 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001748 }
1749 else
1750 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001751 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001752 endc = 0;
1753 if (*regparse == '[')
1754 endc = get_coll_element(&regparse);
1755 if (endc == 0)
1756 {
1757 if (has_mbyte)
1758 endc = mb_ptr2char_adv(&regparse);
1759 else
1760 endc = *regparse++;
1761 }
1762
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001763 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001764 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1765 endc = coll_get_char();
1766
1767 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001768 EMSG_RET_NULL(_(e_reverse_range_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001769 if (has_mbyte && ((*mb_char2len)(startc) > 1
1770 || (*mb_char2len)(endc) > 1))
1771 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001772 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001773 if (endc > startc + 256)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001774 EMSG_RET_NULL(_(e_range_too_large_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001775 while (++startc <= endc)
1776 regmbc(startc);
1777 }
1778 else
1779 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001780 while (++startc <= endc)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001781 regc(startc);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001782 }
1783 startc = -1;
1784 }
1785 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001786 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1787 // accepts "\t", "\e", etc., but only when the 'l' flag in
1788 // 'cpoptions' is not included.
1789 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001790 else if (*regparse == '\\'
1791 && !reg_cpo_bsl
1792 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1793 || (!reg_cpo_lit
1794 && vim_strchr(REGEXP_ABBR,
1795 regparse[1]) != NULL)))
1796 {
1797 regparse++;
1798 if (*regparse == 'n')
1799 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001800 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001801 if (ret != JUST_CALC_SIZE)
1802 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001803 // Using \n inside [^] does not change what
1804 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001805 if (*ret == ANYOF)
1806 {
1807 *ret = ANYOF + ADD_NL;
1808 *flagp |= HASNL;
1809 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001810 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001811 }
1812 regparse++;
1813 startc = -1;
1814 }
1815 else if (*regparse == 'd'
1816 || *regparse == 'o'
1817 || *regparse == 'x'
1818 || *regparse == 'u'
1819 || *regparse == 'U')
1820 {
1821 startc = coll_get_char();
1822 if (startc == 0)
1823 regc(0x0a);
1824 else
1825 regmbc(startc);
1826 }
1827 else
1828 {
1829 startc = backslash_trans(*regparse++);
1830 regc(startc);
1831 }
1832 }
1833 else if (*regparse == '[')
1834 {
1835 int c_class;
1836 int cu;
1837
1838 c_class = get_char_class(&regparse);
1839 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001840 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001841 switch (c_class)
1842 {
1843 case CLASS_NONE:
1844 c_class = get_equi_class(&regparse);
1845 if (c_class != 0)
1846 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001847 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001848 reg_equi_class(c_class);
1849 }
1850 else if ((c_class =
1851 get_coll_element(&regparse)) != 0)
1852 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001853 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001854 regmbc(c_class);
1855 }
1856 else
1857 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001858 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001859 startc = *regparse++;
1860 regc(startc);
1861 }
1862 break;
1863 case CLASS_ALNUM:
1864 for (cu = 1; cu < 128; cu++)
1865 if (isalnum(cu))
1866 regmbc(cu);
1867 break;
1868 case CLASS_ALPHA:
1869 for (cu = 1; cu < 128; cu++)
1870 if (isalpha(cu))
1871 regmbc(cu);
1872 break;
1873 case CLASS_BLANK:
1874 regc(' ');
1875 regc('\t');
1876 break;
1877 case CLASS_CNTRL:
1878 for (cu = 1; cu <= 127; cu++)
1879 if (iscntrl(cu))
1880 regmbc(cu);
1881 break;
1882 case CLASS_DIGIT:
1883 for (cu = 1; cu <= 127; cu++)
1884 if (VIM_ISDIGIT(cu))
1885 regmbc(cu);
1886 break;
1887 case CLASS_GRAPH:
1888 for (cu = 1; cu <= 127; cu++)
1889 if (isgraph(cu))
1890 regmbc(cu);
1891 break;
1892 case CLASS_LOWER:
1893 for (cu = 1; cu <= 255; cu++)
1894 if (MB_ISLOWER(cu) && cu != 170
1895 && cu != 186)
1896 regmbc(cu);
1897 break;
1898 case CLASS_PRINT:
1899 for (cu = 1; cu <= 255; cu++)
1900 if (vim_isprintc(cu))
1901 regmbc(cu);
1902 break;
1903 case CLASS_PUNCT:
1904 for (cu = 1; cu < 128; cu++)
1905 if (ispunct(cu))
1906 regmbc(cu);
1907 break;
1908 case CLASS_SPACE:
1909 for (cu = 9; cu <= 13; cu++)
1910 regc(cu);
1911 regc(' ');
1912 break;
1913 case CLASS_UPPER:
1914 for (cu = 1; cu <= 255; cu++)
1915 if (MB_ISUPPER(cu))
1916 regmbc(cu);
1917 break;
1918 case CLASS_XDIGIT:
1919 for (cu = 1; cu <= 255; cu++)
1920 if (vim_isxdigit(cu))
1921 regmbc(cu);
1922 break;
1923 case CLASS_TAB:
1924 regc('\t');
1925 break;
1926 case CLASS_RETURN:
1927 regc('\r');
1928 break;
1929 case CLASS_BACKSPACE:
1930 regc('\b');
1931 break;
1932 case CLASS_ESCAPE:
1933 regc('\033');
1934 break;
1935 case CLASS_IDENT:
1936 for (cu = 1; cu <= 255; cu++)
1937 if (vim_isIDc(cu))
1938 regmbc(cu);
1939 break;
1940 case CLASS_KEYWORD:
1941 for (cu = 1; cu <= 255; cu++)
1942 if (reg_iswordc(cu))
1943 regmbc(cu);
1944 break;
1945 case CLASS_FNAME:
1946 for (cu = 1; cu <= 255; cu++)
1947 if (vim_isfilec(cu))
1948 regmbc(cu);
1949 break;
1950 }
1951 }
1952 else
1953 {
1954 if (has_mbyte)
1955 {
1956 int len;
1957
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001958 // produce a multibyte character, including any
1959 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001960 startc = mb_ptr2char(regparse);
1961 len = (*mb_ptr2len)(regparse);
1962 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001963 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001964 while (--len >= 0)
1965 regc(*regparse++);
1966 }
1967 else
1968 {
1969 startc = *regparse++;
1970 regc(startc);
1971 }
1972 }
1973 }
1974 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001975 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001976 if (*regparse != ']')
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001977 EMSG_RET_NULL(_(e_too_many_brackets)); // Cannot happen?
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001978 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001979 *flagp |= HASWIDTH | SIMPLE;
1980 break;
1981 }
1982 else if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001983 EMSG2_RET_NULL(_(e_missing_rsb_after_str_lsb),
1984 reg_magic > MAGIC_OFF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001985 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001986 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001987
1988 default:
1989 {
1990 int len;
1991
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001992 // A multi-byte character is handled as a separate atom if it's
1993 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001994 if (use_multibytecode(c))
1995 {
1996do_multibyte:
1997 ret = regnode(MULTIBYTECODE);
1998 regmbc(c);
1999 *flagp |= HASWIDTH | SIMPLE;
2000 break;
2001 }
2002
2003 ret = regnode(EXACTLY);
2004
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002005 // Append characters as long as:
2006 // - there is no following multi, we then need the character in
2007 // front of it as a single character operand
2008 // - not running into a Magic character
2009 // - "one_exactly" is not set
2010 // But always emit at least one character. Might be a Multi,
2011 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002012 for (len = 0; c != NUL && (len == 0
2013 || (re_multi_type(peekchr()) == NOT_MULTI
2014 && !one_exactly
2015 && !is_Magic(c))); ++len)
2016 {
2017 c = no_Magic(c);
2018 if (has_mbyte)
2019 {
2020 regmbc(c);
2021 if (enc_utf8)
2022 {
2023 int l;
2024
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002025 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002026 for (;;)
2027 {
2028 l = utf_ptr2len(regparse);
2029 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2030 break;
2031 regmbc(utf_ptr2char(regparse));
2032 skipchr();
2033 }
2034 }
2035 }
2036 else
2037 regc(c);
2038 c = getchr();
2039 }
2040 ungetchr();
2041
2042 regc(NUL);
2043 *flagp |= HASWIDTH;
2044 if (len == 1)
2045 *flagp |= SIMPLE;
2046 }
2047 break;
2048 }
2049
2050 return ret;
2051}
2052
2053/*
2054 * Parse something followed by possible [*+=].
2055 *
2056 * Note that the branching code sequences used for = and the general cases
2057 * of * and + are somewhat optimized: they use the same NOTHING node as
2058 * both the endmarker for their branch list and the body of the last branch.
2059 * It might seem that this node could be dispensed with entirely, but the
2060 * endmarker role is not redundant.
2061 */
2062 static char_u *
2063regpiece(int *flagp)
2064{
2065 char_u *ret;
2066 int op;
2067 char_u *next;
2068 int flags;
2069 long minval;
2070 long maxval;
2071
2072 ret = regatom(&flags);
2073 if (ret == NULL)
2074 return NULL;
2075
2076 op = peekchr();
2077 if (re_multi_type(op) == NOT_MULTI)
2078 {
2079 *flagp = flags;
2080 return ret;
2081 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002082 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002083 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2084
2085 skipchr();
2086 switch (op)
2087 {
2088 case Magic('*'):
2089 if (flags & SIMPLE)
2090 reginsert(STAR, ret);
2091 else
2092 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002093 // Emit x* as (x&|), where & means "self".
2094 reginsert(BRANCH, ret); // Either x
2095 regoptail(ret, regnode(BACK)); // and loop
2096 regoptail(ret, ret); // back
2097 regtail(ret, regnode(BRANCH)); // or
2098 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002099 }
2100 break;
2101
2102 case Magic('+'):
2103 if (flags & SIMPLE)
2104 reginsert(PLUS, ret);
2105 else
2106 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002107 // Emit x+ as x(&|), where & means "self".
2108 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002109 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002110 regtail(regnode(BACK), ret); // loop back
2111 regtail(next, regnode(BRANCH)); // or
2112 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002113 }
2114 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2115 break;
2116
2117 case Magic('@'):
2118 {
2119 int lop = END;
2120 long nr;
2121
2122 nr = getdecchrs();
2123 switch (no_Magic(getchr()))
2124 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002125 case '=': lop = MATCH; break; // \@=
2126 case '!': lop = NOMATCH; break; // \@!
2127 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002128 case '<': switch (no_Magic(getchr()))
2129 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002130 case '=': lop = BEHIND; break; // \@<=
2131 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002132 }
2133 }
2134 if (lop == END)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002135 EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002136 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002137 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002138 if (lop == BEHIND || lop == NOBEHIND)
2139 {
2140 regtail(ret, regnode(BHPOS));
2141 *flagp |= HASLOOKBH;
2142 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002143 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002144 if (lop == BEHIND || lop == NOBEHIND)
2145 {
2146 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002147 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002148 reginsert_nr(lop, nr, ret);
2149 }
2150 else
2151 reginsert(lop, ret);
2152 break;
2153 }
2154
2155 case Magic('?'):
2156 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002157 // Emit x= as (x|)
2158 reginsert(BRANCH, ret); // Either x
2159 regtail(ret, regnode(BRANCH)); // or
2160 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002161 regtail(ret, next);
2162 regoptail(ret, next);
2163 break;
2164
2165 case Magic('{'):
2166 if (!read_limits(&minval, &maxval))
2167 return NULL;
2168 if (flags & SIMPLE)
2169 {
2170 reginsert(BRACE_SIMPLE, ret);
2171 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2172 }
2173 else
2174 {
2175 if (num_complex_braces >= 10)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002176 EMSG2_RET_NULL(_(e_too_many_complex_str_curly),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002177 reg_magic == MAGIC_ALL);
2178 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2179 regoptail(ret, regnode(BACK));
2180 regoptail(ret, ret);
2181 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2182 ++num_complex_braces;
2183 }
2184 if (minval > 0 && maxval > 0)
2185 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2186 break;
2187 }
2188 if (re_multi_type(peekchr()) != NOT_MULTI)
2189 {
2190 // Can't have a multi follow a multi.
2191 if (peekchr() == Magic('*'))
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002192 EMSG2_RET_NULL(_(e_nested_str), reg_magic >= MAGIC_ON);
2193 EMSG3_RET_NULL(_(e_nested_str_chr), reg_magic == MAGIC_ALL,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002194 no_Magic(peekchr()));
2195 }
2196
2197 return ret;
2198}
2199
2200/*
2201 * Parse one alternative of an | or & operator.
2202 * Implements the concatenation operator.
2203 */
2204 static char_u *
2205regconcat(int *flagp)
2206{
2207 char_u *first = NULL;
2208 char_u *chain = NULL;
2209 char_u *latest;
2210 int flags;
2211 int cont = TRUE;
2212
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002213 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002214
2215 while (cont)
2216 {
2217 switch (peekchr())
2218 {
2219 case NUL:
2220 case Magic('|'):
2221 case Magic('&'):
2222 case Magic(')'):
2223 cont = FALSE;
2224 break;
2225 case Magic('Z'):
2226 regflags |= RF_ICOMBINE;
2227 skipchr_keepstart();
2228 break;
2229 case Magic('c'):
2230 regflags |= RF_ICASE;
2231 skipchr_keepstart();
2232 break;
2233 case Magic('C'):
2234 regflags |= RF_NOICASE;
2235 skipchr_keepstart();
2236 break;
2237 case Magic('v'):
2238 reg_magic = MAGIC_ALL;
2239 skipchr_keepstart();
2240 curchr = -1;
2241 break;
2242 case Magic('m'):
2243 reg_magic = MAGIC_ON;
2244 skipchr_keepstart();
2245 curchr = -1;
2246 break;
2247 case Magic('M'):
2248 reg_magic = MAGIC_OFF;
2249 skipchr_keepstart();
2250 curchr = -1;
2251 break;
2252 case Magic('V'):
2253 reg_magic = MAGIC_NONE;
2254 skipchr_keepstart();
2255 curchr = -1;
2256 break;
2257 default:
2258 latest = regpiece(&flags);
2259 if (latest == NULL || reg_toolong)
2260 return NULL;
2261 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002262 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002263 *flagp |= flags & SPSTART;
2264 else
2265 regtail(chain, latest);
2266 chain = latest;
2267 if (first == NULL)
2268 first = latest;
2269 break;
2270 }
2271 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002272 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002273 first = regnode(NOTHING);
2274 return first;
2275}
2276
2277/*
2278 * Parse one alternative of an | operator.
2279 * Implements the & operator.
2280 */
2281 static char_u *
2282regbranch(int *flagp)
2283{
2284 char_u *ret;
2285 char_u *chain = NULL;
2286 char_u *latest;
2287 int flags;
2288
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002289 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002290
2291 ret = regnode(BRANCH);
2292 for (;;)
2293 {
2294 latest = regconcat(&flags);
2295 if (latest == NULL)
2296 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002297 // If one of the branches has width, the whole thing has. If one of
2298 // the branches anchors at start-of-line, the whole thing does.
2299 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002300 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002301 // If one of the branches doesn't match a line-break, the whole thing
2302 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002303 *flagp &= ~HASNL | (flags & HASNL);
2304 if (chain != NULL)
2305 regtail(chain, latest);
2306 if (peekchr() != Magic('&'))
2307 break;
2308 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002309 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002310 if (reg_toolong)
2311 break;
2312 reginsert(MATCH, latest);
2313 chain = latest;
2314 }
2315
2316 return ret;
2317}
2318
2319/*
2320 * Parse regular expression, i.e. main body or parenthesized thing.
2321 *
2322 * Caller must absorb opening parenthesis.
2323 *
2324 * Combining parenthesis handling with the base level of regular expression
2325 * is a trifle forced, but the need to tie the tails of the branches to what
2326 * follows makes it hard to avoid.
2327 */
2328 static char_u *
2329reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002330 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002331 int *flagp)
2332{
2333 char_u *ret;
2334 char_u *br;
2335 char_u *ender;
2336 int parno = 0;
2337 int flags;
2338
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002339 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002340
2341#ifdef FEAT_SYN_HL
2342 if (paren == REG_ZPAREN)
2343 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002344 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002345 if (regnzpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002346 EMSG_RET_NULL(_(e_too_many_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002347 parno = regnzpar;
2348 regnzpar++;
2349 ret = regnode(ZOPEN + parno);
2350 }
2351 else
2352#endif
2353 if (paren == REG_PAREN)
2354 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002355 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002356 if (regnpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002357 EMSG2_RET_NULL(_(e_too_many_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002358 parno = regnpar;
2359 ++regnpar;
2360 ret = regnode(MOPEN + parno);
2361 }
2362 else if (paren == REG_NPAREN)
2363 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002364 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002365 ret = regnode(NOPEN);
2366 }
2367 else
2368 ret = NULL;
2369
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002370 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002371 br = regbranch(&flags);
2372 if (br == NULL)
2373 return NULL;
2374 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002375 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002376 else
2377 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002378 // If one of the branches can be zero-width, the whole thing can.
2379 // If one of the branches has * at start or matches a line-break, the
2380 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002381 if (!(flags & HASWIDTH))
2382 *flagp &= ~HASWIDTH;
2383 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2384 while (peekchr() == Magic('|'))
2385 {
2386 skipchr();
2387 br = regbranch(&flags);
2388 if (br == NULL || reg_toolong)
2389 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002390 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002391 if (!(flags & HASWIDTH))
2392 *flagp &= ~HASWIDTH;
2393 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2394 }
2395
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002396 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002397 ender = regnode(
2398#ifdef FEAT_SYN_HL
2399 paren == REG_ZPAREN ? ZCLOSE + parno :
2400#endif
2401 paren == REG_PAREN ? MCLOSE + parno :
2402 paren == REG_NPAREN ? NCLOSE : END);
2403 regtail(ret, ender);
2404
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002405 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002406 for (br = ret; br != NULL; br = regnext(br))
2407 regoptail(br, ender);
2408
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002409 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002410 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2411 {
2412#ifdef FEAT_SYN_HL
2413 if (paren == REG_ZPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002414 EMSG_RET_NULL(_(e_unmatched_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002415 else
2416#endif
2417 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002418 EMSG2_RET_NULL(_(e_unmatched_str_percent_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002419 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002420 EMSG2_RET_NULL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002421 }
2422 else if (paren == REG_NOPAREN && peekchr() != NUL)
2423 {
2424 if (curchr == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002425 EMSG2_RET_NULL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002426 else
Bram Moolenaar74409f62022-01-01 15:58:22 +00002427 EMSG_RET_NULL(_(e_trailing_characters)); // "Can't happen".
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002428 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002429 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002430 // Here we set the flag allowing back references to this set of
2431 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002432 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002433 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002434 return ret;
2435}
2436
2437/*
2438 * bt_regcomp() - compile a regular expression into internal code for the
2439 * traditional back track matcher.
2440 * Returns the program in allocated space. Returns NULL for an error.
2441 *
2442 * We can't allocate space until we know how big the compiled form will be,
2443 * but we can't compile it (and thus know how big it is) until we've got a
2444 * place to put the code. So we cheat: we compile it twice, once with code
2445 * generation turned off and size counting turned on, and once "for real".
2446 * This also means that we don't allocate space until we are sure that the
2447 * thing really will compile successfully, and we never have to move the
2448 * code and thus invalidate pointers into it. (Note that it has to be in
2449 * one piece because vim_free() must be able to free it all.)
2450 *
2451 * Whether upper/lower case is to be ignored is decided when executing the
2452 * program, it does not matter here.
2453 *
2454 * Beware that the optimization-preparation code in here knows about some
2455 * of the structure of the compiled regexp.
2456 * "re_flags": RE_MAGIC and/or RE_STRING.
2457 */
2458 static regprog_T *
2459bt_regcomp(char_u *expr, int re_flags)
2460{
2461 bt_regprog_T *r;
2462 char_u *scan;
2463 char_u *longest;
2464 int len;
2465 int flags;
2466
2467 if (expr == NULL)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002468 IEMSG_RET_NULL(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002469
2470 init_class_tab();
2471
2472 // First pass: determine size, legality.
2473 regcomp_start(expr, re_flags);
2474 regcode = JUST_CALC_SIZE;
2475 regc(REGMAGIC);
2476 if (reg(REG_NOPAREN, &flags) == NULL)
2477 return NULL;
2478
2479 // Allocate space.
2480 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2481 if (r == NULL)
2482 return NULL;
2483 r->re_in_use = FALSE;
2484
2485 // Second pass: emit code.
2486 regcomp_start(expr, re_flags);
2487 regcode = r->program;
2488 regc(REGMAGIC);
2489 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2490 {
2491 vim_free(r);
2492 if (reg_toolong)
Bram Moolenaareaaac012022-01-02 17:00:40 +00002493 EMSG_RET_NULL(_(e_pattern_too_long));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002494 return NULL;
2495 }
2496
2497 // Dig out information for optimizations.
2498 r->regstart = NUL; // Worst-case defaults.
2499 r->reganch = 0;
2500 r->regmust = NULL;
2501 r->regmlen = 0;
2502 r->regflags = regflags;
2503 if (flags & HASNL)
2504 r->regflags |= RF_HASNL;
2505 if (flags & HASLOOKBH)
2506 r->regflags |= RF_LOOKBH;
2507#ifdef FEAT_SYN_HL
2508 // Remember whether this pattern has any \z specials in it.
2509 r->reghasz = re_has_z;
2510#endif
2511 scan = r->program + 1; // First BRANCH.
2512 if (OP(regnext(scan)) == END) // Only one top-level choice.
2513 {
2514 scan = OPERAND(scan);
2515
2516 // Starting-point info.
2517 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2518 {
2519 r->reganch++;
2520 scan = regnext(scan);
2521 }
2522
2523 if (OP(scan) == EXACTLY)
2524 {
2525 if (has_mbyte)
2526 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2527 else
2528 r->regstart = *OPERAND(scan);
2529 }
2530 else if ((OP(scan) == BOW
2531 || OP(scan) == EOW
2532 || OP(scan) == NOTHING
2533 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2534 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2535 && OP(regnext(scan)) == EXACTLY)
2536 {
2537 if (has_mbyte)
2538 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2539 else
2540 r->regstart = *OPERAND(regnext(scan));
2541 }
2542
2543 // If there's something expensive in the r.e., find the longest
2544 // literal string that must appear and make it the regmust. Resolve
2545 // ties in favor of later strings, since the regstart check works
2546 // with the beginning of the r.e. and avoiding duplication
2547 // strengthens checking. Not a strong reason, but sufficient in the
2548 // absence of others.
2549
2550 // When the r.e. starts with BOW, it is faster to look for a regmust
2551 // first. Used a lot for "#" and "*" commands. (Added by mool).
2552 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2553 && !(flags & HASNL))
2554 {
2555 longest = NULL;
2556 len = 0;
2557 for (; scan != NULL; scan = regnext(scan))
2558 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
2559 {
2560 longest = OPERAND(scan);
2561 len = (int)STRLEN(OPERAND(scan));
2562 }
2563 r->regmust = longest;
2564 r->regmlen = len;
2565 }
2566 }
2567#ifdef BT_REGEXP_DUMP
2568 regdump(expr, r);
2569#endif
2570 r->engine = &bt_regengine;
2571 return (regprog_T *)r;
2572}
2573
2574#if defined(FEAT_SYN_HL) || defined(PROTO)
2575/*
2576 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2577 * found. This is messy, but it works fine.
2578 */
2579 int
2580vim_regcomp_had_eol(void)
2581{
2582 return had_eol;
2583}
2584#endif
2585
2586/*
2587 * Get a number after a backslash that is inside [].
2588 * When nothing is recognized return a backslash.
2589 */
2590 static int
2591coll_get_char(void)
2592{
2593 long nr = -1;
2594
2595 switch (*regparse++)
2596 {
2597 case 'd': nr = getdecchrs(); break;
2598 case 'o': nr = getoctchrs(); break;
2599 case 'x': nr = gethexchrs(2); break;
2600 case 'u': nr = gethexchrs(4); break;
2601 case 'U': nr = gethexchrs(8); break;
2602 }
2603 if (nr < 0 || nr > INT_MAX)
2604 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002605 // If getting the number fails be backwards compatible: the character
2606 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002607 --regparse;
2608 nr = '\\';
2609 }
2610 return nr;
2611}
2612
2613/*
2614 * Free a compiled regexp program, returned by bt_regcomp().
2615 */
2616 static void
2617bt_regfree(regprog_T *prog)
2618{
2619 vim_free(prog);
2620}
2621
2622#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2623
2624/*
2625 * The arguments from BRACE_LIMITS are stored here. They are actually local
2626 * to regmatch(), but they are here to reduce the amount of stack space used
2627 * (it can be called recursively many times).
2628 */
2629static long bl_minval;
2630static long bl_maxval;
2631
2632/*
2633 * Save the input line and position in a regsave_T.
2634 */
2635 static void
2636reg_save(regsave_T *save, garray_T *gap)
2637{
2638 if (REG_MULTI)
2639 {
2640 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2641 save->rs_u.pos.lnum = rex.lnum;
2642 }
2643 else
2644 save->rs_u.ptr = rex.input;
2645 save->rs_len = gap->ga_len;
2646}
2647
2648/*
2649 * Restore the input line and position from a regsave_T.
2650 */
2651 static void
2652reg_restore(regsave_T *save, garray_T *gap)
2653{
2654 if (REG_MULTI)
2655 {
2656 if (rex.lnum != save->rs_u.pos.lnum)
2657 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002658 // only call reg_getline() when the line number changed to save
2659 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002660 rex.lnum = save->rs_u.pos.lnum;
2661 rex.line = reg_getline(rex.lnum);
2662 }
2663 rex.input = rex.line + save->rs_u.pos.col;
2664 }
2665 else
2666 rex.input = save->rs_u.ptr;
2667 gap->ga_len = save->rs_len;
2668}
2669
2670/*
2671 * Return TRUE if current position is equal to saved position.
2672 */
2673 static int
2674reg_save_equal(regsave_T *save)
2675{
2676 if (REG_MULTI)
2677 return rex.lnum == save->rs_u.pos.lnum
2678 && rex.input == rex.line + save->rs_u.pos.col;
2679 return rex.input == save->rs_u.ptr;
2680}
2681
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002682// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002683#define save_se(savep, posp, pp) \
2684 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2685
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002686// After a failed match restore the sub-expressions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002687#define restore_se(savep, posp, pp) { \
2688 if (REG_MULTI) \
2689 *(posp) = (savep)->se_u.pos; \
2690 else \
2691 *(pp) = (savep)->se_u.ptr; }
2692
2693/*
2694 * Tentatively set the sub-expression start to the current position (after
2695 * calling regmatch() they will have changed). Need to save the existing
2696 * values for when there is no match.
2697 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2698 * depending on REG_MULTI.
2699 */
2700 static void
2701save_se_multi(save_se_T *savep, lpos_T *posp)
2702{
2703 savep->se_u.pos = *posp;
2704 posp->lnum = rex.lnum;
2705 posp->col = (colnr_T)(rex.input - rex.line);
2706}
2707
2708 static void
2709save_se_one(save_se_T *savep, char_u **pp)
2710{
2711 savep->se_u.ptr = *pp;
2712 *pp = rex.input;
2713}
2714
2715/*
2716 * regrepeat - repeatedly match something simple, return how many.
2717 * Advances rex.input (and rex.lnum) to just after the matched chars.
2718 */
2719 static int
2720regrepeat(
2721 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002722 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002723{
2724 long count = 0;
2725 char_u *scan;
2726 char_u *opnd;
2727 int mask;
2728 int testval = 0;
2729
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002730 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002731 opnd = OPERAND(p);
2732 switch (OP(p))
2733 {
2734 case ANY:
2735 case ANY + ADD_NL:
2736 while (count < maxcount)
2737 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002738 // Matching anything means we continue until end-of-line (or
2739 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002740 while (*scan != NUL && count < maxcount)
2741 {
2742 ++count;
2743 MB_PTR_ADV(scan);
2744 }
2745 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2746 || rex.reg_line_lbr || count == maxcount)
2747 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002748 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002749 reg_nextline();
2750 scan = rex.input;
2751 if (got_int)
2752 break;
2753 }
2754 break;
2755
2756 case IDENT:
2757 case IDENT + ADD_NL:
2758 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002759 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002760 case SIDENT:
2761 case SIDENT + ADD_NL:
2762 while (count < maxcount)
2763 {
2764 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2765 {
2766 MB_PTR_ADV(scan);
2767 }
2768 else if (*scan == NUL)
2769 {
2770 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2771 || rex.reg_line_lbr)
2772 break;
2773 reg_nextline();
2774 scan = rex.input;
2775 if (got_int)
2776 break;
2777 }
2778 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2779 ++scan;
2780 else
2781 break;
2782 ++count;
2783 }
2784 break;
2785
2786 case KWORD:
2787 case KWORD + ADD_NL:
2788 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002789 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002790 case SKWORD:
2791 case SKWORD + ADD_NL:
2792 while (count < maxcount)
2793 {
2794 if (vim_iswordp_buf(scan, rex.reg_buf)
2795 && (testval || !VIM_ISDIGIT(*scan)))
2796 {
2797 MB_PTR_ADV(scan);
2798 }
2799 else if (*scan == NUL)
2800 {
2801 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2802 || rex.reg_line_lbr)
2803 break;
2804 reg_nextline();
2805 scan = rex.input;
2806 if (got_int)
2807 break;
2808 }
2809 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2810 ++scan;
2811 else
2812 break;
2813 ++count;
2814 }
2815 break;
2816
2817 case FNAME:
2818 case FNAME + ADD_NL:
2819 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002820 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002821 case SFNAME:
2822 case SFNAME + ADD_NL:
2823 while (count < maxcount)
2824 {
2825 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2826 {
2827 MB_PTR_ADV(scan);
2828 }
2829 else if (*scan == NUL)
2830 {
2831 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2832 || rex.reg_line_lbr)
2833 break;
2834 reg_nextline();
2835 scan = rex.input;
2836 if (got_int)
2837 break;
2838 }
2839 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2840 ++scan;
2841 else
2842 break;
2843 ++count;
2844 }
2845 break;
2846
2847 case PRINT:
2848 case PRINT + ADD_NL:
2849 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002850 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002851 case SPRINT:
2852 case SPRINT + ADD_NL:
2853 while (count < maxcount)
2854 {
2855 if (*scan == NUL)
2856 {
2857 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2858 || rex.reg_line_lbr)
2859 break;
2860 reg_nextline();
2861 scan = rex.input;
2862 if (got_int)
2863 break;
2864 }
2865 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2866 && (testval || !VIM_ISDIGIT(*scan)))
2867 {
2868 MB_PTR_ADV(scan);
2869 }
2870 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2871 ++scan;
2872 else
2873 break;
2874 ++count;
2875 }
2876 break;
2877
2878 case WHITE:
2879 case WHITE + ADD_NL:
2880 testval = mask = RI_WHITE;
2881do_class:
2882 while (count < maxcount)
2883 {
2884 int l;
2885
2886 if (*scan == NUL)
2887 {
2888 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2889 || rex.reg_line_lbr)
2890 break;
2891 reg_nextline();
2892 scan = rex.input;
2893 if (got_int)
2894 break;
2895 }
2896 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2897 {
2898 if (testval != 0)
2899 break;
2900 scan += l;
2901 }
2902 else if ((class_tab[*scan] & mask) == testval)
2903 ++scan;
2904 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2905 ++scan;
2906 else
2907 break;
2908 ++count;
2909 }
2910 break;
2911
2912 case NWHITE:
2913 case NWHITE + ADD_NL:
2914 mask = RI_WHITE;
2915 goto do_class;
2916 case DIGIT:
2917 case DIGIT + ADD_NL:
2918 testval = mask = RI_DIGIT;
2919 goto do_class;
2920 case NDIGIT:
2921 case NDIGIT + ADD_NL:
2922 mask = RI_DIGIT;
2923 goto do_class;
2924 case HEX:
2925 case HEX + ADD_NL:
2926 testval = mask = RI_HEX;
2927 goto do_class;
2928 case NHEX:
2929 case NHEX + ADD_NL:
2930 mask = RI_HEX;
2931 goto do_class;
2932 case OCTAL:
2933 case OCTAL + ADD_NL:
2934 testval = mask = RI_OCTAL;
2935 goto do_class;
2936 case NOCTAL:
2937 case NOCTAL + ADD_NL:
2938 mask = RI_OCTAL;
2939 goto do_class;
2940 case WORD:
2941 case WORD + ADD_NL:
2942 testval = mask = RI_WORD;
2943 goto do_class;
2944 case NWORD:
2945 case NWORD + ADD_NL:
2946 mask = RI_WORD;
2947 goto do_class;
2948 case HEAD:
2949 case HEAD + ADD_NL:
2950 testval = mask = RI_HEAD;
2951 goto do_class;
2952 case NHEAD:
2953 case NHEAD + ADD_NL:
2954 mask = RI_HEAD;
2955 goto do_class;
2956 case ALPHA:
2957 case ALPHA + ADD_NL:
2958 testval = mask = RI_ALPHA;
2959 goto do_class;
2960 case NALPHA:
2961 case NALPHA + ADD_NL:
2962 mask = RI_ALPHA;
2963 goto do_class;
2964 case LOWER:
2965 case LOWER + ADD_NL:
2966 testval = mask = RI_LOWER;
2967 goto do_class;
2968 case NLOWER:
2969 case NLOWER + ADD_NL:
2970 mask = RI_LOWER;
2971 goto do_class;
2972 case UPPER:
2973 case UPPER + ADD_NL:
2974 testval = mask = RI_UPPER;
2975 goto do_class;
2976 case NUPPER:
2977 case NUPPER + ADD_NL:
2978 mask = RI_UPPER;
2979 goto do_class;
2980
2981 case EXACTLY:
2982 {
2983 int cu, cl;
2984
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002985 // This doesn't do a multi-byte character, because a MULTIBYTECODE
2986 // would have been used for it. It does handle single-byte
2987 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002988 if (rex.reg_ic)
2989 {
2990 cu = MB_TOUPPER(*opnd);
2991 cl = MB_TOLOWER(*opnd);
2992 while (count < maxcount && (*scan == cu || *scan == cl))
2993 {
2994 count++;
2995 scan++;
2996 }
2997 }
2998 else
2999 {
3000 cu = *opnd;
3001 while (count < maxcount && *scan == cu)
3002 {
3003 count++;
3004 scan++;
3005 }
3006 }
3007 break;
3008 }
3009
3010 case MULTIBYTECODE:
3011 {
3012 int i, len, cf = 0;
3013
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003014 // Safety check (just in case 'encoding' was changed since
3015 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003016 if ((len = (*mb_ptr2len)(opnd)) > 1)
3017 {
3018 if (rex.reg_ic && enc_utf8)
3019 cf = utf_fold(utf_ptr2char(opnd));
3020 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3021 {
3022 for (i = 0; i < len; ++i)
3023 if (opnd[i] != scan[i])
3024 break;
3025 if (i < len && (!rex.reg_ic || !enc_utf8
3026 || utf_fold(utf_ptr2char(scan)) != cf))
3027 break;
3028 scan += len;
3029 ++count;
3030 }
3031 }
3032 }
3033 break;
3034
3035 case ANYOF:
3036 case ANYOF + ADD_NL:
3037 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003038 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003039
3040 case ANYBUT:
3041 case ANYBUT + ADD_NL:
3042 while (count < maxcount)
3043 {
3044 int len;
3045
3046 if (*scan == NUL)
3047 {
3048 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3049 || rex.reg_line_lbr)
3050 break;
3051 reg_nextline();
3052 scan = rex.input;
3053 if (got_int)
3054 break;
3055 }
3056 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3057 ++scan;
3058 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3059 {
3060 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3061 break;
3062 scan += len;
3063 }
3064 else
3065 {
3066 if ((cstrchr(opnd, *scan) == NULL) == testval)
3067 break;
3068 ++scan;
3069 }
3070 ++count;
3071 }
3072 break;
3073
3074 case NEWL:
3075 while (count < maxcount
3076 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3077 && !rex.reg_line_lbr && REG_MULTI)
3078 || (*scan == '\n' && rex.reg_line_lbr)))
3079 {
3080 count++;
3081 if (rex.reg_line_lbr)
3082 ADVANCE_REGINPUT();
3083 else
3084 reg_nextline();
3085 scan = rex.input;
3086 if (got_int)
3087 break;
3088 }
3089 break;
3090
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003091 default: // Oh dear. Called inappropriately.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02003092 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003093#ifdef DEBUG
3094 printf("Called regrepeat with op code %d\n", OP(p));
3095#endif
3096 break;
3097 }
3098
3099 rex.input = scan;
3100
3101 return (int)count;
3102}
3103
3104/*
3105 * Push an item onto the regstack.
3106 * Returns pointer to new item. Returns NULL when out of memory.
3107 */
3108 static regitem_T *
3109regstack_push(regstate_T state, char_u *scan)
3110{
3111 regitem_T *rp;
3112
3113 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3114 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00003115 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003116 return NULL;
3117 }
3118 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3119 return NULL;
3120
3121 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3122 rp->rs_state = state;
3123 rp->rs_scan = scan;
3124
3125 regstack.ga_len += sizeof(regitem_T);
3126 return rp;
3127}
3128
3129/*
3130 * Pop an item from the regstack.
3131 */
3132 static void
3133regstack_pop(char_u **scan)
3134{
3135 regitem_T *rp;
3136
3137 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3138 *scan = rp->rs_scan;
3139
3140 regstack.ga_len -= sizeof(regitem_T);
3141}
3142
3143/*
3144 * Save the current subexpr to "bp", so that they can be restored
3145 * later by restore_subexpr().
3146 */
3147 static void
3148save_subexpr(regbehind_T *bp)
3149{
3150 int i;
3151
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003152 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3153 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003154 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
3155 if (!rex.need_clear_subexpr)
3156 {
3157 for (i = 0; i < NSUBEXP; ++i)
3158 {
3159 if (REG_MULTI)
3160 {
3161 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3162 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3163 }
3164 else
3165 {
3166 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3167 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
3168 }
3169 }
3170 }
3171}
3172
3173/*
3174 * Restore the subexpr from "bp".
3175 */
3176 static void
3177restore_subexpr(regbehind_T *bp)
3178{
3179 int i;
3180
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003181 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003182 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
3183 if (!rex.need_clear_subexpr)
3184 {
3185 for (i = 0; i < NSUBEXP; ++i)
3186 {
3187 if (REG_MULTI)
3188 {
3189 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3190 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3191 }
3192 else
3193 {
3194 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3195 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
3196 }
3197 }
3198 }
3199}
3200
3201/*
3202 * regmatch - main matching routine
3203 *
3204 * Conceptually the strategy is simple: Check to see whether the current node
3205 * matches, push an item onto the regstack and loop to see whether the rest
3206 * matches, and then act accordingly. In practice we make some effort to
3207 * avoid using the regstack, in particular by going through "ordinary" nodes
3208 * (that don't need to know whether the rest of the match failed) by a nested
3209 * loop.
3210 *
3211 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3212 * the last matched character.
3213 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3214 * undefined state!
3215 */
3216 static int
3217regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003218 char_u *scan, // Current node.
3219 proftime_T *tm UNUSED, // timeout limit or NULL
3220 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003221{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003222 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003223 int op;
3224 int c;
3225 regitem_T *rp;
3226 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003227 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003228#ifdef FEAT_RELTIME
3229 int tm_count = 0;
3230#endif
3231
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003232 // Make "regstack" and "backpos" empty. They are allocated and freed in
3233 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003234 regstack.ga_len = 0;
3235 backpos.ga_len = 0;
3236
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003237 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003238 for (;;)
3239 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003240 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3241 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003242 fast_breakcheck();
3243
3244#ifdef DEBUG
3245 if (scan != NULL && regnarrate)
3246 {
3247 mch_errmsg((char *)regprop(scan));
3248 mch_errmsg("(\n");
3249 }
3250#endif
3251
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003252 // Repeat for items that can be matched sequentially, without using the
3253 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003254 for (;;)
3255 {
3256 if (got_int || scan == NULL)
3257 {
3258 status = RA_FAIL;
3259 break;
3260 }
3261#ifdef FEAT_RELTIME
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003262 // Check for timeout once in a 100 times to avoid overhead.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003263 if (tm != NULL && ++tm_count == 100)
3264 {
3265 tm_count = 0;
3266 if (profile_passed_limit(tm))
3267 {
3268 if (timed_out != NULL)
3269 *timed_out = TRUE;
3270 status = RA_FAIL;
3271 break;
3272 }
3273 }
3274#endif
3275 status = RA_CONT;
3276
3277#ifdef DEBUG
3278 if (regnarrate)
3279 {
3280 mch_errmsg((char *)regprop(scan));
3281 mch_errmsg("...\n");
3282# ifdef FEAT_SYN_HL
3283 if (re_extmatch_in != NULL)
3284 {
3285 int i;
3286
3287 mch_errmsg(_("External submatches:\n"));
3288 for (i = 0; i < NSUBEXP; i++)
3289 {
3290 mch_errmsg(" \"");
3291 if (re_extmatch_in->matches[i] != NULL)
3292 mch_errmsg((char *)re_extmatch_in->matches[i]);
3293 mch_errmsg("\"\n");
3294 }
3295 }
3296# endif
3297 }
3298#endif
3299 next = regnext(scan);
3300
3301 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003302 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003303 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3304 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
3305 {
3306 reg_nextline();
3307 }
3308 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3309 {
3310 ADVANCE_REGINPUT();
3311 }
3312 else
3313 {
3314 if (WITH_NL(op))
3315 op -= ADD_NL;
3316 if (has_mbyte)
3317 c = (*mb_ptr2char)(rex.input);
3318 else
3319 c = *rex.input;
3320 switch (op)
3321 {
3322 case BOL:
3323 if (rex.input != rex.line)
3324 status = RA_NOMATCH;
3325 break;
3326
3327 case EOL:
3328 if (c != NUL)
3329 status = RA_NOMATCH;
3330 break;
3331
3332 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003333 // We're not at the beginning of the file when below the first
3334 // line where we started, not at the start of the line or we
3335 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003336 if (rex.lnum != 0 || rex.input != rex.line
3337 || (REG_MULTI && rex.reg_firstlnum > 1))
3338 status = RA_NOMATCH;
3339 break;
3340
3341 case RE_EOF:
3342 if (rex.lnum != rex.reg_maxline || c != NUL)
3343 status = RA_NOMATCH;
3344 break;
3345
3346 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003347 // Check if the buffer is in a window and compare the
3348 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003349 if (rex.reg_win == NULL
3350 || (rex.lnum + rex.reg_firstlnum
3351 != rex.reg_win->w_cursor.lnum)
3352 || ((colnr_T)(rex.input - rex.line)
3353 != rex.reg_win->w_cursor.col))
3354 status = RA_NOMATCH;
3355 break;
3356
3357 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003358 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003359 {
3360 int mark = OPERAND(scan)[0];
3361 int cmp = OPERAND(scan)[1];
3362 pos_T *pos;
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003363 size_t col = REG_MULTI ? rex.input - rex.line : 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003364
3365 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003366
3367 // Line may have been freed, get it again.
3368 if (REG_MULTI)
3369 {
3370 rex.line = reg_getline(rex.lnum);
3371 rex.input = rex.line + col;
3372 }
3373
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003374 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003375 || pos->lnum <= 0) // mark isn't set in reg_buf
3376 {
3377 status = RA_NOMATCH;
3378 }
3379 else
3380 {
3381 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3382 && pos->col == MAXCOL
3383 ? (colnr_T)STRLEN(reg_getline(
3384 pos->lnum - rex.reg_firstlnum))
3385 : pos->col;
3386
3387 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3388 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003389 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003390 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003391 ? cmp != '>'
3392 : cmp != '<'))
3393 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3394 ? cmp != '>'
3395 : cmp != '<')))
3396 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003397 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003398 }
3399 break;
3400
3401 case RE_VISUAL:
3402 if (!reg_match_visual())
3403 status = RA_NOMATCH;
3404 break;
3405
3406 case RE_LNUM:
3407 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3408 scan))
3409 status = RA_NOMATCH;
3410 break;
3411
3412 case RE_COL:
3413 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3414 status = RA_NOMATCH;
3415 break;
3416
3417 case RE_VCOL:
3418 if (!re_num_cmp((long_u)win_linetabsize(
3419 rex.reg_win == NULL ? curwin : rex.reg_win,
3420 rex.line, (colnr_T)(rex.input - rex.line)) + 1, scan))
3421 status = RA_NOMATCH;
3422 break;
3423
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003424 case BOW: // \<word; rex.input points to w
3425 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003426 status = RA_NOMATCH;
3427 else if (has_mbyte)
3428 {
3429 int this_class;
3430
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003431 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003432 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3433 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003434 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003435 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003436 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003437 }
3438 else
3439 {
3440 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3441 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3442 status = RA_NOMATCH;
3443 }
3444 break;
3445
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003446 case EOW: // word\>; rex.input points after d
3447 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003448 status = RA_NOMATCH;
3449 else if (has_mbyte)
3450 {
3451 int this_class, prev_class;
3452
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003453 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003454 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3455 prev_class = reg_prev_class();
3456 if (this_class == prev_class
3457 || prev_class == 0 || prev_class == 1)
3458 status = RA_NOMATCH;
3459 }
3460 else
3461 {
3462 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3463 || (rex.input[0] != NUL
3464 && vim_iswordc_buf(c, rex.reg_buf)))
3465 status = RA_NOMATCH;
3466 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003467 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003468
3469 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003470 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003471 if (c == NUL)
3472 status = RA_NOMATCH;
3473 else
3474 ADVANCE_REGINPUT();
3475 break;
3476
3477 case IDENT:
3478 if (!vim_isIDc(c))
3479 status = RA_NOMATCH;
3480 else
3481 ADVANCE_REGINPUT();
3482 break;
3483
3484 case SIDENT:
3485 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3486 status = RA_NOMATCH;
3487 else
3488 ADVANCE_REGINPUT();
3489 break;
3490
3491 case KWORD:
3492 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3493 status = RA_NOMATCH;
3494 else
3495 ADVANCE_REGINPUT();
3496 break;
3497
3498 case SKWORD:
3499 if (VIM_ISDIGIT(*rex.input)
3500 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3501 status = RA_NOMATCH;
3502 else
3503 ADVANCE_REGINPUT();
3504 break;
3505
3506 case FNAME:
3507 if (!vim_isfilec(c))
3508 status = RA_NOMATCH;
3509 else
3510 ADVANCE_REGINPUT();
3511 break;
3512
3513 case SFNAME:
3514 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3515 status = RA_NOMATCH;
3516 else
3517 ADVANCE_REGINPUT();
3518 break;
3519
3520 case PRINT:
3521 if (!vim_isprintc(PTR2CHAR(rex.input)))
3522 status = RA_NOMATCH;
3523 else
3524 ADVANCE_REGINPUT();
3525 break;
3526
3527 case SPRINT:
3528 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3529 status = RA_NOMATCH;
3530 else
3531 ADVANCE_REGINPUT();
3532 break;
3533
3534 case WHITE:
3535 if (!VIM_ISWHITE(c))
3536 status = RA_NOMATCH;
3537 else
3538 ADVANCE_REGINPUT();
3539 break;
3540
3541 case NWHITE:
3542 if (c == NUL || VIM_ISWHITE(c))
3543 status = RA_NOMATCH;
3544 else
3545 ADVANCE_REGINPUT();
3546 break;
3547
3548 case DIGIT:
3549 if (!ri_digit(c))
3550 status = RA_NOMATCH;
3551 else
3552 ADVANCE_REGINPUT();
3553 break;
3554
3555 case NDIGIT:
3556 if (c == NUL || ri_digit(c))
3557 status = RA_NOMATCH;
3558 else
3559 ADVANCE_REGINPUT();
3560 break;
3561
3562 case HEX:
3563 if (!ri_hex(c))
3564 status = RA_NOMATCH;
3565 else
3566 ADVANCE_REGINPUT();
3567 break;
3568
3569 case NHEX:
3570 if (c == NUL || ri_hex(c))
3571 status = RA_NOMATCH;
3572 else
3573 ADVANCE_REGINPUT();
3574 break;
3575
3576 case OCTAL:
3577 if (!ri_octal(c))
3578 status = RA_NOMATCH;
3579 else
3580 ADVANCE_REGINPUT();
3581 break;
3582
3583 case NOCTAL:
3584 if (c == NUL || ri_octal(c))
3585 status = RA_NOMATCH;
3586 else
3587 ADVANCE_REGINPUT();
3588 break;
3589
3590 case WORD:
3591 if (!ri_word(c))
3592 status = RA_NOMATCH;
3593 else
3594 ADVANCE_REGINPUT();
3595 break;
3596
3597 case NWORD:
3598 if (c == NUL || ri_word(c))
3599 status = RA_NOMATCH;
3600 else
3601 ADVANCE_REGINPUT();
3602 break;
3603
3604 case HEAD:
3605 if (!ri_head(c))
3606 status = RA_NOMATCH;
3607 else
3608 ADVANCE_REGINPUT();
3609 break;
3610
3611 case NHEAD:
3612 if (c == NUL || ri_head(c))
3613 status = RA_NOMATCH;
3614 else
3615 ADVANCE_REGINPUT();
3616 break;
3617
3618 case ALPHA:
3619 if (!ri_alpha(c))
3620 status = RA_NOMATCH;
3621 else
3622 ADVANCE_REGINPUT();
3623 break;
3624
3625 case NALPHA:
3626 if (c == NUL || ri_alpha(c))
3627 status = RA_NOMATCH;
3628 else
3629 ADVANCE_REGINPUT();
3630 break;
3631
3632 case LOWER:
3633 if (!ri_lower(c))
3634 status = RA_NOMATCH;
3635 else
3636 ADVANCE_REGINPUT();
3637 break;
3638
3639 case NLOWER:
3640 if (c == NUL || ri_lower(c))
3641 status = RA_NOMATCH;
3642 else
3643 ADVANCE_REGINPUT();
3644 break;
3645
3646 case UPPER:
3647 if (!ri_upper(c))
3648 status = RA_NOMATCH;
3649 else
3650 ADVANCE_REGINPUT();
3651 break;
3652
3653 case NUPPER:
3654 if (c == NUL || ri_upper(c))
3655 status = RA_NOMATCH;
3656 else
3657 ADVANCE_REGINPUT();
3658 break;
3659
3660 case EXACTLY:
3661 {
3662 int len;
3663 char_u *opnd;
3664
3665 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003666 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003667 if (*opnd != *rex.input
3668 && (!rex.reg_ic
3669 || (!enc_utf8
3670 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3671 status = RA_NOMATCH;
3672 else if (*opnd == NUL)
3673 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003674 // match empty string always works; happens when "~" is
3675 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003676 }
3677 else
3678 {
3679 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3680 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003681 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003682 }
3683 else
3684 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003685 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003686 len = (int)STRLEN(opnd);
3687 if (cstrncmp(opnd, rex.input, &len) != 0)
3688 status = RA_NOMATCH;
3689 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003690 // Check for following composing character, unless %C
3691 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003692 if (status != RA_NOMATCH
3693 && enc_utf8
3694 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3695 && !rex.reg_icombine
3696 && OP(next) != RE_COMPOSING)
3697 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003698 // raaron: This code makes a composing character get
3699 // ignored, which is the correct behavior (sometimes)
3700 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003701 status = RA_NOMATCH;
3702 }
3703 if (status != RA_NOMATCH)
3704 rex.input += len;
3705 }
3706 }
3707 break;
3708
3709 case ANYOF:
3710 case ANYBUT:
3711 if (c == NUL)
3712 status = RA_NOMATCH;
3713 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3714 status = RA_NOMATCH;
3715 else
3716 ADVANCE_REGINPUT();
3717 break;
3718
3719 case MULTIBYTECODE:
3720 if (has_mbyte)
3721 {
3722 int i, len;
3723 char_u *opnd;
3724 int opndc = 0, inpc;
3725
3726 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003727 // Safety check (just in case 'encoding' was changed since
3728 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003729 if ((len = (*mb_ptr2len)(opnd)) < 2)
3730 {
3731 status = RA_NOMATCH;
3732 break;
3733 }
3734 if (enc_utf8)
3735 opndc = utf_ptr2char(opnd);
3736 if (enc_utf8 && utf_iscomposing(opndc))
3737 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003738 // When only a composing char is given match at any
3739 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003740 status = RA_NOMATCH;
3741 for (i = 0; rex.input[i] != NUL;
3742 i += utf_ptr2len(rex.input + i))
3743 {
3744 inpc = utf_ptr2char(rex.input + i);
3745 if (!utf_iscomposing(inpc))
3746 {
3747 if (i > 0)
3748 break;
3749 }
3750 else if (opndc == inpc)
3751 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003752 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003753 len = i + utfc_ptr2len(rex.input + i);
3754 status = RA_MATCH;
3755 break;
3756 }
3757 }
3758 }
3759 else
3760 for (i = 0; i < len; ++i)
3761 if (opnd[i] != rex.input[i])
3762 {
3763 status = RA_NOMATCH;
3764 break;
3765 }
3766 rex.input += len;
3767 }
3768 else
3769 status = RA_NOMATCH;
3770 break;
3771 case RE_COMPOSING:
3772 if (enc_utf8)
3773 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003774 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003775 while (utf_iscomposing(utf_ptr2char(rex.input)))
3776 MB_CPTR_ADV(rex.input);
3777 }
3778 break;
3779
3780 case NOTHING:
3781 break;
3782
3783 case BACK:
3784 {
3785 int i;
3786 backpos_T *bp;
3787
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003788 // When we run into BACK we need to check if we don't keep
3789 // looping without matching any input. The second and later
3790 // times a BACK is encountered it fails if the input is still
3791 // at the same position as the previous time.
3792 // The positions are stored in "backpos" and found by the
3793 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003794 bp = (backpos_T *)backpos.ga_data;
3795 for (i = 0; i < backpos.ga_len; ++i)
3796 if (bp[i].bp_scan == scan)
3797 break;
3798 if (i == backpos.ga_len)
3799 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003800 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003801 if (ga_grow(&backpos, 1) == FAIL)
3802 status = RA_FAIL;
3803 else
3804 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003805 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003806 bp = (backpos_T *)backpos.ga_data;
3807 bp[i].bp_scan = scan;
3808 ++backpos.ga_len;
3809 }
3810 }
3811 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003812 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003813 status = RA_NOMATCH;
3814
3815 if (status != RA_FAIL && status != RA_NOMATCH)
3816 reg_save(&bp[i].bp_pos, &backpos);
3817 }
3818 break;
3819
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003820 case MOPEN + 0: // Match start: \zs
3821 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003822 case MOPEN + 2:
3823 case MOPEN + 3:
3824 case MOPEN + 4:
3825 case MOPEN + 5:
3826 case MOPEN + 6:
3827 case MOPEN + 7:
3828 case MOPEN + 8:
3829 case MOPEN + 9:
3830 {
3831 no = op - MOPEN;
3832 cleanup_subexpr();
3833 rp = regstack_push(RS_MOPEN, scan);
3834 if (rp == NULL)
3835 status = RA_FAIL;
3836 else
3837 {
3838 rp->rs_no = no;
3839 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3840 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003841 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003842 }
3843 }
3844 break;
3845
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003846 case NOPEN: // \%(
3847 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003848 if (regstack_push(RS_NOPEN, scan) == NULL)
3849 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003850 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003851 break;
3852
3853#ifdef FEAT_SYN_HL
3854 case ZOPEN + 1:
3855 case ZOPEN + 2:
3856 case ZOPEN + 3:
3857 case ZOPEN + 4:
3858 case ZOPEN + 5:
3859 case ZOPEN + 6:
3860 case ZOPEN + 7:
3861 case ZOPEN + 8:
3862 case ZOPEN + 9:
3863 {
3864 no = op - ZOPEN;
3865 cleanup_zsubexpr();
3866 rp = regstack_push(RS_ZOPEN, scan);
3867 if (rp == NULL)
3868 status = RA_FAIL;
3869 else
3870 {
3871 rp->rs_no = no;
3872 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3873 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003874 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003875 }
3876 }
3877 break;
3878#endif
3879
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003880 case MCLOSE + 0: // Match end: \ze
3881 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003882 case MCLOSE + 2:
3883 case MCLOSE + 3:
3884 case MCLOSE + 4:
3885 case MCLOSE + 5:
3886 case MCLOSE + 6:
3887 case MCLOSE + 7:
3888 case MCLOSE + 8:
3889 case MCLOSE + 9:
3890 {
3891 no = op - MCLOSE;
3892 cleanup_subexpr();
3893 rp = regstack_push(RS_MCLOSE, scan);
3894 if (rp == NULL)
3895 status = RA_FAIL;
3896 else
3897 {
3898 rp->rs_no = no;
3899 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3900 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003901 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003902 }
3903 }
3904 break;
3905
3906#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003907 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003908 case ZCLOSE + 2:
3909 case ZCLOSE + 3:
3910 case ZCLOSE + 4:
3911 case ZCLOSE + 5:
3912 case ZCLOSE + 6:
3913 case ZCLOSE + 7:
3914 case ZCLOSE + 8:
3915 case ZCLOSE + 9:
3916 {
3917 no = op - ZCLOSE;
3918 cleanup_zsubexpr();
3919 rp = regstack_push(RS_ZCLOSE, scan);
3920 if (rp == NULL)
3921 status = RA_FAIL;
3922 else
3923 {
3924 rp->rs_no = no;
3925 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
3926 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003927 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003928 }
3929 }
3930 break;
3931#endif
3932
3933 case BACKREF + 1:
3934 case BACKREF + 2:
3935 case BACKREF + 3:
3936 case BACKREF + 4:
3937 case BACKREF + 5:
3938 case BACKREF + 6:
3939 case BACKREF + 7:
3940 case BACKREF + 8:
3941 case BACKREF + 9:
3942 {
3943 int len;
3944
3945 no = op - BACKREF;
3946 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003947 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003948 {
3949 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
3950 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003951 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003952 len = 0;
3953 }
3954 else
3955 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003956 // Compare current input with back-ref in the same
3957 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003958 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
3959 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
3960 status = RA_NOMATCH;
3961 }
3962 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003963 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003964 {
3965 if (rex.reg_startpos[no].lnum < 0
3966 || rex.reg_endpos[no].lnum < 0)
3967 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003968 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003969 len = 0;
3970 }
3971 else
3972 {
3973 if (rex.reg_startpos[no].lnum == rex.lnum
3974 && rex.reg_endpos[no].lnum == rex.lnum)
3975 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003976 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003977 len = rex.reg_endpos[no].col
3978 - rex.reg_startpos[no].col;
3979 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
3980 rex.input, &len) != 0)
3981 status = RA_NOMATCH;
3982 }
3983 else
3984 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003985 // Messy situation: Need to compare between two
3986 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003987 int r = match_with_backref(
3988 rex.reg_startpos[no].lnum,
3989 rex.reg_startpos[no].col,
3990 rex.reg_endpos[no].lnum,
3991 rex.reg_endpos[no].col,
3992 &len);
3993
3994 if (r != RA_MATCH)
3995 status = r;
3996 }
3997 }
3998 }
3999
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004000 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004001 rex.input += len;
4002 }
4003 break;
4004
4005#ifdef FEAT_SYN_HL
4006 case ZREF + 1:
4007 case ZREF + 2:
4008 case ZREF + 3:
4009 case ZREF + 4:
4010 case ZREF + 5:
4011 case ZREF + 6:
4012 case ZREF + 7:
4013 case ZREF + 8:
4014 case ZREF + 9:
4015 {
4016 int len;
4017
4018 cleanup_zsubexpr();
4019 no = op - ZREF;
4020 if (re_extmatch_in != NULL
4021 && re_extmatch_in->matches[no] != NULL)
4022 {
4023 len = (int)STRLEN(re_extmatch_in->matches[no]);
4024 if (cstrncmp(re_extmatch_in->matches[no],
4025 rex.input, &len) != 0)
4026 status = RA_NOMATCH;
4027 else
4028 rex.input += len;
4029 }
4030 else
4031 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004032 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004033 }
4034 }
4035 break;
4036#endif
4037
4038 case BRANCH:
4039 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004040 if (OP(next) != BRANCH) // No choice.
4041 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004042 else
4043 {
4044 rp = regstack_push(RS_BRANCH, scan);
4045 if (rp == NULL)
4046 status = RA_FAIL;
4047 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004048 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004049 }
4050 }
4051 break;
4052
4053 case BRACE_LIMITS:
4054 {
4055 if (OP(next) == BRACE_SIMPLE)
4056 {
4057 bl_minval = OPERAND_MIN(scan);
4058 bl_maxval = OPERAND_MAX(scan);
4059 }
4060 else if (OP(next) >= BRACE_COMPLEX
4061 && OP(next) < BRACE_COMPLEX + 10)
4062 {
4063 no = OP(next) - BRACE_COMPLEX;
4064 brace_min[no] = OPERAND_MIN(scan);
4065 brace_max[no] = OPERAND_MAX(scan);
4066 brace_count[no] = 0;
4067 }
4068 else
4069 {
4070 internal_error("BRACE_LIMITS");
4071 status = RA_FAIL;
4072 }
4073 }
4074 break;
4075
4076 case BRACE_COMPLEX + 0:
4077 case BRACE_COMPLEX + 1:
4078 case BRACE_COMPLEX + 2:
4079 case BRACE_COMPLEX + 3:
4080 case BRACE_COMPLEX + 4:
4081 case BRACE_COMPLEX + 5:
4082 case BRACE_COMPLEX + 6:
4083 case BRACE_COMPLEX + 7:
4084 case BRACE_COMPLEX + 8:
4085 case BRACE_COMPLEX + 9:
4086 {
4087 no = op - BRACE_COMPLEX;
4088 ++brace_count[no];
4089
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004090 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004091 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4092 ? brace_min[no] : brace_max[no]))
4093 {
4094 rp = regstack_push(RS_BRCPLX_MORE, scan);
4095 if (rp == NULL)
4096 status = RA_FAIL;
4097 else
4098 {
4099 rp->rs_no = no;
4100 reg_save(&rp->rs_un.regsave, &backpos);
4101 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004102 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004103 }
4104 break;
4105 }
4106
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004107 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004108 if (brace_min[no] <= brace_max[no])
4109 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004110 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004111 if (brace_count[no] <= brace_max[no])
4112 {
4113 rp = regstack_push(RS_BRCPLX_LONG, scan);
4114 if (rp == NULL)
4115 status = RA_FAIL;
4116 else
4117 {
4118 rp->rs_no = no;
4119 reg_save(&rp->rs_un.regsave, &backpos);
4120 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004121 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004122 }
4123 }
4124 }
4125 else
4126 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004127 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004128 if (brace_count[no] <= brace_min[no])
4129 {
4130 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4131 if (rp == NULL)
4132 status = RA_FAIL;
4133 else
4134 {
4135 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004136 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004137 }
4138 }
4139 }
4140 }
4141 break;
4142
4143 case BRACE_SIMPLE:
4144 case STAR:
4145 case PLUS:
4146 {
4147 regstar_T rst;
4148
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004149 // Lookahead to avoid useless match attempts when we know
4150 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004151 if (OP(next) == EXACTLY)
4152 {
4153 rst.nextb = *OPERAND(next);
4154 if (rex.reg_ic)
4155 {
4156 if (MB_ISUPPER(rst.nextb))
4157 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4158 else
4159 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4160 }
4161 else
4162 rst.nextb_ic = rst.nextb;
4163 }
4164 else
4165 {
4166 rst.nextb = NUL;
4167 rst.nextb_ic = NUL;
4168 }
4169 if (op != BRACE_SIMPLE)
4170 {
4171 rst.minval = (op == STAR) ? 0 : 1;
4172 rst.maxval = MAX_LIMIT;
4173 }
4174 else
4175 {
4176 rst.minval = bl_minval;
4177 rst.maxval = bl_maxval;
4178 }
4179
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004180 // When maxval > minval, try matching as much as possible, up
4181 // to maxval. When maxval < minval, try matching at least the
4182 // minimal number (since the range is backwards, that's also
4183 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004184 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4185 if (got_int)
4186 {
4187 status = RA_FAIL;
4188 break;
4189 }
4190 if (rst.minval <= rst.maxval
4191 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4192 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004193 // It could match. Prepare for trying to match what
4194 // follows. The code is below. Parameters are stored in
4195 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004196 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4197 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004198 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004199 status = RA_FAIL;
4200 }
4201 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4202 status = RA_FAIL;
4203 else
4204 {
4205 regstack.ga_len += sizeof(regstar_T);
4206 rp = regstack_push(rst.minval <= rst.maxval
4207 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4208 if (rp == NULL)
4209 status = RA_FAIL;
4210 else
4211 {
4212 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004213 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004214 }
4215 }
4216 }
4217 else
4218 status = RA_NOMATCH;
4219
4220 }
4221 break;
4222
4223 case NOMATCH:
4224 case MATCH:
4225 case SUBPAT:
4226 rp = regstack_push(RS_NOMATCH, scan);
4227 if (rp == NULL)
4228 status = RA_FAIL;
4229 else
4230 {
4231 rp->rs_no = op;
4232 reg_save(&rp->rs_un.regsave, &backpos);
4233 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004234 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004235 }
4236 break;
4237
4238 case BEHIND:
4239 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004240 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004241 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4242 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004243 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004244 status = RA_FAIL;
4245 }
4246 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4247 status = RA_FAIL;
4248 else
4249 {
4250 regstack.ga_len += sizeof(regbehind_T);
4251 rp = regstack_push(RS_BEHIND1, scan);
4252 if (rp == NULL)
4253 status = RA_FAIL;
4254 else
4255 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004256 // Need to save the subexpr to be able to restore them
4257 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004258 save_subexpr(((regbehind_T *)rp) - 1);
4259
4260 rp->rs_no = op;
4261 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004262 // First try if what follows matches. If it does then we
4263 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004264 }
4265 }
4266 break;
4267
4268 case BHPOS:
4269 if (REG_MULTI)
4270 {
4271 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4272 || behind_pos.rs_u.pos.lnum != rex.lnum)
4273 status = RA_NOMATCH;
4274 }
4275 else if (behind_pos.rs_u.ptr != rex.input)
4276 status = RA_NOMATCH;
4277 break;
4278
4279 case NEWL:
4280 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4281 || rex.reg_line_lbr)
4282 && (c != '\n' || !rex.reg_line_lbr))
4283 status = RA_NOMATCH;
4284 else if (rex.reg_line_lbr)
4285 ADVANCE_REGINPUT();
4286 else
4287 reg_nextline();
4288 break;
4289
4290 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004291 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004292 break;
4293
4294 default:
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004295 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004296#ifdef DEBUG
4297 printf("Illegal op code %d\n", op);
4298#endif
4299 status = RA_FAIL;
4300 break;
4301 }
4302 }
4303
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004304 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004305 if (status != RA_CONT)
4306 break;
4307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004308 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004309 scan = next;
4310
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004311 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004312
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004313 // If there is something on the regstack execute the code for the state.
4314 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004315 while (regstack.ga_len > 0 && status != RA_FAIL)
4316 {
4317 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4318 switch (rp->rs_state)
4319 {
4320 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004321 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004322 regstack_pop(&scan);
4323 break;
4324
4325 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004326 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004327 if (status == RA_NOMATCH)
4328 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4329 &rex.reg_startp[rp->rs_no]);
4330 regstack_pop(&scan);
4331 break;
4332
4333#ifdef FEAT_SYN_HL
4334 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004335 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004336 if (status == RA_NOMATCH)
4337 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4338 &reg_startzp[rp->rs_no]);
4339 regstack_pop(&scan);
4340 break;
4341#endif
4342
4343 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004344 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004345 if (status == RA_NOMATCH)
4346 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4347 &rex.reg_endp[rp->rs_no]);
4348 regstack_pop(&scan);
4349 break;
4350
4351#ifdef FEAT_SYN_HL
4352 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004353 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004354 if (status == RA_NOMATCH)
4355 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4356 &reg_endzp[rp->rs_no]);
4357 regstack_pop(&scan);
4358 break;
4359#endif
4360
4361 case RS_BRANCH:
4362 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004363 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004364 regstack_pop(&scan);
4365 else
4366 {
4367 if (status != RA_BREAK)
4368 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004369 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004370 reg_restore(&rp->rs_un.regsave, &backpos);
4371 scan = rp->rs_scan;
4372 }
4373 if (scan == NULL || OP(scan) != BRANCH)
4374 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004375 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004376 status = RA_NOMATCH;
4377 regstack_pop(&scan);
4378 }
4379 else
4380 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004381 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004382 rp->rs_scan = regnext(scan);
4383 reg_save(&rp->rs_un.regsave, &backpos);
4384 scan = OPERAND(scan);
4385 }
4386 }
4387 break;
4388
4389 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004390 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004391 if (status == RA_NOMATCH)
4392 {
4393 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004394 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004395 }
4396 regstack_pop(&scan);
4397 break;
4398
4399 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004400 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004401 if (status == RA_NOMATCH)
4402 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004403 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004404 reg_restore(&rp->rs_un.regsave, &backpos);
4405 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004406 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004407 status = RA_CONT;
4408 }
4409 regstack_pop(&scan);
4410 if (status == RA_CONT)
4411 scan = regnext(scan);
4412 break;
4413
4414 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004415 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004416 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004417 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004418 reg_restore(&rp->rs_un.regsave, &backpos);
4419 regstack_pop(&scan);
4420 if (status == RA_NOMATCH)
4421 {
4422 scan = OPERAND(scan);
4423 status = RA_CONT;
4424 }
4425 break;
4426
4427 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004428 // Pop the state. If the operand matches for NOMATCH or
4429 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4430 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004431 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4432 status = RA_NOMATCH;
4433 else
4434 {
4435 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004436 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004437 reg_restore(&rp->rs_un.regsave, &backpos);
4438 }
4439 regstack_pop(&scan);
4440 if (status == RA_CONT)
4441 scan = regnext(scan);
4442 break;
4443
4444 case RS_BEHIND1:
4445 if (status == RA_NOMATCH)
4446 {
4447 regstack_pop(&scan);
4448 regstack.ga_len -= sizeof(regbehind_T);
4449 }
4450 else
4451 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004452 // The stuff after BEHIND/NOBEHIND matches. Now try if
4453 // the behind part does (not) match before the current
4454 // position in the input. This must be done at every
4455 // position in the input and checking if the match ends at
4456 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004457
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004458 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004459 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4460
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004461 // Start looking for a match with operand at the current
4462 // position. Go back one character until we find the
4463 // result, hitting the start of the line or the previous
4464 // line (for multi-line matching).
4465 // Set behind_pos to where the match should end, BHPOS
4466 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004467 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4468 behind_pos = rp->rs_un.regsave;
4469
4470 rp->rs_state = RS_BEHIND2;
4471
4472 reg_restore(&rp->rs_un.regsave, &backpos);
4473 scan = OPERAND(rp->rs_scan) + 4;
4474 }
4475 break;
4476
4477 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004478 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004479 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4480 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004481 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004482 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4483 if (rp->rs_no == BEHIND)
4484 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4485 &backpos);
4486 else
4487 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004488 // But we didn't want a match. Need to restore the
4489 // subexpr, because what follows matched, so they have
4490 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004491 status = RA_NOMATCH;
4492 restore_subexpr(((regbehind_T *)rp) - 1);
4493 }
4494 regstack_pop(&scan);
4495 regstack.ga_len -= sizeof(regbehind_T);
4496 }
4497 else
4498 {
4499 long limit;
4500
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004501 // No match or a match that doesn't end where we want it: Go
4502 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004503 no = OK;
4504 limit = OPERAND_MIN(rp->rs_scan);
4505 if (REG_MULTI)
4506 {
4507 if (limit > 0
4508 && ((rp->rs_un.regsave.rs_u.pos.lnum
4509 < behind_pos.rs_u.pos.lnum
4510 ? (colnr_T)STRLEN(rex.line)
4511 : behind_pos.rs_u.pos.col)
4512 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4513 no = FAIL;
4514 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4515 {
4516 if (rp->rs_un.regsave.rs_u.pos.lnum
4517 < behind_pos.rs_u.pos.lnum
4518 || reg_getline(
4519 --rp->rs_un.regsave.rs_u.pos.lnum)
4520 == NULL)
4521 no = FAIL;
4522 else
4523 {
4524 reg_restore(&rp->rs_un.regsave, &backpos);
4525 rp->rs_un.regsave.rs_u.pos.col =
4526 (colnr_T)STRLEN(rex.line);
4527 }
4528 }
4529 else
4530 {
4531 if (has_mbyte)
4532 {
4533 char_u *line =
4534 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4535
4536 rp->rs_un.regsave.rs_u.pos.col -=
4537 (*mb_head_off)(line, line
4538 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4539 }
4540 else
4541 --rp->rs_un.regsave.rs_u.pos.col;
4542 }
4543 }
4544 else
4545 {
4546 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4547 no = FAIL;
4548 else
4549 {
4550 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4551 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4552 - rp->rs_un.regsave.rs_u.ptr) > limit)
4553 no = FAIL;
4554 }
4555 }
4556 if (no == OK)
4557 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004558 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004559 reg_restore(&rp->rs_un.regsave, &backpos);
4560 scan = OPERAND(rp->rs_scan) + 4;
4561 if (status == RA_MATCH)
4562 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004563 // We did match, so subexpr may have been changed,
4564 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004565 status = RA_NOMATCH;
4566 restore_subexpr(((regbehind_T *)rp) - 1);
4567 }
4568 }
4569 else
4570 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004571 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004572 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4573 if (rp->rs_no == NOBEHIND)
4574 {
4575 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4576 &backpos);
4577 status = RA_MATCH;
4578 }
4579 else
4580 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004581 // We do want a proper match. Need to restore the
4582 // subexpr if we had a match, because they may have
4583 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004584 if (status == RA_MATCH)
4585 {
4586 status = RA_NOMATCH;
4587 restore_subexpr(((regbehind_T *)rp) - 1);
4588 }
4589 }
4590 regstack_pop(&scan);
4591 regstack.ga_len -= sizeof(regbehind_T);
4592 }
4593 }
4594 break;
4595
4596 case RS_STAR_LONG:
4597 case RS_STAR_SHORT:
4598 {
4599 regstar_T *rst = ((regstar_T *)rp) - 1;
4600
4601 if (status == RA_MATCH)
4602 {
4603 regstack_pop(&scan);
4604 regstack.ga_len -= sizeof(regstar_T);
4605 break;
4606 }
4607
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004608 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004609 if (status != RA_BREAK)
4610 reg_restore(&rp->rs_un.regsave, &backpos);
4611
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004612 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004613 for (;;)
4614 {
4615 if (status != RA_BREAK)
4616 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004617 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004618 if (rp->rs_state == RS_STAR_LONG)
4619 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004620 // Trying for longest match, but couldn't or
4621 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004622 if (--rst->count < rst->minval)
4623 break;
4624 if (rex.input == rex.line)
4625 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004626 // backup to last char of previous line
Bram Moolenaar6456fae2022-02-22 13:37:31 +00004627 if (rex.lnum == 0)
4628 {
4629 status = RA_NOMATCH;
4630 break;
4631 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004632 --rex.lnum;
4633 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004634 // Just in case regrepeat() didn't count
4635 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004636 if (rex.line == NULL)
4637 break;
4638 rex.input = rex.line + STRLEN(rex.line);
4639 fast_breakcheck();
4640 }
4641 else
4642 MB_PTR_BACK(rex.line, rex.input);
4643 }
4644 else
4645 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004646 // Range is backwards, use shortest match first.
4647 // Careful: maxval and minval are exchanged!
4648 // Couldn't or didn't match: try advancing one
4649 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004650 if (rst->count == rst->minval
4651 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4652 break;
4653 ++rst->count;
4654 }
4655 if (got_int)
4656 break;
4657 }
4658 else
4659 status = RA_NOMATCH;
4660
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004661 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004662 if (rst->nextb == NUL || *rex.input == rst->nextb
4663 || *rex.input == rst->nextb_ic)
4664 {
4665 reg_save(&rp->rs_un.regsave, &backpos);
4666 scan = regnext(rp->rs_scan);
4667 status = RA_CONT;
4668 break;
4669 }
4670 }
4671 if (status != RA_CONT)
4672 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004673 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004674 regstack_pop(&scan);
4675 regstack.ga_len -= sizeof(regstar_T);
4676 status = RA_NOMATCH;
4677 }
4678 }
4679 break;
4680 }
4681
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004682 // If we want to continue the inner loop or didn't pop a state
4683 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004684 if (status == RA_CONT || rp == (regitem_T *)
4685 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4686 break;
4687 }
4688
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004689 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004690 if (status == RA_CONT)
4691 continue;
4692
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004693 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004694 if (regstack.ga_len == 0 || status == RA_FAIL)
4695 {
4696 if (scan == NULL)
4697 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004698 // We get here only if there's trouble -- normally "case END" is
4699 // the terminating point.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004700 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004701#ifdef DEBUG
4702 printf("Premature EOL\n");
4703#endif
4704 }
4705 return (status == RA_MATCH);
4706 }
4707
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004708 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004709
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004710 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004711}
4712
4713/*
4714 * regtry - try match of "prog" with at rex.line["col"].
4715 * Returns 0 for failure, number of lines contained in the match otherwise.
4716 */
4717 static long
4718regtry(
4719 bt_regprog_T *prog,
4720 colnr_T col,
4721 proftime_T *tm, // timeout limit or NULL
4722 int *timed_out) // flag set on timeout or NULL
4723{
4724 rex.input = rex.line + col;
4725 rex.need_clear_subexpr = TRUE;
4726#ifdef FEAT_SYN_HL
4727 // Clear the external match subpointers if necessary.
4728 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4729#endif
4730
4731 if (regmatch(prog->program + 1, tm, timed_out) == 0)
4732 return 0;
4733
4734 cleanup_subexpr();
4735 if (REG_MULTI)
4736 {
4737 if (rex.reg_startpos[0].lnum < 0)
4738 {
4739 rex.reg_startpos[0].lnum = 0;
4740 rex.reg_startpos[0].col = col;
4741 }
4742 if (rex.reg_endpos[0].lnum < 0)
4743 {
4744 rex.reg_endpos[0].lnum = rex.lnum;
4745 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4746 }
4747 else
4748 // Use line number of "\ze".
4749 rex.lnum = rex.reg_endpos[0].lnum;
4750 }
4751 else
4752 {
4753 if (rex.reg_startp[0] == NULL)
4754 rex.reg_startp[0] = rex.line + col;
4755 if (rex.reg_endp[0] == NULL)
4756 rex.reg_endp[0] = rex.input;
4757 }
4758#ifdef FEAT_SYN_HL
4759 // Package any found \z(...\) matches for export. Default is none.
4760 unref_extmatch(re_extmatch_out);
4761 re_extmatch_out = NULL;
4762
4763 if (prog->reghasz == REX_SET)
4764 {
4765 int i;
4766
4767 cleanup_zsubexpr();
4768 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004769 if (re_extmatch_out == NULL)
4770 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004771 for (i = 0; i < NSUBEXP; i++)
4772 {
4773 if (REG_MULTI)
4774 {
4775 // Only accept single line matches.
4776 if (reg_startzpos[i].lnum >= 0
4777 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4778 && reg_endzpos[i].col >= reg_startzpos[i].col)
4779 re_extmatch_out->matches[i] =
4780 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4781 + reg_startzpos[i].col,
4782 reg_endzpos[i].col - reg_startzpos[i].col);
4783 }
4784 else
4785 {
4786 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4787 re_extmatch_out->matches[i] =
4788 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004789 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004790 }
4791 }
4792 }
4793#endif
4794 return 1 + rex.lnum;
4795}
4796
4797/*
4798 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004799 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004800 * Returns 0 for failure, number of lines contained in the match otherwise.
4801 */
4802 static long
4803bt_regexec_both(
4804 char_u *line,
4805 colnr_T col, // column to start looking for match
4806 proftime_T *tm, // timeout limit or NULL
4807 int *timed_out) // flag set on timeout or NULL
4808{
4809 bt_regprog_T *prog;
4810 char_u *s;
4811 long retval = 0L;
4812
4813 // Create "regstack" and "backpos" if they are not allocated yet.
4814 // We allocate *_INITIAL amount of bytes first and then set the grow size
4815 // to much bigger value to avoid many malloc calls in case of deep regular
4816 // expressions.
4817 if (regstack.ga_data == NULL)
4818 {
4819 // Use an item size of 1 byte, since we push different things
4820 // onto the regstack.
4821 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4822 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4823 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4824 }
4825
4826 if (backpos.ga_data == NULL)
4827 {
4828 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4829 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4830 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4831 }
4832
4833 if (REG_MULTI)
4834 {
4835 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4836 line = reg_getline((linenr_T)0);
4837 rex.reg_startpos = rex.reg_mmatch->startpos;
4838 rex.reg_endpos = rex.reg_mmatch->endpos;
4839 }
4840 else
4841 {
4842 prog = (bt_regprog_T *)rex.reg_match->regprog;
4843 rex.reg_startp = rex.reg_match->startp;
4844 rex.reg_endp = rex.reg_match->endp;
4845 }
4846
4847 // Be paranoid...
4848 if (prog == NULL || line == NULL)
4849 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004850 iemsg(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004851 goto theend;
4852 }
4853
4854 // Check validity of program.
4855 if (prog_magic_wrong())
4856 goto theend;
4857
4858 // If the start column is past the maximum column: no need to try.
4859 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4860 goto theend;
4861
4862 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4863 if (prog->regflags & RF_ICASE)
4864 rex.reg_ic = TRUE;
4865 else if (prog->regflags & RF_NOICASE)
4866 rex.reg_ic = FALSE;
4867
4868 // If pattern contains "\Z" overrule value of rex.reg_icombine
4869 if (prog->regflags & RF_ICOMBINE)
4870 rex.reg_icombine = TRUE;
4871
4872 // If there is a "must appear" string, look for it.
4873 if (prog->regmust != NULL)
4874 {
4875 int c;
4876
4877 if (has_mbyte)
4878 c = (*mb_ptr2char)(prog->regmust);
4879 else
4880 c = *prog->regmust;
4881 s = line + col;
4882
4883 // This is used very often, esp. for ":global". Use three versions of
4884 // the loop to avoid overhead of conditions.
4885 if (!rex.reg_ic && !has_mbyte)
4886 while ((s = vim_strbyte(s, c)) != NULL)
4887 {
4888 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4889 break; // Found it.
4890 ++s;
4891 }
4892 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4893 while ((s = vim_strchr(s, c)) != NULL)
4894 {
4895 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4896 break; // Found it.
4897 MB_PTR_ADV(s);
4898 }
4899 else
4900 while ((s = cstrchr(s, c)) != NULL)
4901 {
4902 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4903 break; // Found it.
4904 MB_PTR_ADV(s);
4905 }
4906 if (s == NULL) // Not present.
4907 goto theend;
4908 }
4909
4910 rex.line = line;
4911 rex.lnum = 0;
4912 reg_toolong = FALSE;
4913
4914 // Simplest case: Anchored match need be tried only once.
4915 if (prog->reganch)
4916 {
4917 int c;
4918
4919 if (has_mbyte)
4920 c = (*mb_ptr2char)(rex.line + col);
4921 else
4922 c = rex.line[col];
4923 if (prog->regstart == NUL
4924 || prog->regstart == c
4925 || (rex.reg_ic
4926 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
4927 || (c < 255 && prog->regstart < 255 &&
4928 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
4929 retval = regtry(prog, col, tm, timed_out);
4930 else
4931 retval = 0;
4932 }
4933 else
4934 {
4935#ifdef FEAT_RELTIME
4936 int tm_count = 0;
4937#endif
4938 // Messy cases: unanchored match.
4939 while (!got_int)
4940 {
4941 if (prog->regstart != NUL)
4942 {
4943 // Skip until the char we know it must start with.
4944 // Used often, do some work to avoid call overhead.
4945 if (!rex.reg_ic && !has_mbyte)
4946 s = vim_strbyte(rex.line + col, prog->regstart);
4947 else
4948 s = cstrchr(rex.line + col, prog->regstart);
4949 if (s == NULL)
4950 {
4951 retval = 0;
4952 break;
4953 }
4954 col = (int)(s - rex.line);
4955 }
4956
4957 // Check for maximum column to try.
4958 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4959 {
4960 retval = 0;
4961 break;
4962 }
4963
4964 retval = regtry(prog, col, tm, timed_out);
4965 if (retval > 0)
4966 break;
4967
4968 // if not currently on the first line, get it again
4969 if (rex.lnum != 0)
4970 {
4971 rex.lnum = 0;
4972 rex.line = reg_getline((linenr_T)0);
4973 }
4974 if (rex.line[col] == NUL)
4975 break;
4976 if (has_mbyte)
4977 col += (*mb_ptr2len)(rex.line + col);
4978 else
4979 ++col;
4980#ifdef FEAT_RELTIME
4981 // Check for timeout once in a twenty times to avoid overhead.
4982 if (tm != NULL && ++tm_count == 20)
4983 {
4984 tm_count = 0;
4985 if (profile_passed_limit(tm))
4986 {
4987 if (timed_out != NULL)
4988 *timed_out = TRUE;
4989 break;
4990 }
4991 }
4992#endif
4993 }
4994 }
4995
4996theend:
4997 // Free "reg_tofree" when it's a bit big.
4998 // Free regstack and backpos if they are bigger than their initial size.
4999 if (reg_tofreelen > 400)
5000 VIM_CLEAR(reg_tofree);
5001 if (regstack.ga_maxlen > REGSTACK_INITIAL)
5002 ga_clear(&regstack);
5003 if (backpos.ga_maxlen > BACKPOS_INITIAL)
5004 ga_clear(&backpos);
5005
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005006 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005007 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005008 // Make sure the end is never before the start. Can happen when \zs
5009 // and \ze are used.
5010 if (REG_MULTI)
5011 {
5012 lpos_T *start = &rex.reg_mmatch->startpos[0];
5013 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005014
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005015 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005016 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005017 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
5018 }
5019 else
5020 {
5021 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5022 rex.reg_match->endp[0] = rex.reg_match->startp[0];
5023 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005024 }
5025
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005026 return retval;
5027}
5028
5029/*
5030 * Match a regexp against a string.
5031 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5032 * Uses curbuf for line count and 'iskeyword'.
5033 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5034 *
5035 * Returns 0 for failure, number of lines contained in the match otherwise.
5036 */
5037 static int
5038bt_regexec_nl(
5039 regmatch_T *rmp,
5040 char_u *line, // string to match against
5041 colnr_T col, // column to start looking for match
5042 int line_lbr)
5043{
5044 rex.reg_match = rmp;
5045 rex.reg_mmatch = NULL;
5046 rex.reg_maxline = 0;
5047 rex.reg_line_lbr = line_lbr;
5048 rex.reg_buf = curbuf;
5049 rex.reg_win = NULL;
5050 rex.reg_ic = rmp->rm_ic;
5051 rex.reg_icombine = FALSE;
5052 rex.reg_maxcol = 0;
5053
5054 return bt_regexec_both(line, col, NULL, NULL);
5055}
5056
5057/*
5058 * Match a regexp against multiple lines.
5059 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5060 * Uses curbuf for line count and 'iskeyword'.
5061 *
5062 * Return zero if there is no match. Return number of lines contained in the
5063 * match otherwise.
5064 */
5065 static long
5066bt_regexec_multi(
5067 regmmatch_T *rmp,
5068 win_T *win, // window in which to search or NULL
5069 buf_T *buf, // buffer in which to search
5070 linenr_T lnum, // nr of line to start looking for match
5071 colnr_T col, // column to start looking for match
5072 proftime_T *tm, // timeout limit or NULL
5073 int *timed_out) // flag set on timeout or NULL
5074{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005075 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005076 return bt_regexec_both(NULL, col, tm, timed_out);
5077}
5078
5079/*
5080 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5081 */
5082 static int
5083re_num_cmp(long_u val, char_u *scan)
5084{
5085 long_u n = OPERAND_MIN(scan);
5086
5087 if (OPERAND_CMP(scan) == '>')
5088 return val > n;
5089 if (OPERAND_CMP(scan) == '<')
5090 return val < n;
5091 return val == n;
5092}
5093
5094#ifdef BT_REGEXP_DUMP
5095
5096/*
5097 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5098 */
5099 static void
5100regdump(char_u *pattern, bt_regprog_T *r)
5101{
5102 char_u *s;
5103 int op = EXACTLY; // Arbitrary non-END op.
5104 char_u *next;
5105 char_u *end = NULL;
5106 FILE *f;
5107
5108#ifdef BT_REGEXP_LOG
5109 f = fopen("bt_regexp_log.log", "a");
5110#else
5111 f = stdout;
5112#endif
5113 if (f == NULL)
5114 return;
5115 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5116
5117 s = r->program + 1;
5118 // Loop until we find the END that isn't before a referred next (an END
5119 // can also appear in a NOMATCH operand).
5120 while (op != END || s <= end)
5121 {
5122 op = OP(s);
5123 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5124 next = regnext(s);
5125 if (next == NULL) // Next ptr.
5126 fprintf(f, "(0)");
5127 else
5128 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5129 if (end < next)
5130 end = next;
5131 if (op == BRACE_LIMITS)
5132 {
5133 // Two ints
5134 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5135 s += 8;
5136 }
5137 else if (op == BEHIND || op == NOBEHIND)
5138 {
5139 // one int
5140 fprintf(f, " count %ld", OPERAND_MIN(s));
5141 s += 4;
5142 }
5143 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5144 {
5145 // one int plus comparator
5146 fprintf(f, " count %ld", OPERAND_MIN(s));
5147 s += 5;
5148 }
5149 s += 3;
5150 if (op == ANYOF || op == ANYOF + ADD_NL
5151 || op == ANYBUT || op == ANYBUT + ADD_NL
5152 || op == EXACTLY)
5153 {
5154 // Literal string, where present.
5155 fprintf(f, "\nxxxxxxxxx\n");
5156 while (*s != NUL)
5157 fprintf(f, "%c", *s++);
5158 fprintf(f, "\nxxxxxxxxx\n");
5159 s++;
5160 }
5161 fprintf(f, "\r\n");
5162 }
5163
5164 // Header fields of interest.
5165 if (r->regstart != NUL)
5166 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5167 ? (char *)transchar(r->regstart)
5168 : "multibyte", r->regstart);
5169 if (r->reganch)
5170 fprintf(f, "anchored; ");
5171 if (r->regmust != NULL)
5172 fprintf(f, "must have \"%s\"", r->regmust);
5173 fprintf(f, "\r\n");
5174
5175#ifdef BT_REGEXP_LOG
5176 fclose(f);
5177#endif
5178}
5179#endif // BT_REGEXP_DUMP
5180
5181#ifdef DEBUG
5182/*
5183 * regprop - printable representation of opcode
5184 */
5185 static char_u *
5186regprop(char_u *op)
5187{
5188 char *p;
5189 static char buf[50];
5190
5191 STRCPY(buf, ":");
5192
5193 switch ((int) OP(op))
5194 {
5195 case BOL:
5196 p = "BOL";
5197 break;
5198 case EOL:
5199 p = "EOL";
5200 break;
5201 case RE_BOF:
5202 p = "BOF";
5203 break;
5204 case RE_EOF:
5205 p = "EOF";
5206 break;
5207 case CURSOR:
5208 p = "CURSOR";
5209 break;
5210 case RE_VISUAL:
5211 p = "RE_VISUAL";
5212 break;
5213 case RE_LNUM:
5214 p = "RE_LNUM";
5215 break;
5216 case RE_MARK:
5217 p = "RE_MARK";
5218 break;
5219 case RE_COL:
5220 p = "RE_COL";
5221 break;
5222 case RE_VCOL:
5223 p = "RE_VCOL";
5224 break;
5225 case BOW:
5226 p = "BOW";
5227 break;
5228 case EOW:
5229 p = "EOW";
5230 break;
5231 case ANY:
5232 p = "ANY";
5233 break;
5234 case ANY + ADD_NL:
5235 p = "ANY+NL";
5236 break;
5237 case ANYOF:
5238 p = "ANYOF";
5239 break;
5240 case ANYOF + ADD_NL:
5241 p = "ANYOF+NL";
5242 break;
5243 case ANYBUT:
5244 p = "ANYBUT";
5245 break;
5246 case ANYBUT + ADD_NL:
5247 p = "ANYBUT+NL";
5248 break;
5249 case IDENT:
5250 p = "IDENT";
5251 break;
5252 case IDENT + ADD_NL:
5253 p = "IDENT+NL";
5254 break;
5255 case SIDENT:
5256 p = "SIDENT";
5257 break;
5258 case SIDENT + ADD_NL:
5259 p = "SIDENT+NL";
5260 break;
5261 case KWORD:
5262 p = "KWORD";
5263 break;
5264 case KWORD + ADD_NL:
5265 p = "KWORD+NL";
5266 break;
5267 case SKWORD:
5268 p = "SKWORD";
5269 break;
5270 case SKWORD + ADD_NL:
5271 p = "SKWORD+NL";
5272 break;
5273 case FNAME:
5274 p = "FNAME";
5275 break;
5276 case FNAME + ADD_NL:
5277 p = "FNAME+NL";
5278 break;
5279 case SFNAME:
5280 p = "SFNAME";
5281 break;
5282 case SFNAME + ADD_NL:
5283 p = "SFNAME+NL";
5284 break;
5285 case PRINT:
5286 p = "PRINT";
5287 break;
5288 case PRINT + ADD_NL:
5289 p = "PRINT+NL";
5290 break;
5291 case SPRINT:
5292 p = "SPRINT";
5293 break;
5294 case SPRINT + ADD_NL:
5295 p = "SPRINT+NL";
5296 break;
5297 case WHITE:
5298 p = "WHITE";
5299 break;
5300 case WHITE + ADD_NL:
5301 p = "WHITE+NL";
5302 break;
5303 case NWHITE:
5304 p = "NWHITE";
5305 break;
5306 case NWHITE + ADD_NL:
5307 p = "NWHITE+NL";
5308 break;
5309 case DIGIT:
5310 p = "DIGIT";
5311 break;
5312 case DIGIT + ADD_NL:
5313 p = "DIGIT+NL";
5314 break;
5315 case NDIGIT:
5316 p = "NDIGIT";
5317 break;
5318 case NDIGIT + ADD_NL:
5319 p = "NDIGIT+NL";
5320 break;
5321 case HEX:
5322 p = "HEX";
5323 break;
5324 case HEX + ADD_NL:
5325 p = "HEX+NL";
5326 break;
5327 case NHEX:
5328 p = "NHEX";
5329 break;
5330 case NHEX + ADD_NL:
5331 p = "NHEX+NL";
5332 break;
5333 case OCTAL:
5334 p = "OCTAL";
5335 break;
5336 case OCTAL + ADD_NL:
5337 p = "OCTAL+NL";
5338 break;
5339 case NOCTAL:
5340 p = "NOCTAL";
5341 break;
5342 case NOCTAL + ADD_NL:
5343 p = "NOCTAL+NL";
5344 break;
5345 case WORD:
5346 p = "WORD";
5347 break;
5348 case WORD + ADD_NL:
5349 p = "WORD+NL";
5350 break;
5351 case NWORD:
5352 p = "NWORD";
5353 break;
5354 case NWORD + ADD_NL:
5355 p = "NWORD+NL";
5356 break;
5357 case HEAD:
5358 p = "HEAD";
5359 break;
5360 case HEAD + ADD_NL:
5361 p = "HEAD+NL";
5362 break;
5363 case NHEAD:
5364 p = "NHEAD";
5365 break;
5366 case NHEAD + ADD_NL:
5367 p = "NHEAD+NL";
5368 break;
5369 case ALPHA:
5370 p = "ALPHA";
5371 break;
5372 case ALPHA + ADD_NL:
5373 p = "ALPHA+NL";
5374 break;
5375 case NALPHA:
5376 p = "NALPHA";
5377 break;
5378 case NALPHA + ADD_NL:
5379 p = "NALPHA+NL";
5380 break;
5381 case LOWER:
5382 p = "LOWER";
5383 break;
5384 case LOWER + ADD_NL:
5385 p = "LOWER+NL";
5386 break;
5387 case NLOWER:
5388 p = "NLOWER";
5389 break;
5390 case NLOWER + ADD_NL:
5391 p = "NLOWER+NL";
5392 break;
5393 case UPPER:
5394 p = "UPPER";
5395 break;
5396 case UPPER + ADD_NL:
5397 p = "UPPER+NL";
5398 break;
5399 case NUPPER:
5400 p = "NUPPER";
5401 break;
5402 case NUPPER + ADD_NL:
5403 p = "NUPPER+NL";
5404 break;
5405 case BRANCH:
5406 p = "BRANCH";
5407 break;
5408 case EXACTLY:
5409 p = "EXACTLY";
5410 break;
5411 case NOTHING:
5412 p = "NOTHING";
5413 break;
5414 case BACK:
5415 p = "BACK";
5416 break;
5417 case END:
5418 p = "END";
5419 break;
5420 case MOPEN + 0:
5421 p = "MATCH START";
5422 break;
5423 case MOPEN + 1:
5424 case MOPEN + 2:
5425 case MOPEN + 3:
5426 case MOPEN + 4:
5427 case MOPEN + 5:
5428 case MOPEN + 6:
5429 case MOPEN + 7:
5430 case MOPEN + 8:
5431 case MOPEN + 9:
5432 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5433 p = NULL;
5434 break;
5435 case MCLOSE + 0:
5436 p = "MATCH END";
5437 break;
5438 case MCLOSE + 1:
5439 case MCLOSE + 2:
5440 case MCLOSE + 3:
5441 case MCLOSE + 4:
5442 case MCLOSE + 5:
5443 case MCLOSE + 6:
5444 case MCLOSE + 7:
5445 case MCLOSE + 8:
5446 case MCLOSE + 9:
5447 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5448 p = NULL;
5449 break;
5450 case BACKREF + 1:
5451 case BACKREF + 2:
5452 case BACKREF + 3:
5453 case BACKREF + 4:
5454 case BACKREF + 5:
5455 case BACKREF + 6:
5456 case BACKREF + 7:
5457 case BACKREF + 8:
5458 case BACKREF + 9:
5459 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5460 p = NULL;
5461 break;
5462 case NOPEN:
5463 p = "NOPEN";
5464 break;
5465 case NCLOSE:
5466 p = "NCLOSE";
5467 break;
5468#ifdef FEAT_SYN_HL
5469 case ZOPEN + 1:
5470 case ZOPEN + 2:
5471 case ZOPEN + 3:
5472 case ZOPEN + 4:
5473 case ZOPEN + 5:
5474 case ZOPEN + 6:
5475 case ZOPEN + 7:
5476 case ZOPEN + 8:
5477 case ZOPEN + 9:
5478 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5479 p = NULL;
5480 break;
5481 case ZCLOSE + 1:
5482 case ZCLOSE + 2:
5483 case ZCLOSE + 3:
5484 case ZCLOSE + 4:
5485 case ZCLOSE + 5:
5486 case ZCLOSE + 6:
5487 case ZCLOSE + 7:
5488 case ZCLOSE + 8:
5489 case ZCLOSE + 9:
5490 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5491 p = NULL;
5492 break;
5493 case ZREF + 1:
5494 case ZREF + 2:
5495 case ZREF + 3:
5496 case ZREF + 4:
5497 case ZREF + 5:
5498 case ZREF + 6:
5499 case ZREF + 7:
5500 case ZREF + 8:
5501 case ZREF + 9:
5502 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5503 p = NULL;
5504 break;
5505#endif
5506 case STAR:
5507 p = "STAR";
5508 break;
5509 case PLUS:
5510 p = "PLUS";
5511 break;
5512 case NOMATCH:
5513 p = "NOMATCH";
5514 break;
5515 case MATCH:
5516 p = "MATCH";
5517 break;
5518 case BEHIND:
5519 p = "BEHIND";
5520 break;
5521 case NOBEHIND:
5522 p = "NOBEHIND";
5523 break;
5524 case SUBPAT:
5525 p = "SUBPAT";
5526 break;
5527 case BRACE_LIMITS:
5528 p = "BRACE_LIMITS";
5529 break;
5530 case BRACE_SIMPLE:
5531 p = "BRACE_SIMPLE";
5532 break;
5533 case BRACE_COMPLEX + 0:
5534 case BRACE_COMPLEX + 1:
5535 case BRACE_COMPLEX + 2:
5536 case BRACE_COMPLEX + 3:
5537 case BRACE_COMPLEX + 4:
5538 case BRACE_COMPLEX + 5:
5539 case BRACE_COMPLEX + 6:
5540 case BRACE_COMPLEX + 7:
5541 case BRACE_COMPLEX + 8:
5542 case BRACE_COMPLEX + 9:
5543 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5544 p = NULL;
5545 break;
5546 case MULTIBYTECODE:
5547 p = "MULTIBYTECODE";
5548 break;
5549 case NEWL:
5550 p = "NEWL";
5551 break;
5552 default:
5553 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5554 p = NULL;
5555 break;
5556 }
5557 if (p != NULL)
5558 STRCAT(buf, p);
5559 return (char_u *)buf;
5560}
5561#endif // DEBUG