blob: 9286d911aeab0aa5232ca43db888e4a0cbe47b9d [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
536#ifdef EBCDIC
537 int i;
538
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200539 // This might be slower than switch/case below.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200540 for (i = 0; i < 16; i++)
541 {
542 if (vim_strchr(EQUIVAL_CLASS_C[i], c) != NULL)
543 {
544 char *p = EQUIVAL_CLASS_C[i];
545
546 while (*p != 0)
547 regmbc(*p++);
548 return;
549 }
550 }
551#else
552 switch (c)
553 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200554 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200555 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
556 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
557 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
558 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
559 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
560 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
561 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
562 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
563 regmbc(0x100); regmbc(0x102); regmbc(0x104);
564 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
565 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
566 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
567 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
568 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
569 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
570 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200571 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200572 case 'B': case 0x181: case 0x243: case 0x1e02:
573 case 0x1e04: case 0x1e06:
574 regmbc('B');
575 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
576 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200577 return;
578 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200579 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
580 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200581 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200582 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
583 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
584 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200585 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200586 case 'D': case 0x10e: case 0x110: case 0x18a:
587 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
588 case 0x1e12:
589 regmbc('D'); regmbc(0x10e); regmbc(0x110);
590 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
591 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
593 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200594 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
595 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
596 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
597 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
598 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200599 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200600 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
601 regmbc(0x114); regmbc(0x116); regmbc(0x118);
602 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
603 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
604 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
605 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
606 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
607 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200608 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200609 case 'F': case 0x191: case 0x1e1e: case 0xa798:
610 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
611 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200613 case 'G': case 0x11c: case 0x11e: case 0x120:
614 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
615 case 0x1f4: case 0x1e20: case 0xa7a0:
616 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
617 regmbc(0x120); regmbc(0x122); regmbc(0x193);
618 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
619 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200620 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200621 case 'H': case 0x124: case 0x126: case 0x21e:
622 case 0x1e22: case 0x1e24: case 0x1e26:
623 case 0x1e28: case 0x1e2a: case 0x2c67:
624 regmbc('H'); regmbc(0x124); regmbc(0x126);
625 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
626 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
627 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200628 return;
629 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200630 case 0x128: case 0x12a: case 0x12c: case 0x12e:
631 case 0x130: case 0x197: case 0x1cf: case 0x208:
632 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
633 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200634 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200635 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
636 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
637 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
638 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
639 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200640 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200641 case 'J': case 0x134: case 0x248:
642 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200643 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200644 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
645 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
646 regmbc('K'); regmbc(0x136); regmbc(0x198);
647 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
648 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200649 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200650 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
651 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
652 case 0x1e3a: case 0x1e3c: case 0x2c60:
653 regmbc('L'); regmbc(0x139); regmbc(0x13b);
654 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
655 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
656 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200657 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200658 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
659 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
660 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200661 return;
662 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200663 case 0x143: case 0x145: case 0x147: case 0x1f8:
664 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
665 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200666 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200667 regmbc(0x143); regmbc(0x145); regmbc(0x147);
668 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
669 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200670 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200671 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
672 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
673 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
674 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
675 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
676 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
677 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
678 case 0x1ee0: case 0x1ee2:
679 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
680 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
681 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
682 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
683 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
684 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
685 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
686 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
687 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
688 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
689 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
690 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
691 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
694 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
695 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200696 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200697 case 'Q': case 0x24a:
698 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200699 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200700 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
701 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
702 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
703 regmbc('R'); regmbc(0x154); regmbc(0x156);
704 regmbc(0x210); regmbc(0x212); regmbc(0x158);
705 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
706 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
707 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200708 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200709 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
710 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
711 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
712 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
713 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
714 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
715 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
716 regmbc(0xa7a8);
717 return;
718 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
719 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
720 case 0x1e6e: case 0x1e70:
721 regmbc('T'); regmbc(0x162); regmbc(0x164);
722 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
723 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
724 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200725 return;
726 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200727 case 0x168: case 0x16a: case 0x16c: case 0x16e:
728 case 0x170: case 0x172: case 0x1af: case 0x1d3:
729 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
730 case 0x214: case 0x216: case 0x244: case 0x1e72:
731 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
732 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
733 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200734 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200735 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
736 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
737 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
738 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
739 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
740 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
741 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
742 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
743 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
744 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200745 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200746 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
747 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
748 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200749 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200750 case 'W': case 0x174: case 0x1e80: case 0x1e82:
751 case 0x1e84: case 0x1e86: case 0x1e88:
752 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
753 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
754 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200755 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200756 case 'X': case 0x1e8a: case 0x1e8c:
757 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200758 return;
759 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200760 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
761 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
762 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
763 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
764 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
765 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200766 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
768 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
769 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
770 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
771 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200772 return;
773 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200774 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
775 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
776 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
777 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
778 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
779 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
780 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200781 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
782 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200783 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
784 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
785 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
786 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
787 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
788 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
789 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
790 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
791 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200792 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200793 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
794 case 0x1e03: case 0x1e05: case 0x1e07:
795 regmbc('b');
796 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
797 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
798 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
802 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
803 regmbc('c'); regmbc(0xe7); regmbc(0x107);
804 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
805 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
806 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
809 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
810 case 0x1e0f: case 0x1e11: case 0x1e13:
811 regmbc('d'); regmbc(0x10f); regmbc(0x111);
812 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
813 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
814 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200815 return;
816 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200817 case 0x113: case 0x115: case 0x117: case 0x119:
818 case 0x11b: case 0x205: case 0x207: case 0x229:
819 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
820 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
821 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
822 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200823 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200824 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
825 regmbc(0x115); regmbc(0x117); regmbc(0x119);
826 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
827 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
828 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
829 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
830 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
831 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
832 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200833 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200834 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
835 case 0x1e1f: case 0xa799:
836 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
837 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
838 return;
839 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
840 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
841 case 0x1e21: case 0xa7a1:
842 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
843 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
844 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
845 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200846 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200847 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
848 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
849 case 0x1e96: case 0x2c68: case 0xa795:
850 regmbc('h'); regmbc(0x125); regmbc(0x127);
851 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
852 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
853 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
855 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200856 case 0x129: case 0x12b: case 0x12d: case 0x12f:
857 case 0x1d0: case 0x209: case 0x20b: case 0x268:
858 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
859 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200860 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200861 regmbc(0xee); regmbc(0xef); regmbc(0x129);
862 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
863 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
864 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
865 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200866 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200867 case 'j': case 0x135: case 0x1f0: case 0x249:
868 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
869 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'k': case 0x137: case 0x199: case 0x1e9:
872 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
873 case 0x2c6a: case 0xa741:
874 regmbc('k'); regmbc(0x137); regmbc(0x199);
875 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
876 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
877 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200878 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200879 case 'l': case 0x13a: case 0x13c: case 0x13e:
880 case 0x140: case 0x142: case 0x19a: case 0x1e37:
881 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
882 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
883 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
884 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
885 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200886 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200887 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
888 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
889 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200890 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200891 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
892 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
893 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
894 case 0xa7a5:
895 regmbc('n'); regmbc(0xf1); regmbc(0x144);
896 regmbc(0x146); regmbc(0x148); regmbc(0x149);
897 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
898 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
899 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200900 return;
901 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200902 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
903 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
904 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
905 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
906 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
907 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
908 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
909 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200910 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
911 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200912 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
913 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
914 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
915 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
916 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
917 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
918 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
919 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
920 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
921 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
922 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200923 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200924 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
925 case 0x1e55: case 0x1e57:
926 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
927 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
928 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200929 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200930 case 'q': case 0x24b: case 0x2a0:
931 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200932 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200933 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
934 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
935 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
936 case 0xa7a7:
937 regmbc('r'); regmbc(0x155); regmbc(0x157);
938 regmbc(0x159); regmbc(0x211); regmbc(0x213);
939 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
940 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
941 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
942 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200943 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200944 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
945 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
946 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
947 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
948 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
949 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
950 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
951 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
952 return;
953 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
954 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
955 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
956 regmbc('t'); regmbc(0x163); regmbc(0x165);
957 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
958 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
959 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
960 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200961 return;
962 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200963 case 0x169: case 0x16b: case 0x16d: case 0x16f:
964 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
965 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
966 case 0x215: case 0x217: case 0x289: case 0x1e73:
967 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
968 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
969 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
970 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
973 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
974 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
975 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
976 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
977 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
978 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
979 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
980 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
981 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
982 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200983 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200984 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
985 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
986 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200987 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200988 case 'w': case 0x175: case 0x1e81: case 0x1e83:
989 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
990 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
991 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
992 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200993 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200994 case 'x': case 0x1e8b: case 0x1e8d:
995 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200996 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200997 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
998 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
999 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001000 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001001 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
1002 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
1003 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
1004 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001005 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +02001006 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
1007 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
1008 case 0x1e95: case 0x2c6c:
1009 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
1010 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
1011 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
1012 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001013 return;
1014 }
1015#endif
1016 }
1017 regmbc(c);
1018}
1019
1020/*
1021 * Emit a node.
1022 * Return pointer to generated code.
1023 */
1024 static char_u *
1025regnode(int op)
1026{
1027 char_u *ret;
1028
1029 ret = regcode;
1030 if (ret == JUST_CALC_SIZE)
1031 regsize += 3;
1032 else
1033 {
1034 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001035 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001036 *regcode++ = NUL;
1037 }
1038 return ret;
1039}
1040
1041/*
1042 * Write a long as four bytes at "p" and return pointer to the next char.
1043 */
1044 static char_u *
1045re_put_long(char_u *p, long_u val)
1046{
1047 *p++ = (char_u) ((val >> 24) & 0377);
1048 *p++ = (char_u) ((val >> 16) & 0377);
1049 *p++ = (char_u) ((val >> 8) & 0377);
1050 *p++ = (char_u) (val & 0377);
1051 return p;
1052}
1053
1054/*
1055 * regnext - dig the "next" pointer out of a node
1056 * Returns NULL when calculating size, when there is no next item and when
1057 * there is an error.
1058 */
1059 static char_u *
1060regnext(char_u *p)
1061{
1062 int offset;
1063
1064 if (p == JUST_CALC_SIZE || reg_toolong)
1065 return NULL;
1066
1067 offset = NEXT(p);
1068 if (offset == 0)
1069 return NULL;
1070
1071 if (OP(p) == BACK)
1072 return p - offset;
1073 else
1074 return p + offset;
1075}
1076
1077/*
1078 * Set the next-pointer at the end of a node chain.
1079 */
1080 static void
1081regtail(char_u *p, char_u *val)
1082{
1083 char_u *scan;
1084 char_u *temp;
1085 int offset;
1086
1087 if (p == JUST_CALC_SIZE)
1088 return;
1089
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001090 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001091 scan = p;
1092 for (;;)
1093 {
1094 temp = regnext(scan);
1095 if (temp == NULL)
1096 break;
1097 scan = temp;
1098 }
1099
1100 if (OP(scan) == BACK)
1101 offset = (int)(scan - val);
1102 else
1103 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001104 // When the offset uses more than 16 bits it can no longer fit in the two
1105 // bytes available. Use a global flag to avoid having to check return
1106 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001107 if (offset > 0xffff)
1108 reg_toolong = TRUE;
1109 else
1110 {
1111 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1112 *(scan + 2) = (char_u) (offset & 0377);
1113 }
1114}
1115
1116/*
1117 * Like regtail, on item after a BRANCH; nop if none.
1118 */
1119 static void
1120regoptail(char_u *p, char_u *val)
1121{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001122 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001123 if (p == NULL || p == JUST_CALC_SIZE
1124 || (OP(p) != BRANCH
1125 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1126 return;
1127 regtail(OPERAND(p), val);
1128}
1129
1130/*
1131 * Insert an operator in front of already-emitted operand
1132 *
1133 * Means relocating the operand.
1134 */
1135 static void
1136reginsert(int op, char_u *opnd)
1137{
1138 char_u *src;
1139 char_u *dst;
1140 char_u *place;
1141
1142 if (regcode == JUST_CALC_SIZE)
1143 {
1144 regsize += 3;
1145 return;
1146 }
1147 src = regcode;
1148 regcode += 3;
1149 dst = regcode;
1150 while (src > opnd)
1151 *--dst = *--src;
1152
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001153 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001154 *place++ = op;
1155 *place++ = NUL;
1156 *place = NUL;
1157}
1158
1159/*
1160 * Insert an operator in front of already-emitted operand.
1161 * Add a number to the operator.
1162 */
1163 static void
1164reginsert_nr(int op, long val, char_u *opnd)
1165{
1166 char_u *src;
1167 char_u *dst;
1168 char_u *place;
1169
1170 if (regcode == JUST_CALC_SIZE)
1171 {
1172 regsize += 7;
1173 return;
1174 }
1175 src = regcode;
1176 regcode += 7;
1177 dst = regcode;
1178 while (src > opnd)
1179 *--dst = *--src;
1180
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001181 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001182 *place++ = op;
1183 *place++ = NUL;
1184 *place++ = NUL;
1185 re_put_long(place, (long_u)val);
1186}
1187
1188/*
1189 * Insert an operator in front of already-emitted operand.
1190 * The operator has the given limit values as operands. Also set next pointer.
1191 *
1192 * Means relocating the operand.
1193 */
1194 static void
1195reginsert_limits(
1196 int op,
1197 long minval,
1198 long maxval,
1199 char_u *opnd)
1200{
1201 char_u *src;
1202 char_u *dst;
1203 char_u *place;
1204
1205 if (regcode == JUST_CALC_SIZE)
1206 {
1207 regsize += 11;
1208 return;
1209 }
1210 src = regcode;
1211 regcode += 11;
1212 dst = regcode;
1213 while (src > opnd)
1214 *--dst = *--src;
1215
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001216 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001217 *place++ = op;
1218 *place++ = NUL;
1219 *place++ = NUL;
1220 place = re_put_long(place, (long_u)minval);
1221 place = re_put_long(place, (long_u)maxval);
1222 regtail(opnd, place);
1223}
1224
1225/*
1226 * Return TRUE if the back reference is legal. We must have seen the close
1227 * brace.
1228 * TODO: Should also check that we don't refer to something that is repeated
1229 * (+*=): what instance of the repetition should we match?
1230 */
1231 static int
1232seen_endbrace(int refnum)
1233{
1234 if (!had_endbrace[refnum])
1235 {
1236 char_u *p;
1237
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001238 // Trick: check if "@<=" or "@<!" follows, in which case
1239 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001240 for (p = regparse; *p != NUL; ++p)
1241 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1242 break;
1243 if (*p == NUL)
1244 {
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001245 emsg(_(e_illegal_back_reference));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001246 rc_did_emsg = TRUE;
1247 return FALSE;
1248 }
1249 }
1250 return TRUE;
1251}
1252
1253/*
1254 * Parse the lowest level.
1255 *
1256 * Optimization: gobbles an entire sequence of ordinary characters so that
1257 * it can turn them into a single node, which is smaller to store and
1258 * faster to run. Don't do this when one_exactly is set.
1259 */
1260 static char_u *
1261regatom(int *flagp)
1262{
1263 char_u *ret;
1264 int flags;
1265 int c;
1266 char_u *p;
1267 int extra = 0;
1268 int save_prev_at_start = prev_at_start;
1269
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001270 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001271
1272 c = getchr();
1273 switch (c)
1274 {
1275 case Magic('^'):
1276 ret = regnode(BOL);
1277 break;
1278
1279 case Magic('$'):
1280 ret = regnode(EOL);
1281#if defined(FEAT_SYN_HL) || defined(PROTO)
1282 had_eol = TRUE;
1283#endif
1284 break;
1285
1286 case Magic('<'):
1287 ret = regnode(BOW);
1288 break;
1289
1290 case Magic('>'):
1291 ret = regnode(EOW);
1292 break;
1293
1294 case Magic('_'):
1295 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 {
1298 ret = regnode(BOL);
1299 break;
1300 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001301 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302 {
1303 ret = regnode(EOL);
1304#if defined(FEAT_SYN_HL) || defined(PROTO)
1305 had_eol = TRUE;
1306#endif
1307 break;
1308 }
1309
1310 extra = ADD_NL;
1311 *flagp |= HASNL;
1312
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001313 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001314 if (c == '[')
1315 goto collection;
1316
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001317 // "\_x" is character class plus newline
1318 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001319
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001320 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001321 case Magic('.'):
1322 case Magic('i'):
1323 case Magic('I'):
1324 case Magic('k'):
1325 case Magic('K'):
1326 case Magic('f'):
1327 case Magic('F'):
1328 case Magic('p'):
1329 case Magic('P'):
1330 case Magic('s'):
1331 case Magic('S'):
1332 case Magic('d'):
1333 case Magic('D'):
1334 case Magic('x'):
1335 case Magic('X'):
1336 case Magic('o'):
1337 case Magic('O'):
1338 case Magic('w'):
1339 case Magic('W'):
1340 case Magic('h'):
1341 case Magic('H'):
1342 case Magic('a'):
1343 case Magic('A'):
1344 case Magic('l'):
1345 case Magic('L'):
1346 case Magic('u'):
1347 case Magic('U'):
1348 p = vim_strchr(classchars, no_Magic(c));
1349 if (p == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001350 EMSG_RET_NULL(_(e_invalid_use_of_underscore));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001351
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001352 // When '.' is followed by a composing char ignore the dot, so that
1353 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001354 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1355 {
1356 c = getchr();
1357 goto do_multibyte;
1358 }
1359 ret = regnode(classcodes[p - classchars] + extra);
1360 *flagp |= HASWIDTH | SIMPLE;
1361 break;
1362
1363 case Magic('n'):
1364 if (reg_string)
1365 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001366 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001367 ret = regnode(EXACTLY);
1368 regc(NL);
1369 regc(NUL);
1370 *flagp |= HASWIDTH | SIMPLE;
1371 }
1372 else
1373 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001374 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001375 ret = regnode(NEWL);
1376 *flagp |= HASWIDTH | HASNL;
1377 }
1378 break;
1379
1380 case Magic('('):
1381 if (one_exactly)
1382 EMSG_ONE_RET_NULL;
1383 ret = reg(REG_PAREN, &flags);
1384 if (ret == NULL)
1385 return NULL;
1386 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1387 break;
1388
1389 case NUL:
1390 case Magic('|'):
1391 case Magic('&'):
1392 case Magic(')'):
1393 if (one_exactly)
1394 EMSG_ONE_RET_NULL;
Bram Moolenaard0819d12021-12-31 23:15:53 +00001395 // Supposed to be caught earlier.
1396 IEMSG_RET_NULL(_(e_internal_error_in_regexp));
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001397 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001398
1399 case Magic('='):
1400 case Magic('?'):
1401 case Magic('+'):
1402 case Magic('@'):
1403 case Magic('{'):
1404 case Magic('*'):
1405 c = no_Magic(c);
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001406 EMSG3_RET_NULL(_(e_str_chr_follows_nothing),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001407 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001408 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001409
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001410 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001411 if (reg_prev_sub != NULL)
1412 {
1413 char_u *lp;
1414
1415 ret = regnode(EXACTLY);
1416 lp = reg_prev_sub;
1417 while (*lp != NUL)
1418 regc(*lp++);
1419 regc(NUL);
1420 if (*reg_prev_sub != NUL)
1421 {
1422 *flagp |= HASWIDTH;
1423 if ((lp - reg_prev_sub) == 1)
1424 *flagp |= SIMPLE;
1425 }
1426 }
1427 else
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001428 EMSG_RET_NULL(_(e_no_previous_substitute_regular_expression));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001429 break;
1430
1431 case Magic('1'):
1432 case Magic('2'):
1433 case Magic('3'):
1434 case Magic('4'):
1435 case Magic('5'):
1436 case Magic('6'):
1437 case Magic('7'):
1438 case Magic('8'):
1439 case Magic('9'):
1440 {
1441 int refnum;
1442
1443 refnum = c - Magic('0');
1444 if (!seen_endbrace(refnum))
1445 return NULL;
1446 ret = regnode(BACKREF + refnum);
1447 }
1448 break;
1449
1450 case Magic('z'):
1451 {
1452 c = no_Magic(getchr());
1453 switch (c)
1454 {
1455#ifdef FEAT_SYN_HL
1456 case '(': if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001457 EMSG_RET_NULL(_(e_z_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001458 if (one_exactly)
1459 EMSG_ONE_RET_NULL;
1460 ret = reg(REG_ZPAREN, &flags);
1461 if (ret == NULL)
1462 return NULL;
1463 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1464 re_has_z = REX_SET;
1465 break;
1466
1467 case '1':
1468 case '2':
1469 case '3':
1470 case '4':
1471 case '5':
1472 case '6':
1473 case '7':
1474 case '8':
1475 case '9': if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001476 EMSG_RET_NULL(_(e_z1_z9_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001477 ret = regnode(ZREF + c - '0');
1478 re_has_z = REX_USE;
1479 break;
1480#endif
1481
1482 case 's': ret = regnode(MOPEN + 0);
1483 if (re_mult_next("\\zs") == FAIL)
1484 return NULL;
1485 break;
1486
1487 case 'e': ret = regnode(MCLOSE + 0);
1488 if (re_mult_next("\\ze") == FAIL)
1489 return NULL;
1490 break;
1491
1492 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1493 }
1494 }
1495 break;
1496
1497 case Magic('%'):
1498 {
1499 c = no_Magic(getchr());
1500 switch (c)
1501 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001502 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001503 case '(':
1504 if (one_exactly)
1505 EMSG_ONE_RET_NULL;
1506 ret = reg(REG_NPAREN, &flags);
1507 if (ret == NULL)
1508 return NULL;
1509 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1510 break;
1511
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001512 // Catch \%^ and \%$ regardless of where they appear in the
1513 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001514 case '^':
1515 ret = regnode(RE_BOF);
1516 break;
1517
1518 case '$':
1519 ret = regnode(RE_EOF);
1520 break;
1521
1522 case '#':
1523 ret = regnode(CURSOR);
1524 break;
1525
1526 case 'V':
1527 ret = regnode(RE_VISUAL);
1528 break;
1529
1530 case 'C':
1531 ret = regnode(RE_COMPOSING);
1532 break;
1533
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001534 // \%[abc]: Emit as a list of branches, all ending at the last
1535 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001536 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001537 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001538 EMSG_ONE_RET_NULL;
1539 {
1540 char_u *lastbranch;
1541 char_u *lastnode = NULL;
1542 char_u *br;
1543
1544 ret = NULL;
1545 while ((c = getchr()) != ']')
1546 {
1547 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001548 EMSG2_RET_NULL(_(e_missing_sb_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001549 reg_magic == MAGIC_ALL);
1550 br = regnode(BRANCH);
1551 if (ret == NULL)
1552 ret = br;
1553 else
1554 {
1555 regtail(lastnode, br);
1556 if (reg_toolong)
1557 return NULL;
1558 }
1559
1560 ungetchr();
1561 one_exactly = TRUE;
1562 lastnode = regatom(flagp);
1563 one_exactly = FALSE;
1564 if (lastnode == NULL)
1565 return NULL;
1566 }
1567 if (ret == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001568 EMSG2_RET_NULL(_(e_empty_str_brackets),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001569 reg_magic == MAGIC_ALL);
1570 lastbranch = regnode(BRANCH);
1571 br = regnode(NOTHING);
1572 if (ret != JUST_CALC_SIZE)
1573 {
1574 regtail(lastnode, br);
1575 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001576 // connect all branches to the NOTHING
1577 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001578 for (br = ret; br != lastnode; )
1579 {
1580 if (OP(br) == BRANCH)
1581 {
1582 regtail(br, lastbranch);
1583 if (reg_toolong)
1584 return NULL;
1585 br = OPERAND(br);
1586 }
1587 else
1588 br = regnext(br);
1589 }
1590 }
1591 *flagp &= ~(HASWIDTH | SIMPLE);
1592 break;
1593 }
1594
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001595 case 'd': // %d123 decimal
1596 case 'o': // %o123 octal
1597 case 'x': // %xab hex 2
1598 case 'u': // %uabcd hex 4
1599 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001600 {
1601 long i;
1602
1603 switch (c)
1604 {
1605 case 'd': i = getdecchrs(); break;
1606 case 'o': i = getoctchrs(); break;
1607 case 'x': i = gethexchrs(2); break;
1608 case 'u': i = gethexchrs(4); break;
1609 case 'U': i = gethexchrs(8); break;
1610 default: i = -1; break;
1611 }
1612
1613 if (i < 0 || i > INT_MAX)
1614 EMSG2_RET_NULL(
Bram Moolenaara6f79292022-01-04 21:30:47 +00001615 _(e_invalid_character_after_str_2),
1616 reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001617 if (use_multibytecode(i))
1618 ret = regnode(MULTIBYTECODE);
1619 else
1620 ret = regnode(EXACTLY);
1621 if (i == 0)
1622 regc(0x0a);
1623 else
1624 regmbc(i);
1625 regc(NUL);
1626 *flagp |= HASWIDTH;
1627 break;
1628 }
1629
1630 default:
1631 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001632 || c == '\'' || c == '.')
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001633 {
1634 long_u n = 0;
1635 int cmp;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001636 int cur = FALSE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001637
1638 cmp = c;
1639 if (cmp == '<' || cmp == '>')
1640 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001641 if (no_Magic(c) == '.')
1642 {
1643 cur = TRUE;
1644 c = getchr();
1645 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001646 while (VIM_ISDIGIT(c))
1647 {
1648 n = n * 10 + (c - '0');
1649 c = getchr();
1650 }
1651 if (c == '\'' && n == 0)
1652 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001653 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001654 c = getchr();
1655 ret = regnode(RE_MARK);
1656 if (ret == JUST_CALC_SIZE)
1657 regsize += 2;
1658 else
1659 {
1660 *regcode++ = c;
1661 *regcode++ = cmp;
1662 }
1663 break;
1664 }
1665 else if (c == 'l' || c == 'c' || c == 'v')
1666 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001667 if (cur && n)
1668 {
1669 semsg(_(e_regexp_number_after_dot_pos_search), no_Magic(c));
1670 rc_did_emsg = TRUE;
1671 return NULL;
1672 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001673 if (c == 'l')
1674 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001675 if (cur)
1676 n = curwin->w_cursor.lnum;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001677 ret = regnode(RE_LNUM);
1678 if (save_prev_at_start)
1679 at_start = TRUE;
1680 }
1681 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001682 {
1683 if (cur)
1684 {
1685 n = curwin->w_cursor.col;
1686 n++;
1687 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001688 ret = regnode(RE_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001689 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001690 else
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001691 {
1692 if (cur)
1693 {
1694 colnr_T vcol = 0;
1695
1696 getvvcol(curwin, &curwin->w_cursor,
1697 NULL, NULL, &vcol);
1698 ++vcol;
1699 n = vcol;
1700 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001701 ret = regnode(RE_VCOL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001702 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001703 if (ret == JUST_CALC_SIZE)
1704 regsize += 5;
1705 else
1706 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001707 // put the number and the optional
1708 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001709 regcode = re_put_long(regcode, n);
1710 *regcode++ = cmp;
1711 }
1712 break;
1713 }
1714 }
1715
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001716 EMSG2_RET_NULL(_(e_invalid_character_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001717 reg_magic == MAGIC_ALL);
1718 }
1719 }
1720 break;
1721
1722 case Magic('['):
1723collection:
1724 {
1725 char_u *lp;
1726
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001727 // If there is no matching ']', we assume the '[' is a normal
1728 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001729 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001730 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001731 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001732 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001733 int endc;
1734
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001735 // In a character class, different parsing rules apply.
1736 // Not even \ is special anymore, nothing is.
1737 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001738 {
1739 ret = regnode(ANYBUT + extra);
1740 regparse++;
1741 }
1742 else
1743 ret = regnode(ANYOF + extra);
1744
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001745 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001746 if (*regparse == ']' || *regparse == '-')
1747 {
1748 startc = *regparse;
1749 regc(*regparse++);
1750 }
1751
1752 while (*regparse != NUL && *regparse != ']')
1753 {
1754 if (*regparse == '-')
1755 {
1756 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001757 // The '-' is not used for a range at the end and
1758 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001759 if (*regparse == ']' || *regparse == NUL
1760 || startc == -1
1761 || (regparse[0] == '\\' && regparse[1] == 'n'))
1762 {
1763 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001764 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001765 }
1766 else
1767 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001768 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001769 endc = 0;
1770 if (*regparse == '[')
1771 endc = get_coll_element(&regparse);
1772 if (endc == 0)
1773 {
1774 if (has_mbyte)
1775 endc = mb_ptr2char_adv(&regparse);
1776 else
1777 endc = *regparse++;
1778 }
1779
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001780 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001781 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1782 endc = coll_get_char();
1783
1784 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001785 EMSG_RET_NULL(_(e_reverse_range_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001786 if (has_mbyte && ((*mb_char2len)(startc) > 1
1787 || (*mb_char2len)(endc) > 1))
1788 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001789 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001790 if (endc > startc + 256)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001791 EMSG_RET_NULL(_(e_range_too_large_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001792 while (++startc <= endc)
1793 regmbc(startc);
1794 }
1795 else
1796 {
1797#ifdef EBCDIC
1798 int alpha_only = FALSE;
1799
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001800 // for alphabetical range skip the gaps
1801 // 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001802 if (isalpha(startc) && isalpha(endc))
1803 alpha_only = TRUE;
1804#endif
1805 while (++startc <= endc)
1806#ifdef EBCDIC
1807 if (!alpha_only || isalpha(startc))
1808#endif
1809 regc(startc);
1810 }
1811 startc = -1;
1812 }
1813 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001814 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1815 // accepts "\t", "\e", etc., but only when the 'l' flag in
1816 // 'cpoptions' is not included.
1817 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001818 else if (*regparse == '\\'
1819 && !reg_cpo_bsl
1820 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1821 || (!reg_cpo_lit
1822 && vim_strchr(REGEXP_ABBR,
1823 regparse[1]) != NULL)))
1824 {
1825 regparse++;
1826 if (*regparse == 'n')
1827 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001828 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001829 if (ret != JUST_CALC_SIZE)
1830 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001831 // Using \n inside [^] does not change what
1832 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001833 if (*ret == ANYOF)
1834 {
1835 *ret = ANYOF + ADD_NL;
1836 *flagp |= HASNL;
1837 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001838 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001839 }
1840 regparse++;
1841 startc = -1;
1842 }
1843 else if (*regparse == 'd'
1844 || *regparse == 'o'
1845 || *regparse == 'x'
1846 || *regparse == 'u'
1847 || *regparse == 'U')
1848 {
1849 startc = coll_get_char();
1850 if (startc == 0)
1851 regc(0x0a);
1852 else
1853 regmbc(startc);
1854 }
1855 else
1856 {
1857 startc = backslash_trans(*regparse++);
1858 regc(startc);
1859 }
1860 }
1861 else if (*regparse == '[')
1862 {
1863 int c_class;
1864 int cu;
1865
1866 c_class = get_char_class(&regparse);
1867 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001868 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001869 switch (c_class)
1870 {
1871 case CLASS_NONE:
1872 c_class = get_equi_class(&regparse);
1873 if (c_class != 0)
1874 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001875 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001876 reg_equi_class(c_class);
1877 }
1878 else if ((c_class =
1879 get_coll_element(&regparse)) != 0)
1880 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001881 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001882 regmbc(c_class);
1883 }
1884 else
1885 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001886 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001887 startc = *regparse++;
1888 regc(startc);
1889 }
1890 break;
1891 case CLASS_ALNUM:
1892 for (cu = 1; cu < 128; cu++)
1893 if (isalnum(cu))
1894 regmbc(cu);
1895 break;
1896 case CLASS_ALPHA:
1897 for (cu = 1; cu < 128; cu++)
1898 if (isalpha(cu))
1899 regmbc(cu);
1900 break;
1901 case CLASS_BLANK:
1902 regc(' ');
1903 regc('\t');
1904 break;
1905 case CLASS_CNTRL:
1906 for (cu = 1; cu <= 127; cu++)
1907 if (iscntrl(cu))
1908 regmbc(cu);
1909 break;
1910 case CLASS_DIGIT:
1911 for (cu = 1; cu <= 127; cu++)
1912 if (VIM_ISDIGIT(cu))
1913 regmbc(cu);
1914 break;
1915 case CLASS_GRAPH:
1916 for (cu = 1; cu <= 127; cu++)
1917 if (isgraph(cu))
1918 regmbc(cu);
1919 break;
1920 case CLASS_LOWER:
1921 for (cu = 1; cu <= 255; cu++)
1922 if (MB_ISLOWER(cu) && cu != 170
1923 && cu != 186)
1924 regmbc(cu);
1925 break;
1926 case CLASS_PRINT:
1927 for (cu = 1; cu <= 255; cu++)
1928 if (vim_isprintc(cu))
1929 regmbc(cu);
1930 break;
1931 case CLASS_PUNCT:
1932 for (cu = 1; cu < 128; cu++)
1933 if (ispunct(cu))
1934 regmbc(cu);
1935 break;
1936 case CLASS_SPACE:
1937 for (cu = 9; cu <= 13; cu++)
1938 regc(cu);
1939 regc(' ');
1940 break;
1941 case CLASS_UPPER:
1942 for (cu = 1; cu <= 255; cu++)
1943 if (MB_ISUPPER(cu))
1944 regmbc(cu);
1945 break;
1946 case CLASS_XDIGIT:
1947 for (cu = 1; cu <= 255; cu++)
1948 if (vim_isxdigit(cu))
1949 regmbc(cu);
1950 break;
1951 case CLASS_TAB:
1952 regc('\t');
1953 break;
1954 case CLASS_RETURN:
1955 regc('\r');
1956 break;
1957 case CLASS_BACKSPACE:
1958 regc('\b');
1959 break;
1960 case CLASS_ESCAPE:
1961 regc('\033');
1962 break;
1963 case CLASS_IDENT:
1964 for (cu = 1; cu <= 255; cu++)
1965 if (vim_isIDc(cu))
1966 regmbc(cu);
1967 break;
1968 case CLASS_KEYWORD:
1969 for (cu = 1; cu <= 255; cu++)
1970 if (reg_iswordc(cu))
1971 regmbc(cu);
1972 break;
1973 case CLASS_FNAME:
1974 for (cu = 1; cu <= 255; cu++)
1975 if (vim_isfilec(cu))
1976 regmbc(cu);
1977 break;
1978 }
1979 }
1980 else
1981 {
1982 if (has_mbyte)
1983 {
1984 int len;
1985
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001986 // produce a multibyte character, including any
1987 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001988 startc = mb_ptr2char(regparse);
1989 len = (*mb_ptr2len)(regparse);
1990 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001991 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001992 while (--len >= 0)
1993 regc(*regparse++);
1994 }
1995 else
1996 {
1997 startc = *regparse++;
1998 regc(startc);
1999 }
2000 }
2001 }
2002 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002003 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002004 if (*regparse != ']')
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002005 EMSG_RET_NULL(_(e_too_many_brackets)); // Cannot happen?
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002006 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002007 *flagp |= HASWIDTH | SIMPLE;
2008 break;
2009 }
2010 else if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00002011 EMSG2_RET_NULL(_(e_missing_rsb_after_str_lsb),
2012 reg_magic > MAGIC_OFF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002013 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002014 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002015
2016 default:
2017 {
2018 int len;
2019
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002020 // A multi-byte character is handled as a separate atom if it's
2021 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002022 if (use_multibytecode(c))
2023 {
2024do_multibyte:
2025 ret = regnode(MULTIBYTECODE);
2026 regmbc(c);
2027 *flagp |= HASWIDTH | SIMPLE;
2028 break;
2029 }
2030
2031 ret = regnode(EXACTLY);
2032
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002033 // Append characters as long as:
2034 // - there is no following multi, we then need the character in
2035 // front of it as a single character operand
2036 // - not running into a Magic character
2037 // - "one_exactly" is not set
2038 // But always emit at least one character. Might be a Multi,
2039 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002040 for (len = 0; c != NUL && (len == 0
2041 || (re_multi_type(peekchr()) == NOT_MULTI
2042 && !one_exactly
2043 && !is_Magic(c))); ++len)
2044 {
2045 c = no_Magic(c);
2046 if (has_mbyte)
2047 {
2048 regmbc(c);
2049 if (enc_utf8)
2050 {
2051 int l;
2052
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002053 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002054 for (;;)
2055 {
2056 l = utf_ptr2len(regparse);
2057 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2058 break;
2059 regmbc(utf_ptr2char(regparse));
2060 skipchr();
2061 }
2062 }
2063 }
2064 else
2065 regc(c);
2066 c = getchr();
2067 }
2068 ungetchr();
2069
2070 regc(NUL);
2071 *flagp |= HASWIDTH;
2072 if (len == 1)
2073 *flagp |= SIMPLE;
2074 }
2075 break;
2076 }
2077
2078 return ret;
2079}
2080
2081/*
2082 * Parse something followed by possible [*+=].
2083 *
2084 * Note that the branching code sequences used for = and the general cases
2085 * of * and + are somewhat optimized: they use the same NOTHING node as
2086 * both the endmarker for their branch list and the body of the last branch.
2087 * It might seem that this node could be dispensed with entirely, but the
2088 * endmarker role is not redundant.
2089 */
2090 static char_u *
2091regpiece(int *flagp)
2092{
2093 char_u *ret;
2094 int op;
2095 char_u *next;
2096 int flags;
2097 long minval;
2098 long maxval;
2099
2100 ret = regatom(&flags);
2101 if (ret == NULL)
2102 return NULL;
2103
2104 op = peekchr();
2105 if (re_multi_type(op) == NOT_MULTI)
2106 {
2107 *flagp = flags;
2108 return ret;
2109 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002110 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002111 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2112
2113 skipchr();
2114 switch (op)
2115 {
2116 case Magic('*'):
2117 if (flags & SIMPLE)
2118 reginsert(STAR, ret);
2119 else
2120 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002121 // Emit x* as (x&|), where & means "self".
2122 reginsert(BRANCH, ret); // Either x
2123 regoptail(ret, regnode(BACK)); // and loop
2124 regoptail(ret, ret); // back
2125 regtail(ret, regnode(BRANCH)); // or
2126 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002127 }
2128 break;
2129
2130 case Magic('+'):
2131 if (flags & SIMPLE)
2132 reginsert(PLUS, ret);
2133 else
2134 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002135 // Emit x+ as x(&|), where & means "self".
2136 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002137 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002138 regtail(regnode(BACK), ret); // loop back
2139 regtail(next, regnode(BRANCH)); // or
2140 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002141 }
2142 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2143 break;
2144
2145 case Magic('@'):
2146 {
2147 int lop = END;
2148 long nr;
2149
2150 nr = getdecchrs();
2151 switch (no_Magic(getchr()))
2152 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002153 case '=': lop = MATCH; break; // \@=
2154 case '!': lop = NOMATCH; break; // \@!
2155 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002156 case '<': switch (no_Magic(getchr()))
2157 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002158 case '=': lop = BEHIND; break; // \@<=
2159 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002160 }
2161 }
2162 if (lop == END)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002163 EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002164 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002165 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002166 if (lop == BEHIND || lop == NOBEHIND)
2167 {
2168 regtail(ret, regnode(BHPOS));
2169 *flagp |= HASLOOKBH;
2170 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002171 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002172 if (lop == BEHIND || lop == NOBEHIND)
2173 {
2174 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002175 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002176 reginsert_nr(lop, nr, ret);
2177 }
2178 else
2179 reginsert(lop, ret);
2180 break;
2181 }
2182
2183 case Magic('?'):
2184 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002185 // Emit x= as (x|)
2186 reginsert(BRANCH, ret); // Either x
2187 regtail(ret, regnode(BRANCH)); // or
2188 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002189 regtail(ret, next);
2190 regoptail(ret, next);
2191 break;
2192
2193 case Magic('{'):
2194 if (!read_limits(&minval, &maxval))
2195 return NULL;
2196 if (flags & SIMPLE)
2197 {
2198 reginsert(BRACE_SIMPLE, ret);
2199 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2200 }
2201 else
2202 {
2203 if (num_complex_braces >= 10)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002204 EMSG2_RET_NULL(_(e_too_many_complex_str_curly),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002205 reg_magic == MAGIC_ALL);
2206 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2207 regoptail(ret, regnode(BACK));
2208 regoptail(ret, ret);
2209 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2210 ++num_complex_braces;
2211 }
2212 if (minval > 0 && maxval > 0)
2213 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2214 break;
2215 }
2216 if (re_multi_type(peekchr()) != NOT_MULTI)
2217 {
2218 // Can't have a multi follow a multi.
2219 if (peekchr() == Magic('*'))
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002220 EMSG2_RET_NULL(_(e_nested_str), reg_magic >= MAGIC_ON);
2221 EMSG3_RET_NULL(_(e_nested_str_chr), reg_magic == MAGIC_ALL,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002222 no_Magic(peekchr()));
2223 }
2224
2225 return ret;
2226}
2227
2228/*
2229 * Parse one alternative of an | or & operator.
2230 * Implements the concatenation operator.
2231 */
2232 static char_u *
2233regconcat(int *flagp)
2234{
2235 char_u *first = NULL;
2236 char_u *chain = NULL;
2237 char_u *latest;
2238 int flags;
2239 int cont = TRUE;
2240
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002241 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002242
2243 while (cont)
2244 {
2245 switch (peekchr())
2246 {
2247 case NUL:
2248 case Magic('|'):
2249 case Magic('&'):
2250 case Magic(')'):
2251 cont = FALSE;
2252 break;
2253 case Magic('Z'):
2254 regflags |= RF_ICOMBINE;
2255 skipchr_keepstart();
2256 break;
2257 case Magic('c'):
2258 regflags |= RF_ICASE;
2259 skipchr_keepstart();
2260 break;
2261 case Magic('C'):
2262 regflags |= RF_NOICASE;
2263 skipchr_keepstart();
2264 break;
2265 case Magic('v'):
2266 reg_magic = MAGIC_ALL;
2267 skipchr_keepstart();
2268 curchr = -1;
2269 break;
2270 case Magic('m'):
2271 reg_magic = MAGIC_ON;
2272 skipchr_keepstart();
2273 curchr = -1;
2274 break;
2275 case Magic('M'):
2276 reg_magic = MAGIC_OFF;
2277 skipchr_keepstart();
2278 curchr = -1;
2279 break;
2280 case Magic('V'):
2281 reg_magic = MAGIC_NONE;
2282 skipchr_keepstart();
2283 curchr = -1;
2284 break;
2285 default:
2286 latest = regpiece(&flags);
2287 if (latest == NULL || reg_toolong)
2288 return NULL;
2289 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002290 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002291 *flagp |= flags & SPSTART;
2292 else
2293 regtail(chain, latest);
2294 chain = latest;
2295 if (first == NULL)
2296 first = latest;
2297 break;
2298 }
2299 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002300 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002301 first = regnode(NOTHING);
2302 return first;
2303}
2304
2305/*
2306 * Parse one alternative of an | operator.
2307 * Implements the & operator.
2308 */
2309 static char_u *
2310regbranch(int *flagp)
2311{
2312 char_u *ret;
2313 char_u *chain = NULL;
2314 char_u *latest;
2315 int flags;
2316
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002317 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002318
2319 ret = regnode(BRANCH);
2320 for (;;)
2321 {
2322 latest = regconcat(&flags);
2323 if (latest == NULL)
2324 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002325 // If one of the branches has width, the whole thing has. If one of
2326 // the branches anchors at start-of-line, the whole thing does.
2327 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002328 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002329 // If one of the branches doesn't match a line-break, the whole thing
2330 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002331 *flagp &= ~HASNL | (flags & HASNL);
2332 if (chain != NULL)
2333 regtail(chain, latest);
2334 if (peekchr() != Magic('&'))
2335 break;
2336 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002337 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002338 if (reg_toolong)
2339 break;
2340 reginsert(MATCH, latest);
2341 chain = latest;
2342 }
2343
2344 return ret;
2345}
2346
2347/*
2348 * Parse regular expression, i.e. main body or parenthesized thing.
2349 *
2350 * Caller must absorb opening parenthesis.
2351 *
2352 * Combining parenthesis handling with the base level of regular expression
2353 * is a trifle forced, but the need to tie the tails of the branches to what
2354 * follows makes it hard to avoid.
2355 */
2356 static char_u *
2357reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002358 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002359 int *flagp)
2360{
2361 char_u *ret;
2362 char_u *br;
2363 char_u *ender;
2364 int parno = 0;
2365 int flags;
2366
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002367 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002368
2369#ifdef FEAT_SYN_HL
2370 if (paren == REG_ZPAREN)
2371 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002372 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002373 if (regnzpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002374 EMSG_RET_NULL(_(e_too_many_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002375 parno = regnzpar;
2376 regnzpar++;
2377 ret = regnode(ZOPEN + parno);
2378 }
2379 else
2380#endif
2381 if (paren == REG_PAREN)
2382 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002383 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002384 if (regnpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002385 EMSG2_RET_NULL(_(e_too_many_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002386 parno = regnpar;
2387 ++regnpar;
2388 ret = regnode(MOPEN + parno);
2389 }
2390 else if (paren == REG_NPAREN)
2391 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002392 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002393 ret = regnode(NOPEN);
2394 }
2395 else
2396 ret = NULL;
2397
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002398 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002399 br = regbranch(&flags);
2400 if (br == NULL)
2401 return NULL;
2402 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002403 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002404 else
2405 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002406 // If one of the branches can be zero-width, the whole thing can.
2407 // If one of the branches has * at start or matches a line-break, the
2408 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002409 if (!(flags & HASWIDTH))
2410 *flagp &= ~HASWIDTH;
2411 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2412 while (peekchr() == Magic('|'))
2413 {
2414 skipchr();
2415 br = regbranch(&flags);
2416 if (br == NULL || reg_toolong)
2417 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002418 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002419 if (!(flags & HASWIDTH))
2420 *flagp &= ~HASWIDTH;
2421 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2422 }
2423
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002424 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002425 ender = regnode(
2426#ifdef FEAT_SYN_HL
2427 paren == REG_ZPAREN ? ZCLOSE + parno :
2428#endif
2429 paren == REG_PAREN ? MCLOSE + parno :
2430 paren == REG_NPAREN ? NCLOSE : END);
2431 regtail(ret, ender);
2432
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002433 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002434 for (br = ret; br != NULL; br = regnext(br))
2435 regoptail(br, ender);
2436
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002437 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002438 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2439 {
2440#ifdef FEAT_SYN_HL
2441 if (paren == REG_ZPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002442 EMSG_RET_NULL(_(e_unmatched_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002443 else
2444#endif
2445 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002446 EMSG2_RET_NULL(_(e_unmatched_str_percent_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002447 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002448 EMSG2_RET_NULL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002449 }
2450 else if (paren == REG_NOPAREN && peekchr() != NUL)
2451 {
2452 if (curchr == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002453 EMSG2_RET_NULL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002454 else
Bram Moolenaar74409f62022-01-01 15:58:22 +00002455 EMSG_RET_NULL(_(e_trailing_characters)); // "Can't happen".
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002456 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002457 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002458 // Here we set the flag allowing back references to this set of
2459 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002460 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002461 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002462 return ret;
2463}
2464
2465/*
2466 * bt_regcomp() - compile a regular expression into internal code for the
2467 * traditional back track matcher.
2468 * Returns the program in allocated space. Returns NULL for an error.
2469 *
2470 * We can't allocate space until we know how big the compiled form will be,
2471 * but we can't compile it (and thus know how big it is) until we've got a
2472 * place to put the code. So we cheat: we compile it twice, once with code
2473 * generation turned off and size counting turned on, and once "for real".
2474 * This also means that we don't allocate space until we are sure that the
2475 * thing really will compile successfully, and we never have to move the
2476 * code and thus invalidate pointers into it. (Note that it has to be in
2477 * one piece because vim_free() must be able to free it all.)
2478 *
2479 * Whether upper/lower case is to be ignored is decided when executing the
2480 * program, it does not matter here.
2481 *
2482 * Beware that the optimization-preparation code in here knows about some
2483 * of the structure of the compiled regexp.
2484 * "re_flags": RE_MAGIC and/or RE_STRING.
2485 */
2486 static regprog_T *
2487bt_regcomp(char_u *expr, int re_flags)
2488{
2489 bt_regprog_T *r;
2490 char_u *scan;
2491 char_u *longest;
2492 int len;
2493 int flags;
2494
2495 if (expr == NULL)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002496 IEMSG_RET_NULL(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002497
2498 init_class_tab();
2499
2500 // First pass: determine size, legality.
2501 regcomp_start(expr, re_flags);
2502 regcode = JUST_CALC_SIZE;
2503 regc(REGMAGIC);
2504 if (reg(REG_NOPAREN, &flags) == NULL)
2505 return NULL;
2506
2507 // Allocate space.
2508 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2509 if (r == NULL)
2510 return NULL;
2511 r->re_in_use = FALSE;
2512
2513 // Second pass: emit code.
2514 regcomp_start(expr, re_flags);
2515 regcode = r->program;
2516 regc(REGMAGIC);
2517 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2518 {
2519 vim_free(r);
2520 if (reg_toolong)
Bram Moolenaareaaac012022-01-02 17:00:40 +00002521 EMSG_RET_NULL(_(e_pattern_too_long));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002522 return NULL;
2523 }
2524
2525 // Dig out information for optimizations.
2526 r->regstart = NUL; // Worst-case defaults.
2527 r->reganch = 0;
2528 r->regmust = NULL;
2529 r->regmlen = 0;
2530 r->regflags = regflags;
2531 if (flags & HASNL)
2532 r->regflags |= RF_HASNL;
2533 if (flags & HASLOOKBH)
2534 r->regflags |= RF_LOOKBH;
2535#ifdef FEAT_SYN_HL
2536 // Remember whether this pattern has any \z specials in it.
2537 r->reghasz = re_has_z;
2538#endif
2539 scan = r->program + 1; // First BRANCH.
2540 if (OP(regnext(scan)) == END) // Only one top-level choice.
2541 {
2542 scan = OPERAND(scan);
2543
2544 // Starting-point info.
2545 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2546 {
2547 r->reganch++;
2548 scan = regnext(scan);
2549 }
2550
2551 if (OP(scan) == EXACTLY)
2552 {
2553 if (has_mbyte)
2554 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2555 else
2556 r->regstart = *OPERAND(scan);
2557 }
2558 else if ((OP(scan) == BOW
2559 || OP(scan) == EOW
2560 || OP(scan) == NOTHING
2561 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2562 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2563 && OP(regnext(scan)) == EXACTLY)
2564 {
2565 if (has_mbyte)
2566 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2567 else
2568 r->regstart = *OPERAND(regnext(scan));
2569 }
2570
2571 // If there's something expensive in the r.e., find the longest
2572 // literal string that must appear and make it the regmust. Resolve
2573 // ties in favor of later strings, since the regstart check works
2574 // with the beginning of the r.e. and avoiding duplication
2575 // strengthens checking. Not a strong reason, but sufficient in the
2576 // absence of others.
2577
2578 // When the r.e. starts with BOW, it is faster to look for a regmust
2579 // first. Used a lot for "#" and "*" commands. (Added by mool).
2580 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2581 && !(flags & HASNL))
2582 {
2583 longest = NULL;
2584 len = 0;
2585 for (; scan != NULL; scan = regnext(scan))
2586 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
2587 {
2588 longest = OPERAND(scan);
2589 len = (int)STRLEN(OPERAND(scan));
2590 }
2591 r->regmust = longest;
2592 r->regmlen = len;
2593 }
2594 }
2595#ifdef BT_REGEXP_DUMP
2596 regdump(expr, r);
2597#endif
2598 r->engine = &bt_regengine;
2599 return (regprog_T *)r;
2600}
2601
2602#if defined(FEAT_SYN_HL) || defined(PROTO)
2603/*
2604 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2605 * found. This is messy, but it works fine.
2606 */
2607 int
2608vim_regcomp_had_eol(void)
2609{
2610 return had_eol;
2611}
2612#endif
2613
2614/*
2615 * Get a number after a backslash that is inside [].
2616 * When nothing is recognized return a backslash.
2617 */
2618 static int
2619coll_get_char(void)
2620{
2621 long nr = -1;
2622
2623 switch (*regparse++)
2624 {
2625 case 'd': nr = getdecchrs(); break;
2626 case 'o': nr = getoctchrs(); break;
2627 case 'x': nr = gethexchrs(2); break;
2628 case 'u': nr = gethexchrs(4); break;
2629 case 'U': nr = gethexchrs(8); break;
2630 }
2631 if (nr < 0 || nr > INT_MAX)
2632 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002633 // If getting the number fails be backwards compatible: the character
2634 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002635 --regparse;
2636 nr = '\\';
2637 }
2638 return nr;
2639}
2640
2641/*
2642 * Free a compiled regexp program, returned by bt_regcomp().
2643 */
2644 static void
2645bt_regfree(regprog_T *prog)
2646{
2647 vim_free(prog);
2648}
2649
2650#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2651
2652/*
2653 * The arguments from BRACE_LIMITS are stored here. They are actually local
2654 * to regmatch(), but they are here to reduce the amount of stack space used
2655 * (it can be called recursively many times).
2656 */
2657static long bl_minval;
2658static long bl_maxval;
2659
2660/*
2661 * Save the input line and position in a regsave_T.
2662 */
2663 static void
2664reg_save(regsave_T *save, garray_T *gap)
2665{
2666 if (REG_MULTI)
2667 {
2668 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2669 save->rs_u.pos.lnum = rex.lnum;
2670 }
2671 else
2672 save->rs_u.ptr = rex.input;
2673 save->rs_len = gap->ga_len;
2674}
2675
2676/*
2677 * Restore the input line and position from a regsave_T.
2678 */
2679 static void
2680reg_restore(regsave_T *save, garray_T *gap)
2681{
2682 if (REG_MULTI)
2683 {
2684 if (rex.lnum != save->rs_u.pos.lnum)
2685 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002686 // only call reg_getline() when the line number changed to save
2687 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002688 rex.lnum = save->rs_u.pos.lnum;
2689 rex.line = reg_getline(rex.lnum);
2690 }
2691 rex.input = rex.line + save->rs_u.pos.col;
2692 }
2693 else
2694 rex.input = save->rs_u.ptr;
2695 gap->ga_len = save->rs_len;
2696}
2697
2698/*
2699 * Return TRUE if current position is equal to saved position.
2700 */
2701 static int
2702reg_save_equal(regsave_T *save)
2703{
2704 if (REG_MULTI)
2705 return rex.lnum == save->rs_u.pos.lnum
2706 && rex.input == rex.line + save->rs_u.pos.col;
2707 return rex.input == save->rs_u.ptr;
2708}
2709
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002710// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002711#define save_se(savep, posp, pp) \
2712 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2713
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002714// After a failed match restore the sub-expressions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002715#define restore_se(savep, posp, pp) { \
2716 if (REG_MULTI) \
2717 *(posp) = (savep)->se_u.pos; \
2718 else \
2719 *(pp) = (savep)->se_u.ptr; }
2720
2721/*
2722 * Tentatively set the sub-expression start to the current position (after
2723 * calling regmatch() they will have changed). Need to save the existing
2724 * values for when there is no match.
2725 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2726 * depending on REG_MULTI.
2727 */
2728 static void
2729save_se_multi(save_se_T *savep, lpos_T *posp)
2730{
2731 savep->se_u.pos = *posp;
2732 posp->lnum = rex.lnum;
2733 posp->col = (colnr_T)(rex.input - rex.line);
2734}
2735
2736 static void
2737save_se_one(save_se_T *savep, char_u **pp)
2738{
2739 savep->se_u.ptr = *pp;
2740 *pp = rex.input;
2741}
2742
2743/*
2744 * regrepeat - repeatedly match something simple, return how many.
2745 * Advances rex.input (and rex.lnum) to just after the matched chars.
2746 */
2747 static int
2748regrepeat(
2749 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002750 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002751{
2752 long count = 0;
2753 char_u *scan;
2754 char_u *opnd;
2755 int mask;
2756 int testval = 0;
2757
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002758 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002759 opnd = OPERAND(p);
2760 switch (OP(p))
2761 {
2762 case ANY:
2763 case ANY + ADD_NL:
2764 while (count < maxcount)
2765 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002766 // Matching anything means we continue until end-of-line (or
2767 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002768 while (*scan != NUL && count < maxcount)
2769 {
2770 ++count;
2771 MB_PTR_ADV(scan);
2772 }
2773 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2774 || rex.reg_line_lbr || count == maxcount)
2775 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002776 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002777 reg_nextline();
2778 scan = rex.input;
2779 if (got_int)
2780 break;
2781 }
2782 break;
2783
2784 case IDENT:
2785 case IDENT + ADD_NL:
2786 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002787 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002788 case SIDENT:
2789 case SIDENT + ADD_NL:
2790 while (count < maxcount)
2791 {
2792 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2793 {
2794 MB_PTR_ADV(scan);
2795 }
2796 else if (*scan == NUL)
2797 {
2798 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2799 || rex.reg_line_lbr)
2800 break;
2801 reg_nextline();
2802 scan = rex.input;
2803 if (got_int)
2804 break;
2805 }
2806 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2807 ++scan;
2808 else
2809 break;
2810 ++count;
2811 }
2812 break;
2813
2814 case KWORD:
2815 case KWORD + ADD_NL:
2816 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002817 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002818 case SKWORD:
2819 case SKWORD + ADD_NL:
2820 while (count < maxcount)
2821 {
2822 if (vim_iswordp_buf(scan, rex.reg_buf)
2823 && (testval || !VIM_ISDIGIT(*scan)))
2824 {
2825 MB_PTR_ADV(scan);
2826 }
2827 else if (*scan == NUL)
2828 {
2829 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2830 || rex.reg_line_lbr)
2831 break;
2832 reg_nextline();
2833 scan = rex.input;
2834 if (got_int)
2835 break;
2836 }
2837 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2838 ++scan;
2839 else
2840 break;
2841 ++count;
2842 }
2843 break;
2844
2845 case FNAME:
2846 case FNAME + ADD_NL:
2847 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002848 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002849 case SFNAME:
2850 case SFNAME + ADD_NL:
2851 while (count < maxcount)
2852 {
2853 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2854 {
2855 MB_PTR_ADV(scan);
2856 }
2857 else if (*scan == NUL)
2858 {
2859 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2860 || rex.reg_line_lbr)
2861 break;
2862 reg_nextline();
2863 scan = rex.input;
2864 if (got_int)
2865 break;
2866 }
2867 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2868 ++scan;
2869 else
2870 break;
2871 ++count;
2872 }
2873 break;
2874
2875 case PRINT:
2876 case PRINT + ADD_NL:
2877 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002878 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002879 case SPRINT:
2880 case SPRINT + ADD_NL:
2881 while (count < maxcount)
2882 {
2883 if (*scan == NUL)
2884 {
2885 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2886 || rex.reg_line_lbr)
2887 break;
2888 reg_nextline();
2889 scan = rex.input;
2890 if (got_int)
2891 break;
2892 }
2893 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2894 && (testval || !VIM_ISDIGIT(*scan)))
2895 {
2896 MB_PTR_ADV(scan);
2897 }
2898 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2899 ++scan;
2900 else
2901 break;
2902 ++count;
2903 }
2904 break;
2905
2906 case WHITE:
2907 case WHITE + ADD_NL:
2908 testval = mask = RI_WHITE;
2909do_class:
2910 while (count < maxcount)
2911 {
2912 int l;
2913
2914 if (*scan == NUL)
2915 {
2916 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2917 || rex.reg_line_lbr)
2918 break;
2919 reg_nextline();
2920 scan = rex.input;
2921 if (got_int)
2922 break;
2923 }
2924 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2925 {
2926 if (testval != 0)
2927 break;
2928 scan += l;
2929 }
2930 else if ((class_tab[*scan] & mask) == testval)
2931 ++scan;
2932 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2933 ++scan;
2934 else
2935 break;
2936 ++count;
2937 }
2938 break;
2939
2940 case NWHITE:
2941 case NWHITE + ADD_NL:
2942 mask = RI_WHITE;
2943 goto do_class;
2944 case DIGIT:
2945 case DIGIT + ADD_NL:
2946 testval = mask = RI_DIGIT;
2947 goto do_class;
2948 case NDIGIT:
2949 case NDIGIT + ADD_NL:
2950 mask = RI_DIGIT;
2951 goto do_class;
2952 case HEX:
2953 case HEX + ADD_NL:
2954 testval = mask = RI_HEX;
2955 goto do_class;
2956 case NHEX:
2957 case NHEX + ADD_NL:
2958 mask = RI_HEX;
2959 goto do_class;
2960 case OCTAL:
2961 case OCTAL + ADD_NL:
2962 testval = mask = RI_OCTAL;
2963 goto do_class;
2964 case NOCTAL:
2965 case NOCTAL + ADD_NL:
2966 mask = RI_OCTAL;
2967 goto do_class;
2968 case WORD:
2969 case WORD + ADD_NL:
2970 testval = mask = RI_WORD;
2971 goto do_class;
2972 case NWORD:
2973 case NWORD + ADD_NL:
2974 mask = RI_WORD;
2975 goto do_class;
2976 case HEAD:
2977 case HEAD + ADD_NL:
2978 testval = mask = RI_HEAD;
2979 goto do_class;
2980 case NHEAD:
2981 case NHEAD + ADD_NL:
2982 mask = RI_HEAD;
2983 goto do_class;
2984 case ALPHA:
2985 case ALPHA + ADD_NL:
2986 testval = mask = RI_ALPHA;
2987 goto do_class;
2988 case NALPHA:
2989 case NALPHA + ADD_NL:
2990 mask = RI_ALPHA;
2991 goto do_class;
2992 case LOWER:
2993 case LOWER + ADD_NL:
2994 testval = mask = RI_LOWER;
2995 goto do_class;
2996 case NLOWER:
2997 case NLOWER + ADD_NL:
2998 mask = RI_LOWER;
2999 goto do_class;
3000 case UPPER:
3001 case UPPER + ADD_NL:
3002 testval = mask = RI_UPPER;
3003 goto do_class;
3004 case NUPPER:
3005 case NUPPER + ADD_NL:
3006 mask = RI_UPPER;
3007 goto do_class;
3008
3009 case EXACTLY:
3010 {
3011 int cu, cl;
3012
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003013 // This doesn't do a multi-byte character, because a MULTIBYTECODE
3014 // would have been used for it. It does handle single-byte
3015 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003016 if (rex.reg_ic)
3017 {
3018 cu = MB_TOUPPER(*opnd);
3019 cl = MB_TOLOWER(*opnd);
3020 while (count < maxcount && (*scan == cu || *scan == cl))
3021 {
3022 count++;
3023 scan++;
3024 }
3025 }
3026 else
3027 {
3028 cu = *opnd;
3029 while (count < maxcount && *scan == cu)
3030 {
3031 count++;
3032 scan++;
3033 }
3034 }
3035 break;
3036 }
3037
3038 case MULTIBYTECODE:
3039 {
3040 int i, len, cf = 0;
3041
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003042 // Safety check (just in case 'encoding' was changed since
3043 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003044 if ((len = (*mb_ptr2len)(opnd)) > 1)
3045 {
3046 if (rex.reg_ic && enc_utf8)
3047 cf = utf_fold(utf_ptr2char(opnd));
3048 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3049 {
3050 for (i = 0; i < len; ++i)
3051 if (opnd[i] != scan[i])
3052 break;
3053 if (i < len && (!rex.reg_ic || !enc_utf8
3054 || utf_fold(utf_ptr2char(scan)) != cf))
3055 break;
3056 scan += len;
3057 ++count;
3058 }
3059 }
3060 }
3061 break;
3062
3063 case ANYOF:
3064 case ANYOF + ADD_NL:
3065 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003066 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003067
3068 case ANYBUT:
3069 case ANYBUT + ADD_NL:
3070 while (count < maxcount)
3071 {
3072 int len;
3073
3074 if (*scan == NUL)
3075 {
3076 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3077 || rex.reg_line_lbr)
3078 break;
3079 reg_nextline();
3080 scan = rex.input;
3081 if (got_int)
3082 break;
3083 }
3084 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3085 ++scan;
3086 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3087 {
3088 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3089 break;
3090 scan += len;
3091 }
3092 else
3093 {
3094 if ((cstrchr(opnd, *scan) == NULL) == testval)
3095 break;
3096 ++scan;
3097 }
3098 ++count;
3099 }
3100 break;
3101
3102 case NEWL:
3103 while (count < maxcount
3104 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3105 && !rex.reg_line_lbr && REG_MULTI)
3106 || (*scan == '\n' && rex.reg_line_lbr)))
3107 {
3108 count++;
3109 if (rex.reg_line_lbr)
3110 ADVANCE_REGINPUT();
3111 else
3112 reg_nextline();
3113 scan = rex.input;
3114 if (got_int)
3115 break;
3116 }
3117 break;
3118
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003119 default: // Oh dear. Called inappropriately.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02003120 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003121#ifdef DEBUG
3122 printf("Called regrepeat with op code %d\n", OP(p));
3123#endif
3124 break;
3125 }
3126
3127 rex.input = scan;
3128
3129 return (int)count;
3130}
3131
3132/*
3133 * Push an item onto the regstack.
3134 * Returns pointer to new item. Returns NULL when out of memory.
3135 */
3136 static regitem_T *
3137regstack_push(regstate_T state, char_u *scan)
3138{
3139 regitem_T *rp;
3140
3141 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3142 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00003143 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003144 return NULL;
3145 }
3146 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3147 return NULL;
3148
3149 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3150 rp->rs_state = state;
3151 rp->rs_scan = scan;
3152
3153 regstack.ga_len += sizeof(regitem_T);
3154 return rp;
3155}
3156
3157/*
3158 * Pop an item from the regstack.
3159 */
3160 static void
3161regstack_pop(char_u **scan)
3162{
3163 regitem_T *rp;
3164
3165 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3166 *scan = rp->rs_scan;
3167
3168 regstack.ga_len -= sizeof(regitem_T);
3169}
3170
3171/*
3172 * Save the current subexpr to "bp", so that they can be restored
3173 * later by restore_subexpr().
3174 */
3175 static void
3176save_subexpr(regbehind_T *bp)
3177{
3178 int i;
3179
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003180 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3181 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003182 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
3183 if (!rex.need_clear_subexpr)
3184 {
3185 for (i = 0; i < NSUBEXP; ++i)
3186 {
3187 if (REG_MULTI)
3188 {
3189 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3190 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3191 }
3192 else
3193 {
3194 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3195 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
3196 }
3197 }
3198 }
3199}
3200
3201/*
3202 * Restore the subexpr from "bp".
3203 */
3204 static void
3205restore_subexpr(regbehind_T *bp)
3206{
3207 int i;
3208
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003209 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003210 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
3211 if (!rex.need_clear_subexpr)
3212 {
3213 for (i = 0; i < NSUBEXP; ++i)
3214 {
3215 if (REG_MULTI)
3216 {
3217 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3218 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3219 }
3220 else
3221 {
3222 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3223 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
3224 }
3225 }
3226 }
3227}
3228
3229/*
3230 * regmatch - main matching routine
3231 *
3232 * Conceptually the strategy is simple: Check to see whether the current node
3233 * matches, push an item onto the regstack and loop to see whether the rest
3234 * matches, and then act accordingly. In practice we make some effort to
3235 * avoid using the regstack, in particular by going through "ordinary" nodes
3236 * (that don't need to know whether the rest of the match failed) by a nested
3237 * loop.
3238 *
3239 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3240 * the last matched character.
3241 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3242 * undefined state!
3243 */
3244 static int
3245regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003246 char_u *scan, // Current node.
3247 proftime_T *tm UNUSED, // timeout limit or NULL
3248 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003249{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003250 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003251 int op;
3252 int c;
3253 regitem_T *rp;
3254 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003255 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003256#ifdef FEAT_RELTIME
3257 int tm_count = 0;
3258#endif
3259
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003260 // Make "regstack" and "backpos" empty. They are allocated and freed in
3261 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003262 regstack.ga_len = 0;
3263 backpos.ga_len = 0;
3264
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003265 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003266 for (;;)
3267 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003268 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3269 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003270 fast_breakcheck();
3271
3272#ifdef DEBUG
3273 if (scan != NULL && regnarrate)
3274 {
3275 mch_errmsg((char *)regprop(scan));
3276 mch_errmsg("(\n");
3277 }
3278#endif
3279
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003280 // Repeat for items that can be matched sequentially, without using the
3281 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003282 for (;;)
3283 {
3284 if (got_int || scan == NULL)
3285 {
3286 status = RA_FAIL;
3287 break;
3288 }
3289#ifdef FEAT_RELTIME
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003290 // Check for timeout once in a 100 times to avoid overhead.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003291 if (tm != NULL && ++tm_count == 100)
3292 {
3293 tm_count = 0;
3294 if (profile_passed_limit(tm))
3295 {
3296 if (timed_out != NULL)
3297 *timed_out = TRUE;
3298 status = RA_FAIL;
3299 break;
3300 }
3301 }
3302#endif
3303 status = RA_CONT;
3304
3305#ifdef DEBUG
3306 if (regnarrate)
3307 {
3308 mch_errmsg((char *)regprop(scan));
3309 mch_errmsg("...\n");
3310# ifdef FEAT_SYN_HL
3311 if (re_extmatch_in != NULL)
3312 {
3313 int i;
3314
3315 mch_errmsg(_("External submatches:\n"));
3316 for (i = 0; i < NSUBEXP; i++)
3317 {
3318 mch_errmsg(" \"");
3319 if (re_extmatch_in->matches[i] != NULL)
3320 mch_errmsg((char *)re_extmatch_in->matches[i]);
3321 mch_errmsg("\"\n");
3322 }
3323 }
3324# endif
3325 }
3326#endif
3327 next = regnext(scan);
3328
3329 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003330 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003331 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3332 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
3333 {
3334 reg_nextline();
3335 }
3336 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3337 {
3338 ADVANCE_REGINPUT();
3339 }
3340 else
3341 {
3342 if (WITH_NL(op))
3343 op -= ADD_NL;
3344 if (has_mbyte)
3345 c = (*mb_ptr2char)(rex.input);
3346 else
3347 c = *rex.input;
3348 switch (op)
3349 {
3350 case BOL:
3351 if (rex.input != rex.line)
3352 status = RA_NOMATCH;
3353 break;
3354
3355 case EOL:
3356 if (c != NUL)
3357 status = RA_NOMATCH;
3358 break;
3359
3360 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003361 // We're not at the beginning of the file when below the first
3362 // line where we started, not at the start of the line or we
3363 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003364 if (rex.lnum != 0 || rex.input != rex.line
3365 || (REG_MULTI && rex.reg_firstlnum > 1))
3366 status = RA_NOMATCH;
3367 break;
3368
3369 case RE_EOF:
3370 if (rex.lnum != rex.reg_maxline || c != NUL)
3371 status = RA_NOMATCH;
3372 break;
3373
3374 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003375 // Check if the buffer is in a window and compare the
3376 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003377 if (rex.reg_win == NULL
3378 || (rex.lnum + rex.reg_firstlnum
3379 != rex.reg_win->w_cursor.lnum)
3380 || ((colnr_T)(rex.input - rex.line)
3381 != rex.reg_win->w_cursor.col))
3382 status = RA_NOMATCH;
3383 break;
3384
3385 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003386 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003387 {
3388 int mark = OPERAND(scan)[0];
3389 int cmp = OPERAND(scan)[1];
3390 pos_T *pos;
3391
3392 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003393 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003394 || pos->lnum <= 0) // mark isn't set in reg_buf
3395 {
3396 status = RA_NOMATCH;
3397 }
3398 else
3399 {
3400 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3401 && pos->col == MAXCOL
3402 ? (colnr_T)STRLEN(reg_getline(
3403 pos->lnum - rex.reg_firstlnum))
3404 : pos->col;
3405
3406 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3407 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003408 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003409 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003410 ? cmp != '>'
3411 : cmp != '<'))
3412 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3413 ? cmp != '>'
3414 : cmp != '<')))
3415 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003416 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003417 }
3418 break;
3419
3420 case RE_VISUAL:
3421 if (!reg_match_visual())
3422 status = RA_NOMATCH;
3423 break;
3424
3425 case RE_LNUM:
3426 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3427 scan))
3428 status = RA_NOMATCH;
3429 break;
3430
3431 case RE_COL:
3432 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3433 status = RA_NOMATCH;
3434 break;
3435
3436 case RE_VCOL:
3437 if (!re_num_cmp((long_u)win_linetabsize(
3438 rex.reg_win == NULL ? curwin : rex.reg_win,
3439 rex.line, (colnr_T)(rex.input - rex.line)) + 1, scan))
3440 status = RA_NOMATCH;
3441 break;
3442
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003443 case BOW: // \<word; rex.input points to w
3444 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003445 status = RA_NOMATCH;
3446 else if (has_mbyte)
3447 {
3448 int this_class;
3449
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003450 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003451 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3452 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003453 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003454 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003455 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003456 }
3457 else
3458 {
3459 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3460 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3461 status = RA_NOMATCH;
3462 }
3463 break;
3464
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003465 case EOW: // word\>; rex.input points after d
3466 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003467 status = RA_NOMATCH;
3468 else if (has_mbyte)
3469 {
3470 int this_class, prev_class;
3471
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003472 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003473 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3474 prev_class = reg_prev_class();
3475 if (this_class == prev_class
3476 || prev_class == 0 || prev_class == 1)
3477 status = RA_NOMATCH;
3478 }
3479 else
3480 {
3481 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3482 || (rex.input[0] != NUL
3483 && vim_iswordc_buf(c, rex.reg_buf)))
3484 status = RA_NOMATCH;
3485 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003486 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003487
3488 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003489 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003490 if (c == NUL)
3491 status = RA_NOMATCH;
3492 else
3493 ADVANCE_REGINPUT();
3494 break;
3495
3496 case IDENT:
3497 if (!vim_isIDc(c))
3498 status = RA_NOMATCH;
3499 else
3500 ADVANCE_REGINPUT();
3501 break;
3502
3503 case SIDENT:
3504 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3505 status = RA_NOMATCH;
3506 else
3507 ADVANCE_REGINPUT();
3508 break;
3509
3510 case KWORD:
3511 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3512 status = RA_NOMATCH;
3513 else
3514 ADVANCE_REGINPUT();
3515 break;
3516
3517 case SKWORD:
3518 if (VIM_ISDIGIT(*rex.input)
3519 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3520 status = RA_NOMATCH;
3521 else
3522 ADVANCE_REGINPUT();
3523 break;
3524
3525 case FNAME:
3526 if (!vim_isfilec(c))
3527 status = RA_NOMATCH;
3528 else
3529 ADVANCE_REGINPUT();
3530 break;
3531
3532 case SFNAME:
3533 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3534 status = RA_NOMATCH;
3535 else
3536 ADVANCE_REGINPUT();
3537 break;
3538
3539 case PRINT:
3540 if (!vim_isprintc(PTR2CHAR(rex.input)))
3541 status = RA_NOMATCH;
3542 else
3543 ADVANCE_REGINPUT();
3544 break;
3545
3546 case SPRINT:
3547 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3548 status = RA_NOMATCH;
3549 else
3550 ADVANCE_REGINPUT();
3551 break;
3552
3553 case WHITE:
3554 if (!VIM_ISWHITE(c))
3555 status = RA_NOMATCH;
3556 else
3557 ADVANCE_REGINPUT();
3558 break;
3559
3560 case NWHITE:
3561 if (c == NUL || VIM_ISWHITE(c))
3562 status = RA_NOMATCH;
3563 else
3564 ADVANCE_REGINPUT();
3565 break;
3566
3567 case DIGIT:
3568 if (!ri_digit(c))
3569 status = RA_NOMATCH;
3570 else
3571 ADVANCE_REGINPUT();
3572 break;
3573
3574 case NDIGIT:
3575 if (c == NUL || ri_digit(c))
3576 status = RA_NOMATCH;
3577 else
3578 ADVANCE_REGINPUT();
3579 break;
3580
3581 case HEX:
3582 if (!ri_hex(c))
3583 status = RA_NOMATCH;
3584 else
3585 ADVANCE_REGINPUT();
3586 break;
3587
3588 case NHEX:
3589 if (c == NUL || ri_hex(c))
3590 status = RA_NOMATCH;
3591 else
3592 ADVANCE_REGINPUT();
3593 break;
3594
3595 case OCTAL:
3596 if (!ri_octal(c))
3597 status = RA_NOMATCH;
3598 else
3599 ADVANCE_REGINPUT();
3600 break;
3601
3602 case NOCTAL:
3603 if (c == NUL || ri_octal(c))
3604 status = RA_NOMATCH;
3605 else
3606 ADVANCE_REGINPUT();
3607 break;
3608
3609 case WORD:
3610 if (!ri_word(c))
3611 status = RA_NOMATCH;
3612 else
3613 ADVANCE_REGINPUT();
3614 break;
3615
3616 case NWORD:
3617 if (c == NUL || ri_word(c))
3618 status = RA_NOMATCH;
3619 else
3620 ADVANCE_REGINPUT();
3621 break;
3622
3623 case HEAD:
3624 if (!ri_head(c))
3625 status = RA_NOMATCH;
3626 else
3627 ADVANCE_REGINPUT();
3628 break;
3629
3630 case NHEAD:
3631 if (c == NUL || ri_head(c))
3632 status = RA_NOMATCH;
3633 else
3634 ADVANCE_REGINPUT();
3635 break;
3636
3637 case ALPHA:
3638 if (!ri_alpha(c))
3639 status = RA_NOMATCH;
3640 else
3641 ADVANCE_REGINPUT();
3642 break;
3643
3644 case NALPHA:
3645 if (c == NUL || ri_alpha(c))
3646 status = RA_NOMATCH;
3647 else
3648 ADVANCE_REGINPUT();
3649 break;
3650
3651 case LOWER:
3652 if (!ri_lower(c))
3653 status = RA_NOMATCH;
3654 else
3655 ADVANCE_REGINPUT();
3656 break;
3657
3658 case NLOWER:
3659 if (c == NUL || ri_lower(c))
3660 status = RA_NOMATCH;
3661 else
3662 ADVANCE_REGINPUT();
3663 break;
3664
3665 case UPPER:
3666 if (!ri_upper(c))
3667 status = RA_NOMATCH;
3668 else
3669 ADVANCE_REGINPUT();
3670 break;
3671
3672 case NUPPER:
3673 if (c == NUL || ri_upper(c))
3674 status = RA_NOMATCH;
3675 else
3676 ADVANCE_REGINPUT();
3677 break;
3678
3679 case EXACTLY:
3680 {
3681 int len;
3682 char_u *opnd;
3683
3684 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003685 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003686 if (*opnd != *rex.input
3687 && (!rex.reg_ic
3688 || (!enc_utf8
3689 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3690 status = RA_NOMATCH;
3691 else if (*opnd == NUL)
3692 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003693 // match empty string always works; happens when "~" is
3694 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003695 }
3696 else
3697 {
3698 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3699 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003700 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003701 }
3702 else
3703 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003704 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003705 len = (int)STRLEN(opnd);
3706 if (cstrncmp(opnd, rex.input, &len) != 0)
3707 status = RA_NOMATCH;
3708 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003709 // Check for following composing character, unless %C
3710 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003711 if (status != RA_NOMATCH
3712 && enc_utf8
3713 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3714 && !rex.reg_icombine
3715 && OP(next) != RE_COMPOSING)
3716 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003717 // raaron: This code makes a composing character get
3718 // ignored, which is the correct behavior (sometimes)
3719 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003720 status = RA_NOMATCH;
3721 }
3722 if (status != RA_NOMATCH)
3723 rex.input += len;
3724 }
3725 }
3726 break;
3727
3728 case ANYOF:
3729 case ANYBUT:
3730 if (c == NUL)
3731 status = RA_NOMATCH;
3732 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3733 status = RA_NOMATCH;
3734 else
3735 ADVANCE_REGINPUT();
3736 break;
3737
3738 case MULTIBYTECODE:
3739 if (has_mbyte)
3740 {
3741 int i, len;
3742 char_u *opnd;
3743 int opndc = 0, inpc;
3744
3745 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003746 // Safety check (just in case 'encoding' was changed since
3747 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003748 if ((len = (*mb_ptr2len)(opnd)) < 2)
3749 {
3750 status = RA_NOMATCH;
3751 break;
3752 }
3753 if (enc_utf8)
3754 opndc = utf_ptr2char(opnd);
3755 if (enc_utf8 && utf_iscomposing(opndc))
3756 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003757 // When only a composing char is given match at any
3758 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003759 status = RA_NOMATCH;
3760 for (i = 0; rex.input[i] != NUL;
3761 i += utf_ptr2len(rex.input + i))
3762 {
3763 inpc = utf_ptr2char(rex.input + i);
3764 if (!utf_iscomposing(inpc))
3765 {
3766 if (i > 0)
3767 break;
3768 }
3769 else if (opndc == inpc)
3770 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003771 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003772 len = i + utfc_ptr2len(rex.input + i);
3773 status = RA_MATCH;
3774 break;
3775 }
3776 }
3777 }
3778 else
3779 for (i = 0; i < len; ++i)
3780 if (opnd[i] != rex.input[i])
3781 {
3782 status = RA_NOMATCH;
3783 break;
3784 }
3785 rex.input += len;
3786 }
3787 else
3788 status = RA_NOMATCH;
3789 break;
3790 case RE_COMPOSING:
3791 if (enc_utf8)
3792 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003793 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003794 while (utf_iscomposing(utf_ptr2char(rex.input)))
3795 MB_CPTR_ADV(rex.input);
3796 }
3797 break;
3798
3799 case NOTHING:
3800 break;
3801
3802 case BACK:
3803 {
3804 int i;
3805 backpos_T *bp;
3806
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003807 // When we run into BACK we need to check if we don't keep
3808 // looping without matching any input. The second and later
3809 // times a BACK is encountered it fails if the input is still
3810 // at the same position as the previous time.
3811 // The positions are stored in "backpos" and found by the
3812 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003813 bp = (backpos_T *)backpos.ga_data;
3814 for (i = 0; i < backpos.ga_len; ++i)
3815 if (bp[i].bp_scan == scan)
3816 break;
3817 if (i == backpos.ga_len)
3818 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003819 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003820 if (ga_grow(&backpos, 1) == FAIL)
3821 status = RA_FAIL;
3822 else
3823 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003824 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003825 bp = (backpos_T *)backpos.ga_data;
3826 bp[i].bp_scan = scan;
3827 ++backpos.ga_len;
3828 }
3829 }
3830 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003831 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003832 status = RA_NOMATCH;
3833
3834 if (status != RA_FAIL && status != RA_NOMATCH)
3835 reg_save(&bp[i].bp_pos, &backpos);
3836 }
3837 break;
3838
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003839 case MOPEN + 0: // Match start: \zs
3840 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003841 case MOPEN + 2:
3842 case MOPEN + 3:
3843 case MOPEN + 4:
3844 case MOPEN + 5:
3845 case MOPEN + 6:
3846 case MOPEN + 7:
3847 case MOPEN + 8:
3848 case MOPEN + 9:
3849 {
3850 no = op - MOPEN;
3851 cleanup_subexpr();
3852 rp = regstack_push(RS_MOPEN, scan);
3853 if (rp == NULL)
3854 status = RA_FAIL;
3855 else
3856 {
3857 rp->rs_no = no;
3858 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3859 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003860 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003861 }
3862 }
3863 break;
3864
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003865 case NOPEN: // \%(
3866 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003867 if (regstack_push(RS_NOPEN, scan) == NULL)
3868 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003869 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003870 break;
3871
3872#ifdef FEAT_SYN_HL
3873 case ZOPEN + 1:
3874 case ZOPEN + 2:
3875 case ZOPEN + 3:
3876 case ZOPEN + 4:
3877 case ZOPEN + 5:
3878 case ZOPEN + 6:
3879 case ZOPEN + 7:
3880 case ZOPEN + 8:
3881 case ZOPEN + 9:
3882 {
3883 no = op - ZOPEN;
3884 cleanup_zsubexpr();
3885 rp = regstack_push(RS_ZOPEN, scan);
3886 if (rp == NULL)
3887 status = RA_FAIL;
3888 else
3889 {
3890 rp->rs_no = no;
3891 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3892 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003893 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003894 }
3895 }
3896 break;
3897#endif
3898
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003899 case MCLOSE + 0: // Match end: \ze
3900 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003901 case MCLOSE + 2:
3902 case MCLOSE + 3:
3903 case MCLOSE + 4:
3904 case MCLOSE + 5:
3905 case MCLOSE + 6:
3906 case MCLOSE + 7:
3907 case MCLOSE + 8:
3908 case MCLOSE + 9:
3909 {
3910 no = op - MCLOSE;
3911 cleanup_subexpr();
3912 rp = regstack_push(RS_MCLOSE, scan);
3913 if (rp == NULL)
3914 status = RA_FAIL;
3915 else
3916 {
3917 rp->rs_no = no;
3918 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3919 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003920 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003921 }
3922 }
3923 break;
3924
3925#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003926 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003927 case ZCLOSE + 2:
3928 case ZCLOSE + 3:
3929 case ZCLOSE + 4:
3930 case ZCLOSE + 5:
3931 case ZCLOSE + 6:
3932 case ZCLOSE + 7:
3933 case ZCLOSE + 8:
3934 case ZCLOSE + 9:
3935 {
3936 no = op - ZCLOSE;
3937 cleanup_zsubexpr();
3938 rp = regstack_push(RS_ZCLOSE, scan);
3939 if (rp == NULL)
3940 status = RA_FAIL;
3941 else
3942 {
3943 rp->rs_no = no;
3944 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
3945 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003946 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003947 }
3948 }
3949 break;
3950#endif
3951
3952 case BACKREF + 1:
3953 case BACKREF + 2:
3954 case BACKREF + 3:
3955 case BACKREF + 4:
3956 case BACKREF + 5:
3957 case BACKREF + 6:
3958 case BACKREF + 7:
3959 case BACKREF + 8:
3960 case BACKREF + 9:
3961 {
3962 int len;
3963
3964 no = op - BACKREF;
3965 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003966 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003967 {
3968 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
3969 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003970 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003971 len = 0;
3972 }
3973 else
3974 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003975 // Compare current input with back-ref in the same
3976 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003977 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
3978 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
3979 status = RA_NOMATCH;
3980 }
3981 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003982 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003983 {
3984 if (rex.reg_startpos[no].lnum < 0
3985 || rex.reg_endpos[no].lnum < 0)
3986 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003987 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003988 len = 0;
3989 }
3990 else
3991 {
3992 if (rex.reg_startpos[no].lnum == rex.lnum
3993 && rex.reg_endpos[no].lnum == rex.lnum)
3994 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003995 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003996 len = rex.reg_endpos[no].col
3997 - rex.reg_startpos[no].col;
3998 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
3999 rex.input, &len) != 0)
4000 status = RA_NOMATCH;
4001 }
4002 else
4003 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004004 // Messy situation: Need to compare between two
4005 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004006 int r = match_with_backref(
4007 rex.reg_startpos[no].lnum,
4008 rex.reg_startpos[no].col,
4009 rex.reg_endpos[no].lnum,
4010 rex.reg_endpos[no].col,
4011 &len);
4012
4013 if (r != RA_MATCH)
4014 status = r;
4015 }
4016 }
4017 }
4018
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004019 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004020 rex.input += len;
4021 }
4022 break;
4023
4024#ifdef FEAT_SYN_HL
4025 case ZREF + 1:
4026 case ZREF + 2:
4027 case ZREF + 3:
4028 case ZREF + 4:
4029 case ZREF + 5:
4030 case ZREF + 6:
4031 case ZREF + 7:
4032 case ZREF + 8:
4033 case ZREF + 9:
4034 {
4035 int len;
4036
4037 cleanup_zsubexpr();
4038 no = op - ZREF;
4039 if (re_extmatch_in != NULL
4040 && re_extmatch_in->matches[no] != NULL)
4041 {
4042 len = (int)STRLEN(re_extmatch_in->matches[no]);
4043 if (cstrncmp(re_extmatch_in->matches[no],
4044 rex.input, &len) != 0)
4045 status = RA_NOMATCH;
4046 else
4047 rex.input += len;
4048 }
4049 else
4050 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004051 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004052 }
4053 }
4054 break;
4055#endif
4056
4057 case BRANCH:
4058 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004059 if (OP(next) != BRANCH) // No choice.
4060 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004061 else
4062 {
4063 rp = regstack_push(RS_BRANCH, scan);
4064 if (rp == NULL)
4065 status = RA_FAIL;
4066 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004067 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004068 }
4069 }
4070 break;
4071
4072 case BRACE_LIMITS:
4073 {
4074 if (OP(next) == BRACE_SIMPLE)
4075 {
4076 bl_minval = OPERAND_MIN(scan);
4077 bl_maxval = OPERAND_MAX(scan);
4078 }
4079 else if (OP(next) >= BRACE_COMPLEX
4080 && OP(next) < BRACE_COMPLEX + 10)
4081 {
4082 no = OP(next) - BRACE_COMPLEX;
4083 brace_min[no] = OPERAND_MIN(scan);
4084 brace_max[no] = OPERAND_MAX(scan);
4085 brace_count[no] = 0;
4086 }
4087 else
4088 {
4089 internal_error("BRACE_LIMITS");
4090 status = RA_FAIL;
4091 }
4092 }
4093 break;
4094
4095 case BRACE_COMPLEX + 0:
4096 case BRACE_COMPLEX + 1:
4097 case BRACE_COMPLEX + 2:
4098 case BRACE_COMPLEX + 3:
4099 case BRACE_COMPLEX + 4:
4100 case BRACE_COMPLEX + 5:
4101 case BRACE_COMPLEX + 6:
4102 case BRACE_COMPLEX + 7:
4103 case BRACE_COMPLEX + 8:
4104 case BRACE_COMPLEX + 9:
4105 {
4106 no = op - BRACE_COMPLEX;
4107 ++brace_count[no];
4108
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004109 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004110 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4111 ? brace_min[no] : brace_max[no]))
4112 {
4113 rp = regstack_push(RS_BRCPLX_MORE, scan);
4114 if (rp == NULL)
4115 status = RA_FAIL;
4116 else
4117 {
4118 rp->rs_no = no;
4119 reg_save(&rp->rs_un.regsave, &backpos);
4120 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004121 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004122 }
4123 break;
4124 }
4125
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004126 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004127 if (brace_min[no] <= brace_max[no])
4128 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004129 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004130 if (brace_count[no] <= brace_max[no])
4131 {
4132 rp = regstack_push(RS_BRCPLX_LONG, scan);
4133 if (rp == NULL)
4134 status = RA_FAIL;
4135 else
4136 {
4137 rp->rs_no = no;
4138 reg_save(&rp->rs_un.regsave, &backpos);
4139 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004140 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004141 }
4142 }
4143 }
4144 else
4145 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004146 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004147 if (brace_count[no] <= brace_min[no])
4148 {
4149 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4150 if (rp == NULL)
4151 status = RA_FAIL;
4152 else
4153 {
4154 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004155 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004156 }
4157 }
4158 }
4159 }
4160 break;
4161
4162 case BRACE_SIMPLE:
4163 case STAR:
4164 case PLUS:
4165 {
4166 regstar_T rst;
4167
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004168 // Lookahead to avoid useless match attempts when we know
4169 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004170 if (OP(next) == EXACTLY)
4171 {
4172 rst.nextb = *OPERAND(next);
4173 if (rex.reg_ic)
4174 {
4175 if (MB_ISUPPER(rst.nextb))
4176 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4177 else
4178 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4179 }
4180 else
4181 rst.nextb_ic = rst.nextb;
4182 }
4183 else
4184 {
4185 rst.nextb = NUL;
4186 rst.nextb_ic = NUL;
4187 }
4188 if (op != BRACE_SIMPLE)
4189 {
4190 rst.minval = (op == STAR) ? 0 : 1;
4191 rst.maxval = MAX_LIMIT;
4192 }
4193 else
4194 {
4195 rst.minval = bl_minval;
4196 rst.maxval = bl_maxval;
4197 }
4198
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004199 // When maxval > minval, try matching as much as possible, up
4200 // to maxval. When maxval < minval, try matching at least the
4201 // minimal number (since the range is backwards, that's also
4202 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004203 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4204 if (got_int)
4205 {
4206 status = RA_FAIL;
4207 break;
4208 }
4209 if (rst.minval <= rst.maxval
4210 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4211 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004212 // It could match. Prepare for trying to match what
4213 // follows. The code is below. Parameters are stored in
4214 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004215 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4216 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004217 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004218 status = RA_FAIL;
4219 }
4220 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4221 status = RA_FAIL;
4222 else
4223 {
4224 regstack.ga_len += sizeof(regstar_T);
4225 rp = regstack_push(rst.minval <= rst.maxval
4226 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4227 if (rp == NULL)
4228 status = RA_FAIL;
4229 else
4230 {
4231 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004232 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004233 }
4234 }
4235 }
4236 else
4237 status = RA_NOMATCH;
4238
4239 }
4240 break;
4241
4242 case NOMATCH:
4243 case MATCH:
4244 case SUBPAT:
4245 rp = regstack_push(RS_NOMATCH, scan);
4246 if (rp == NULL)
4247 status = RA_FAIL;
4248 else
4249 {
4250 rp->rs_no = op;
4251 reg_save(&rp->rs_un.regsave, &backpos);
4252 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004253 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004254 }
4255 break;
4256
4257 case BEHIND:
4258 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004259 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004260 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4261 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004262 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004263 status = RA_FAIL;
4264 }
4265 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4266 status = RA_FAIL;
4267 else
4268 {
4269 regstack.ga_len += sizeof(regbehind_T);
4270 rp = regstack_push(RS_BEHIND1, scan);
4271 if (rp == NULL)
4272 status = RA_FAIL;
4273 else
4274 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004275 // Need to save the subexpr to be able to restore them
4276 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004277 save_subexpr(((regbehind_T *)rp) - 1);
4278
4279 rp->rs_no = op;
4280 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004281 // First try if what follows matches. If it does then we
4282 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004283 }
4284 }
4285 break;
4286
4287 case BHPOS:
4288 if (REG_MULTI)
4289 {
4290 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4291 || behind_pos.rs_u.pos.lnum != rex.lnum)
4292 status = RA_NOMATCH;
4293 }
4294 else if (behind_pos.rs_u.ptr != rex.input)
4295 status = RA_NOMATCH;
4296 break;
4297
4298 case NEWL:
4299 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4300 || rex.reg_line_lbr)
4301 && (c != '\n' || !rex.reg_line_lbr))
4302 status = RA_NOMATCH;
4303 else if (rex.reg_line_lbr)
4304 ADVANCE_REGINPUT();
4305 else
4306 reg_nextline();
4307 break;
4308
4309 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004310 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004311 break;
4312
4313 default:
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004314 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004315#ifdef DEBUG
4316 printf("Illegal op code %d\n", op);
4317#endif
4318 status = RA_FAIL;
4319 break;
4320 }
4321 }
4322
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004323 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004324 if (status != RA_CONT)
4325 break;
4326
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004327 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004328 scan = next;
4329
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004330 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004331
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004332 // If there is something on the regstack execute the code for the state.
4333 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004334 while (regstack.ga_len > 0 && status != RA_FAIL)
4335 {
4336 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4337 switch (rp->rs_state)
4338 {
4339 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004340 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004341 regstack_pop(&scan);
4342 break;
4343
4344 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004345 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004346 if (status == RA_NOMATCH)
4347 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4348 &rex.reg_startp[rp->rs_no]);
4349 regstack_pop(&scan);
4350 break;
4351
4352#ifdef FEAT_SYN_HL
4353 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004354 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004355 if (status == RA_NOMATCH)
4356 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4357 &reg_startzp[rp->rs_no]);
4358 regstack_pop(&scan);
4359 break;
4360#endif
4361
4362 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004363 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004364 if (status == RA_NOMATCH)
4365 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4366 &rex.reg_endp[rp->rs_no]);
4367 regstack_pop(&scan);
4368 break;
4369
4370#ifdef FEAT_SYN_HL
4371 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004372 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004373 if (status == RA_NOMATCH)
4374 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4375 &reg_endzp[rp->rs_no]);
4376 regstack_pop(&scan);
4377 break;
4378#endif
4379
4380 case RS_BRANCH:
4381 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004382 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004383 regstack_pop(&scan);
4384 else
4385 {
4386 if (status != RA_BREAK)
4387 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004388 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004389 reg_restore(&rp->rs_un.regsave, &backpos);
4390 scan = rp->rs_scan;
4391 }
4392 if (scan == NULL || OP(scan) != BRANCH)
4393 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004394 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004395 status = RA_NOMATCH;
4396 regstack_pop(&scan);
4397 }
4398 else
4399 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004400 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004401 rp->rs_scan = regnext(scan);
4402 reg_save(&rp->rs_un.regsave, &backpos);
4403 scan = OPERAND(scan);
4404 }
4405 }
4406 break;
4407
4408 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004409 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004410 if (status == RA_NOMATCH)
4411 {
4412 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004413 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004414 }
4415 regstack_pop(&scan);
4416 break;
4417
4418 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004419 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004420 if (status == RA_NOMATCH)
4421 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004422 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004423 reg_restore(&rp->rs_un.regsave, &backpos);
4424 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004425 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004426 status = RA_CONT;
4427 }
4428 regstack_pop(&scan);
4429 if (status == RA_CONT)
4430 scan = regnext(scan);
4431 break;
4432
4433 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004434 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004435 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004436 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004437 reg_restore(&rp->rs_un.regsave, &backpos);
4438 regstack_pop(&scan);
4439 if (status == RA_NOMATCH)
4440 {
4441 scan = OPERAND(scan);
4442 status = RA_CONT;
4443 }
4444 break;
4445
4446 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004447 // Pop the state. If the operand matches for NOMATCH or
4448 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4449 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004450 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4451 status = RA_NOMATCH;
4452 else
4453 {
4454 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004455 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004456 reg_restore(&rp->rs_un.regsave, &backpos);
4457 }
4458 regstack_pop(&scan);
4459 if (status == RA_CONT)
4460 scan = regnext(scan);
4461 break;
4462
4463 case RS_BEHIND1:
4464 if (status == RA_NOMATCH)
4465 {
4466 regstack_pop(&scan);
4467 regstack.ga_len -= sizeof(regbehind_T);
4468 }
4469 else
4470 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004471 // The stuff after BEHIND/NOBEHIND matches. Now try if
4472 // the behind part does (not) match before the current
4473 // position in the input. This must be done at every
4474 // position in the input and checking if the match ends at
4475 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004476
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004477 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004478 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4479
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004480 // Start looking for a match with operand at the current
4481 // position. Go back one character until we find the
4482 // result, hitting the start of the line or the previous
4483 // line (for multi-line matching).
4484 // Set behind_pos to where the match should end, BHPOS
4485 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004486 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4487 behind_pos = rp->rs_un.regsave;
4488
4489 rp->rs_state = RS_BEHIND2;
4490
4491 reg_restore(&rp->rs_un.regsave, &backpos);
4492 scan = OPERAND(rp->rs_scan) + 4;
4493 }
4494 break;
4495
4496 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004497 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004498 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4499 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004500 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004501 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4502 if (rp->rs_no == BEHIND)
4503 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4504 &backpos);
4505 else
4506 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004507 // But we didn't want a match. Need to restore the
4508 // subexpr, because what follows matched, so they have
4509 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004510 status = RA_NOMATCH;
4511 restore_subexpr(((regbehind_T *)rp) - 1);
4512 }
4513 regstack_pop(&scan);
4514 regstack.ga_len -= sizeof(regbehind_T);
4515 }
4516 else
4517 {
4518 long limit;
4519
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004520 // No match or a match that doesn't end where we want it: Go
4521 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004522 no = OK;
4523 limit = OPERAND_MIN(rp->rs_scan);
4524 if (REG_MULTI)
4525 {
4526 if (limit > 0
4527 && ((rp->rs_un.regsave.rs_u.pos.lnum
4528 < behind_pos.rs_u.pos.lnum
4529 ? (colnr_T)STRLEN(rex.line)
4530 : behind_pos.rs_u.pos.col)
4531 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4532 no = FAIL;
4533 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4534 {
4535 if (rp->rs_un.regsave.rs_u.pos.lnum
4536 < behind_pos.rs_u.pos.lnum
4537 || reg_getline(
4538 --rp->rs_un.regsave.rs_u.pos.lnum)
4539 == NULL)
4540 no = FAIL;
4541 else
4542 {
4543 reg_restore(&rp->rs_un.regsave, &backpos);
4544 rp->rs_un.regsave.rs_u.pos.col =
4545 (colnr_T)STRLEN(rex.line);
4546 }
4547 }
4548 else
4549 {
4550 if (has_mbyte)
4551 {
4552 char_u *line =
4553 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4554
4555 rp->rs_un.regsave.rs_u.pos.col -=
4556 (*mb_head_off)(line, line
4557 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4558 }
4559 else
4560 --rp->rs_un.regsave.rs_u.pos.col;
4561 }
4562 }
4563 else
4564 {
4565 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4566 no = FAIL;
4567 else
4568 {
4569 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4570 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4571 - rp->rs_un.regsave.rs_u.ptr) > limit)
4572 no = FAIL;
4573 }
4574 }
4575 if (no == OK)
4576 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004577 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004578 reg_restore(&rp->rs_un.regsave, &backpos);
4579 scan = OPERAND(rp->rs_scan) + 4;
4580 if (status == RA_MATCH)
4581 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004582 // We did match, so subexpr may have been changed,
4583 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004584 status = RA_NOMATCH;
4585 restore_subexpr(((regbehind_T *)rp) - 1);
4586 }
4587 }
4588 else
4589 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004590 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004591 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4592 if (rp->rs_no == NOBEHIND)
4593 {
4594 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4595 &backpos);
4596 status = RA_MATCH;
4597 }
4598 else
4599 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004600 // We do want a proper match. Need to restore the
4601 // subexpr if we had a match, because they may have
4602 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004603 if (status == RA_MATCH)
4604 {
4605 status = RA_NOMATCH;
4606 restore_subexpr(((regbehind_T *)rp) - 1);
4607 }
4608 }
4609 regstack_pop(&scan);
4610 regstack.ga_len -= sizeof(regbehind_T);
4611 }
4612 }
4613 break;
4614
4615 case RS_STAR_LONG:
4616 case RS_STAR_SHORT:
4617 {
4618 regstar_T *rst = ((regstar_T *)rp) - 1;
4619
4620 if (status == RA_MATCH)
4621 {
4622 regstack_pop(&scan);
4623 regstack.ga_len -= sizeof(regstar_T);
4624 break;
4625 }
4626
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004627 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004628 if (status != RA_BREAK)
4629 reg_restore(&rp->rs_un.regsave, &backpos);
4630
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004631 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004632 for (;;)
4633 {
4634 if (status != RA_BREAK)
4635 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004636 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004637 if (rp->rs_state == RS_STAR_LONG)
4638 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004639 // Trying for longest match, but couldn't or
4640 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004641 if (--rst->count < rst->minval)
4642 break;
4643 if (rex.input == rex.line)
4644 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004645 // backup to last char of previous line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004646 --rex.lnum;
4647 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004648 // Just in case regrepeat() didn't count
4649 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004650 if (rex.line == NULL)
4651 break;
4652 rex.input = rex.line + STRLEN(rex.line);
4653 fast_breakcheck();
4654 }
4655 else
4656 MB_PTR_BACK(rex.line, rex.input);
4657 }
4658 else
4659 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004660 // Range is backwards, use shortest match first.
4661 // Careful: maxval and minval are exchanged!
4662 // Couldn't or didn't match: try advancing one
4663 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004664 if (rst->count == rst->minval
4665 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4666 break;
4667 ++rst->count;
4668 }
4669 if (got_int)
4670 break;
4671 }
4672 else
4673 status = RA_NOMATCH;
4674
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004675 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004676 if (rst->nextb == NUL || *rex.input == rst->nextb
4677 || *rex.input == rst->nextb_ic)
4678 {
4679 reg_save(&rp->rs_un.regsave, &backpos);
4680 scan = regnext(rp->rs_scan);
4681 status = RA_CONT;
4682 break;
4683 }
4684 }
4685 if (status != RA_CONT)
4686 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004687 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004688 regstack_pop(&scan);
4689 regstack.ga_len -= sizeof(regstar_T);
4690 status = RA_NOMATCH;
4691 }
4692 }
4693 break;
4694 }
4695
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004696 // If we want to continue the inner loop or didn't pop a state
4697 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004698 if (status == RA_CONT || rp == (regitem_T *)
4699 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4700 break;
4701 }
4702
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004703 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004704 if (status == RA_CONT)
4705 continue;
4706
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004707 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004708 if (regstack.ga_len == 0 || status == RA_FAIL)
4709 {
4710 if (scan == NULL)
4711 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004712 // We get here only if there's trouble -- normally "case END" is
4713 // the terminating point.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004714 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004715#ifdef DEBUG
4716 printf("Premature EOL\n");
4717#endif
4718 }
4719 return (status == RA_MATCH);
4720 }
4721
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004722 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004723
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004724 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004725}
4726
4727/*
4728 * regtry - try match of "prog" with at rex.line["col"].
4729 * Returns 0 for failure, number of lines contained in the match otherwise.
4730 */
4731 static long
4732regtry(
4733 bt_regprog_T *prog,
4734 colnr_T col,
4735 proftime_T *tm, // timeout limit or NULL
4736 int *timed_out) // flag set on timeout or NULL
4737{
4738 rex.input = rex.line + col;
4739 rex.need_clear_subexpr = TRUE;
4740#ifdef FEAT_SYN_HL
4741 // Clear the external match subpointers if necessary.
4742 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4743#endif
4744
4745 if (regmatch(prog->program + 1, tm, timed_out) == 0)
4746 return 0;
4747
4748 cleanup_subexpr();
4749 if (REG_MULTI)
4750 {
4751 if (rex.reg_startpos[0].lnum < 0)
4752 {
4753 rex.reg_startpos[0].lnum = 0;
4754 rex.reg_startpos[0].col = col;
4755 }
4756 if (rex.reg_endpos[0].lnum < 0)
4757 {
4758 rex.reg_endpos[0].lnum = rex.lnum;
4759 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4760 }
4761 else
4762 // Use line number of "\ze".
4763 rex.lnum = rex.reg_endpos[0].lnum;
4764 }
4765 else
4766 {
4767 if (rex.reg_startp[0] == NULL)
4768 rex.reg_startp[0] = rex.line + col;
4769 if (rex.reg_endp[0] == NULL)
4770 rex.reg_endp[0] = rex.input;
4771 }
4772#ifdef FEAT_SYN_HL
4773 // Package any found \z(...\) matches for export. Default is none.
4774 unref_extmatch(re_extmatch_out);
4775 re_extmatch_out = NULL;
4776
4777 if (prog->reghasz == REX_SET)
4778 {
4779 int i;
4780
4781 cleanup_zsubexpr();
4782 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004783 if (re_extmatch_out == NULL)
4784 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004785 for (i = 0; i < NSUBEXP; i++)
4786 {
4787 if (REG_MULTI)
4788 {
4789 // Only accept single line matches.
4790 if (reg_startzpos[i].lnum >= 0
4791 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4792 && reg_endzpos[i].col >= reg_startzpos[i].col)
4793 re_extmatch_out->matches[i] =
4794 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4795 + reg_startzpos[i].col,
4796 reg_endzpos[i].col - reg_startzpos[i].col);
4797 }
4798 else
4799 {
4800 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4801 re_extmatch_out->matches[i] =
4802 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004803 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004804 }
4805 }
4806 }
4807#endif
4808 return 1 + rex.lnum;
4809}
4810
4811/*
4812 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004813 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004814 * Returns 0 for failure, number of lines contained in the match otherwise.
4815 */
4816 static long
4817bt_regexec_both(
4818 char_u *line,
4819 colnr_T col, // column to start looking for match
4820 proftime_T *tm, // timeout limit or NULL
4821 int *timed_out) // flag set on timeout or NULL
4822{
4823 bt_regprog_T *prog;
4824 char_u *s;
4825 long retval = 0L;
4826
4827 // Create "regstack" and "backpos" if they are not allocated yet.
4828 // We allocate *_INITIAL amount of bytes first and then set the grow size
4829 // to much bigger value to avoid many malloc calls in case of deep regular
4830 // expressions.
4831 if (regstack.ga_data == NULL)
4832 {
4833 // Use an item size of 1 byte, since we push different things
4834 // onto the regstack.
4835 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4836 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4837 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4838 }
4839
4840 if (backpos.ga_data == NULL)
4841 {
4842 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4843 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4844 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4845 }
4846
4847 if (REG_MULTI)
4848 {
4849 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4850 line = reg_getline((linenr_T)0);
4851 rex.reg_startpos = rex.reg_mmatch->startpos;
4852 rex.reg_endpos = rex.reg_mmatch->endpos;
4853 }
4854 else
4855 {
4856 prog = (bt_regprog_T *)rex.reg_match->regprog;
4857 rex.reg_startp = rex.reg_match->startp;
4858 rex.reg_endp = rex.reg_match->endp;
4859 }
4860
4861 // Be paranoid...
4862 if (prog == NULL || line == NULL)
4863 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004864 iemsg(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004865 goto theend;
4866 }
4867
4868 // Check validity of program.
4869 if (prog_magic_wrong())
4870 goto theend;
4871
4872 // If the start column is past the maximum column: no need to try.
4873 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4874 goto theend;
4875
4876 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4877 if (prog->regflags & RF_ICASE)
4878 rex.reg_ic = TRUE;
4879 else if (prog->regflags & RF_NOICASE)
4880 rex.reg_ic = FALSE;
4881
4882 // If pattern contains "\Z" overrule value of rex.reg_icombine
4883 if (prog->regflags & RF_ICOMBINE)
4884 rex.reg_icombine = TRUE;
4885
4886 // If there is a "must appear" string, look for it.
4887 if (prog->regmust != NULL)
4888 {
4889 int c;
4890
4891 if (has_mbyte)
4892 c = (*mb_ptr2char)(prog->regmust);
4893 else
4894 c = *prog->regmust;
4895 s = line + col;
4896
4897 // This is used very often, esp. for ":global". Use three versions of
4898 // the loop to avoid overhead of conditions.
4899 if (!rex.reg_ic && !has_mbyte)
4900 while ((s = vim_strbyte(s, c)) != NULL)
4901 {
4902 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4903 break; // Found it.
4904 ++s;
4905 }
4906 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4907 while ((s = vim_strchr(s, c)) != NULL)
4908 {
4909 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4910 break; // Found it.
4911 MB_PTR_ADV(s);
4912 }
4913 else
4914 while ((s = cstrchr(s, c)) != NULL)
4915 {
4916 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4917 break; // Found it.
4918 MB_PTR_ADV(s);
4919 }
4920 if (s == NULL) // Not present.
4921 goto theend;
4922 }
4923
4924 rex.line = line;
4925 rex.lnum = 0;
4926 reg_toolong = FALSE;
4927
4928 // Simplest case: Anchored match need be tried only once.
4929 if (prog->reganch)
4930 {
4931 int c;
4932
4933 if (has_mbyte)
4934 c = (*mb_ptr2char)(rex.line + col);
4935 else
4936 c = rex.line[col];
4937 if (prog->regstart == NUL
4938 || prog->regstart == c
4939 || (rex.reg_ic
4940 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
4941 || (c < 255 && prog->regstart < 255 &&
4942 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
4943 retval = regtry(prog, col, tm, timed_out);
4944 else
4945 retval = 0;
4946 }
4947 else
4948 {
4949#ifdef FEAT_RELTIME
4950 int tm_count = 0;
4951#endif
4952 // Messy cases: unanchored match.
4953 while (!got_int)
4954 {
4955 if (prog->regstart != NUL)
4956 {
4957 // Skip until the char we know it must start with.
4958 // Used often, do some work to avoid call overhead.
4959 if (!rex.reg_ic && !has_mbyte)
4960 s = vim_strbyte(rex.line + col, prog->regstart);
4961 else
4962 s = cstrchr(rex.line + col, prog->regstart);
4963 if (s == NULL)
4964 {
4965 retval = 0;
4966 break;
4967 }
4968 col = (int)(s - rex.line);
4969 }
4970
4971 // Check for maximum column to try.
4972 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4973 {
4974 retval = 0;
4975 break;
4976 }
4977
4978 retval = regtry(prog, col, tm, timed_out);
4979 if (retval > 0)
4980 break;
4981
4982 // if not currently on the first line, get it again
4983 if (rex.lnum != 0)
4984 {
4985 rex.lnum = 0;
4986 rex.line = reg_getline((linenr_T)0);
4987 }
4988 if (rex.line[col] == NUL)
4989 break;
4990 if (has_mbyte)
4991 col += (*mb_ptr2len)(rex.line + col);
4992 else
4993 ++col;
4994#ifdef FEAT_RELTIME
4995 // Check for timeout once in a twenty times to avoid overhead.
4996 if (tm != NULL && ++tm_count == 20)
4997 {
4998 tm_count = 0;
4999 if (profile_passed_limit(tm))
5000 {
5001 if (timed_out != NULL)
5002 *timed_out = TRUE;
5003 break;
5004 }
5005 }
5006#endif
5007 }
5008 }
5009
5010theend:
5011 // Free "reg_tofree" when it's a bit big.
5012 // Free regstack and backpos if they are bigger than their initial size.
5013 if (reg_tofreelen > 400)
5014 VIM_CLEAR(reg_tofree);
5015 if (regstack.ga_maxlen > REGSTACK_INITIAL)
5016 ga_clear(&regstack);
5017 if (backpos.ga_maxlen > BACKPOS_INITIAL)
5018 ga_clear(&backpos);
5019
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005020 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005021 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005022 // Make sure the end is never before the start. Can happen when \zs
5023 // and \ze are used.
5024 if (REG_MULTI)
5025 {
5026 lpos_T *start = &rex.reg_mmatch->startpos[0];
5027 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005028
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005029 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005030 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005031 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
5032 }
5033 else
5034 {
5035 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5036 rex.reg_match->endp[0] = rex.reg_match->startp[0];
5037 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005038 }
5039
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005040 return retval;
5041}
5042
5043/*
5044 * Match a regexp against a string.
5045 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5046 * Uses curbuf for line count and 'iskeyword'.
5047 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5048 *
5049 * Returns 0 for failure, number of lines contained in the match otherwise.
5050 */
5051 static int
5052bt_regexec_nl(
5053 regmatch_T *rmp,
5054 char_u *line, // string to match against
5055 colnr_T col, // column to start looking for match
5056 int line_lbr)
5057{
5058 rex.reg_match = rmp;
5059 rex.reg_mmatch = NULL;
5060 rex.reg_maxline = 0;
5061 rex.reg_line_lbr = line_lbr;
5062 rex.reg_buf = curbuf;
5063 rex.reg_win = NULL;
5064 rex.reg_ic = rmp->rm_ic;
5065 rex.reg_icombine = FALSE;
5066 rex.reg_maxcol = 0;
5067
5068 return bt_regexec_both(line, col, NULL, NULL);
5069}
5070
5071/*
5072 * Match a regexp against multiple lines.
5073 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5074 * Uses curbuf for line count and 'iskeyword'.
5075 *
5076 * Return zero if there is no match. Return number of lines contained in the
5077 * match otherwise.
5078 */
5079 static long
5080bt_regexec_multi(
5081 regmmatch_T *rmp,
5082 win_T *win, // window in which to search or NULL
5083 buf_T *buf, // buffer in which to search
5084 linenr_T lnum, // nr of line to start looking for match
5085 colnr_T col, // column to start looking for match
5086 proftime_T *tm, // timeout limit or NULL
5087 int *timed_out) // flag set on timeout or NULL
5088{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005089 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005090 return bt_regexec_both(NULL, col, tm, timed_out);
5091}
5092
5093/*
5094 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5095 */
5096 static int
5097re_num_cmp(long_u val, char_u *scan)
5098{
5099 long_u n = OPERAND_MIN(scan);
5100
5101 if (OPERAND_CMP(scan) == '>')
5102 return val > n;
5103 if (OPERAND_CMP(scan) == '<')
5104 return val < n;
5105 return val == n;
5106}
5107
5108#ifdef BT_REGEXP_DUMP
5109
5110/*
5111 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5112 */
5113 static void
5114regdump(char_u *pattern, bt_regprog_T *r)
5115{
5116 char_u *s;
5117 int op = EXACTLY; // Arbitrary non-END op.
5118 char_u *next;
5119 char_u *end = NULL;
5120 FILE *f;
5121
5122#ifdef BT_REGEXP_LOG
5123 f = fopen("bt_regexp_log.log", "a");
5124#else
5125 f = stdout;
5126#endif
5127 if (f == NULL)
5128 return;
5129 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5130
5131 s = r->program + 1;
5132 // Loop until we find the END that isn't before a referred next (an END
5133 // can also appear in a NOMATCH operand).
5134 while (op != END || s <= end)
5135 {
5136 op = OP(s);
5137 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5138 next = regnext(s);
5139 if (next == NULL) // Next ptr.
5140 fprintf(f, "(0)");
5141 else
5142 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5143 if (end < next)
5144 end = next;
5145 if (op == BRACE_LIMITS)
5146 {
5147 // Two ints
5148 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5149 s += 8;
5150 }
5151 else if (op == BEHIND || op == NOBEHIND)
5152 {
5153 // one int
5154 fprintf(f, " count %ld", OPERAND_MIN(s));
5155 s += 4;
5156 }
5157 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5158 {
5159 // one int plus comparator
5160 fprintf(f, " count %ld", OPERAND_MIN(s));
5161 s += 5;
5162 }
5163 s += 3;
5164 if (op == ANYOF || op == ANYOF + ADD_NL
5165 || op == ANYBUT || op == ANYBUT + ADD_NL
5166 || op == EXACTLY)
5167 {
5168 // Literal string, where present.
5169 fprintf(f, "\nxxxxxxxxx\n");
5170 while (*s != NUL)
5171 fprintf(f, "%c", *s++);
5172 fprintf(f, "\nxxxxxxxxx\n");
5173 s++;
5174 }
5175 fprintf(f, "\r\n");
5176 }
5177
5178 // Header fields of interest.
5179 if (r->regstart != NUL)
5180 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5181 ? (char *)transchar(r->regstart)
5182 : "multibyte", r->regstart);
5183 if (r->reganch)
5184 fprintf(f, "anchored; ");
5185 if (r->regmust != NULL)
5186 fprintf(f, "must have \"%s\"", r->regmust);
5187 fprintf(f, "\r\n");
5188
5189#ifdef BT_REGEXP_LOG
5190 fclose(f);
5191#endif
5192}
5193#endif // BT_REGEXP_DUMP
5194
5195#ifdef DEBUG
5196/*
5197 * regprop - printable representation of opcode
5198 */
5199 static char_u *
5200regprop(char_u *op)
5201{
5202 char *p;
5203 static char buf[50];
5204
5205 STRCPY(buf, ":");
5206
5207 switch ((int) OP(op))
5208 {
5209 case BOL:
5210 p = "BOL";
5211 break;
5212 case EOL:
5213 p = "EOL";
5214 break;
5215 case RE_BOF:
5216 p = "BOF";
5217 break;
5218 case RE_EOF:
5219 p = "EOF";
5220 break;
5221 case CURSOR:
5222 p = "CURSOR";
5223 break;
5224 case RE_VISUAL:
5225 p = "RE_VISUAL";
5226 break;
5227 case RE_LNUM:
5228 p = "RE_LNUM";
5229 break;
5230 case RE_MARK:
5231 p = "RE_MARK";
5232 break;
5233 case RE_COL:
5234 p = "RE_COL";
5235 break;
5236 case RE_VCOL:
5237 p = "RE_VCOL";
5238 break;
5239 case BOW:
5240 p = "BOW";
5241 break;
5242 case EOW:
5243 p = "EOW";
5244 break;
5245 case ANY:
5246 p = "ANY";
5247 break;
5248 case ANY + ADD_NL:
5249 p = "ANY+NL";
5250 break;
5251 case ANYOF:
5252 p = "ANYOF";
5253 break;
5254 case ANYOF + ADD_NL:
5255 p = "ANYOF+NL";
5256 break;
5257 case ANYBUT:
5258 p = "ANYBUT";
5259 break;
5260 case ANYBUT + ADD_NL:
5261 p = "ANYBUT+NL";
5262 break;
5263 case IDENT:
5264 p = "IDENT";
5265 break;
5266 case IDENT + ADD_NL:
5267 p = "IDENT+NL";
5268 break;
5269 case SIDENT:
5270 p = "SIDENT";
5271 break;
5272 case SIDENT + ADD_NL:
5273 p = "SIDENT+NL";
5274 break;
5275 case KWORD:
5276 p = "KWORD";
5277 break;
5278 case KWORD + ADD_NL:
5279 p = "KWORD+NL";
5280 break;
5281 case SKWORD:
5282 p = "SKWORD";
5283 break;
5284 case SKWORD + ADD_NL:
5285 p = "SKWORD+NL";
5286 break;
5287 case FNAME:
5288 p = "FNAME";
5289 break;
5290 case FNAME + ADD_NL:
5291 p = "FNAME+NL";
5292 break;
5293 case SFNAME:
5294 p = "SFNAME";
5295 break;
5296 case SFNAME + ADD_NL:
5297 p = "SFNAME+NL";
5298 break;
5299 case PRINT:
5300 p = "PRINT";
5301 break;
5302 case PRINT + ADD_NL:
5303 p = "PRINT+NL";
5304 break;
5305 case SPRINT:
5306 p = "SPRINT";
5307 break;
5308 case SPRINT + ADD_NL:
5309 p = "SPRINT+NL";
5310 break;
5311 case WHITE:
5312 p = "WHITE";
5313 break;
5314 case WHITE + ADD_NL:
5315 p = "WHITE+NL";
5316 break;
5317 case NWHITE:
5318 p = "NWHITE";
5319 break;
5320 case NWHITE + ADD_NL:
5321 p = "NWHITE+NL";
5322 break;
5323 case DIGIT:
5324 p = "DIGIT";
5325 break;
5326 case DIGIT + ADD_NL:
5327 p = "DIGIT+NL";
5328 break;
5329 case NDIGIT:
5330 p = "NDIGIT";
5331 break;
5332 case NDIGIT + ADD_NL:
5333 p = "NDIGIT+NL";
5334 break;
5335 case HEX:
5336 p = "HEX";
5337 break;
5338 case HEX + ADD_NL:
5339 p = "HEX+NL";
5340 break;
5341 case NHEX:
5342 p = "NHEX";
5343 break;
5344 case NHEX + ADD_NL:
5345 p = "NHEX+NL";
5346 break;
5347 case OCTAL:
5348 p = "OCTAL";
5349 break;
5350 case OCTAL + ADD_NL:
5351 p = "OCTAL+NL";
5352 break;
5353 case NOCTAL:
5354 p = "NOCTAL";
5355 break;
5356 case NOCTAL + ADD_NL:
5357 p = "NOCTAL+NL";
5358 break;
5359 case WORD:
5360 p = "WORD";
5361 break;
5362 case WORD + ADD_NL:
5363 p = "WORD+NL";
5364 break;
5365 case NWORD:
5366 p = "NWORD";
5367 break;
5368 case NWORD + ADD_NL:
5369 p = "NWORD+NL";
5370 break;
5371 case HEAD:
5372 p = "HEAD";
5373 break;
5374 case HEAD + ADD_NL:
5375 p = "HEAD+NL";
5376 break;
5377 case NHEAD:
5378 p = "NHEAD";
5379 break;
5380 case NHEAD + ADD_NL:
5381 p = "NHEAD+NL";
5382 break;
5383 case ALPHA:
5384 p = "ALPHA";
5385 break;
5386 case ALPHA + ADD_NL:
5387 p = "ALPHA+NL";
5388 break;
5389 case NALPHA:
5390 p = "NALPHA";
5391 break;
5392 case NALPHA + ADD_NL:
5393 p = "NALPHA+NL";
5394 break;
5395 case LOWER:
5396 p = "LOWER";
5397 break;
5398 case LOWER + ADD_NL:
5399 p = "LOWER+NL";
5400 break;
5401 case NLOWER:
5402 p = "NLOWER";
5403 break;
5404 case NLOWER + ADD_NL:
5405 p = "NLOWER+NL";
5406 break;
5407 case UPPER:
5408 p = "UPPER";
5409 break;
5410 case UPPER + ADD_NL:
5411 p = "UPPER+NL";
5412 break;
5413 case NUPPER:
5414 p = "NUPPER";
5415 break;
5416 case NUPPER + ADD_NL:
5417 p = "NUPPER+NL";
5418 break;
5419 case BRANCH:
5420 p = "BRANCH";
5421 break;
5422 case EXACTLY:
5423 p = "EXACTLY";
5424 break;
5425 case NOTHING:
5426 p = "NOTHING";
5427 break;
5428 case BACK:
5429 p = "BACK";
5430 break;
5431 case END:
5432 p = "END";
5433 break;
5434 case MOPEN + 0:
5435 p = "MATCH START";
5436 break;
5437 case MOPEN + 1:
5438 case MOPEN + 2:
5439 case MOPEN + 3:
5440 case MOPEN + 4:
5441 case MOPEN + 5:
5442 case MOPEN + 6:
5443 case MOPEN + 7:
5444 case MOPEN + 8:
5445 case MOPEN + 9:
5446 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5447 p = NULL;
5448 break;
5449 case MCLOSE + 0:
5450 p = "MATCH END";
5451 break;
5452 case MCLOSE + 1:
5453 case MCLOSE + 2:
5454 case MCLOSE + 3:
5455 case MCLOSE + 4:
5456 case MCLOSE + 5:
5457 case MCLOSE + 6:
5458 case MCLOSE + 7:
5459 case MCLOSE + 8:
5460 case MCLOSE + 9:
5461 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5462 p = NULL;
5463 break;
5464 case BACKREF + 1:
5465 case BACKREF + 2:
5466 case BACKREF + 3:
5467 case BACKREF + 4:
5468 case BACKREF + 5:
5469 case BACKREF + 6:
5470 case BACKREF + 7:
5471 case BACKREF + 8:
5472 case BACKREF + 9:
5473 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5474 p = NULL;
5475 break;
5476 case NOPEN:
5477 p = "NOPEN";
5478 break;
5479 case NCLOSE:
5480 p = "NCLOSE";
5481 break;
5482#ifdef FEAT_SYN_HL
5483 case ZOPEN + 1:
5484 case ZOPEN + 2:
5485 case ZOPEN + 3:
5486 case ZOPEN + 4:
5487 case ZOPEN + 5:
5488 case ZOPEN + 6:
5489 case ZOPEN + 7:
5490 case ZOPEN + 8:
5491 case ZOPEN + 9:
5492 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5493 p = NULL;
5494 break;
5495 case ZCLOSE + 1:
5496 case ZCLOSE + 2:
5497 case ZCLOSE + 3:
5498 case ZCLOSE + 4:
5499 case ZCLOSE + 5:
5500 case ZCLOSE + 6:
5501 case ZCLOSE + 7:
5502 case ZCLOSE + 8:
5503 case ZCLOSE + 9:
5504 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5505 p = NULL;
5506 break;
5507 case ZREF + 1:
5508 case ZREF + 2:
5509 case ZREF + 3:
5510 case ZREF + 4:
5511 case ZREF + 5:
5512 case ZREF + 6:
5513 case ZREF + 7:
5514 case ZREF + 8:
5515 case ZREF + 9:
5516 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5517 p = NULL;
5518 break;
5519#endif
5520 case STAR:
5521 p = "STAR";
5522 break;
5523 case PLUS:
5524 p = "PLUS";
5525 break;
5526 case NOMATCH:
5527 p = "NOMATCH";
5528 break;
5529 case MATCH:
5530 p = "MATCH";
5531 break;
5532 case BEHIND:
5533 p = "BEHIND";
5534 break;
5535 case NOBEHIND:
5536 p = "NOBEHIND";
5537 break;
5538 case SUBPAT:
5539 p = "SUBPAT";
5540 break;
5541 case BRACE_LIMITS:
5542 p = "BRACE_LIMITS";
5543 break;
5544 case BRACE_SIMPLE:
5545 p = "BRACE_SIMPLE";
5546 break;
5547 case BRACE_COMPLEX + 0:
5548 case BRACE_COMPLEX + 1:
5549 case BRACE_COMPLEX + 2:
5550 case BRACE_COMPLEX + 3:
5551 case BRACE_COMPLEX + 4:
5552 case BRACE_COMPLEX + 5:
5553 case BRACE_COMPLEX + 6:
5554 case BRACE_COMPLEX + 7:
5555 case BRACE_COMPLEX + 8:
5556 case BRACE_COMPLEX + 9:
5557 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5558 p = NULL;
5559 break;
5560 case MULTIBYTECODE:
5561 p = "MULTIBYTECODE";
5562 break;
5563 case NEWL:
5564 p = "NEWL";
5565 break;
5566 default:
5567 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5568 p = NULL;
5569 break;
5570 }
5571 if (p != NULL)
5572 STRCAT(buf, p);
5573 return (char_u *)buf;
5574}
5575#endif // DEBUG