blob: 16dac730de977ed4721ea5b6333a6b0e46419cd8 [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200536 switch (c)
537 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200538 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200539 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
540 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
541 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
542 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
543 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
544 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
545 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
546 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
547 regmbc(0x100); regmbc(0x102); regmbc(0x104);
548 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
549 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
550 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
551 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
552 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
553 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
554 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200555 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200556 case 'B': case 0x181: case 0x243: case 0x1e02:
557 case 0x1e04: case 0x1e06:
558 regmbc('B');
559 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
560 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200561 return;
562 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200563 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
564 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200565 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200566 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
567 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
568 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200569 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200570 case 'D': case 0x10e: case 0x110: case 0x18a:
571 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
572 case 0x1e12:
573 regmbc('D'); regmbc(0x10e); regmbc(0x110);
574 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
575 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200576 return;
577 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200578 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
579 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
580 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
581 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
582 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200583 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200584 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
585 regmbc(0x114); regmbc(0x116); regmbc(0x118);
586 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
587 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
588 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
589 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
590 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
591 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200593 case 'F': case 0x191: case 0x1e1e: case 0xa798:
594 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
595 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200596 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200597 case 'G': case 0x11c: case 0x11e: case 0x120:
598 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
599 case 0x1f4: case 0x1e20: case 0xa7a0:
600 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
601 regmbc(0x120); regmbc(0x122); regmbc(0x193);
602 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
603 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200604 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200605 case 'H': case 0x124: case 0x126: case 0x21e:
606 case 0x1e22: case 0x1e24: case 0x1e26:
607 case 0x1e28: case 0x1e2a: case 0x2c67:
608 regmbc('H'); regmbc(0x124); regmbc(0x126);
609 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
610 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
611 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
613 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200614 case 0x128: case 0x12a: case 0x12c: case 0x12e:
615 case 0x130: case 0x197: case 0x1cf: case 0x208:
616 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
617 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200618 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200619 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
620 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
621 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
622 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
623 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200624 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200625 case 'J': case 0x134: case 0x248:
626 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200627 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200628 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
629 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
630 regmbc('K'); regmbc(0x136); regmbc(0x198);
631 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
632 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200633 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200634 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
635 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
636 case 0x1e3a: case 0x1e3c: case 0x2c60:
637 regmbc('L'); regmbc(0x139); regmbc(0x13b);
638 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
639 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
640 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200641 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200642 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
643 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
644 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200645 return;
646 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200647 case 0x143: case 0x145: case 0x147: case 0x1f8:
648 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
649 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200650 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200651 regmbc(0x143); regmbc(0x145); regmbc(0x147);
652 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
653 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200654 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200655 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
656 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
657 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
658 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
659 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
660 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
661 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
662 case 0x1ee0: case 0x1ee2:
663 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
664 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
665 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
666 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
667 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
668 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
669 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
670 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
671 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
672 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
673 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
674 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
675 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200676 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200677 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
678 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
679 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200680 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200681 case 'Q': case 0x24a:
682 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200683 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200684 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
685 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
686 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
687 regmbc('R'); regmbc(0x154); regmbc(0x156);
688 regmbc(0x210); regmbc(0x212); regmbc(0x158);
689 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
690 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
691 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
694 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
695 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
696 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
697 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
698 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
699 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
700 regmbc(0xa7a8);
701 return;
702 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
703 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
704 case 0x1e6e: case 0x1e70:
705 regmbc('T'); regmbc(0x162); regmbc(0x164);
706 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
707 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
708 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200709 return;
710 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200711 case 0x168: case 0x16a: case 0x16c: case 0x16e:
712 case 0x170: case 0x172: case 0x1af: case 0x1d3:
713 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
714 case 0x214: case 0x216: case 0x244: case 0x1e72:
715 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
716 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
717 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200718 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200719 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
720 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
721 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
722 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
723 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
724 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
725 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
726 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
727 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
728 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200729 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200730 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
731 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
732 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200733 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200734 case 'W': case 0x174: case 0x1e80: case 0x1e82:
735 case 0x1e84: case 0x1e86: case 0x1e88:
736 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
737 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
738 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200739 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200740 case 'X': case 0x1e8a: case 0x1e8c:
741 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200742 return;
743 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200744 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
745 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
746 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
747 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
748 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
749 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200750 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200751 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
752 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
753 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
754 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
755 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200756 return;
757 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200758 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
759 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
760 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
761 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
762 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
763 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
764 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200765 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
766 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
768 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
769 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
770 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
771 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
772 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
773 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
774 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
775 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200776 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200777 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
778 case 0x1e03: case 0x1e05: case 0x1e07:
779 regmbc('b');
780 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
781 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
782 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200783 return;
784 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200785 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
786 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
787 regmbc('c'); regmbc(0xe7); regmbc(0x107);
788 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
789 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
790 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200791 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200792 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
793 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
794 case 0x1e0f: case 0x1e11: case 0x1e13:
795 regmbc('d'); regmbc(0x10f); regmbc(0x111);
796 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
797 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
798 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x113: case 0x115: case 0x117: case 0x119:
802 case 0x11b: case 0x205: case 0x207: case 0x229:
803 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
804 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
805 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
806 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
809 regmbc(0x115); regmbc(0x117); regmbc(0x119);
810 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
811 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
812 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
813 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
814 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
815 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
816 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200817 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200818 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
819 case 0x1e1f: case 0xa799:
820 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
821 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
822 return;
823 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
824 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
825 case 0x1e21: case 0xa7a1:
826 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
827 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
828 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
829 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200830 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200831 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
832 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
833 case 0x1e96: case 0x2c68: case 0xa795:
834 regmbc('h'); regmbc(0x125); regmbc(0x127);
835 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
836 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
837 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200838 return;
839 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200840 case 0x129: case 0x12b: case 0x12d: case 0x12f:
841 case 0x1d0: case 0x209: case 0x20b: case 0x268:
842 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
843 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200844 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200845 regmbc(0xee); regmbc(0xef); regmbc(0x129);
846 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
847 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
848 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
849 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200850 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200851 case 'j': case 0x135: case 0x1f0: case 0x249:
852 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
853 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200855 case 'k': case 0x137: case 0x199: case 0x1e9:
856 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
857 case 0x2c6a: case 0xa741:
858 regmbc('k'); regmbc(0x137); regmbc(0x199);
859 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
860 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
861 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200862 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200863 case 'l': case 0x13a: case 0x13c: case 0x13e:
864 case 0x140: case 0x142: case 0x19a: case 0x1e37:
865 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
866 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
867 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
868 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
869 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
872 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
873 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200874 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200875 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
876 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
877 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
878 case 0xa7a5:
879 regmbc('n'); regmbc(0xf1); regmbc(0x144);
880 regmbc(0x146); regmbc(0x148); regmbc(0x149);
881 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
882 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
883 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200884 return;
885 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200886 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
887 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
888 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
889 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
890 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
891 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
892 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
893 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200894 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
895 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200896 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
897 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
898 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
899 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
900 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
901 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
902 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
903 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
904 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
905 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
906 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200907 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200908 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
909 case 0x1e55: case 0x1e57:
910 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
911 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
912 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200913 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200914 case 'q': case 0x24b: case 0x2a0:
915 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200916 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200917 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
918 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
919 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
920 case 0xa7a7:
921 regmbc('r'); regmbc(0x155); regmbc(0x157);
922 regmbc(0x159); regmbc(0x211); regmbc(0x213);
923 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
924 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
925 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
926 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200927 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200928 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
929 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
930 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
931 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
932 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
933 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
934 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
935 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
936 return;
937 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
938 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
939 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
940 regmbc('t'); regmbc(0x163); regmbc(0x165);
941 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
942 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
943 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
944 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200945 return;
946 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200947 case 0x169: case 0x16b: case 0x16d: case 0x16f:
948 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
949 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
950 case 0x215: case 0x217: case 0x289: case 0x1e73:
951 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
952 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
953 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
954 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200955 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
957 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
958 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
959 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
960 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
961 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
962 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
963 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
964 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
965 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
966 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200967 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200968 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
969 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
970 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 case 'w': case 0x175: case 0x1e81: case 0x1e83:
973 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
974 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
975 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
976 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200977 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200978 case 'x': case 0x1e8b: case 0x1e8d:
979 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200980 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
982 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
983 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200984 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
986 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
987 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
988 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200989 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
991 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
992 case 0x1e95: case 0x2c6c:
993 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
994 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
995 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
996 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200997 return;
998 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200999 }
1000 regmbc(c);
1001}
1002
1003/*
1004 * Emit a node.
1005 * Return pointer to generated code.
1006 */
1007 static char_u *
1008regnode(int op)
1009{
1010 char_u *ret;
1011
1012 ret = regcode;
1013 if (ret == JUST_CALC_SIZE)
1014 regsize += 3;
1015 else
1016 {
1017 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001018 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001019 *regcode++ = NUL;
1020 }
1021 return ret;
1022}
1023
1024/*
1025 * Write a long as four bytes at "p" and return pointer to the next char.
1026 */
1027 static char_u *
1028re_put_long(char_u *p, long_u val)
1029{
1030 *p++ = (char_u) ((val >> 24) & 0377);
1031 *p++ = (char_u) ((val >> 16) & 0377);
1032 *p++ = (char_u) ((val >> 8) & 0377);
1033 *p++ = (char_u) (val & 0377);
1034 return p;
1035}
1036
1037/*
1038 * regnext - dig the "next" pointer out of a node
1039 * Returns NULL when calculating size, when there is no next item and when
1040 * there is an error.
1041 */
1042 static char_u *
1043regnext(char_u *p)
1044{
1045 int offset;
1046
1047 if (p == JUST_CALC_SIZE || reg_toolong)
1048 return NULL;
1049
1050 offset = NEXT(p);
1051 if (offset == 0)
1052 return NULL;
1053
1054 if (OP(p) == BACK)
1055 return p - offset;
1056 else
1057 return p + offset;
1058}
1059
1060/*
1061 * Set the next-pointer at the end of a node chain.
1062 */
1063 static void
1064regtail(char_u *p, char_u *val)
1065{
1066 char_u *scan;
1067 char_u *temp;
1068 int offset;
1069
1070 if (p == JUST_CALC_SIZE)
1071 return;
1072
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001073 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001074 scan = p;
1075 for (;;)
1076 {
1077 temp = regnext(scan);
1078 if (temp == NULL)
1079 break;
1080 scan = temp;
1081 }
1082
1083 if (OP(scan) == BACK)
1084 offset = (int)(scan - val);
1085 else
1086 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001087 // When the offset uses more than 16 bits it can no longer fit in the two
1088 // bytes available. Use a global flag to avoid having to check return
1089 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001090 if (offset > 0xffff)
1091 reg_toolong = TRUE;
1092 else
1093 {
1094 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1095 *(scan + 2) = (char_u) (offset & 0377);
1096 }
1097}
1098
1099/*
1100 * Like regtail, on item after a BRANCH; nop if none.
1101 */
1102 static void
1103regoptail(char_u *p, char_u *val)
1104{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001105 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001106 if (p == NULL || p == JUST_CALC_SIZE
1107 || (OP(p) != BRANCH
1108 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1109 return;
1110 regtail(OPERAND(p), val);
1111}
1112
1113/*
1114 * Insert an operator in front of already-emitted operand
1115 *
1116 * Means relocating the operand.
1117 */
1118 static void
1119reginsert(int op, char_u *opnd)
1120{
1121 char_u *src;
1122 char_u *dst;
1123 char_u *place;
1124
1125 if (regcode == JUST_CALC_SIZE)
1126 {
1127 regsize += 3;
1128 return;
1129 }
1130 src = regcode;
1131 regcode += 3;
1132 dst = regcode;
1133 while (src > opnd)
1134 *--dst = *--src;
1135
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001136 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001137 *place++ = op;
1138 *place++ = NUL;
1139 *place = NUL;
1140}
1141
1142/*
1143 * Insert an operator in front of already-emitted operand.
1144 * Add a number to the operator.
1145 */
1146 static void
1147reginsert_nr(int op, long val, char_u *opnd)
1148{
1149 char_u *src;
1150 char_u *dst;
1151 char_u *place;
1152
1153 if (regcode == JUST_CALC_SIZE)
1154 {
1155 regsize += 7;
1156 return;
1157 }
1158 src = regcode;
1159 regcode += 7;
1160 dst = regcode;
1161 while (src > opnd)
1162 *--dst = *--src;
1163
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001164 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001165 *place++ = op;
1166 *place++ = NUL;
1167 *place++ = NUL;
1168 re_put_long(place, (long_u)val);
1169}
1170
1171/*
1172 * Insert an operator in front of already-emitted operand.
1173 * The operator has the given limit values as operands. Also set next pointer.
1174 *
1175 * Means relocating the operand.
1176 */
1177 static void
1178reginsert_limits(
1179 int op,
1180 long minval,
1181 long maxval,
1182 char_u *opnd)
1183{
1184 char_u *src;
1185 char_u *dst;
1186 char_u *place;
1187
1188 if (regcode == JUST_CALC_SIZE)
1189 {
1190 regsize += 11;
1191 return;
1192 }
1193 src = regcode;
1194 regcode += 11;
1195 dst = regcode;
1196 while (src > opnd)
1197 *--dst = *--src;
1198
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001199 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001200 *place++ = op;
1201 *place++ = NUL;
1202 *place++ = NUL;
1203 place = re_put_long(place, (long_u)minval);
1204 place = re_put_long(place, (long_u)maxval);
1205 regtail(opnd, place);
1206}
1207
1208/*
1209 * Return TRUE if the back reference is legal. We must have seen the close
1210 * brace.
1211 * TODO: Should also check that we don't refer to something that is repeated
1212 * (+*=): what instance of the repetition should we match?
1213 */
1214 static int
1215seen_endbrace(int refnum)
1216{
1217 if (!had_endbrace[refnum])
1218 {
1219 char_u *p;
1220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001221 // Trick: check if "@<=" or "@<!" follows, in which case
1222 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001223 for (p = regparse; *p != NUL; ++p)
1224 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1225 break;
1226 if (*p == NUL)
1227 {
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001228 emsg(_(e_illegal_back_reference));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001229 rc_did_emsg = TRUE;
1230 return FALSE;
1231 }
1232 }
1233 return TRUE;
1234}
1235
1236/*
1237 * Parse the lowest level.
1238 *
1239 * Optimization: gobbles an entire sequence of ordinary characters so that
1240 * it can turn them into a single node, which is smaller to store and
1241 * faster to run. Don't do this when one_exactly is set.
1242 */
1243 static char_u *
1244regatom(int *flagp)
1245{
1246 char_u *ret;
1247 int flags;
1248 int c;
1249 char_u *p;
1250 int extra = 0;
1251 int save_prev_at_start = prev_at_start;
1252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001253 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001254
1255 c = getchr();
1256 switch (c)
1257 {
1258 case Magic('^'):
1259 ret = regnode(BOL);
1260 break;
1261
1262 case Magic('$'):
1263 ret = regnode(EOL);
1264#if defined(FEAT_SYN_HL) || defined(PROTO)
1265 had_eol = TRUE;
1266#endif
1267 break;
1268
1269 case Magic('<'):
1270 ret = regnode(BOW);
1271 break;
1272
1273 case Magic('>'):
1274 ret = regnode(EOW);
1275 break;
1276
1277 case Magic('_'):
1278 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001279 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001280 {
1281 ret = regnode(BOL);
1282 break;
1283 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001284 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001285 {
1286 ret = regnode(EOL);
1287#if defined(FEAT_SYN_HL) || defined(PROTO)
1288 had_eol = TRUE;
1289#endif
1290 break;
1291 }
1292
1293 extra = ADD_NL;
1294 *flagp |= HASNL;
1295
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 if (c == '[')
1298 goto collection;
1299
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001300 // "\_x" is character class plus newline
1301 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001303 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001304 case Magic('.'):
1305 case Magic('i'):
1306 case Magic('I'):
1307 case Magic('k'):
1308 case Magic('K'):
1309 case Magic('f'):
1310 case Magic('F'):
1311 case Magic('p'):
1312 case Magic('P'):
1313 case Magic('s'):
1314 case Magic('S'):
1315 case Magic('d'):
1316 case Magic('D'):
1317 case Magic('x'):
1318 case Magic('X'):
1319 case Magic('o'):
1320 case Magic('O'):
1321 case Magic('w'):
1322 case Magic('W'):
1323 case Magic('h'):
1324 case Magic('H'):
1325 case Magic('a'):
1326 case Magic('A'):
1327 case Magic('l'):
1328 case Magic('L'):
1329 case Magic('u'):
1330 case Magic('U'):
1331 p = vim_strchr(classchars, no_Magic(c));
1332 if (p == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001333 EMSG_RET_NULL(_(e_invalid_use_of_underscore));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001334
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001335 // When '.' is followed by a composing char ignore the dot, so that
1336 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001337 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1338 {
1339 c = getchr();
1340 goto do_multibyte;
1341 }
1342 ret = regnode(classcodes[p - classchars] + extra);
1343 *flagp |= HASWIDTH | SIMPLE;
1344 break;
1345
1346 case Magic('n'):
1347 if (reg_string)
1348 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001349 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001350 ret = regnode(EXACTLY);
1351 regc(NL);
1352 regc(NUL);
1353 *flagp |= HASWIDTH | SIMPLE;
1354 }
1355 else
1356 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001357 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001358 ret = regnode(NEWL);
1359 *flagp |= HASWIDTH | HASNL;
1360 }
1361 break;
1362
1363 case Magic('('):
1364 if (one_exactly)
1365 EMSG_ONE_RET_NULL;
1366 ret = reg(REG_PAREN, &flags);
1367 if (ret == NULL)
1368 return NULL;
1369 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1370 break;
1371
1372 case NUL:
1373 case Magic('|'):
1374 case Magic('&'):
1375 case Magic(')'):
1376 if (one_exactly)
1377 EMSG_ONE_RET_NULL;
Bram Moolenaard0819d12021-12-31 23:15:53 +00001378 // Supposed to be caught earlier.
RestorerZ68ebcee2023-05-31 17:12:14 +01001379 IEMSG_RET_NULL(e_internal_error_in_regexp);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001380 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001381
1382 case Magic('='):
1383 case Magic('?'):
1384 case Magic('+'):
1385 case Magic('@'):
1386 case Magic('{'):
1387 case Magic('*'):
1388 c = no_Magic(c);
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001389 EMSG3_RET_NULL(_(e_str_chr_follows_nothing),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001390 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001391 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001392
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001393 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001394 if (reg_prev_sub != NULL)
1395 {
1396 char_u *lp;
1397
1398 ret = regnode(EXACTLY);
1399 lp = reg_prev_sub;
1400 while (*lp != NUL)
1401 regc(*lp++);
1402 regc(NUL);
1403 if (*reg_prev_sub != NUL)
1404 {
1405 *flagp |= HASWIDTH;
1406 if ((lp - reg_prev_sub) == 1)
1407 *flagp |= SIMPLE;
1408 }
1409 }
1410 else
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001411 EMSG_RET_NULL(_(e_no_previous_substitute_regular_expression));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001412 break;
1413
1414 case Magic('1'):
1415 case Magic('2'):
1416 case Magic('3'):
1417 case Magic('4'):
1418 case Magic('5'):
1419 case Magic('6'):
1420 case Magic('7'):
1421 case Magic('8'):
1422 case Magic('9'):
1423 {
1424 int refnum;
1425
1426 refnum = c - Magic('0');
1427 if (!seen_endbrace(refnum))
1428 return NULL;
1429 ret = regnode(BACKREF + refnum);
1430 }
1431 break;
1432
1433 case Magic('z'):
1434 {
1435 c = no_Magic(getchr());
1436 switch (c)
1437 {
1438#ifdef FEAT_SYN_HL
1439 case '(': if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001440 EMSG_RET_NULL(_(e_z_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001441 if (one_exactly)
1442 EMSG_ONE_RET_NULL;
1443 ret = reg(REG_ZPAREN, &flags);
1444 if (ret == NULL)
1445 return NULL;
1446 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1447 re_has_z = REX_SET;
1448 break;
1449
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9': if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001459 EMSG_RET_NULL(_(e_z1_z9_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001460 ret = regnode(ZREF + c - '0');
1461 re_has_z = REX_USE;
1462 break;
1463#endif
1464
1465 case 's': ret = regnode(MOPEN + 0);
1466 if (re_mult_next("\\zs") == FAIL)
1467 return NULL;
1468 break;
1469
1470 case 'e': ret = regnode(MCLOSE + 0);
1471 if (re_mult_next("\\ze") == FAIL)
1472 return NULL;
1473 break;
1474
Bram Moolenaarb2810f12022-01-08 21:38:52 +00001475 default: EMSG_RET_NULL(_(e_invalid_character_after_bsl_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001476 }
1477 }
1478 break;
1479
1480 case Magic('%'):
1481 {
1482 c = no_Magic(getchr());
1483 switch (c)
1484 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001485 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001486 case '(':
1487 if (one_exactly)
1488 EMSG_ONE_RET_NULL;
1489 ret = reg(REG_NPAREN, &flags);
1490 if (ret == NULL)
1491 return NULL;
1492 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1493 break;
1494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001495 // Catch \%^ and \%$ regardless of where they appear in the
1496 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001497 case '^':
1498 ret = regnode(RE_BOF);
1499 break;
1500
1501 case '$':
1502 ret = regnode(RE_EOF);
1503 break;
1504
1505 case '#':
Christian Brabandt360da402022-05-18 15:04:02 +01001506 if (regparse[0] == '=' && regparse[1] >= 48
1507 && regparse[1] <= 50)
1508 {
1509 // misplaced \%#=1
1510 semsg(_(e_atom_engine_must_be_at_start_of_pattern),
1511 regparse[1]);
1512 return FAIL;
1513 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001514 ret = regnode(CURSOR);
1515 break;
1516
1517 case 'V':
1518 ret = regnode(RE_VISUAL);
1519 break;
1520
1521 case 'C':
1522 ret = regnode(RE_COMPOSING);
1523 break;
1524
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001525 // \%[abc]: Emit as a list of branches, all ending at the last
1526 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001527 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001528 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001529 EMSG_ONE_RET_NULL;
1530 {
1531 char_u *lastbranch;
1532 char_u *lastnode = NULL;
1533 char_u *br;
1534
1535 ret = NULL;
1536 while ((c = getchr()) != ']')
1537 {
1538 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001539 EMSG2_RET_NULL(_(e_missing_sb_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001540 reg_magic == MAGIC_ALL);
1541 br = regnode(BRANCH);
1542 if (ret == NULL)
1543 ret = br;
1544 else
1545 {
1546 regtail(lastnode, br);
1547 if (reg_toolong)
1548 return NULL;
1549 }
1550
1551 ungetchr();
1552 one_exactly = TRUE;
1553 lastnode = regatom(flagp);
1554 one_exactly = FALSE;
1555 if (lastnode == NULL)
1556 return NULL;
1557 }
1558 if (ret == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001559 EMSG2_RET_NULL(_(e_empty_str_brackets),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001560 reg_magic == MAGIC_ALL);
1561 lastbranch = regnode(BRANCH);
1562 br = regnode(NOTHING);
1563 if (ret != JUST_CALC_SIZE)
1564 {
1565 regtail(lastnode, br);
1566 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001567 // connect all branches to the NOTHING
1568 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001569 for (br = ret; br != lastnode; )
1570 {
1571 if (OP(br) == BRANCH)
1572 {
1573 regtail(br, lastbranch);
1574 if (reg_toolong)
1575 return NULL;
1576 br = OPERAND(br);
1577 }
1578 else
1579 br = regnext(br);
1580 }
1581 }
1582 *flagp &= ~(HASWIDTH | SIMPLE);
1583 break;
1584 }
1585
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001586 case 'd': // %d123 decimal
1587 case 'o': // %o123 octal
1588 case 'x': // %xab hex 2
1589 case 'u': // %uabcd hex 4
1590 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001591 {
1592 long i;
1593
1594 switch (c)
1595 {
1596 case 'd': i = getdecchrs(); break;
1597 case 'o': i = getoctchrs(); break;
1598 case 'x': i = gethexchrs(2); break;
1599 case 'u': i = gethexchrs(4); break;
1600 case 'U': i = gethexchrs(8); break;
1601 default: i = -1; break;
1602 }
1603
1604 if (i < 0 || i > INT_MAX)
1605 EMSG2_RET_NULL(
Bram Moolenaara6f79292022-01-04 21:30:47 +00001606 _(e_invalid_character_after_str_2),
1607 reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001608 if (use_multibytecode(i))
1609 ret = regnode(MULTIBYTECODE);
1610 else
1611 ret = regnode(EXACTLY);
1612 if (i == 0)
1613 regc(0x0a);
1614 else
1615 regmbc(i);
1616 regc(NUL);
1617 *flagp |= HASWIDTH;
1618 break;
1619 }
1620
1621 default:
1622 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001623 || c == '\'' || c == '.')
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001624 {
1625 long_u n = 0;
1626 int cmp;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001627 int cur = FALSE;
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001628 int got_digit = FALSE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001629
1630 cmp = c;
1631 if (cmp == '<' || cmp == '>')
1632 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001633 if (no_Magic(c) == '.')
1634 {
1635 cur = TRUE;
1636 c = getchr();
1637 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001638 while (VIM_ISDIGIT(c))
1639 {
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001640 got_digit = TRUE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001641 n = n * 10 + (c - '0');
1642 c = getchr();
1643 }
Julio B46fa3c72024-03-28 10:23:37 +01001644 if (no_Magic(c) == '\'' && n == 0)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001645 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001646 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001647 c = getchr();
1648 ret = regnode(RE_MARK);
1649 if (ret == JUST_CALC_SIZE)
1650 regsize += 2;
1651 else
1652 {
1653 *regcode++ = c;
1654 *regcode++ = cmp;
1655 }
1656 break;
1657 }
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001658 else if ((c == 'l' || c == 'c' || c == 'v')
1659 && (cur || got_digit))
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001660 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001661 if (cur && n)
1662 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001663 semsg(_(e_regexp_number_after_dot_pos_search_chr),
1664 no_Magic(c));
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001665 rc_did_emsg = TRUE;
1666 return NULL;
1667 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001668 if (c == 'l')
1669 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001670 if (cur)
1671 n = curwin->w_cursor.lnum;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001672 ret = regnode(RE_LNUM);
1673 if (save_prev_at_start)
1674 at_start = TRUE;
1675 }
1676 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001677 {
1678 if (cur)
1679 {
1680 n = curwin->w_cursor.col;
1681 n++;
1682 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001683 ret = regnode(RE_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001684 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001685 else
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001686 {
1687 if (cur)
1688 {
1689 colnr_T vcol = 0;
1690
1691 getvvcol(curwin, &curwin->w_cursor,
1692 NULL, NULL, &vcol);
1693 ++vcol;
1694 n = vcol;
1695 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001696 ret = regnode(RE_VCOL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001697 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001698 if (ret == JUST_CALC_SIZE)
1699 regsize += 5;
1700 else
1701 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001702 // put the number and the optional
1703 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001704 regcode = re_put_long(regcode, n);
1705 *regcode++ = cmp;
1706 }
1707 break;
1708 }
1709 }
1710
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001711 EMSG2_RET_NULL(_(e_invalid_character_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001712 reg_magic == MAGIC_ALL);
1713 }
1714 }
1715 break;
1716
1717 case Magic('['):
1718collection:
1719 {
1720 char_u *lp;
1721
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001722 // If there is no matching ']', we assume the '[' is a normal
1723 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001724 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001725 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001726 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001727 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001728 int endc;
1729
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001730 // In a character class, different parsing rules apply.
1731 // Not even \ is special anymore, nothing is.
1732 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001733 {
1734 ret = regnode(ANYBUT + extra);
1735 regparse++;
1736 }
1737 else
1738 ret = regnode(ANYOF + extra);
1739
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001740 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001741 if (*regparse == ']' || *regparse == '-')
1742 {
1743 startc = *regparse;
1744 regc(*regparse++);
1745 }
1746
1747 while (*regparse != NUL && *regparse != ']')
1748 {
1749 if (*regparse == '-')
1750 {
1751 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001752 // The '-' is not used for a range at the end and
1753 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001754 if (*regparse == ']' || *regparse == NUL
1755 || startc == -1
1756 || (regparse[0] == '\\' && regparse[1] == 'n'))
1757 {
1758 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001759 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001760 }
1761 else
1762 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001763 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001764 endc = 0;
1765 if (*regparse == '[')
1766 endc = get_coll_element(&regparse);
1767 if (endc == 0)
1768 {
1769 if (has_mbyte)
1770 endc = mb_ptr2char_adv(&regparse);
1771 else
1772 endc = *regparse++;
1773 }
1774
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001775 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001776 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1777 endc = coll_get_char();
1778
1779 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001780 EMSG_RET_NULL(_(e_reverse_range_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001781 if (has_mbyte && ((*mb_char2len)(startc) > 1
1782 || (*mb_char2len)(endc) > 1))
1783 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001784 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001785 if (endc > startc + 256)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001786 EMSG_RET_NULL(_(e_range_too_large_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001787 while (++startc <= endc)
1788 regmbc(startc);
1789 }
1790 else
1791 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001792 while (++startc <= endc)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001793 regc(startc);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001794 }
1795 startc = -1;
1796 }
1797 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001798 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1799 // accepts "\t", "\e", etc., but only when the 'l' flag in
1800 // 'cpoptions' is not included.
1801 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001802 else if (*regparse == '\\'
1803 && !reg_cpo_bsl
1804 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1805 || (!reg_cpo_lit
1806 && vim_strchr(REGEXP_ABBR,
1807 regparse[1]) != NULL)))
1808 {
1809 regparse++;
1810 if (*regparse == 'n')
1811 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001812 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001813 if (ret != JUST_CALC_SIZE)
1814 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001815 // Using \n inside [^] does not change what
1816 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001817 if (*ret == ANYOF)
1818 {
1819 *ret = ANYOF + ADD_NL;
1820 *flagp |= HASNL;
1821 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001822 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001823 }
1824 regparse++;
1825 startc = -1;
1826 }
1827 else if (*regparse == 'd'
1828 || *regparse == 'o'
1829 || *regparse == 'x'
1830 || *regparse == 'u'
1831 || *regparse == 'U')
1832 {
1833 startc = coll_get_char();
1834 if (startc == 0)
1835 regc(0x0a);
1836 else
1837 regmbc(startc);
1838 }
1839 else
1840 {
1841 startc = backslash_trans(*regparse++);
1842 regc(startc);
1843 }
1844 }
1845 else if (*regparse == '[')
1846 {
1847 int c_class;
1848 int cu;
1849
1850 c_class = get_char_class(&regparse);
1851 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001852 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001853 switch (c_class)
1854 {
1855 case CLASS_NONE:
1856 c_class = get_equi_class(&regparse);
1857 if (c_class != 0)
1858 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001859 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001860 reg_equi_class(c_class);
1861 }
1862 else if ((c_class =
1863 get_coll_element(&regparse)) != 0)
1864 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001865 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001866 regmbc(c_class);
1867 }
1868 else
1869 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001870 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001871 startc = *regparse++;
1872 regc(startc);
1873 }
1874 break;
1875 case CLASS_ALNUM:
1876 for (cu = 1; cu < 128; cu++)
1877 if (isalnum(cu))
1878 regmbc(cu);
1879 break;
1880 case CLASS_ALPHA:
1881 for (cu = 1; cu < 128; cu++)
1882 if (isalpha(cu))
1883 regmbc(cu);
1884 break;
1885 case CLASS_BLANK:
1886 regc(' ');
1887 regc('\t');
1888 break;
1889 case CLASS_CNTRL:
1890 for (cu = 1; cu <= 127; cu++)
1891 if (iscntrl(cu))
1892 regmbc(cu);
1893 break;
1894 case CLASS_DIGIT:
1895 for (cu = 1; cu <= 127; cu++)
1896 if (VIM_ISDIGIT(cu))
1897 regmbc(cu);
1898 break;
1899 case CLASS_GRAPH:
1900 for (cu = 1; cu <= 127; cu++)
1901 if (isgraph(cu))
1902 regmbc(cu);
1903 break;
1904 case CLASS_LOWER:
1905 for (cu = 1; cu <= 255; cu++)
1906 if (MB_ISLOWER(cu) && cu != 170
1907 && cu != 186)
1908 regmbc(cu);
1909 break;
1910 case CLASS_PRINT:
1911 for (cu = 1; cu <= 255; cu++)
1912 if (vim_isprintc(cu))
1913 regmbc(cu);
1914 break;
1915 case CLASS_PUNCT:
1916 for (cu = 1; cu < 128; cu++)
1917 if (ispunct(cu))
1918 regmbc(cu);
1919 break;
1920 case CLASS_SPACE:
1921 for (cu = 9; cu <= 13; cu++)
1922 regc(cu);
1923 regc(' ');
1924 break;
1925 case CLASS_UPPER:
1926 for (cu = 1; cu <= 255; cu++)
1927 if (MB_ISUPPER(cu))
1928 regmbc(cu);
1929 break;
1930 case CLASS_XDIGIT:
1931 for (cu = 1; cu <= 255; cu++)
1932 if (vim_isxdigit(cu))
1933 regmbc(cu);
1934 break;
1935 case CLASS_TAB:
1936 regc('\t');
1937 break;
1938 case CLASS_RETURN:
1939 regc('\r');
1940 break;
1941 case CLASS_BACKSPACE:
1942 regc('\b');
1943 break;
1944 case CLASS_ESCAPE:
1945 regc('\033');
1946 break;
1947 case CLASS_IDENT:
1948 for (cu = 1; cu <= 255; cu++)
1949 if (vim_isIDc(cu))
1950 regmbc(cu);
1951 break;
1952 case CLASS_KEYWORD:
1953 for (cu = 1; cu <= 255; cu++)
1954 if (reg_iswordc(cu))
1955 regmbc(cu);
1956 break;
1957 case CLASS_FNAME:
1958 for (cu = 1; cu <= 255; cu++)
1959 if (vim_isfilec(cu))
1960 regmbc(cu);
1961 break;
1962 }
1963 }
1964 else
1965 {
1966 if (has_mbyte)
1967 {
1968 int len;
1969
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001970 // produce a multibyte character, including any
1971 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001972 startc = mb_ptr2char(regparse);
1973 len = (*mb_ptr2len)(regparse);
1974 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001975 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001976 while (--len >= 0)
1977 regc(*regparse++);
1978 }
1979 else
1980 {
1981 startc = *regparse++;
1982 regc(startc);
1983 }
1984 }
1985 }
1986 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001987 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001988 if (*regparse != ']')
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001989 EMSG_RET_NULL(_(e_too_many_brackets)); // Cannot happen?
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001990 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001991 *flagp |= HASWIDTH | SIMPLE;
1992 break;
1993 }
1994 else if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001995 EMSG2_RET_NULL(_(e_missing_rsb_after_str_lsb),
1996 reg_magic > MAGIC_OFF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001997 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001998 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001999
2000 default:
2001 {
2002 int len;
2003
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002004 // A multi-byte character is handled as a separate atom if it's
2005 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002006 if (use_multibytecode(c))
2007 {
2008do_multibyte:
2009 ret = regnode(MULTIBYTECODE);
2010 regmbc(c);
2011 *flagp |= HASWIDTH | SIMPLE;
2012 break;
2013 }
2014
2015 ret = regnode(EXACTLY);
2016
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002017 // Append characters as long as:
2018 // - there is no following multi, we then need the character in
2019 // front of it as a single character operand
2020 // - not running into a Magic character
2021 // - "one_exactly" is not set
2022 // But always emit at least one character. Might be a Multi,
2023 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002024 for (len = 0; c != NUL && (len == 0
2025 || (re_multi_type(peekchr()) == NOT_MULTI
2026 && !one_exactly
2027 && !is_Magic(c))); ++len)
2028 {
2029 c = no_Magic(c);
2030 if (has_mbyte)
2031 {
2032 regmbc(c);
2033 if (enc_utf8)
2034 {
2035 int l;
2036
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002037 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002038 for (;;)
2039 {
2040 l = utf_ptr2len(regparse);
2041 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2042 break;
2043 regmbc(utf_ptr2char(regparse));
2044 skipchr();
2045 }
2046 }
2047 }
2048 else
2049 regc(c);
2050 c = getchr();
2051 }
2052 ungetchr();
2053
2054 regc(NUL);
2055 *flagp |= HASWIDTH;
2056 if (len == 1)
2057 *flagp |= SIMPLE;
2058 }
2059 break;
2060 }
2061
2062 return ret;
2063}
2064
2065/*
2066 * Parse something followed by possible [*+=].
2067 *
2068 * Note that the branching code sequences used for = and the general cases
2069 * of * and + are somewhat optimized: they use the same NOTHING node as
2070 * both the endmarker for their branch list and the body of the last branch.
2071 * It might seem that this node could be dispensed with entirely, but the
2072 * endmarker role is not redundant.
2073 */
2074 static char_u *
2075regpiece(int *flagp)
2076{
2077 char_u *ret;
2078 int op;
2079 char_u *next;
2080 int flags;
2081 long minval;
2082 long maxval;
2083
2084 ret = regatom(&flags);
2085 if (ret == NULL)
2086 return NULL;
2087
2088 op = peekchr();
2089 if (re_multi_type(op) == NOT_MULTI)
2090 {
2091 *flagp = flags;
2092 return ret;
2093 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002094 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002095 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2096
2097 skipchr();
2098 switch (op)
2099 {
2100 case Magic('*'):
2101 if (flags & SIMPLE)
2102 reginsert(STAR, ret);
2103 else
2104 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002105 // Emit x* as (x&|), where & means "self".
2106 reginsert(BRANCH, ret); // Either x
2107 regoptail(ret, regnode(BACK)); // and loop
2108 regoptail(ret, ret); // back
2109 regtail(ret, regnode(BRANCH)); // or
2110 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002111 }
2112 break;
2113
2114 case Magic('+'):
2115 if (flags & SIMPLE)
2116 reginsert(PLUS, ret);
2117 else
2118 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002119 // Emit x+ as x(&|), where & means "self".
2120 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002121 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002122 regtail(regnode(BACK), ret); // loop back
2123 regtail(next, regnode(BRANCH)); // or
2124 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002125 }
2126 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2127 break;
2128
2129 case Magic('@'):
2130 {
2131 int lop = END;
2132 long nr;
2133
2134 nr = getdecchrs();
2135 switch (no_Magic(getchr()))
2136 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002137 case '=': lop = MATCH; break; // \@=
2138 case '!': lop = NOMATCH; break; // \@!
2139 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002140 case '<': switch (no_Magic(getchr()))
2141 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002142 case '=': lop = BEHIND; break; // \@<=
2143 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002144 }
2145 }
2146 if (lop == END)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002147 EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002148 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002149 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002150 if (lop == BEHIND || lop == NOBEHIND)
2151 {
2152 regtail(ret, regnode(BHPOS));
2153 *flagp |= HASLOOKBH;
2154 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002155 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002156 if (lop == BEHIND || lop == NOBEHIND)
2157 {
2158 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002159 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002160 reginsert_nr(lop, nr, ret);
2161 }
2162 else
2163 reginsert(lop, ret);
2164 break;
2165 }
2166
2167 case Magic('?'):
2168 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002169 // Emit x= as (x|)
2170 reginsert(BRANCH, ret); // Either x
2171 regtail(ret, regnode(BRANCH)); // or
2172 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002173 regtail(ret, next);
2174 regoptail(ret, next);
2175 break;
2176
2177 case Magic('{'):
2178 if (!read_limits(&minval, &maxval))
2179 return NULL;
2180 if (flags & SIMPLE)
2181 {
2182 reginsert(BRACE_SIMPLE, ret);
2183 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2184 }
2185 else
2186 {
2187 if (num_complex_braces >= 10)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002188 EMSG2_RET_NULL(_(e_too_many_complex_str_curly),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002189 reg_magic == MAGIC_ALL);
2190 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2191 regoptail(ret, regnode(BACK));
2192 regoptail(ret, ret);
2193 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2194 ++num_complex_braces;
2195 }
2196 if (minval > 0 && maxval > 0)
2197 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2198 break;
2199 }
2200 if (re_multi_type(peekchr()) != NOT_MULTI)
2201 {
2202 // Can't have a multi follow a multi.
2203 if (peekchr() == Magic('*'))
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002204 EMSG2_RET_NULL(_(e_nested_str), reg_magic >= MAGIC_ON);
2205 EMSG3_RET_NULL(_(e_nested_str_chr), reg_magic == MAGIC_ALL,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002206 no_Magic(peekchr()));
2207 }
2208
2209 return ret;
2210}
2211
2212/*
2213 * Parse one alternative of an | or & operator.
2214 * Implements the concatenation operator.
2215 */
2216 static char_u *
2217regconcat(int *flagp)
2218{
2219 char_u *first = NULL;
2220 char_u *chain = NULL;
2221 char_u *latest;
2222 int flags;
2223 int cont = TRUE;
2224
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002225 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002226
2227 while (cont)
2228 {
2229 switch (peekchr())
2230 {
2231 case NUL:
2232 case Magic('|'):
2233 case Magic('&'):
2234 case Magic(')'):
2235 cont = FALSE;
2236 break;
2237 case Magic('Z'):
2238 regflags |= RF_ICOMBINE;
2239 skipchr_keepstart();
2240 break;
2241 case Magic('c'):
2242 regflags |= RF_ICASE;
2243 skipchr_keepstart();
2244 break;
2245 case Magic('C'):
2246 regflags |= RF_NOICASE;
2247 skipchr_keepstart();
2248 break;
2249 case Magic('v'):
2250 reg_magic = MAGIC_ALL;
2251 skipchr_keepstart();
2252 curchr = -1;
2253 break;
2254 case Magic('m'):
2255 reg_magic = MAGIC_ON;
2256 skipchr_keepstart();
2257 curchr = -1;
2258 break;
2259 case Magic('M'):
2260 reg_magic = MAGIC_OFF;
2261 skipchr_keepstart();
2262 curchr = -1;
2263 break;
2264 case Magic('V'):
2265 reg_magic = MAGIC_NONE;
2266 skipchr_keepstart();
2267 curchr = -1;
2268 break;
2269 default:
2270 latest = regpiece(&flags);
2271 if (latest == NULL || reg_toolong)
2272 return NULL;
2273 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002274 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002275 *flagp |= flags & SPSTART;
2276 else
2277 regtail(chain, latest);
2278 chain = latest;
2279 if (first == NULL)
2280 first = latest;
2281 break;
2282 }
2283 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002284 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002285 first = regnode(NOTHING);
2286 return first;
2287}
2288
2289/*
2290 * Parse one alternative of an | operator.
2291 * Implements the & operator.
2292 */
2293 static char_u *
2294regbranch(int *flagp)
2295{
2296 char_u *ret;
2297 char_u *chain = NULL;
2298 char_u *latest;
2299 int flags;
2300
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002301 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002302
2303 ret = regnode(BRANCH);
2304 for (;;)
2305 {
2306 latest = regconcat(&flags);
2307 if (latest == NULL)
2308 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002309 // If one of the branches has width, the whole thing has. If one of
2310 // the branches anchors at start-of-line, the whole thing does.
2311 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002312 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002313 // If one of the branches doesn't match a line-break, the whole thing
2314 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002315 *flagp &= ~HASNL | (flags & HASNL);
2316 if (chain != NULL)
2317 regtail(chain, latest);
2318 if (peekchr() != Magic('&'))
2319 break;
2320 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002321 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002322 if (reg_toolong)
2323 break;
2324 reginsert(MATCH, latest);
2325 chain = latest;
2326 }
2327
2328 return ret;
2329}
2330
2331/*
2332 * Parse regular expression, i.e. main body or parenthesized thing.
2333 *
2334 * Caller must absorb opening parenthesis.
2335 *
2336 * Combining parenthesis handling with the base level of regular expression
2337 * is a trifle forced, but the need to tie the tails of the branches to what
2338 * follows makes it hard to avoid.
2339 */
2340 static char_u *
2341reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002342 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002343 int *flagp)
2344{
2345 char_u *ret;
2346 char_u *br;
2347 char_u *ender;
2348 int parno = 0;
2349 int flags;
2350
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002351 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002352
2353#ifdef FEAT_SYN_HL
2354 if (paren == REG_ZPAREN)
2355 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002356 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002357 if (regnzpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002358 EMSG_RET_NULL(_(e_too_many_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002359 parno = regnzpar;
2360 regnzpar++;
2361 ret = regnode(ZOPEN + parno);
2362 }
2363 else
2364#endif
2365 if (paren == REG_PAREN)
2366 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002367 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002368 if (regnpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002369 EMSG2_RET_NULL(_(e_too_many_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002370 parno = regnpar;
2371 ++regnpar;
2372 ret = regnode(MOPEN + parno);
2373 }
2374 else if (paren == REG_NPAREN)
2375 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002376 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002377 ret = regnode(NOPEN);
2378 }
2379 else
2380 ret = NULL;
2381
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002382 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002383 br = regbranch(&flags);
2384 if (br == NULL)
2385 return NULL;
2386 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002387 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002388 else
2389 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002390 // If one of the branches can be zero-width, the whole thing can.
2391 // If one of the branches has * at start or matches a line-break, the
2392 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002393 if (!(flags & HASWIDTH))
2394 *flagp &= ~HASWIDTH;
2395 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2396 while (peekchr() == Magic('|'))
2397 {
2398 skipchr();
2399 br = regbranch(&flags);
2400 if (br == NULL || reg_toolong)
2401 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002402 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002403 if (!(flags & HASWIDTH))
2404 *flagp &= ~HASWIDTH;
2405 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2406 }
2407
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002408 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002409 ender = regnode(
2410#ifdef FEAT_SYN_HL
2411 paren == REG_ZPAREN ? ZCLOSE + parno :
2412#endif
2413 paren == REG_PAREN ? MCLOSE + parno :
2414 paren == REG_NPAREN ? NCLOSE : END);
2415 regtail(ret, ender);
2416
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002417 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002418 for (br = ret; br != NULL; br = regnext(br))
2419 regoptail(br, ender);
2420
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002421 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002422 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2423 {
2424#ifdef FEAT_SYN_HL
2425 if (paren == REG_ZPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002426 EMSG_RET_NULL(_(e_unmatched_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002427 else
2428#endif
2429 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002430 EMSG2_RET_NULL(_(e_unmatched_str_percent_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002431 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002432 EMSG2_RET_NULL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002433 }
2434 else if (paren == REG_NOPAREN && peekchr() != NUL)
2435 {
2436 if (curchr == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002437 EMSG2_RET_NULL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002438 else
Bram Moolenaar74409f62022-01-01 15:58:22 +00002439 EMSG_RET_NULL(_(e_trailing_characters)); // "Can't happen".
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002440 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002441 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002442 // Here we set the flag allowing back references to this set of
2443 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002444 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002445 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002446 return ret;
2447}
2448
2449/*
2450 * bt_regcomp() - compile a regular expression into internal code for the
2451 * traditional back track matcher.
2452 * Returns the program in allocated space. Returns NULL for an error.
2453 *
2454 * We can't allocate space until we know how big the compiled form will be,
2455 * but we can't compile it (and thus know how big it is) until we've got a
2456 * place to put the code. So we cheat: we compile it twice, once with code
2457 * generation turned off and size counting turned on, and once "for real".
2458 * This also means that we don't allocate space until we are sure that the
2459 * thing really will compile successfully, and we never have to move the
2460 * code and thus invalidate pointers into it. (Note that it has to be in
2461 * one piece because vim_free() must be able to free it all.)
2462 *
2463 * Whether upper/lower case is to be ignored is decided when executing the
2464 * program, it does not matter here.
2465 *
2466 * Beware that the optimization-preparation code in here knows about some
2467 * of the structure of the compiled regexp.
2468 * "re_flags": RE_MAGIC and/or RE_STRING.
2469 */
2470 static regprog_T *
2471bt_regcomp(char_u *expr, int re_flags)
2472{
2473 bt_regprog_T *r;
2474 char_u *scan;
2475 char_u *longest;
2476 int len;
2477 int flags;
2478
2479 if (expr == NULL)
RestorerZ68ebcee2023-05-31 17:12:14 +01002480 IEMSG_RET_NULL(e_null_argument);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002481
2482 init_class_tab();
2483
2484 // First pass: determine size, legality.
2485 regcomp_start(expr, re_flags);
2486 regcode = JUST_CALC_SIZE;
2487 regc(REGMAGIC);
2488 if (reg(REG_NOPAREN, &flags) == NULL)
2489 return NULL;
2490
2491 // Allocate space.
2492 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2493 if (r == NULL)
2494 return NULL;
2495 r->re_in_use = FALSE;
2496
2497 // Second pass: emit code.
2498 regcomp_start(expr, re_flags);
2499 regcode = r->program;
2500 regc(REGMAGIC);
2501 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2502 {
2503 vim_free(r);
2504 if (reg_toolong)
Bram Moolenaareaaac012022-01-02 17:00:40 +00002505 EMSG_RET_NULL(_(e_pattern_too_long));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002506 return NULL;
2507 }
2508
2509 // Dig out information for optimizations.
2510 r->regstart = NUL; // Worst-case defaults.
2511 r->reganch = 0;
2512 r->regmust = NULL;
2513 r->regmlen = 0;
2514 r->regflags = regflags;
2515 if (flags & HASNL)
2516 r->regflags |= RF_HASNL;
2517 if (flags & HASLOOKBH)
2518 r->regflags |= RF_LOOKBH;
2519#ifdef FEAT_SYN_HL
2520 // Remember whether this pattern has any \z specials in it.
2521 r->reghasz = re_has_z;
2522#endif
2523 scan = r->program + 1; // First BRANCH.
2524 if (OP(regnext(scan)) == END) // Only one top-level choice.
2525 {
2526 scan = OPERAND(scan);
2527
2528 // Starting-point info.
2529 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2530 {
2531 r->reganch++;
2532 scan = regnext(scan);
2533 }
2534
2535 if (OP(scan) == EXACTLY)
2536 {
2537 if (has_mbyte)
2538 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2539 else
2540 r->regstart = *OPERAND(scan);
2541 }
2542 else if ((OP(scan) == BOW
2543 || OP(scan) == EOW
2544 || OP(scan) == NOTHING
2545 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2546 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2547 && OP(regnext(scan)) == EXACTLY)
2548 {
2549 if (has_mbyte)
2550 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2551 else
2552 r->regstart = *OPERAND(regnext(scan));
2553 }
2554
2555 // If there's something expensive in the r.e., find the longest
2556 // literal string that must appear and make it the regmust. Resolve
2557 // ties in favor of later strings, since the regstart check works
2558 // with the beginning of the r.e. and avoiding duplication
2559 // strengthens checking. Not a strong reason, but sufficient in the
2560 // absence of others.
2561
2562 // When the r.e. starts with BOW, it is faster to look for a regmust
2563 // first. Used a lot for "#" and "*" commands. (Added by mool).
2564 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2565 && !(flags & HASNL))
2566 {
John Marriott82792db2024-05-12 00:07:17 +02002567 size_t scanlen;
2568
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002569 longest = NULL;
2570 len = 0;
2571 for (; scan != NULL; scan = regnext(scan))
John Marriott82792db2024-05-12 00:07:17 +02002572 {
2573 if (OP(scan) == EXACTLY)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002574 {
John Marriott82792db2024-05-12 00:07:17 +02002575 scanlen = STRLEN(OPERAND(scan));
2576 if (scanlen >= (size_t)len)
2577 {
2578 longest = OPERAND(scan);
2579 len = (int)scanlen;
2580 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002581 }
John Marriott82792db2024-05-12 00:07:17 +02002582 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002583 r->regmust = longest;
2584 r->regmlen = len;
2585 }
2586 }
2587#ifdef BT_REGEXP_DUMP
2588 regdump(expr, r);
2589#endif
2590 r->engine = &bt_regengine;
2591 return (regprog_T *)r;
2592}
2593
2594#if defined(FEAT_SYN_HL) || defined(PROTO)
2595/*
2596 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2597 * found. This is messy, but it works fine.
2598 */
2599 int
2600vim_regcomp_had_eol(void)
2601{
2602 return had_eol;
2603}
2604#endif
2605
2606/*
2607 * Get a number after a backslash that is inside [].
2608 * When nothing is recognized return a backslash.
2609 */
2610 static int
2611coll_get_char(void)
2612{
2613 long nr = -1;
2614
2615 switch (*regparse++)
2616 {
2617 case 'd': nr = getdecchrs(); break;
2618 case 'o': nr = getoctchrs(); break;
2619 case 'x': nr = gethexchrs(2); break;
2620 case 'u': nr = gethexchrs(4); break;
2621 case 'U': nr = gethexchrs(8); break;
2622 }
2623 if (nr < 0 || nr > INT_MAX)
2624 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002625 // If getting the number fails be backwards compatible: the character
2626 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002627 --regparse;
2628 nr = '\\';
2629 }
2630 return nr;
2631}
2632
2633/*
2634 * Free a compiled regexp program, returned by bt_regcomp().
2635 */
2636 static void
2637bt_regfree(regprog_T *prog)
2638{
2639 vim_free(prog);
2640}
2641
2642#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2643
2644/*
2645 * The arguments from BRACE_LIMITS are stored here. They are actually local
2646 * to regmatch(), but they are here to reduce the amount of stack space used
2647 * (it can be called recursively many times).
2648 */
2649static long bl_minval;
2650static long bl_maxval;
2651
2652/*
2653 * Save the input line and position in a regsave_T.
2654 */
2655 static void
2656reg_save(regsave_T *save, garray_T *gap)
2657{
2658 if (REG_MULTI)
2659 {
2660 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2661 save->rs_u.pos.lnum = rex.lnum;
2662 }
2663 else
2664 save->rs_u.ptr = rex.input;
2665 save->rs_len = gap->ga_len;
2666}
2667
2668/*
2669 * Restore the input line and position from a regsave_T.
2670 */
2671 static void
2672reg_restore(regsave_T *save, garray_T *gap)
2673{
2674 if (REG_MULTI)
2675 {
2676 if (rex.lnum != save->rs_u.pos.lnum)
2677 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002678 // only call reg_getline() when the line number changed to save
2679 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002680 rex.lnum = save->rs_u.pos.lnum;
2681 rex.line = reg_getline(rex.lnum);
2682 }
2683 rex.input = rex.line + save->rs_u.pos.col;
2684 }
2685 else
2686 rex.input = save->rs_u.ptr;
2687 gap->ga_len = save->rs_len;
2688}
2689
2690/*
2691 * Return TRUE if current position is equal to saved position.
2692 */
2693 static int
2694reg_save_equal(regsave_T *save)
2695{
2696 if (REG_MULTI)
2697 return rex.lnum == save->rs_u.pos.lnum
2698 && rex.input == rex.line + save->rs_u.pos.col;
2699 return rex.input == save->rs_u.ptr;
2700}
2701
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002702// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002703#define save_se(savep, posp, pp) \
2704 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2705
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002706// After a failed match restore the sub-expressions.
Bram Moolenaarebfec1c2023-01-22 21:14:53 +00002707#define restore_se(savep, posp, pp) \
2708{ \
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002709 if (REG_MULTI) \
2710 *(posp) = (savep)->se_u.pos; \
2711 else \
Bram Moolenaarebfec1c2023-01-22 21:14:53 +00002712 *(pp) = (savep)->se_u.ptr; \
2713}
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002714
2715/*
2716 * Tentatively set the sub-expression start to the current position (after
2717 * calling regmatch() they will have changed). Need to save the existing
2718 * values for when there is no match.
2719 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2720 * depending on REG_MULTI.
2721 */
2722 static void
2723save_se_multi(save_se_T *savep, lpos_T *posp)
2724{
2725 savep->se_u.pos = *posp;
2726 posp->lnum = rex.lnum;
2727 posp->col = (colnr_T)(rex.input - rex.line);
2728}
2729
2730 static void
2731save_se_one(save_se_T *savep, char_u **pp)
2732{
2733 savep->se_u.ptr = *pp;
2734 *pp = rex.input;
2735}
2736
2737/*
2738 * regrepeat - repeatedly match something simple, return how many.
2739 * Advances rex.input (and rex.lnum) to just after the matched chars.
2740 */
2741 static int
2742regrepeat(
2743 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002744 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002745{
2746 long count = 0;
2747 char_u *scan;
2748 char_u *opnd;
2749 int mask;
2750 int testval = 0;
2751
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002752 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002753 opnd = OPERAND(p);
2754 switch (OP(p))
2755 {
2756 case ANY:
2757 case ANY + ADD_NL:
2758 while (count < maxcount)
2759 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002760 // Matching anything means we continue until end-of-line (or
2761 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002762 while (*scan != NUL && count < maxcount)
2763 {
2764 ++count;
2765 MB_PTR_ADV(scan);
2766 }
2767 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2768 || rex.reg_line_lbr || count == maxcount)
2769 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002770 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002771 reg_nextline();
2772 scan = rex.input;
2773 if (got_int)
2774 break;
2775 }
2776 break;
2777
2778 case IDENT:
2779 case IDENT + ADD_NL:
2780 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002781 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002782 case SIDENT:
2783 case SIDENT + ADD_NL:
2784 while (count < maxcount)
2785 {
2786 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2787 {
2788 MB_PTR_ADV(scan);
2789 }
2790 else if (*scan == NUL)
2791 {
2792 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2793 || rex.reg_line_lbr)
2794 break;
2795 reg_nextline();
2796 scan = rex.input;
2797 if (got_int)
2798 break;
2799 }
2800 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2801 ++scan;
2802 else
2803 break;
2804 ++count;
2805 }
2806 break;
2807
2808 case KWORD:
2809 case KWORD + ADD_NL:
2810 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002811 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002812 case SKWORD:
2813 case SKWORD + ADD_NL:
2814 while (count < maxcount)
2815 {
2816 if (vim_iswordp_buf(scan, rex.reg_buf)
2817 && (testval || !VIM_ISDIGIT(*scan)))
2818 {
2819 MB_PTR_ADV(scan);
2820 }
2821 else if (*scan == NUL)
2822 {
2823 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2824 || rex.reg_line_lbr)
2825 break;
2826 reg_nextline();
2827 scan = rex.input;
2828 if (got_int)
2829 break;
2830 }
2831 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2832 ++scan;
2833 else
2834 break;
2835 ++count;
2836 }
2837 break;
2838
2839 case FNAME:
2840 case FNAME + ADD_NL:
2841 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002842 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002843 case SFNAME:
2844 case SFNAME + ADD_NL:
2845 while (count < maxcount)
2846 {
2847 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2848 {
2849 MB_PTR_ADV(scan);
2850 }
2851 else if (*scan == NUL)
2852 {
2853 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2854 || rex.reg_line_lbr)
2855 break;
2856 reg_nextline();
2857 scan = rex.input;
2858 if (got_int)
2859 break;
2860 }
2861 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2862 ++scan;
2863 else
2864 break;
2865 ++count;
2866 }
2867 break;
2868
2869 case PRINT:
2870 case PRINT + ADD_NL:
2871 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002872 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002873 case SPRINT:
2874 case SPRINT + ADD_NL:
2875 while (count < maxcount)
2876 {
2877 if (*scan == NUL)
2878 {
2879 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2880 || rex.reg_line_lbr)
2881 break;
2882 reg_nextline();
2883 scan = rex.input;
2884 if (got_int)
2885 break;
2886 }
2887 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2888 && (testval || !VIM_ISDIGIT(*scan)))
2889 {
2890 MB_PTR_ADV(scan);
2891 }
2892 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2893 ++scan;
2894 else
2895 break;
2896 ++count;
2897 }
2898 break;
2899
2900 case WHITE:
2901 case WHITE + ADD_NL:
2902 testval = mask = RI_WHITE;
2903do_class:
2904 while (count < maxcount)
2905 {
2906 int l;
2907
2908 if (*scan == NUL)
2909 {
2910 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2911 || rex.reg_line_lbr)
2912 break;
2913 reg_nextline();
2914 scan = rex.input;
2915 if (got_int)
2916 break;
2917 }
2918 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2919 {
2920 if (testval != 0)
2921 break;
2922 scan += l;
2923 }
2924 else if ((class_tab[*scan] & mask) == testval)
2925 ++scan;
2926 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2927 ++scan;
2928 else
2929 break;
2930 ++count;
2931 }
2932 break;
2933
2934 case NWHITE:
2935 case NWHITE + ADD_NL:
2936 mask = RI_WHITE;
2937 goto do_class;
2938 case DIGIT:
2939 case DIGIT + ADD_NL:
2940 testval = mask = RI_DIGIT;
2941 goto do_class;
2942 case NDIGIT:
2943 case NDIGIT + ADD_NL:
2944 mask = RI_DIGIT;
2945 goto do_class;
2946 case HEX:
2947 case HEX + ADD_NL:
2948 testval = mask = RI_HEX;
2949 goto do_class;
2950 case NHEX:
2951 case NHEX + ADD_NL:
2952 mask = RI_HEX;
2953 goto do_class;
2954 case OCTAL:
2955 case OCTAL + ADD_NL:
2956 testval = mask = RI_OCTAL;
2957 goto do_class;
2958 case NOCTAL:
2959 case NOCTAL + ADD_NL:
2960 mask = RI_OCTAL;
2961 goto do_class;
2962 case WORD:
2963 case WORD + ADD_NL:
2964 testval = mask = RI_WORD;
2965 goto do_class;
2966 case NWORD:
2967 case NWORD + ADD_NL:
2968 mask = RI_WORD;
2969 goto do_class;
2970 case HEAD:
2971 case HEAD + ADD_NL:
2972 testval = mask = RI_HEAD;
2973 goto do_class;
2974 case NHEAD:
2975 case NHEAD + ADD_NL:
2976 mask = RI_HEAD;
2977 goto do_class;
2978 case ALPHA:
2979 case ALPHA + ADD_NL:
2980 testval = mask = RI_ALPHA;
2981 goto do_class;
2982 case NALPHA:
2983 case NALPHA + ADD_NL:
2984 mask = RI_ALPHA;
2985 goto do_class;
2986 case LOWER:
2987 case LOWER + ADD_NL:
2988 testval = mask = RI_LOWER;
2989 goto do_class;
2990 case NLOWER:
2991 case NLOWER + ADD_NL:
2992 mask = RI_LOWER;
2993 goto do_class;
2994 case UPPER:
2995 case UPPER + ADD_NL:
2996 testval = mask = RI_UPPER;
2997 goto do_class;
2998 case NUPPER:
2999 case NUPPER + ADD_NL:
3000 mask = RI_UPPER;
3001 goto do_class;
3002
3003 case EXACTLY:
3004 {
3005 int cu, cl;
3006
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003007 // This doesn't do a multi-byte character, because a MULTIBYTECODE
3008 // would have been used for it. It does handle single-byte
3009 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003010 if (rex.reg_ic)
3011 {
3012 cu = MB_TOUPPER(*opnd);
3013 cl = MB_TOLOWER(*opnd);
3014 while (count < maxcount && (*scan == cu || *scan == cl))
3015 {
3016 count++;
3017 scan++;
3018 }
3019 }
3020 else
3021 {
3022 cu = *opnd;
3023 while (count < maxcount && *scan == cu)
3024 {
3025 count++;
3026 scan++;
3027 }
3028 }
3029 break;
3030 }
3031
3032 case MULTIBYTECODE:
3033 {
3034 int i, len, cf = 0;
3035
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003036 // Safety check (just in case 'encoding' was changed since
3037 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003038 if ((len = (*mb_ptr2len)(opnd)) > 1)
3039 {
3040 if (rex.reg_ic && enc_utf8)
3041 cf = utf_fold(utf_ptr2char(opnd));
3042 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3043 {
3044 for (i = 0; i < len; ++i)
3045 if (opnd[i] != scan[i])
3046 break;
3047 if (i < len && (!rex.reg_ic || !enc_utf8
3048 || utf_fold(utf_ptr2char(scan)) != cf))
3049 break;
3050 scan += len;
3051 ++count;
3052 }
3053 }
3054 }
3055 break;
3056
3057 case ANYOF:
3058 case ANYOF + ADD_NL:
3059 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003060 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003061
3062 case ANYBUT:
3063 case ANYBUT + ADD_NL:
3064 while (count < maxcount)
3065 {
3066 int len;
3067
3068 if (*scan == NUL)
3069 {
3070 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3071 || rex.reg_line_lbr)
3072 break;
3073 reg_nextline();
3074 scan = rex.input;
3075 if (got_int)
3076 break;
3077 }
3078 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3079 ++scan;
3080 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3081 {
3082 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3083 break;
3084 scan += len;
3085 }
3086 else
3087 {
3088 if ((cstrchr(opnd, *scan) == NULL) == testval)
3089 break;
3090 ++scan;
3091 }
3092 ++count;
3093 }
3094 break;
3095
3096 case NEWL:
3097 while (count < maxcount
3098 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3099 && !rex.reg_line_lbr && REG_MULTI)
3100 || (*scan == '\n' && rex.reg_line_lbr)))
3101 {
3102 count++;
3103 if (rex.reg_line_lbr)
3104 ADVANCE_REGINPUT();
3105 else
3106 reg_nextline();
3107 scan = rex.input;
3108 if (got_int)
3109 break;
3110 }
3111 break;
3112
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003113 default: // Oh dear. Called inappropriately.
RestorerZ68ebcee2023-05-31 17:12:14 +01003114 iemsg(e_corrupted_regexp_program);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003115#ifdef DEBUG
3116 printf("Called regrepeat with op code %d\n", OP(p));
3117#endif
3118 break;
3119 }
3120
3121 rex.input = scan;
3122
3123 return (int)count;
3124}
3125
3126/*
3127 * Push an item onto the regstack.
3128 * Returns pointer to new item. Returns NULL when out of memory.
3129 */
3130 static regitem_T *
3131regstack_push(regstate_T state, char_u *scan)
3132{
3133 regitem_T *rp;
3134
3135 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3136 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00003137 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003138 return NULL;
3139 }
3140 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3141 return NULL;
3142
3143 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3144 rp->rs_state = state;
3145 rp->rs_scan = scan;
3146
3147 regstack.ga_len += sizeof(regitem_T);
3148 return rp;
3149}
3150
3151/*
3152 * Pop an item from the regstack.
3153 */
3154 static void
3155regstack_pop(char_u **scan)
3156{
3157 regitem_T *rp;
3158
3159 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3160 *scan = rp->rs_scan;
3161
3162 regstack.ga_len -= sizeof(regitem_T);
3163}
3164
Bram Moolenaar616592e2022-06-17 15:17:10 +01003165#ifdef FEAT_RELTIME
3166/*
3167 * Check if the timer expired, return TRUE if so.
3168 */
3169 static int
3170bt_did_time_out(int *timed_out)
3171{
3172 if (*timeout_flag)
3173 {
3174 if (timed_out != NULL)
3175 {
Bram Moolenaar4c5678f2022-11-30 18:12:19 +00003176# ifdef FEAT_EVAL
Bram Moolenaar616592e2022-06-17 15:17:10 +01003177 if (!*timed_out)
3178 ch_log(NULL, "BT regexp timed out");
Bram Moolenaar509ce032022-06-20 11:23:01 +01003179# endif
Bram Moolenaar616592e2022-06-17 15:17:10 +01003180 *timed_out = TRUE;
3181 }
3182 return TRUE;
3183 }
3184 return FALSE;
3185}
3186#endif
3187
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003188/*
3189 * Save the current subexpr to "bp", so that they can be restored
3190 * later by restore_subexpr().
3191 */
3192 static void
3193save_subexpr(regbehind_T *bp)
3194{
3195 int i;
3196
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003197 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3198 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003199 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003200 if (rex.need_clear_subexpr)
3201 return;
3202
3203 for (i = 0; i < NSUBEXP; ++i)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003204 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003205 if (REG_MULTI)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003206 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003207 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3208 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3209 }
3210 else
3211 {
3212 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3213 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003214 }
3215 }
3216}
3217
3218/*
3219 * Restore the subexpr from "bp".
3220 */
3221 static void
3222restore_subexpr(regbehind_T *bp)
3223{
3224 int i;
3225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003226 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003227 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003228 if (rex.need_clear_subexpr)
3229 return;
3230
3231 for (i = 0; i < NSUBEXP; ++i)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003232 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003233 if (REG_MULTI)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003234 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003235 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3236 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3237 }
3238 else
3239 {
3240 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3241 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003242 }
3243 }
3244}
3245
3246/*
3247 * regmatch - main matching routine
3248 *
3249 * Conceptually the strategy is simple: Check to see whether the current node
3250 * matches, push an item onto the regstack and loop to see whether the rest
3251 * matches, and then act accordingly. In practice we make some effort to
3252 * avoid using the regstack, in particular by going through "ordinary" nodes
3253 * (that don't need to know whether the rest of the match failed) by a nested
3254 * loop.
3255 *
3256 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3257 * the last matched character.
3258 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3259 * undefined state!
3260 */
3261 static int
3262regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003263 char_u *scan, // Current node.
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003264 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003265{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003266 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003267 int op;
3268 int c;
3269 regitem_T *rp;
3270 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003271 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003272
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003273 // Make "regstack" and "backpos" empty. They are allocated and freed in
3274 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003275 regstack.ga_len = 0;
3276 backpos.ga_len = 0;
3277
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003278 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003279 for (;;)
3280 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003281 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3282 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003283 fast_breakcheck();
3284
3285#ifdef DEBUG
3286 if (scan != NULL && regnarrate)
3287 {
3288 mch_errmsg((char *)regprop(scan));
3289 mch_errmsg("(\n");
3290 }
3291#endif
3292
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003293 // Repeat for items that can be matched sequentially, without using the
3294 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003295 for (;;)
3296 {
3297 if (got_int || scan == NULL)
3298 {
3299 status = RA_FAIL;
3300 break;
3301 }
3302#ifdef FEAT_RELTIME
Bram Moolenaar616592e2022-06-17 15:17:10 +01003303 if (bt_did_time_out(timed_out))
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003304 {
Paul Ollis65745772022-06-05 16:55:54 +01003305 status = RA_FAIL;
3306 break;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003307 }
3308#endif
3309 status = RA_CONT;
3310
3311#ifdef DEBUG
3312 if (regnarrate)
3313 {
3314 mch_errmsg((char *)regprop(scan));
3315 mch_errmsg("...\n");
3316# ifdef FEAT_SYN_HL
3317 if (re_extmatch_in != NULL)
3318 {
3319 int i;
3320
3321 mch_errmsg(_("External submatches:\n"));
3322 for (i = 0; i < NSUBEXP; i++)
3323 {
3324 mch_errmsg(" \"");
3325 if (re_extmatch_in->matches[i] != NULL)
3326 mch_errmsg((char *)re_extmatch_in->matches[i]);
3327 mch_errmsg("\"\n");
3328 }
3329 }
3330# endif
3331 }
3332#endif
3333 next = regnext(scan);
3334
3335 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003336 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003337 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
Paul Ollis65745772022-06-05 16:55:54 +01003338 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003339 {
3340 reg_nextline();
3341 }
3342 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3343 {
3344 ADVANCE_REGINPUT();
3345 }
3346 else
3347 {
3348 if (WITH_NL(op))
3349 op -= ADD_NL;
3350 if (has_mbyte)
3351 c = (*mb_ptr2char)(rex.input);
3352 else
3353 c = *rex.input;
3354 switch (op)
3355 {
3356 case BOL:
3357 if (rex.input != rex.line)
3358 status = RA_NOMATCH;
3359 break;
3360
3361 case EOL:
3362 if (c != NUL)
3363 status = RA_NOMATCH;
3364 break;
3365
3366 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003367 // We're not at the beginning of the file when below the first
3368 // line where we started, not at the start of the line or we
3369 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003370 if (rex.lnum != 0 || rex.input != rex.line
3371 || (REG_MULTI && rex.reg_firstlnum > 1))
3372 status = RA_NOMATCH;
3373 break;
3374
3375 case RE_EOF:
3376 if (rex.lnum != rex.reg_maxline || c != NUL)
3377 status = RA_NOMATCH;
3378 break;
3379
3380 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003381 // Check if the buffer is in a window and compare the
3382 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003383 if (rex.reg_win == NULL
3384 || (rex.lnum + rex.reg_firstlnum
3385 != rex.reg_win->w_cursor.lnum)
3386 || ((colnr_T)(rex.input - rex.line)
3387 != rex.reg_win->w_cursor.col))
3388 status = RA_NOMATCH;
3389 break;
3390
3391 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003392 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003393 {
3394 int mark = OPERAND(scan)[0];
3395 int cmp = OPERAND(scan)[1];
3396 pos_T *pos;
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003397 size_t col = REG_MULTI ? rex.input - rex.line : 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003398
3399 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003400
3401 // Line may have been freed, get it again.
3402 if (REG_MULTI)
3403 {
3404 rex.line = reg_getline(rex.lnum);
3405 rex.input = rex.line + col;
3406 }
3407
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003408 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003409 || pos->lnum <= 0) // mark isn't set in reg_buf
3410 {
3411 status = RA_NOMATCH;
3412 }
3413 else
3414 {
3415 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3416 && pos->col == MAXCOL
John Marriott82792db2024-05-12 00:07:17 +02003417 ? reg_getline_len(pos->lnum - rex.reg_firstlnum)
Bram Moolenaar872bee52021-05-24 22:56:15 +02003418 : pos->col;
3419
3420 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3421 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003422 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003423 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003424 ? cmp != '>'
3425 : cmp != '<'))
3426 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3427 ? cmp != '>'
3428 : cmp != '<')))
3429 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003430 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003431 }
3432 break;
3433
3434 case RE_VISUAL:
3435 if (!reg_match_visual())
3436 status = RA_NOMATCH;
3437 break;
3438
3439 case RE_LNUM:
3440 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3441 scan))
3442 status = RA_NOMATCH;
3443 break;
3444
3445 case RE_COL:
3446 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3447 status = RA_NOMATCH;
3448 break;
3449
3450 case RE_VCOL:
Bram Moolenaar13ed4942022-08-19 13:59:25 +01003451 {
3452 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaar753aead2022-09-08 12:17:06 +01003453 linenr_T lnum = REG_MULTI ? rex.reg_firstlnum + rex.lnum : 1;
3454 long_u vcol;
Bram Moolenaar13ed4942022-08-19 13:59:25 +01003455
Bram Moolenaar753aead2022-09-08 12:17:06 +01003456 if (REG_MULTI && (lnum <= 0
3457 || lnum > wp->w_buffer->b_ml.ml_line_count))
3458 lnum = 1;
3459 vcol = (long_u)win_linetabsize(wp, lnum, rex.line,
Bram Moolenaar13ed4942022-08-19 13:59:25 +01003460 (colnr_T)(rex.input - rex.line));
3461 if (!re_num_cmp(vcol + 1, scan))
3462 status = RA_NOMATCH;
3463 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003464 break;
3465
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003466 case BOW: // \<word; rex.input points to w
3467 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003468 status = RA_NOMATCH;
3469 else if (has_mbyte)
3470 {
3471 int this_class;
3472
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003473 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003474 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3475 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003476 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003477 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003478 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003479 }
3480 else
3481 {
3482 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3483 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3484 status = RA_NOMATCH;
3485 }
3486 break;
3487
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003488 case EOW: // word\>; rex.input points after d
3489 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003490 status = RA_NOMATCH;
3491 else if (has_mbyte)
3492 {
3493 int this_class, prev_class;
3494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003495 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003496 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3497 prev_class = reg_prev_class();
3498 if (this_class == prev_class
3499 || prev_class == 0 || prev_class == 1)
3500 status = RA_NOMATCH;
3501 }
3502 else
3503 {
3504 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3505 || (rex.input[0] != NUL
3506 && vim_iswordc_buf(c, rex.reg_buf)))
3507 status = RA_NOMATCH;
3508 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003509 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003510
3511 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003512 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003513 if (c == NUL)
3514 status = RA_NOMATCH;
3515 else
3516 ADVANCE_REGINPUT();
3517 break;
3518
3519 case IDENT:
3520 if (!vim_isIDc(c))
3521 status = RA_NOMATCH;
3522 else
3523 ADVANCE_REGINPUT();
3524 break;
3525
3526 case SIDENT:
3527 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3528 status = RA_NOMATCH;
3529 else
3530 ADVANCE_REGINPUT();
3531 break;
3532
3533 case KWORD:
3534 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3535 status = RA_NOMATCH;
3536 else
3537 ADVANCE_REGINPUT();
3538 break;
3539
3540 case SKWORD:
3541 if (VIM_ISDIGIT(*rex.input)
3542 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3543 status = RA_NOMATCH;
3544 else
3545 ADVANCE_REGINPUT();
3546 break;
3547
3548 case FNAME:
3549 if (!vim_isfilec(c))
3550 status = RA_NOMATCH;
3551 else
3552 ADVANCE_REGINPUT();
3553 break;
3554
3555 case SFNAME:
3556 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3557 status = RA_NOMATCH;
3558 else
3559 ADVANCE_REGINPUT();
3560 break;
3561
3562 case PRINT:
3563 if (!vim_isprintc(PTR2CHAR(rex.input)))
3564 status = RA_NOMATCH;
3565 else
3566 ADVANCE_REGINPUT();
3567 break;
3568
3569 case SPRINT:
3570 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3571 status = RA_NOMATCH;
3572 else
3573 ADVANCE_REGINPUT();
3574 break;
3575
3576 case WHITE:
3577 if (!VIM_ISWHITE(c))
3578 status = RA_NOMATCH;
3579 else
3580 ADVANCE_REGINPUT();
3581 break;
3582
3583 case NWHITE:
3584 if (c == NUL || VIM_ISWHITE(c))
3585 status = RA_NOMATCH;
3586 else
3587 ADVANCE_REGINPUT();
3588 break;
3589
3590 case DIGIT:
3591 if (!ri_digit(c))
3592 status = RA_NOMATCH;
3593 else
3594 ADVANCE_REGINPUT();
3595 break;
3596
3597 case NDIGIT:
3598 if (c == NUL || ri_digit(c))
3599 status = RA_NOMATCH;
3600 else
3601 ADVANCE_REGINPUT();
3602 break;
3603
3604 case HEX:
3605 if (!ri_hex(c))
3606 status = RA_NOMATCH;
3607 else
3608 ADVANCE_REGINPUT();
3609 break;
3610
3611 case NHEX:
3612 if (c == NUL || ri_hex(c))
3613 status = RA_NOMATCH;
3614 else
3615 ADVANCE_REGINPUT();
3616 break;
3617
3618 case OCTAL:
3619 if (!ri_octal(c))
3620 status = RA_NOMATCH;
3621 else
3622 ADVANCE_REGINPUT();
3623 break;
3624
3625 case NOCTAL:
3626 if (c == NUL || ri_octal(c))
3627 status = RA_NOMATCH;
3628 else
3629 ADVANCE_REGINPUT();
3630 break;
3631
3632 case WORD:
3633 if (!ri_word(c))
3634 status = RA_NOMATCH;
3635 else
3636 ADVANCE_REGINPUT();
3637 break;
3638
3639 case NWORD:
3640 if (c == NUL || ri_word(c))
3641 status = RA_NOMATCH;
3642 else
3643 ADVANCE_REGINPUT();
3644 break;
3645
3646 case HEAD:
3647 if (!ri_head(c))
3648 status = RA_NOMATCH;
3649 else
3650 ADVANCE_REGINPUT();
3651 break;
3652
3653 case NHEAD:
3654 if (c == NUL || ri_head(c))
3655 status = RA_NOMATCH;
3656 else
3657 ADVANCE_REGINPUT();
3658 break;
3659
3660 case ALPHA:
3661 if (!ri_alpha(c))
3662 status = RA_NOMATCH;
3663 else
3664 ADVANCE_REGINPUT();
3665 break;
3666
3667 case NALPHA:
3668 if (c == NUL || ri_alpha(c))
3669 status = RA_NOMATCH;
3670 else
3671 ADVANCE_REGINPUT();
3672 break;
3673
3674 case LOWER:
3675 if (!ri_lower(c))
3676 status = RA_NOMATCH;
3677 else
3678 ADVANCE_REGINPUT();
3679 break;
3680
3681 case NLOWER:
3682 if (c == NUL || ri_lower(c))
3683 status = RA_NOMATCH;
3684 else
3685 ADVANCE_REGINPUT();
3686 break;
3687
3688 case UPPER:
3689 if (!ri_upper(c))
3690 status = RA_NOMATCH;
3691 else
3692 ADVANCE_REGINPUT();
3693 break;
3694
3695 case NUPPER:
3696 if (c == NUL || ri_upper(c))
3697 status = RA_NOMATCH;
3698 else
3699 ADVANCE_REGINPUT();
3700 break;
3701
3702 case EXACTLY:
3703 {
3704 int len;
3705 char_u *opnd;
3706
3707 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003708 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003709 if (*opnd != *rex.input
3710 && (!rex.reg_ic
3711 || (!enc_utf8
3712 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3713 status = RA_NOMATCH;
3714 else if (*opnd == NUL)
3715 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003716 // match empty string always works; happens when "~" is
3717 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003718 }
3719 else
3720 {
3721 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3722 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003723 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003724 }
3725 else
3726 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003727 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003728 len = (int)STRLEN(opnd);
3729 if (cstrncmp(opnd, rex.input, &len) != 0)
3730 status = RA_NOMATCH;
3731 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003732 // Check for following composing character, unless %C
3733 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003734 if (status != RA_NOMATCH
3735 && enc_utf8
3736 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3737 && !rex.reg_icombine
3738 && OP(next) != RE_COMPOSING)
3739 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003740 // raaron: This code makes a composing character get
3741 // ignored, which is the correct behavior (sometimes)
3742 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003743 status = RA_NOMATCH;
3744 }
3745 if (status != RA_NOMATCH)
3746 rex.input += len;
3747 }
3748 }
3749 break;
3750
3751 case ANYOF:
3752 case ANYBUT:
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01003753 {
3754 char_u *q = OPERAND(scan);
3755
3756 if (c == NUL)
3757 status = RA_NOMATCH;
3758 else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
3759 status = RA_NOMATCH;
3760 else
3761 {
3762 // Check following combining characters
3763 int len = 0;
3764 int i;
3765
3766 if (enc_utf8)
3767 len = utfc_ptr2len(q) - utf_ptr2len(q);
3768
3769 MB_CPTR_ADV(rex.input);
3770 MB_CPTR_ADV(q);
3771
3772 if (!enc_utf8 || len == 0)
3773 break;
3774
3775 for (i = 0; i < len; ++i)
3776 if (q[i] != rex.input[i])
3777 {
3778 status = RA_NOMATCH;
3779 break;
3780 }
3781 rex.input += len;
3782 }
3783 break;
3784 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003785
3786 case MULTIBYTECODE:
3787 if (has_mbyte)
3788 {
3789 int i, len;
3790 char_u *opnd;
3791 int opndc = 0, inpc;
3792
3793 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003794 // Safety check (just in case 'encoding' was changed since
3795 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003796 if ((len = (*mb_ptr2len)(opnd)) < 2)
3797 {
3798 status = RA_NOMATCH;
3799 break;
3800 }
3801 if (enc_utf8)
3802 opndc = utf_ptr2char(opnd);
3803 if (enc_utf8 && utf_iscomposing(opndc))
3804 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003805 // When only a composing char is given match at any
3806 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003807 status = RA_NOMATCH;
3808 for (i = 0; rex.input[i] != NUL;
3809 i += utf_ptr2len(rex.input + i))
3810 {
3811 inpc = utf_ptr2char(rex.input + i);
3812 if (!utf_iscomposing(inpc))
3813 {
3814 if (i > 0)
3815 break;
3816 }
3817 else if (opndc == inpc)
3818 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003819 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003820 len = i + utfc_ptr2len(rex.input + i);
3821 status = RA_MATCH;
3822 break;
3823 }
3824 }
3825 }
Christian Brabandt22e8e122024-07-30 20:39:18 +02003826 else if (enc_utf8)
3827 {
3828 if (cstrncmp(opnd, rex.input, &len) != 0)
3829 {
3830 status = RA_NOMATCH;
3831 break;
3832 }
3833 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003834 else
3835 for (i = 0; i < len; ++i)
3836 if (opnd[i] != rex.input[i])
3837 {
3838 status = RA_NOMATCH;
3839 break;
3840 }
3841 rex.input += len;
3842 }
3843 else
3844 status = RA_NOMATCH;
3845 break;
3846 case RE_COMPOSING:
3847 if (enc_utf8)
3848 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003849 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003850 while (utf_iscomposing(utf_ptr2char(rex.input)))
3851 MB_CPTR_ADV(rex.input);
3852 }
3853 break;
3854
3855 case NOTHING:
3856 break;
3857
3858 case BACK:
3859 {
3860 int i;
3861 backpos_T *bp;
3862
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003863 // When we run into BACK we need to check if we don't keep
3864 // looping without matching any input. The second and later
3865 // times a BACK is encountered it fails if the input is still
3866 // at the same position as the previous time.
3867 // The positions are stored in "backpos" and found by the
3868 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003869 bp = (backpos_T *)backpos.ga_data;
3870 for (i = 0; i < backpos.ga_len; ++i)
3871 if (bp[i].bp_scan == scan)
3872 break;
3873 if (i == backpos.ga_len)
3874 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003875 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003876 if (ga_grow(&backpos, 1) == FAIL)
3877 status = RA_FAIL;
3878 else
3879 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003880 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003881 bp = (backpos_T *)backpos.ga_data;
3882 bp[i].bp_scan = scan;
3883 ++backpos.ga_len;
3884 }
3885 }
3886 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003887 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003888 status = RA_NOMATCH;
3889
3890 if (status != RA_FAIL && status != RA_NOMATCH)
3891 reg_save(&bp[i].bp_pos, &backpos);
3892 }
3893 break;
3894
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003895 case MOPEN + 0: // Match start: \zs
3896 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003897 case MOPEN + 2:
3898 case MOPEN + 3:
3899 case MOPEN + 4:
3900 case MOPEN + 5:
3901 case MOPEN + 6:
3902 case MOPEN + 7:
3903 case MOPEN + 8:
3904 case MOPEN + 9:
3905 {
3906 no = op - MOPEN;
3907 cleanup_subexpr();
3908 rp = regstack_push(RS_MOPEN, scan);
3909 if (rp == NULL)
3910 status = RA_FAIL;
3911 else
3912 {
3913 rp->rs_no = no;
3914 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3915 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003916 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003917 }
3918 }
3919 break;
3920
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003921 case NOPEN: // \%(
3922 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003923 if (regstack_push(RS_NOPEN, scan) == NULL)
3924 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003925 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003926 break;
3927
3928#ifdef FEAT_SYN_HL
3929 case ZOPEN + 1:
3930 case ZOPEN + 2:
3931 case ZOPEN + 3:
3932 case ZOPEN + 4:
3933 case ZOPEN + 5:
3934 case ZOPEN + 6:
3935 case ZOPEN + 7:
3936 case ZOPEN + 8:
3937 case ZOPEN + 9:
3938 {
3939 no = op - ZOPEN;
3940 cleanup_zsubexpr();
3941 rp = regstack_push(RS_ZOPEN, scan);
3942 if (rp == NULL)
3943 status = RA_FAIL;
3944 else
3945 {
3946 rp->rs_no = no;
3947 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3948 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003949 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003950 }
3951 }
3952 break;
3953#endif
3954
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003955 case MCLOSE + 0: // Match end: \ze
3956 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003957 case MCLOSE + 2:
3958 case MCLOSE + 3:
3959 case MCLOSE + 4:
3960 case MCLOSE + 5:
3961 case MCLOSE + 6:
3962 case MCLOSE + 7:
3963 case MCLOSE + 8:
3964 case MCLOSE + 9:
3965 {
3966 no = op - MCLOSE;
3967 cleanup_subexpr();
3968 rp = regstack_push(RS_MCLOSE, scan);
3969 if (rp == NULL)
3970 status = RA_FAIL;
3971 else
3972 {
3973 rp->rs_no = no;
3974 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3975 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003976 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003977 }
3978 }
3979 break;
3980
3981#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003982 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003983 case ZCLOSE + 2:
3984 case ZCLOSE + 3:
3985 case ZCLOSE + 4:
3986 case ZCLOSE + 5:
3987 case ZCLOSE + 6:
3988 case ZCLOSE + 7:
3989 case ZCLOSE + 8:
3990 case ZCLOSE + 9:
3991 {
3992 no = op - ZCLOSE;
3993 cleanup_zsubexpr();
3994 rp = regstack_push(RS_ZCLOSE, scan);
3995 if (rp == NULL)
3996 status = RA_FAIL;
3997 else
3998 {
3999 rp->rs_no = no;
4000 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
4001 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004002 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004003 }
4004 }
4005 break;
4006#endif
4007
4008 case BACKREF + 1:
4009 case BACKREF + 2:
4010 case BACKREF + 3:
4011 case BACKREF + 4:
4012 case BACKREF + 5:
4013 case BACKREF + 6:
4014 case BACKREF + 7:
4015 case BACKREF + 8:
4016 case BACKREF + 9:
4017 {
4018 int len;
4019
4020 no = op - BACKREF;
4021 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004022 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004023 {
4024 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
4025 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004026 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004027 len = 0;
4028 }
4029 else
4030 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004031 // Compare current input with back-ref in the same
4032 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004033 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
4034 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
4035 status = RA_NOMATCH;
4036 }
4037 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004038 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004039 {
4040 if (rex.reg_startpos[no].lnum < 0
4041 || rex.reg_endpos[no].lnum < 0)
4042 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004043 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004044 len = 0;
4045 }
4046 else
4047 {
4048 if (rex.reg_startpos[no].lnum == rex.lnum
4049 && rex.reg_endpos[no].lnum == rex.lnum)
4050 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004051 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004052 len = rex.reg_endpos[no].col
4053 - rex.reg_startpos[no].col;
4054 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
4055 rex.input, &len) != 0)
4056 status = RA_NOMATCH;
4057 }
4058 else
4059 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004060 // Messy situation: Need to compare between two
4061 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004062 int r = match_with_backref(
4063 rex.reg_startpos[no].lnum,
4064 rex.reg_startpos[no].col,
4065 rex.reg_endpos[no].lnum,
4066 rex.reg_endpos[no].col,
4067 &len);
4068
4069 if (r != RA_MATCH)
4070 status = r;
4071 }
4072 }
4073 }
4074
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004075 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004076 rex.input += len;
4077 }
4078 break;
4079
4080#ifdef FEAT_SYN_HL
4081 case ZREF + 1:
4082 case ZREF + 2:
4083 case ZREF + 3:
4084 case ZREF + 4:
4085 case ZREF + 5:
4086 case ZREF + 6:
4087 case ZREF + 7:
4088 case ZREF + 8:
4089 case ZREF + 9:
4090 {
4091 int len;
4092
4093 cleanup_zsubexpr();
4094 no = op - ZREF;
4095 if (re_extmatch_in != NULL
4096 && re_extmatch_in->matches[no] != NULL)
4097 {
4098 len = (int)STRLEN(re_extmatch_in->matches[no]);
4099 if (cstrncmp(re_extmatch_in->matches[no],
4100 rex.input, &len) != 0)
4101 status = RA_NOMATCH;
4102 else
4103 rex.input += len;
4104 }
4105 else
4106 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004107 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004108 }
4109 }
4110 break;
4111#endif
4112
4113 case BRANCH:
4114 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004115 if (OP(next) != BRANCH) // No choice.
4116 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004117 else
4118 {
4119 rp = regstack_push(RS_BRANCH, scan);
4120 if (rp == NULL)
4121 status = RA_FAIL;
4122 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004123 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004124 }
4125 }
4126 break;
4127
4128 case BRACE_LIMITS:
4129 {
4130 if (OP(next) == BRACE_SIMPLE)
4131 {
4132 bl_minval = OPERAND_MIN(scan);
4133 bl_maxval = OPERAND_MAX(scan);
4134 }
4135 else if (OP(next) >= BRACE_COMPLEX
4136 && OP(next) < BRACE_COMPLEX + 10)
4137 {
4138 no = OP(next) - BRACE_COMPLEX;
4139 brace_min[no] = OPERAND_MIN(scan);
4140 brace_max[no] = OPERAND_MAX(scan);
4141 brace_count[no] = 0;
4142 }
4143 else
4144 {
4145 internal_error("BRACE_LIMITS");
4146 status = RA_FAIL;
4147 }
4148 }
4149 break;
4150
4151 case BRACE_COMPLEX + 0:
4152 case BRACE_COMPLEX + 1:
4153 case BRACE_COMPLEX + 2:
4154 case BRACE_COMPLEX + 3:
4155 case BRACE_COMPLEX + 4:
4156 case BRACE_COMPLEX + 5:
4157 case BRACE_COMPLEX + 6:
4158 case BRACE_COMPLEX + 7:
4159 case BRACE_COMPLEX + 8:
4160 case BRACE_COMPLEX + 9:
4161 {
4162 no = op - BRACE_COMPLEX;
4163 ++brace_count[no];
4164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004165 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004166 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4167 ? brace_min[no] : brace_max[no]))
4168 {
4169 rp = regstack_push(RS_BRCPLX_MORE, scan);
4170 if (rp == NULL)
4171 status = RA_FAIL;
4172 else
4173 {
4174 rp->rs_no = no;
4175 reg_save(&rp->rs_un.regsave, &backpos);
4176 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004177 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004178 }
4179 break;
4180 }
4181
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004182 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004183 if (brace_min[no] <= brace_max[no])
4184 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004185 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004186 if (brace_count[no] <= brace_max[no])
4187 {
4188 rp = regstack_push(RS_BRCPLX_LONG, scan);
4189 if (rp == NULL)
4190 status = RA_FAIL;
4191 else
4192 {
4193 rp->rs_no = no;
4194 reg_save(&rp->rs_un.regsave, &backpos);
4195 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004196 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004197 }
4198 }
4199 }
4200 else
4201 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004202 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004203 if (brace_count[no] <= brace_min[no])
4204 {
4205 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4206 if (rp == NULL)
4207 status = RA_FAIL;
4208 else
4209 {
4210 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004211 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004212 }
4213 }
4214 }
4215 }
4216 break;
4217
4218 case BRACE_SIMPLE:
4219 case STAR:
4220 case PLUS:
4221 {
4222 regstar_T rst;
4223
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004224 // Lookahead to avoid useless match attempts when we know
4225 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004226 if (OP(next) == EXACTLY)
4227 {
4228 rst.nextb = *OPERAND(next);
4229 if (rex.reg_ic)
4230 {
4231 if (MB_ISUPPER(rst.nextb))
4232 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4233 else
4234 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4235 }
4236 else
4237 rst.nextb_ic = rst.nextb;
4238 }
4239 else
4240 {
4241 rst.nextb = NUL;
4242 rst.nextb_ic = NUL;
4243 }
4244 if (op != BRACE_SIMPLE)
4245 {
4246 rst.minval = (op == STAR) ? 0 : 1;
4247 rst.maxval = MAX_LIMIT;
4248 }
4249 else
4250 {
4251 rst.minval = bl_minval;
4252 rst.maxval = bl_maxval;
4253 }
4254
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004255 // When maxval > minval, try matching as much as possible, up
4256 // to maxval. When maxval < minval, try matching at least the
4257 // minimal number (since the range is backwards, that's also
4258 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004259 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4260 if (got_int)
4261 {
4262 status = RA_FAIL;
4263 break;
4264 }
4265 if (rst.minval <= rst.maxval
4266 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4267 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004268 // It could match. Prepare for trying to match what
4269 // follows. The code is below. Parameters are stored in
4270 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004271 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4272 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004273 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004274 status = RA_FAIL;
4275 }
4276 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4277 status = RA_FAIL;
4278 else
4279 {
4280 regstack.ga_len += sizeof(regstar_T);
4281 rp = regstack_push(rst.minval <= rst.maxval
4282 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4283 if (rp == NULL)
4284 status = RA_FAIL;
4285 else
4286 {
4287 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004288 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004289 }
4290 }
4291 }
4292 else
4293 status = RA_NOMATCH;
4294
4295 }
4296 break;
4297
4298 case NOMATCH:
4299 case MATCH:
4300 case SUBPAT:
4301 rp = regstack_push(RS_NOMATCH, scan);
4302 if (rp == NULL)
4303 status = RA_FAIL;
4304 else
4305 {
4306 rp->rs_no = op;
4307 reg_save(&rp->rs_un.regsave, &backpos);
4308 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004309 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004310 }
4311 break;
4312
4313 case BEHIND:
4314 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004315 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004316 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4317 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004318 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004319 status = RA_FAIL;
4320 }
4321 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4322 status = RA_FAIL;
4323 else
4324 {
4325 regstack.ga_len += sizeof(regbehind_T);
4326 rp = regstack_push(RS_BEHIND1, scan);
4327 if (rp == NULL)
4328 status = RA_FAIL;
4329 else
4330 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004331 // Need to save the subexpr to be able to restore them
4332 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004333 save_subexpr(((regbehind_T *)rp) - 1);
4334
4335 rp->rs_no = op;
4336 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004337 // First try if what follows matches. If it does then we
4338 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004339 }
4340 }
4341 break;
4342
4343 case BHPOS:
4344 if (REG_MULTI)
4345 {
4346 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4347 || behind_pos.rs_u.pos.lnum != rex.lnum)
4348 status = RA_NOMATCH;
4349 }
4350 else if (behind_pos.rs_u.ptr != rex.input)
4351 status = RA_NOMATCH;
4352 break;
4353
4354 case NEWL:
4355 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4356 || rex.reg_line_lbr)
4357 && (c != '\n' || !rex.reg_line_lbr))
4358 status = RA_NOMATCH;
4359 else if (rex.reg_line_lbr)
4360 ADVANCE_REGINPUT();
4361 else
4362 reg_nextline();
4363 break;
4364
4365 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004366 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004367 break;
4368
4369 default:
RestorerZ68ebcee2023-05-31 17:12:14 +01004370 iemsg(e_corrupted_regexp_program);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004371#ifdef DEBUG
4372 printf("Illegal op code %d\n", op);
4373#endif
4374 status = RA_FAIL;
4375 break;
4376 }
4377 }
4378
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004379 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004380 if (status != RA_CONT)
4381 break;
4382
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004383 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004384 scan = next;
4385
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004386 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004387
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004388 // If there is something on the regstack execute the code for the state.
4389 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004390 while (regstack.ga_len > 0 && status != RA_FAIL)
4391 {
4392 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4393 switch (rp->rs_state)
4394 {
4395 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004396 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004397 regstack_pop(&scan);
4398 break;
4399
4400 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004401 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004402 if (status == RA_NOMATCH)
4403 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4404 &rex.reg_startp[rp->rs_no]);
4405 regstack_pop(&scan);
4406 break;
4407
4408#ifdef FEAT_SYN_HL
4409 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004410 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004411 if (status == RA_NOMATCH)
4412 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4413 &reg_startzp[rp->rs_no]);
4414 regstack_pop(&scan);
4415 break;
4416#endif
4417
4418 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004419 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004420 if (status == RA_NOMATCH)
4421 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4422 &rex.reg_endp[rp->rs_no]);
4423 regstack_pop(&scan);
4424 break;
4425
4426#ifdef FEAT_SYN_HL
4427 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004428 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004429 if (status == RA_NOMATCH)
4430 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4431 &reg_endzp[rp->rs_no]);
4432 regstack_pop(&scan);
4433 break;
4434#endif
4435
4436 case RS_BRANCH:
4437 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004438 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004439 regstack_pop(&scan);
4440 else
4441 {
4442 if (status != RA_BREAK)
4443 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004444 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004445 reg_restore(&rp->rs_un.regsave, &backpos);
4446 scan = rp->rs_scan;
4447 }
4448 if (scan == NULL || OP(scan) != BRANCH)
4449 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004450 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004451 status = RA_NOMATCH;
4452 regstack_pop(&scan);
4453 }
4454 else
4455 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004456 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004457 rp->rs_scan = regnext(scan);
4458 reg_save(&rp->rs_un.regsave, &backpos);
4459 scan = OPERAND(scan);
4460 }
4461 }
4462 break;
4463
4464 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004465 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004466 if (status == RA_NOMATCH)
4467 {
4468 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004469 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004470 }
4471 regstack_pop(&scan);
4472 break;
4473
4474 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004475 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004476 if (status == RA_NOMATCH)
4477 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004478 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004479 reg_restore(&rp->rs_un.regsave, &backpos);
4480 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004481 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004482 status = RA_CONT;
4483 }
4484 regstack_pop(&scan);
4485 if (status == RA_CONT)
4486 scan = regnext(scan);
4487 break;
4488
4489 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004490 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004491 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004492 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004493 reg_restore(&rp->rs_un.regsave, &backpos);
4494 regstack_pop(&scan);
4495 if (status == RA_NOMATCH)
4496 {
4497 scan = OPERAND(scan);
4498 status = RA_CONT;
4499 }
4500 break;
4501
4502 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004503 // Pop the state. If the operand matches for NOMATCH or
4504 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4505 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004506 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4507 status = RA_NOMATCH;
4508 else
4509 {
4510 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004511 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004512 reg_restore(&rp->rs_un.regsave, &backpos);
4513 }
4514 regstack_pop(&scan);
4515 if (status == RA_CONT)
4516 scan = regnext(scan);
4517 break;
4518
4519 case RS_BEHIND1:
4520 if (status == RA_NOMATCH)
4521 {
4522 regstack_pop(&scan);
4523 regstack.ga_len -= sizeof(regbehind_T);
4524 }
4525 else
4526 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004527 // The stuff after BEHIND/NOBEHIND matches. Now try if
4528 // the behind part does (not) match before the current
4529 // position in the input. This must be done at every
4530 // position in the input and checking if the match ends at
4531 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004532
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004533 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004534 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4535
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004536 // Start looking for a match with operand at the current
4537 // position. Go back one character until we find the
4538 // result, hitting the start of the line or the previous
4539 // line (for multi-line matching).
4540 // Set behind_pos to where the match should end, BHPOS
4541 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004542 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4543 behind_pos = rp->rs_un.regsave;
4544
4545 rp->rs_state = RS_BEHIND2;
4546
4547 reg_restore(&rp->rs_un.regsave, &backpos);
4548 scan = OPERAND(rp->rs_scan) + 4;
4549 }
4550 break;
4551
4552 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004553 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004554 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4555 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004556 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004557 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4558 if (rp->rs_no == BEHIND)
4559 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4560 &backpos);
4561 else
4562 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004563 // But we didn't want a match. Need to restore the
4564 // subexpr, because what follows matched, so they have
4565 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004566 status = RA_NOMATCH;
4567 restore_subexpr(((regbehind_T *)rp) - 1);
4568 }
4569 regstack_pop(&scan);
4570 regstack.ga_len -= sizeof(regbehind_T);
4571 }
4572 else
4573 {
4574 long limit;
4575
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004576 // No match or a match that doesn't end where we want it: Go
4577 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004578 no = OK;
4579 limit = OPERAND_MIN(rp->rs_scan);
4580 if (REG_MULTI)
4581 {
4582 if (limit > 0
4583 && ((rp->rs_un.regsave.rs_u.pos.lnum
4584 < behind_pos.rs_u.pos.lnum
4585 ? (colnr_T)STRLEN(rex.line)
4586 : behind_pos.rs_u.pos.col)
4587 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4588 no = FAIL;
4589 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4590 {
4591 if (rp->rs_un.regsave.rs_u.pos.lnum
4592 < behind_pos.rs_u.pos.lnum
4593 || reg_getline(
4594 --rp->rs_un.regsave.rs_u.pos.lnum)
4595 == NULL)
4596 no = FAIL;
4597 else
4598 {
4599 reg_restore(&rp->rs_un.regsave, &backpos);
4600 rp->rs_un.regsave.rs_u.pos.col =
4601 (colnr_T)STRLEN(rex.line);
4602 }
4603 }
4604 else
4605 {
4606 if (has_mbyte)
4607 {
4608 char_u *line =
4609 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4610
4611 rp->rs_un.regsave.rs_u.pos.col -=
4612 (*mb_head_off)(line, line
4613 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4614 }
4615 else
4616 --rp->rs_un.regsave.rs_u.pos.col;
4617 }
4618 }
4619 else
4620 {
4621 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4622 no = FAIL;
4623 else
4624 {
4625 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4626 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4627 - rp->rs_un.regsave.rs_u.ptr) > limit)
4628 no = FAIL;
4629 }
4630 }
4631 if (no == OK)
4632 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004633 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004634 reg_restore(&rp->rs_un.regsave, &backpos);
4635 scan = OPERAND(rp->rs_scan) + 4;
4636 if (status == RA_MATCH)
4637 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004638 // We did match, so subexpr may have been changed,
4639 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004640 status = RA_NOMATCH;
4641 restore_subexpr(((regbehind_T *)rp) - 1);
4642 }
4643 }
4644 else
4645 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004646 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004647 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4648 if (rp->rs_no == NOBEHIND)
4649 {
4650 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4651 &backpos);
4652 status = RA_MATCH;
4653 }
4654 else
4655 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004656 // We do want a proper match. Need to restore the
4657 // subexpr if we had a match, because they may have
4658 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004659 if (status == RA_MATCH)
4660 {
4661 status = RA_NOMATCH;
4662 restore_subexpr(((regbehind_T *)rp) - 1);
4663 }
4664 }
4665 regstack_pop(&scan);
4666 regstack.ga_len -= sizeof(regbehind_T);
4667 }
4668 }
4669 break;
4670
4671 case RS_STAR_LONG:
4672 case RS_STAR_SHORT:
4673 {
4674 regstar_T *rst = ((regstar_T *)rp) - 1;
4675
4676 if (status == RA_MATCH)
4677 {
4678 regstack_pop(&scan);
4679 regstack.ga_len -= sizeof(regstar_T);
4680 break;
4681 }
4682
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004683 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004684 if (status != RA_BREAK)
4685 reg_restore(&rp->rs_un.regsave, &backpos);
4686
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004687 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004688 for (;;)
4689 {
4690 if (status != RA_BREAK)
4691 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004692 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004693 if (rp->rs_state == RS_STAR_LONG)
4694 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004695 // Trying for longest match, but couldn't or
4696 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004697 if (--rst->count < rst->minval)
4698 break;
4699 if (rex.input == rex.line)
4700 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004701 // backup to last char of previous line
Bram Moolenaar6456fae2022-02-22 13:37:31 +00004702 if (rex.lnum == 0)
4703 {
4704 status = RA_NOMATCH;
4705 break;
4706 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004707 --rex.lnum;
4708 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004709 // Just in case regrepeat() didn't count
4710 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004711 if (rex.line == NULL)
4712 break;
John Marriott82792db2024-05-12 00:07:17 +02004713 rex.input = rex.line + reg_getline_len(rex.lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004714 fast_breakcheck();
4715 }
4716 else
4717 MB_PTR_BACK(rex.line, rex.input);
4718 }
4719 else
4720 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004721 // Range is backwards, use shortest match first.
4722 // Careful: maxval and minval are exchanged!
4723 // Couldn't or didn't match: try advancing one
4724 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004725 if (rst->count == rst->minval
4726 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4727 break;
4728 ++rst->count;
4729 }
4730 if (got_int)
4731 break;
4732 }
4733 else
4734 status = RA_NOMATCH;
4735
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004736 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004737 if (rst->nextb == NUL || *rex.input == rst->nextb
4738 || *rex.input == rst->nextb_ic)
4739 {
4740 reg_save(&rp->rs_un.regsave, &backpos);
4741 scan = regnext(rp->rs_scan);
4742 status = RA_CONT;
4743 break;
4744 }
4745 }
4746 if (status != RA_CONT)
4747 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004748 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004749 regstack_pop(&scan);
4750 regstack.ga_len -= sizeof(regstar_T);
4751 status = RA_NOMATCH;
4752 }
4753 }
4754 break;
4755 }
4756
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004757 // If we want to continue the inner loop or didn't pop a state
4758 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004759 if (status == RA_CONT || rp == (regitem_T *)
4760 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4761 break;
Bram Moolenaar616592e2022-06-17 15:17:10 +01004762
4763#ifdef FEAT_RELTIME
4764 if (bt_did_time_out(timed_out))
4765 {
4766 status = RA_FAIL;
4767 break;
4768 }
4769#endif
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004770 }
4771
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004772 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004773 if (status == RA_CONT)
4774 continue;
4775
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004776 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004777 if (regstack.ga_len == 0 || status == RA_FAIL)
4778 {
4779 if (scan == NULL)
4780 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004781 // We get here only if there's trouble -- normally "case END" is
4782 // the terminating point.
RestorerZ68ebcee2023-05-31 17:12:14 +01004783 iemsg(e_corrupted_regexp_program);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004784#ifdef DEBUG
4785 printf("Premature EOL\n");
4786#endif
4787 }
4788 return (status == RA_MATCH);
4789 }
4790
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004791 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004792
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004793 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004794}
4795
4796/*
4797 * regtry - try match of "prog" with at rex.line["col"].
4798 * Returns 0 for failure, number of lines contained in the match otherwise.
4799 */
4800 static long
4801regtry(
4802 bt_regprog_T *prog,
4803 colnr_T col,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004804 int *timed_out) // flag set on timeout or NULL
4805{
4806 rex.input = rex.line + col;
4807 rex.need_clear_subexpr = TRUE;
4808#ifdef FEAT_SYN_HL
4809 // Clear the external match subpointers if necessary.
4810 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4811#endif
4812
Paul Ollis65745772022-06-05 16:55:54 +01004813 if (regmatch(prog->program + 1, timed_out) == 0)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004814 return 0;
4815
4816 cleanup_subexpr();
4817 if (REG_MULTI)
4818 {
4819 if (rex.reg_startpos[0].lnum < 0)
4820 {
4821 rex.reg_startpos[0].lnum = 0;
4822 rex.reg_startpos[0].col = col;
4823 }
4824 if (rex.reg_endpos[0].lnum < 0)
4825 {
4826 rex.reg_endpos[0].lnum = rex.lnum;
4827 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4828 }
4829 else
4830 // Use line number of "\ze".
4831 rex.lnum = rex.reg_endpos[0].lnum;
4832 }
4833 else
4834 {
4835 if (rex.reg_startp[0] == NULL)
4836 rex.reg_startp[0] = rex.line + col;
4837 if (rex.reg_endp[0] == NULL)
4838 rex.reg_endp[0] = rex.input;
4839 }
4840#ifdef FEAT_SYN_HL
4841 // Package any found \z(...\) matches for export. Default is none.
4842 unref_extmatch(re_extmatch_out);
4843 re_extmatch_out = NULL;
4844
4845 if (prog->reghasz == REX_SET)
4846 {
4847 int i;
4848
4849 cleanup_zsubexpr();
4850 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004851 if (re_extmatch_out == NULL)
4852 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004853 for (i = 0; i < NSUBEXP; i++)
4854 {
4855 if (REG_MULTI)
4856 {
4857 // Only accept single line matches.
4858 if (reg_startzpos[i].lnum >= 0
4859 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4860 && reg_endzpos[i].col >= reg_startzpos[i].col)
4861 re_extmatch_out->matches[i] =
4862 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4863 + reg_startzpos[i].col,
4864 reg_endzpos[i].col - reg_startzpos[i].col);
4865 }
4866 else
4867 {
4868 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4869 re_extmatch_out->matches[i] =
4870 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004871 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004872 }
4873 }
4874 }
4875#endif
4876 return 1 + rex.lnum;
4877}
4878
4879/*
4880 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004881 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004882 * Returns 0 for failure, number of lines contained in the match otherwise.
4883 */
4884 static long
4885bt_regexec_both(
4886 char_u *line,
Bram Moolenaar01105b32022-11-26 11:47:10 +00004887 colnr_T startcol, // column to start looking for match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004888 int *timed_out) // flag set on timeout or NULL
4889{
4890 bt_regprog_T *prog;
4891 char_u *s;
Bram Moolenaar01105b32022-11-26 11:47:10 +00004892 colnr_T col = startcol;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004893 long retval = 0L;
4894
4895 // Create "regstack" and "backpos" if they are not allocated yet.
4896 // We allocate *_INITIAL amount of bytes first and then set the grow size
4897 // to much bigger value to avoid many malloc calls in case of deep regular
4898 // expressions.
4899 if (regstack.ga_data == NULL)
4900 {
4901 // Use an item size of 1 byte, since we push different things
4902 // onto the regstack.
4903 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4904 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4905 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4906 }
4907
4908 if (backpos.ga_data == NULL)
4909 {
4910 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4911 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4912 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4913 }
4914
4915 if (REG_MULTI)
4916 {
4917 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4918 line = reg_getline((linenr_T)0);
4919 rex.reg_startpos = rex.reg_mmatch->startpos;
4920 rex.reg_endpos = rex.reg_mmatch->endpos;
4921 }
4922 else
4923 {
4924 prog = (bt_regprog_T *)rex.reg_match->regprog;
4925 rex.reg_startp = rex.reg_match->startp;
4926 rex.reg_endp = rex.reg_match->endp;
4927 }
4928
4929 // Be paranoid...
4930 if (prog == NULL || line == NULL)
4931 {
RestorerZ68ebcee2023-05-31 17:12:14 +01004932 iemsg(e_null_argument);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004933 goto theend;
4934 }
4935
4936 // Check validity of program.
4937 if (prog_magic_wrong())
4938 goto theend;
4939
4940 // If the start column is past the maximum column: no need to try.
4941 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4942 goto theend;
4943
4944 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4945 if (prog->regflags & RF_ICASE)
4946 rex.reg_ic = TRUE;
4947 else if (prog->regflags & RF_NOICASE)
4948 rex.reg_ic = FALSE;
4949
4950 // If pattern contains "\Z" overrule value of rex.reg_icombine
4951 if (prog->regflags & RF_ICOMBINE)
4952 rex.reg_icombine = TRUE;
4953
4954 // If there is a "must appear" string, look for it.
4955 if (prog->regmust != NULL)
4956 {
4957 int c;
4958
4959 if (has_mbyte)
4960 c = (*mb_ptr2char)(prog->regmust);
4961 else
4962 c = *prog->regmust;
4963 s = line + col;
4964
4965 // This is used very often, esp. for ":global". Use three versions of
4966 // the loop to avoid overhead of conditions.
4967 if (!rex.reg_ic && !has_mbyte)
4968 while ((s = vim_strbyte(s, c)) != NULL)
4969 {
4970 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4971 break; // Found it.
4972 ++s;
4973 }
4974 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4975 while ((s = vim_strchr(s, c)) != NULL)
4976 {
4977 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4978 break; // Found it.
4979 MB_PTR_ADV(s);
4980 }
4981 else
4982 while ((s = cstrchr(s, c)) != NULL)
4983 {
4984 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4985 break; // Found it.
4986 MB_PTR_ADV(s);
4987 }
4988 if (s == NULL) // Not present.
4989 goto theend;
4990 }
4991
4992 rex.line = line;
4993 rex.lnum = 0;
4994 reg_toolong = FALSE;
4995
4996 // Simplest case: Anchored match need be tried only once.
4997 if (prog->reganch)
4998 {
4999 int c;
5000
5001 if (has_mbyte)
5002 c = (*mb_ptr2char)(rex.line + col);
5003 else
5004 c = rex.line[col];
5005 if (prog->regstart == NUL
5006 || prog->regstart == c
5007 || (rex.reg_ic
5008 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
5009 || (c < 255 && prog->regstart < 255 &&
5010 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
Paul Ollis65745772022-06-05 16:55:54 +01005011 retval = regtry(prog, col, timed_out);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005012 else
5013 retval = 0;
5014 }
5015 else
5016 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005017 // Messy cases: unanchored match.
5018 while (!got_int)
5019 {
5020 if (prog->regstart != NUL)
5021 {
5022 // Skip until the char we know it must start with.
5023 // Used often, do some work to avoid call overhead.
5024 if (!rex.reg_ic && !has_mbyte)
5025 s = vim_strbyte(rex.line + col, prog->regstart);
5026 else
5027 s = cstrchr(rex.line + col, prog->regstart);
5028 if (s == NULL)
5029 {
5030 retval = 0;
5031 break;
5032 }
5033 col = (int)(s - rex.line);
5034 }
5035
5036 // Check for maximum column to try.
5037 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
5038 {
5039 retval = 0;
5040 break;
5041 }
5042
Paul Ollis65745772022-06-05 16:55:54 +01005043 retval = regtry(prog, col, timed_out);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005044 if (retval > 0)
5045 break;
5046
5047 // if not currently on the first line, get it again
5048 if (rex.lnum != 0)
5049 {
5050 rex.lnum = 0;
5051 rex.line = reg_getline((linenr_T)0);
5052 }
5053 if (rex.line[col] == NUL)
5054 break;
5055 if (has_mbyte)
5056 col += (*mb_ptr2len)(rex.line + col);
5057 else
5058 ++col;
5059#ifdef FEAT_RELTIME
Bram Moolenaar616592e2022-06-17 15:17:10 +01005060 if (bt_did_time_out(timed_out))
Paul Ollis65745772022-06-05 16:55:54 +01005061 break;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005062#endif
5063 }
5064 }
5065
5066theend:
5067 // Free "reg_tofree" when it's a bit big.
5068 // Free regstack and backpos if they are bigger than their initial size.
5069 if (reg_tofreelen > 400)
5070 VIM_CLEAR(reg_tofree);
5071 if (regstack.ga_maxlen > REGSTACK_INITIAL)
5072 ga_clear(&regstack);
5073 if (backpos.ga_maxlen > BACKPOS_INITIAL)
5074 ga_clear(&backpos);
5075
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005076 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005077 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005078 // Make sure the end is never before the start. Can happen when \zs
5079 // and \ze are used.
5080 if (REG_MULTI)
5081 {
5082 lpos_T *start = &rex.reg_mmatch->startpos[0];
5083 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005084
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005085 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005086 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005087 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
Bram Moolenaar01105b32022-11-26 11:47:10 +00005088
5089 // startpos[0] may be set by "\zs", also return the column where
5090 // the whole pattern matched.
5091 rex.reg_mmatch->rmm_matchcol = col;
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005092 }
5093 else
5094 {
5095 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5096 rex.reg_match->endp[0] = rex.reg_match->startp[0];
Bram Moolenaar01105b32022-11-26 11:47:10 +00005097
5098 // startpos[0] may be set by "\zs", also return the column where
5099 // the whole pattern matched.
5100 rex.reg_match->rm_matchcol = col;
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005101 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005102 }
5103
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005104 return retval;
5105}
5106
5107/*
5108 * Match a regexp against a string.
5109 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5110 * Uses curbuf for line count and 'iskeyword'.
5111 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5112 *
5113 * Returns 0 for failure, number of lines contained in the match otherwise.
5114 */
5115 static int
5116bt_regexec_nl(
5117 regmatch_T *rmp,
5118 char_u *line, // string to match against
5119 colnr_T col, // column to start looking for match
5120 int line_lbr)
5121{
5122 rex.reg_match = rmp;
5123 rex.reg_mmatch = NULL;
5124 rex.reg_maxline = 0;
5125 rex.reg_line_lbr = line_lbr;
5126 rex.reg_buf = curbuf;
5127 rex.reg_win = NULL;
5128 rex.reg_ic = rmp->rm_ic;
5129 rex.reg_icombine = FALSE;
5130 rex.reg_maxcol = 0;
5131
Paul Ollis65745772022-06-05 16:55:54 +01005132 return bt_regexec_both(line, col, NULL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005133}
5134
5135/*
5136 * Match a regexp against multiple lines.
5137 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5138 * Uses curbuf for line count and 'iskeyword'.
5139 *
5140 * Return zero if there is no match. Return number of lines contained in the
5141 * match otherwise.
5142 */
5143 static long
5144bt_regexec_multi(
5145 regmmatch_T *rmp,
5146 win_T *win, // window in which to search or NULL
5147 buf_T *buf, // buffer in which to search
5148 linenr_T lnum, // nr of line to start looking for match
5149 colnr_T col, // column to start looking for match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005150 int *timed_out) // flag set on timeout or NULL
5151{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005152 init_regexec_multi(rmp, win, buf, lnum);
Paul Ollis65745772022-06-05 16:55:54 +01005153 return bt_regexec_both(NULL, col, timed_out);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005154}
5155
5156/*
5157 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5158 */
5159 static int
5160re_num_cmp(long_u val, char_u *scan)
5161{
5162 long_u n = OPERAND_MIN(scan);
5163
5164 if (OPERAND_CMP(scan) == '>')
5165 return val > n;
5166 if (OPERAND_CMP(scan) == '<')
5167 return val < n;
5168 return val == n;
5169}
5170
5171#ifdef BT_REGEXP_DUMP
5172
5173/*
5174 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5175 */
5176 static void
5177regdump(char_u *pattern, bt_regprog_T *r)
5178{
5179 char_u *s;
5180 int op = EXACTLY; // Arbitrary non-END op.
5181 char_u *next;
5182 char_u *end = NULL;
5183 FILE *f;
5184
5185#ifdef BT_REGEXP_LOG
5186 f = fopen("bt_regexp_log.log", "a");
5187#else
5188 f = stdout;
5189#endif
5190 if (f == NULL)
5191 return;
5192 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5193
5194 s = r->program + 1;
5195 // Loop until we find the END that isn't before a referred next (an END
5196 // can also appear in a NOMATCH operand).
5197 while (op != END || s <= end)
5198 {
5199 op = OP(s);
5200 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5201 next = regnext(s);
5202 if (next == NULL) // Next ptr.
5203 fprintf(f, "(0)");
5204 else
5205 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5206 if (end < next)
5207 end = next;
5208 if (op == BRACE_LIMITS)
5209 {
5210 // Two ints
5211 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5212 s += 8;
5213 }
5214 else if (op == BEHIND || op == NOBEHIND)
5215 {
5216 // one int
5217 fprintf(f, " count %ld", OPERAND_MIN(s));
5218 s += 4;
5219 }
5220 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5221 {
5222 // one int plus comparator
5223 fprintf(f, " count %ld", OPERAND_MIN(s));
5224 s += 5;
5225 }
5226 s += 3;
5227 if (op == ANYOF || op == ANYOF + ADD_NL
5228 || op == ANYBUT || op == ANYBUT + ADD_NL
5229 || op == EXACTLY)
5230 {
5231 // Literal string, where present.
5232 fprintf(f, "\nxxxxxxxxx\n");
5233 while (*s != NUL)
5234 fprintf(f, "%c", *s++);
5235 fprintf(f, "\nxxxxxxxxx\n");
5236 s++;
5237 }
5238 fprintf(f, "\r\n");
5239 }
5240
5241 // Header fields of interest.
5242 if (r->regstart != NUL)
5243 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5244 ? (char *)transchar(r->regstart)
5245 : "multibyte", r->regstart);
5246 if (r->reganch)
5247 fprintf(f, "anchored; ");
5248 if (r->regmust != NULL)
5249 fprintf(f, "must have \"%s\"", r->regmust);
5250 fprintf(f, "\r\n");
5251
5252#ifdef BT_REGEXP_LOG
5253 fclose(f);
5254#endif
5255}
5256#endif // BT_REGEXP_DUMP
5257
5258#ifdef DEBUG
5259/*
5260 * regprop - printable representation of opcode
5261 */
5262 static char_u *
5263regprop(char_u *op)
5264{
5265 char *p;
5266 static char buf[50];
John Marriott82792db2024-05-12 00:07:17 +02005267 static size_t buflen = 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005268
5269 STRCPY(buf, ":");
John Marriott82792db2024-05-12 00:07:17 +02005270 buflen = 1;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005271
5272 switch ((int) OP(op))
5273 {
5274 case BOL:
5275 p = "BOL";
5276 break;
5277 case EOL:
5278 p = "EOL";
5279 break;
5280 case RE_BOF:
5281 p = "BOF";
5282 break;
5283 case RE_EOF:
5284 p = "EOF";
5285 break;
5286 case CURSOR:
5287 p = "CURSOR";
5288 break;
5289 case RE_VISUAL:
5290 p = "RE_VISUAL";
5291 break;
5292 case RE_LNUM:
5293 p = "RE_LNUM";
5294 break;
5295 case RE_MARK:
5296 p = "RE_MARK";
5297 break;
5298 case RE_COL:
5299 p = "RE_COL";
5300 break;
5301 case RE_VCOL:
5302 p = "RE_VCOL";
5303 break;
5304 case BOW:
5305 p = "BOW";
5306 break;
5307 case EOW:
5308 p = "EOW";
5309 break;
5310 case ANY:
5311 p = "ANY";
5312 break;
5313 case ANY + ADD_NL:
5314 p = "ANY+NL";
5315 break;
5316 case ANYOF:
5317 p = "ANYOF";
5318 break;
5319 case ANYOF + ADD_NL:
5320 p = "ANYOF+NL";
5321 break;
5322 case ANYBUT:
5323 p = "ANYBUT";
5324 break;
5325 case ANYBUT + ADD_NL:
5326 p = "ANYBUT+NL";
5327 break;
5328 case IDENT:
5329 p = "IDENT";
5330 break;
5331 case IDENT + ADD_NL:
5332 p = "IDENT+NL";
5333 break;
5334 case SIDENT:
5335 p = "SIDENT";
5336 break;
5337 case SIDENT + ADD_NL:
5338 p = "SIDENT+NL";
5339 break;
5340 case KWORD:
5341 p = "KWORD";
5342 break;
5343 case KWORD + ADD_NL:
5344 p = "KWORD+NL";
5345 break;
5346 case SKWORD:
5347 p = "SKWORD";
5348 break;
5349 case SKWORD + ADD_NL:
5350 p = "SKWORD+NL";
5351 break;
5352 case FNAME:
5353 p = "FNAME";
5354 break;
5355 case FNAME + ADD_NL:
5356 p = "FNAME+NL";
5357 break;
5358 case SFNAME:
5359 p = "SFNAME";
5360 break;
5361 case SFNAME + ADD_NL:
5362 p = "SFNAME+NL";
5363 break;
5364 case PRINT:
5365 p = "PRINT";
5366 break;
5367 case PRINT + ADD_NL:
5368 p = "PRINT+NL";
5369 break;
5370 case SPRINT:
5371 p = "SPRINT";
5372 break;
5373 case SPRINT + ADD_NL:
5374 p = "SPRINT+NL";
5375 break;
5376 case WHITE:
5377 p = "WHITE";
5378 break;
5379 case WHITE + ADD_NL:
5380 p = "WHITE+NL";
5381 break;
5382 case NWHITE:
5383 p = "NWHITE";
5384 break;
5385 case NWHITE + ADD_NL:
5386 p = "NWHITE+NL";
5387 break;
5388 case DIGIT:
5389 p = "DIGIT";
5390 break;
5391 case DIGIT + ADD_NL:
5392 p = "DIGIT+NL";
5393 break;
5394 case NDIGIT:
5395 p = "NDIGIT";
5396 break;
5397 case NDIGIT + ADD_NL:
5398 p = "NDIGIT+NL";
5399 break;
5400 case HEX:
5401 p = "HEX";
5402 break;
5403 case HEX + ADD_NL:
5404 p = "HEX+NL";
5405 break;
5406 case NHEX:
5407 p = "NHEX";
5408 break;
5409 case NHEX + ADD_NL:
5410 p = "NHEX+NL";
5411 break;
5412 case OCTAL:
5413 p = "OCTAL";
5414 break;
5415 case OCTAL + ADD_NL:
5416 p = "OCTAL+NL";
5417 break;
5418 case NOCTAL:
5419 p = "NOCTAL";
5420 break;
5421 case NOCTAL + ADD_NL:
5422 p = "NOCTAL+NL";
5423 break;
5424 case WORD:
5425 p = "WORD";
5426 break;
5427 case WORD + ADD_NL:
5428 p = "WORD+NL";
5429 break;
5430 case NWORD:
5431 p = "NWORD";
5432 break;
5433 case NWORD + ADD_NL:
5434 p = "NWORD+NL";
5435 break;
5436 case HEAD:
5437 p = "HEAD";
5438 break;
5439 case HEAD + ADD_NL:
5440 p = "HEAD+NL";
5441 break;
5442 case NHEAD:
5443 p = "NHEAD";
5444 break;
5445 case NHEAD + ADD_NL:
5446 p = "NHEAD+NL";
5447 break;
5448 case ALPHA:
5449 p = "ALPHA";
5450 break;
5451 case ALPHA + ADD_NL:
5452 p = "ALPHA+NL";
5453 break;
5454 case NALPHA:
5455 p = "NALPHA";
5456 break;
5457 case NALPHA + ADD_NL:
5458 p = "NALPHA+NL";
5459 break;
5460 case LOWER:
5461 p = "LOWER";
5462 break;
5463 case LOWER + ADD_NL:
5464 p = "LOWER+NL";
5465 break;
5466 case NLOWER:
5467 p = "NLOWER";
5468 break;
5469 case NLOWER + ADD_NL:
5470 p = "NLOWER+NL";
5471 break;
5472 case UPPER:
5473 p = "UPPER";
5474 break;
5475 case UPPER + ADD_NL:
5476 p = "UPPER+NL";
5477 break;
5478 case NUPPER:
5479 p = "NUPPER";
5480 break;
5481 case NUPPER + ADD_NL:
5482 p = "NUPPER+NL";
5483 break;
5484 case BRANCH:
5485 p = "BRANCH";
5486 break;
5487 case EXACTLY:
5488 p = "EXACTLY";
5489 break;
5490 case NOTHING:
5491 p = "NOTHING";
5492 break;
5493 case BACK:
5494 p = "BACK";
5495 break;
5496 case END:
5497 p = "END";
5498 break;
5499 case MOPEN + 0:
5500 p = "MATCH START";
5501 break;
5502 case MOPEN + 1:
5503 case MOPEN + 2:
5504 case MOPEN + 3:
5505 case MOPEN + 4:
5506 case MOPEN + 5:
5507 case MOPEN + 6:
5508 case MOPEN + 7:
5509 case MOPEN + 8:
5510 case MOPEN + 9:
John Marriott82792db2024-05-12 00:07:17 +02005511 buflen += sprintf(buf + buflen, "MOPEN%d", OP(op) - MOPEN);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005512 p = NULL;
5513 break;
5514 case MCLOSE + 0:
5515 p = "MATCH END";
5516 break;
5517 case MCLOSE + 1:
5518 case MCLOSE + 2:
5519 case MCLOSE + 3:
5520 case MCLOSE + 4:
5521 case MCLOSE + 5:
5522 case MCLOSE + 6:
5523 case MCLOSE + 7:
5524 case MCLOSE + 8:
5525 case MCLOSE + 9:
John Marriott82792db2024-05-12 00:07:17 +02005526 buflen += sprintf(buf + buflen, "MCLOSE%d", OP(op) - MCLOSE);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005527 p = NULL;
5528 break;
5529 case BACKREF + 1:
5530 case BACKREF + 2:
5531 case BACKREF + 3:
5532 case BACKREF + 4:
5533 case BACKREF + 5:
5534 case BACKREF + 6:
5535 case BACKREF + 7:
5536 case BACKREF + 8:
5537 case BACKREF + 9:
John Marriott82792db2024-05-12 00:07:17 +02005538 buflen += sprintf(buf + buflen, "BACKREF%d", OP(op) - BACKREF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005539 p = NULL;
5540 break;
5541 case NOPEN:
5542 p = "NOPEN";
5543 break;
5544 case NCLOSE:
5545 p = "NCLOSE";
5546 break;
5547#ifdef FEAT_SYN_HL
5548 case ZOPEN + 1:
5549 case ZOPEN + 2:
5550 case ZOPEN + 3:
5551 case ZOPEN + 4:
5552 case ZOPEN + 5:
5553 case ZOPEN + 6:
5554 case ZOPEN + 7:
5555 case ZOPEN + 8:
5556 case ZOPEN + 9:
John Marriott82792db2024-05-12 00:07:17 +02005557 buflen += sprintf(buf + buflen, "ZOPEN%d", OP(op) - ZOPEN);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005558 p = NULL;
5559 break;
5560 case ZCLOSE + 1:
5561 case ZCLOSE + 2:
5562 case ZCLOSE + 3:
5563 case ZCLOSE + 4:
5564 case ZCLOSE + 5:
5565 case ZCLOSE + 6:
5566 case ZCLOSE + 7:
5567 case ZCLOSE + 8:
5568 case ZCLOSE + 9:
John Marriott82792db2024-05-12 00:07:17 +02005569 buflen += sprintf(buf + buflen, "ZCLOSE%d", OP(op) - ZCLOSE);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005570 p = NULL;
5571 break;
5572 case ZREF + 1:
5573 case ZREF + 2:
5574 case ZREF + 3:
5575 case ZREF + 4:
5576 case ZREF + 5:
5577 case ZREF + 6:
5578 case ZREF + 7:
5579 case ZREF + 8:
5580 case ZREF + 9:
Christian Brabandt60430242024-05-14 11:19:47 +02005581 buflen += sprintf(buf + buflen, "ZREF%d", OP(op) - ZREF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005582 p = NULL;
5583 break;
5584#endif
5585 case STAR:
5586 p = "STAR";
5587 break;
5588 case PLUS:
5589 p = "PLUS";
5590 break;
5591 case NOMATCH:
5592 p = "NOMATCH";
5593 break;
5594 case MATCH:
5595 p = "MATCH";
5596 break;
5597 case BEHIND:
5598 p = "BEHIND";
5599 break;
5600 case NOBEHIND:
5601 p = "NOBEHIND";
5602 break;
5603 case SUBPAT:
5604 p = "SUBPAT";
5605 break;
5606 case BRACE_LIMITS:
5607 p = "BRACE_LIMITS";
5608 break;
5609 case BRACE_SIMPLE:
5610 p = "BRACE_SIMPLE";
5611 break;
5612 case BRACE_COMPLEX + 0:
5613 case BRACE_COMPLEX + 1:
5614 case BRACE_COMPLEX + 2:
5615 case BRACE_COMPLEX + 3:
5616 case BRACE_COMPLEX + 4:
5617 case BRACE_COMPLEX + 5:
5618 case BRACE_COMPLEX + 6:
5619 case BRACE_COMPLEX + 7:
5620 case BRACE_COMPLEX + 8:
5621 case BRACE_COMPLEX + 9:
John Marriott82792db2024-05-12 00:07:17 +02005622 buflen += sprintf(buf + buflen, "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005623 p = NULL;
5624 break;
5625 case MULTIBYTECODE:
5626 p = "MULTIBYTECODE";
5627 break;
5628 case NEWL:
5629 p = "NEWL";
5630 break;
5631 default:
John Marriott82792db2024-05-12 00:07:17 +02005632 buflen += sprintf(buf + buflen, "corrupt %d", OP(op));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005633 p = NULL;
5634 break;
5635 }
5636 if (p != NULL)
John Marriott82792db2024-05-12 00:07:17 +02005637 STRCPY(buf + buflen, p);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005638 return (char_u *)buf;
5639}
5640#endif // DEBUG