blob: f4bd6c36d2c57c830af0f04fe1ca9b5e035100a4 [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200536 switch (c)
537 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200538 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200539 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
540 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
541 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
542 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
543 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
544 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
545 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
546 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
547 regmbc(0x100); regmbc(0x102); regmbc(0x104);
548 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
549 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
550 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
551 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
552 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
553 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
554 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200555 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200556 case 'B': case 0x181: case 0x243: case 0x1e02:
557 case 0x1e04: case 0x1e06:
558 regmbc('B');
559 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
560 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200561 return;
562 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200563 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
564 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200565 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200566 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
567 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
568 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200569 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200570 case 'D': case 0x10e: case 0x110: case 0x18a:
571 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
572 case 0x1e12:
573 regmbc('D'); regmbc(0x10e); regmbc(0x110);
574 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
575 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200576 return;
577 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200578 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
579 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
580 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
581 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
582 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200583 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200584 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
585 regmbc(0x114); regmbc(0x116); regmbc(0x118);
586 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
587 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
588 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
589 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
590 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
591 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200593 case 'F': case 0x191: case 0x1e1e: case 0xa798:
594 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
595 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200596 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200597 case 'G': case 0x11c: case 0x11e: case 0x120:
598 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
599 case 0x1f4: case 0x1e20: case 0xa7a0:
600 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
601 regmbc(0x120); regmbc(0x122); regmbc(0x193);
602 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
603 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200604 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200605 case 'H': case 0x124: case 0x126: case 0x21e:
606 case 0x1e22: case 0x1e24: case 0x1e26:
607 case 0x1e28: case 0x1e2a: case 0x2c67:
608 regmbc('H'); regmbc(0x124); regmbc(0x126);
609 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
610 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
611 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
613 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200614 case 0x128: case 0x12a: case 0x12c: case 0x12e:
615 case 0x130: case 0x197: case 0x1cf: case 0x208:
616 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
617 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200618 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200619 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
620 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
621 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
622 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
623 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200624 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200625 case 'J': case 0x134: case 0x248:
626 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200627 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200628 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
629 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
630 regmbc('K'); regmbc(0x136); regmbc(0x198);
631 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
632 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200633 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200634 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
635 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
636 case 0x1e3a: case 0x1e3c: case 0x2c60:
637 regmbc('L'); regmbc(0x139); regmbc(0x13b);
638 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
639 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
640 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200641 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200642 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
643 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
644 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200645 return;
646 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200647 case 0x143: case 0x145: case 0x147: case 0x1f8:
648 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
649 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200650 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200651 regmbc(0x143); regmbc(0x145); regmbc(0x147);
652 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
653 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200654 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200655 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
656 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
657 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
658 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
659 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
660 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
661 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
662 case 0x1ee0: case 0x1ee2:
663 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
664 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
665 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
666 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
667 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
668 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
669 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
670 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
671 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
672 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
673 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
674 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
675 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200676 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200677 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
678 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
679 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200680 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200681 case 'Q': case 0x24a:
682 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200683 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200684 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
685 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
686 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
687 regmbc('R'); regmbc(0x154); regmbc(0x156);
688 regmbc(0x210); regmbc(0x212); regmbc(0x158);
689 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
690 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
691 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
694 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
695 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
696 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
697 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
698 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
699 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
700 regmbc(0xa7a8);
701 return;
702 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
703 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
704 case 0x1e6e: case 0x1e70:
705 regmbc('T'); regmbc(0x162); regmbc(0x164);
706 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
707 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
708 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200709 return;
710 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200711 case 0x168: case 0x16a: case 0x16c: case 0x16e:
712 case 0x170: case 0x172: case 0x1af: case 0x1d3:
713 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
714 case 0x214: case 0x216: case 0x244: case 0x1e72:
715 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
716 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
717 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200718 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200719 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
720 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
721 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
722 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
723 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
724 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
725 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
726 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
727 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
728 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200729 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200730 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
731 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
732 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200733 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200734 case 'W': case 0x174: case 0x1e80: case 0x1e82:
735 case 0x1e84: case 0x1e86: case 0x1e88:
736 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
737 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
738 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200739 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200740 case 'X': case 0x1e8a: case 0x1e8c:
741 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200742 return;
743 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200744 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
745 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
746 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
747 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
748 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
749 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200750 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200751 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
752 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
753 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
754 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
755 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200756 return;
757 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200758 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
759 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
760 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
761 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
762 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
763 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
764 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200765 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
766 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
768 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
769 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
770 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
771 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
772 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
773 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
774 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
775 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200776 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200777 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
778 case 0x1e03: case 0x1e05: case 0x1e07:
779 regmbc('b');
780 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
781 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
782 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200783 return;
784 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200785 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
786 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
787 regmbc('c'); regmbc(0xe7); regmbc(0x107);
788 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
789 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
790 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200791 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200792 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
793 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
794 case 0x1e0f: case 0x1e11: case 0x1e13:
795 regmbc('d'); regmbc(0x10f); regmbc(0x111);
796 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
797 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
798 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x113: case 0x115: case 0x117: case 0x119:
802 case 0x11b: case 0x205: case 0x207: case 0x229:
803 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
804 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
805 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
806 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
809 regmbc(0x115); regmbc(0x117); regmbc(0x119);
810 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
811 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
812 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
813 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
814 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
815 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
816 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200817 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200818 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
819 case 0x1e1f: case 0xa799:
820 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
821 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
822 return;
823 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
824 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
825 case 0x1e21: case 0xa7a1:
826 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
827 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
828 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
829 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200830 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200831 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
832 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
833 case 0x1e96: case 0x2c68: case 0xa795:
834 regmbc('h'); regmbc(0x125); regmbc(0x127);
835 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
836 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
837 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200838 return;
839 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200840 case 0x129: case 0x12b: case 0x12d: case 0x12f:
841 case 0x1d0: case 0x209: case 0x20b: case 0x268:
842 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
843 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200844 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200845 regmbc(0xee); regmbc(0xef); regmbc(0x129);
846 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
847 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
848 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
849 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200850 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200851 case 'j': case 0x135: case 0x1f0: case 0x249:
852 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
853 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200855 case 'k': case 0x137: case 0x199: case 0x1e9:
856 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
857 case 0x2c6a: case 0xa741:
858 regmbc('k'); regmbc(0x137); regmbc(0x199);
859 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
860 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
861 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200862 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200863 case 'l': case 0x13a: case 0x13c: case 0x13e:
864 case 0x140: case 0x142: case 0x19a: case 0x1e37:
865 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
866 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
867 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
868 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
869 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
872 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
873 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200874 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200875 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
876 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
877 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
878 case 0xa7a5:
879 regmbc('n'); regmbc(0xf1); regmbc(0x144);
880 regmbc(0x146); regmbc(0x148); regmbc(0x149);
881 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
882 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
883 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200884 return;
885 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200886 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
887 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
888 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
889 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
890 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
891 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
892 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
893 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200894 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
895 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200896 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
897 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
898 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
899 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
900 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
901 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
902 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
903 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
904 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
905 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
906 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200907 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200908 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
909 case 0x1e55: case 0x1e57:
910 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
911 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
912 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200913 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200914 case 'q': case 0x24b: case 0x2a0:
915 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200916 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200917 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
918 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
919 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
920 case 0xa7a7:
921 regmbc('r'); regmbc(0x155); regmbc(0x157);
922 regmbc(0x159); regmbc(0x211); regmbc(0x213);
923 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
924 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
925 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
926 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200927 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200928 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
929 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
930 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
931 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
932 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
933 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
934 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
935 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
936 return;
937 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
938 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
939 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
940 regmbc('t'); regmbc(0x163); regmbc(0x165);
941 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
942 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
943 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
944 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200945 return;
946 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200947 case 0x169: case 0x16b: case 0x16d: case 0x16f:
948 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
949 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
950 case 0x215: case 0x217: case 0x289: case 0x1e73:
951 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
952 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
953 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
954 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200955 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
957 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
958 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
959 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
960 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
961 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
962 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
963 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
964 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
965 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
966 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200967 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200968 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
969 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
970 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 case 'w': case 0x175: case 0x1e81: case 0x1e83:
973 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
974 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
975 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
976 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200977 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200978 case 'x': case 0x1e8b: case 0x1e8d:
979 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200980 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
982 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
983 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200984 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
986 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
987 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
988 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200989 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
991 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
992 case 0x1e95: case 0x2c6c:
993 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
994 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
995 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
996 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200997 return;
998 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200999 }
1000 regmbc(c);
1001}
1002
1003/*
1004 * Emit a node.
1005 * Return pointer to generated code.
1006 */
1007 static char_u *
1008regnode(int op)
1009{
1010 char_u *ret;
1011
1012 ret = regcode;
1013 if (ret == JUST_CALC_SIZE)
1014 regsize += 3;
1015 else
1016 {
1017 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001018 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001019 *regcode++ = NUL;
1020 }
1021 return ret;
1022}
1023
1024/*
1025 * Write a long as four bytes at "p" and return pointer to the next char.
1026 */
1027 static char_u *
1028re_put_long(char_u *p, long_u val)
1029{
1030 *p++ = (char_u) ((val >> 24) & 0377);
1031 *p++ = (char_u) ((val >> 16) & 0377);
1032 *p++ = (char_u) ((val >> 8) & 0377);
1033 *p++ = (char_u) (val & 0377);
1034 return p;
1035}
1036
1037/*
1038 * regnext - dig the "next" pointer out of a node
1039 * Returns NULL when calculating size, when there is no next item and when
1040 * there is an error.
1041 */
1042 static char_u *
1043regnext(char_u *p)
1044{
1045 int offset;
1046
1047 if (p == JUST_CALC_SIZE || reg_toolong)
1048 return NULL;
1049
1050 offset = NEXT(p);
1051 if (offset == 0)
1052 return NULL;
1053
1054 if (OP(p) == BACK)
1055 return p - offset;
1056 else
1057 return p + offset;
1058}
1059
1060/*
1061 * Set the next-pointer at the end of a node chain.
1062 */
1063 static void
1064regtail(char_u *p, char_u *val)
1065{
1066 char_u *scan;
1067 char_u *temp;
1068 int offset;
1069
1070 if (p == JUST_CALC_SIZE)
1071 return;
1072
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001073 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001074 scan = p;
1075 for (;;)
1076 {
1077 temp = regnext(scan);
1078 if (temp == NULL)
1079 break;
1080 scan = temp;
1081 }
1082
1083 if (OP(scan) == BACK)
1084 offset = (int)(scan - val);
1085 else
1086 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001087 // When the offset uses more than 16 bits it can no longer fit in the two
1088 // bytes available. Use a global flag to avoid having to check return
1089 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001090 if (offset > 0xffff)
1091 reg_toolong = TRUE;
1092 else
1093 {
1094 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1095 *(scan + 2) = (char_u) (offset & 0377);
1096 }
1097}
1098
1099/*
1100 * Like regtail, on item after a BRANCH; nop if none.
1101 */
1102 static void
1103regoptail(char_u *p, char_u *val)
1104{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001105 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001106 if (p == NULL || p == JUST_CALC_SIZE
1107 || (OP(p) != BRANCH
1108 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1109 return;
1110 regtail(OPERAND(p), val);
1111}
1112
1113/*
1114 * Insert an operator in front of already-emitted operand
1115 *
1116 * Means relocating the operand.
1117 */
1118 static void
1119reginsert(int op, char_u *opnd)
1120{
1121 char_u *src;
1122 char_u *dst;
1123 char_u *place;
1124
1125 if (regcode == JUST_CALC_SIZE)
1126 {
1127 regsize += 3;
1128 return;
1129 }
1130 src = regcode;
1131 regcode += 3;
1132 dst = regcode;
1133 while (src > opnd)
1134 *--dst = *--src;
1135
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001136 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001137 *place++ = op;
1138 *place++ = NUL;
1139 *place = NUL;
1140}
1141
1142/*
1143 * Insert an operator in front of already-emitted operand.
1144 * Add a number to the operator.
1145 */
1146 static void
1147reginsert_nr(int op, long val, char_u *opnd)
1148{
1149 char_u *src;
1150 char_u *dst;
1151 char_u *place;
1152
1153 if (regcode == JUST_CALC_SIZE)
1154 {
1155 regsize += 7;
1156 return;
1157 }
1158 src = regcode;
1159 regcode += 7;
1160 dst = regcode;
1161 while (src > opnd)
1162 *--dst = *--src;
1163
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001164 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001165 *place++ = op;
1166 *place++ = NUL;
1167 *place++ = NUL;
1168 re_put_long(place, (long_u)val);
1169}
1170
1171/*
1172 * Insert an operator in front of already-emitted operand.
1173 * The operator has the given limit values as operands. Also set next pointer.
1174 *
1175 * Means relocating the operand.
1176 */
1177 static void
1178reginsert_limits(
1179 int op,
1180 long minval,
1181 long maxval,
1182 char_u *opnd)
1183{
1184 char_u *src;
1185 char_u *dst;
1186 char_u *place;
1187
1188 if (regcode == JUST_CALC_SIZE)
1189 {
1190 regsize += 11;
1191 return;
1192 }
1193 src = regcode;
1194 regcode += 11;
1195 dst = regcode;
1196 while (src > opnd)
1197 *--dst = *--src;
1198
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001199 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001200 *place++ = op;
1201 *place++ = NUL;
1202 *place++ = NUL;
1203 place = re_put_long(place, (long_u)minval);
1204 place = re_put_long(place, (long_u)maxval);
1205 regtail(opnd, place);
1206}
1207
1208/*
1209 * Return TRUE if the back reference is legal. We must have seen the close
1210 * brace.
1211 * TODO: Should also check that we don't refer to something that is repeated
1212 * (+*=): what instance of the repetition should we match?
1213 */
1214 static int
1215seen_endbrace(int refnum)
1216{
1217 if (!had_endbrace[refnum])
1218 {
1219 char_u *p;
1220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001221 // Trick: check if "@<=" or "@<!" follows, in which case
1222 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001223 for (p = regparse; *p != NUL; ++p)
1224 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1225 break;
1226 if (*p == NUL)
1227 {
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001228 emsg(_(e_illegal_back_reference));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001229 rc_did_emsg = TRUE;
1230 return FALSE;
1231 }
1232 }
1233 return TRUE;
1234}
1235
1236/*
1237 * Parse the lowest level.
1238 *
1239 * Optimization: gobbles an entire sequence of ordinary characters so that
1240 * it can turn them into a single node, which is smaller to store and
1241 * faster to run. Don't do this when one_exactly is set.
1242 */
1243 static char_u *
1244regatom(int *flagp)
1245{
1246 char_u *ret;
1247 int flags;
1248 int c;
1249 char_u *p;
1250 int extra = 0;
1251 int save_prev_at_start = prev_at_start;
1252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001253 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001254
1255 c = getchr();
1256 switch (c)
1257 {
1258 case Magic('^'):
1259 ret = regnode(BOL);
1260 break;
1261
1262 case Magic('$'):
1263 ret = regnode(EOL);
1264#if defined(FEAT_SYN_HL) || defined(PROTO)
1265 had_eol = TRUE;
1266#endif
1267 break;
1268
1269 case Magic('<'):
1270 ret = regnode(BOW);
1271 break;
1272
1273 case Magic('>'):
1274 ret = regnode(EOW);
1275 break;
1276
1277 case Magic('_'):
1278 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001279 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001280 {
1281 ret = regnode(BOL);
1282 break;
1283 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001284 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001285 {
1286 ret = regnode(EOL);
1287#if defined(FEAT_SYN_HL) || defined(PROTO)
1288 had_eol = TRUE;
1289#endif
1290 break;
1291 }
1292
1293 extra = ADD_NL;
1294 *flagp |= HASNL;
1295
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 if (c == '[')
1298 goto collection;
1299
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001300 // "\_x" is character class plus newline
1301 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001303 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001304 case Magic('.'):
1305 case Magic('i'):
1306 case Magic('I'):
1307 case Magic('k'):
1308 case Magic('K'):
1309 case Magic('f'):
1310 case Magic('F'):
1311 case Magic('p'):
1312 case Magic('P'):
1313 case Magic('s'):
1314 case Magic('S'):
1315 case Magic('d'):
1316 case Magic('D'):
1317 case Magic('x'):
1318 case Magic('X'):
1319 case Magic('o'):
1320 case Magic('O'):
1321 case Magic('w'):
1322 case Magic('W'):
1323 case Magic('h'):
1324 case Magic('H'):
1325 case Magic('a'):
1326 case Magic('A'):
1327 case Magic('l'):
1328 case Magic('L'):
1329 case Magic('u'):
1330 case Magic('U'):
1331 p = vim_strchr(classchars, no_Magic(c));
1332 if (p == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001333 EMSG_RET_NULL(_(e_invalid_use_of_underscore));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001334
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001335 // When '.' is followed by a composing char ignore the dot, so that
1336 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001337 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1338 {
1339 c = getchr();
1340 goto do_multibyte;
1341 }
1342 ret = regnode(classcodes[p - classchars] + extra);
1343 *flagp |= HASWIDTH | SIMPLE;
1344 break;
1345
1346 case Magic('n'):
1347 if (reg_string)
1348 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001349 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001350 ret = regnode(EXACTLY);
1351 regc(NL);
1352 regc(NUL);
1353 *flagp |= HASWIDTH | SIMPLE;
1354 }
1355 else
1356 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001357 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001358 ret = regnode(NEWL);
1359 *flagp |= HASWIDTH | HASNL;
1360 }
1361 break;
1362
1363 case Magic('('):
1364 if (one_exactly)
1365 EMSG_ONE_RET_NULL;
1366 ret = reg(REG_PAREN, &flags);
1367 if (ret == NULL)
1368 return NULL;
1369 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1370 break;
1371
1372 case NUL:
1373 case Magic('|'):
1374 case Magic('&'):
1375 case Magic(')'):
1376 if (one_exactly)
1377 EMSG_ONE_RET_NULL;
Bram Moolenaard0819d12021-12-31 23:15:53 +00001378 // Supposed to be caught earlier.
RestorerZ68ebcee2023-05-31 17:12:14 +01001379 IEMSG_RET_NULL(e_internal_error_in_regexp);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001380 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001381
1382 case Magic('='):
1383 case Magic('?'):
1384 case Magic('+'):
1385 case Magic('@'):
1386 case Magic('{'):
1387 case Magic('*'):
1388 c = no_Magic(c);
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001389 EMSG3_RET_NULL(_(e_str_chr_follows_nothing),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001390 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001391 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001392
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001393 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001394 if (reg_prev_sub != NULL)
1395 {
1396 char_u *lp;
1397
1398 ret = regnode(EXACTLY);
1399 lp = reg_prev_sub;
1400 while (*lp != NUL)
1401 regc(*lp++);
1402 regc(NUL);
1403 if (*reg_prev_sub != NUL)
1404 {
1405 *flagp |= HASWIDTH;
1406 if ((lp - reg_prev_sub) == 1)
1407 *flagp |= SIMPLE;
1408 }
1409 }
1410 else
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001411 EMSG_RET_NULL(_(e_no_previous_substitute_regular_expression));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001412 break;
1413
1414 case Magic('1'):
1415 case Magic('2'):
1416 case Magic('3'):
1417 case Magic('4'):
1418 case Magic('5'):
1419 case Magic('6'):
1420 case Magic('7'):
1421 case Magic('8'):
1422 case Magic('9'):
1423 {
1424 int refnum;
1425
1426 refnum = c - Magic('0');
1427 if (!seen_endbrace(refnum))
1428 return NULL;
1429 ret = regnode(BACKREF + refnum);
1430 }
1431 break;
1432
1433 case Magic('z'):
1434 {
1435 c = no_Magic(getchr());
1436 switch (c)
1437 {
1438#ifdef FEAT_SYN_HL
1439 case '(': if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001440 EMSG_RET_NULL(_(e_z_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001441 if (one_exactly)
1442 EMSG_ONE_RET_NULL;
1443 ret = reg(REG_ZPAREN, &flags);
1444 if (ret == NULL)
1445 return NULL;
1446 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1447 re_has_z = REX_SET;
1448 break;
1449
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9': if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001459 EMSG_RET_NULL(_(e_z1_z9_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001460 ret = regnode(ZREF + c - '0');
1461 re_has_z = REX_USE;
1462 break;
1463#endif
1464
1465 case 's': ret = regnode(MOPEN + 0);
1466 if (re_mult_next("\\zs") == FAIL)
1467 return NULL;
1468 break;
1469
1470 case 'e': ret = regnode(MCLOSE + 0);
1471 if (re_mult_next("\\ze") == FAIL)
1472 return NULL;
1473 break;
1474
Bram Moolenaarb2810f12022-01-08 21:38:52 +00001475 default: EMSG_RET_NULL(_(e_invalid_character_after_bsl_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001476 }
1477 }
1478 break;
1479
1480 case Magic('%'):
1481 {
1482 c = no_Magic(getchr());
1483 switch (c)
1484 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001485 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001486 case '(':
1487 if (one_exactly)
1488 EMSG_ONE_RET_NULL;
1489 ret = reg(REG_NPAREN, &flags);
1490 if (ret == NULL)
1491 return NULL;
1492 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1493 break;
1494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001495 // Catch \%^ and \%$ regardless of where they appear in the
1496 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001497 case '^':
1498 ret = regnode(RE_BOF);
1499 break;
1500
1501 case '$':
1502 ret = regnode(RE_EOF);
1503 break;
1504
1505 case '#':
Christian Brabandt360da402022-05-18 15:04:02 +01001506 if (regparse[0] == '=' && regparse[1] >= 48
1507 && regparse[1] <= 50)
1508 {
1509 // misplaced \%#=1
1510 semsg(_(e_atom_engine_must_be_at_start_of_pattern),
1511 regparse[1]);
1512 return FAIL;
1513 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001514 ret = regnode(CURSOR);
1515 break;
1516
1517 case 'V':
1518 ret = regnode(RE_VISUAL);
1519 break;
1520
1521 case 'C':
1522 ret = regnode(RE_COMPOSING);
1523 break;
1524
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001525 // \%[abc]: Emit as a list of branches, all ending at the last
1526 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001527 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001528 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001529 EMSG_ONE_RET_NULL;
1530 {
1531 char_u *lastbranch;
1532 char_u *lastnode = NULL;
1533 char_u *br;
1534
1535 ret = NULL;
1536 while ((c = getchr()) != ']')
1537 {
1538 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001539 EMSG2_RET_NULL(_(e_missing_sb_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001540 reg_magic == MAGIC_ALL);
1541 br = regnode(BRANCH);
1542 if (ret == NULL)
1543 ret = br;
1544 else
1545 {
1546 regtail(lastnode, br);
1547 if (reg_toolong)
1548 return NULL;
1549 }
1550
1551 ungetchr();
1552 one_exactly = TRUE;
1553 lastnode = regatom(flagp);
1554 one_exactly = FALSE;
1555 if (lastnode == NULL)
1556 return NULL;
1557 }
1558 if (ret == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001559 EMSG2_RET_NULL(_(e_empty_str_brackets),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001560 reg_magic == MAGIC_ALL);
1561 lastbranch = regnode(BRANCH);
1562 br = regnode(NOTHING);
1563 if (ret != JUST_CALC_SIZE)
1564 {
1565 regtail(lastnode, br);
1566 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001567 // connect all branches to the NOTHING
1568 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001569 for (br = ret; br != lastnode; )
1570 {
1571 if (OP(br) == BRANCH)
1572 {
1573 regtail(br, lastbranch);
1574 if (reg_toolong)
1575 return NULL;
1576 br = OPERAND(br);
1577 }
1578 else
1579 br = regnext(br);
1580 }
1581 }
1582 *flagp &= ~(HASWIDTH | SIMPLE);
1583 break;
1584 }
1585
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001586 case 'd': // %d123 decimal
1587 case 'o': // %o123 octal
1588 case 'x': // %xab hex 2
1589 case 'u': // %uabcd hex 4
1590 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001591 {
Christian Brabandtf2b16982025-03-29 09:08:58 +01001592 vimlong_T i;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001593
1594 switch (c)
1595 {
1596 case 'd': i = getdecchrs(); break;
1597 case 'o': i = getoctchrs(); break;
1598 case 'x': i = gethexchrs(2); break;
1599 case 'u': i = gethexchrs(4); break;
1600 case 'U': i = gethexchrs(8); break;
1601 default: i = -1; break;
1602 }
1603
1604 if (i < 0 || i > INT_MAX)
1605 EMSG2_RET_NULL(
Bram Moolenaara6f79292022-01-04 21:30:47 +00001606 _(e_invalid_character_after_str_2),
1607 reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001608 if (use_multibytecode(i))
1609 ret = regnode(MULTIBYTECODE);
1610 else
1611 ret = regnode(EXACTLY);
1612 if (i == 0)
1613 regc(0x0a);
1614 else
Christian Brabandtf2b16982025-03-29 09:08:58 +01001615 regmbc((int)i);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001616 regc(NUL);
1617 *flagp |= HASWIDTH;
1618 break;
1619 }
1620
1621 default:
1622 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001623 || c == '\'' || c == '.')
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001624 {
1625 long_u n = 0;
1626 int cmp;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001627 int cur = FALSE;
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001628 int got_digit = FALSE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001629
1630 cmp = c;
1631 if (cmp == '<' || cmp == '>')
1632 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001633 if (no_Magic(c) == '.')
1634 {
1635 cur = TRUE;
1636 c = getchr();
1637 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001638 while (VIM_ISDIGIT(c))
1639 {
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001640 got_digit = TRUE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001641 n = n * 10 + (c - '0');
1642 c = getchr();
1643 }
Julio B46fa3c72024-03-28 10:23:37 +01001644 if (no_Magic(c) == '\'' && n == 0)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001645 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001646 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001647 c = getchr();
1648 ret = regnode(RE_MARK);
1649 if (ret == JUST_CALC_SIZE)
1650 regsize += 2;
1651 else
1652 {
1653 *regcode++ = c;
1654 *regcode++ = cmp;
1655 }
1656 break;
1657 }
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001658 else if ((c == 'l' || c == 'c' || c == 'v')
1659 && (cur || got_digit))
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001660 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001661 if (cur && n)
1662 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001663 semsg(_(e_regexp_number_after_dot_pos_search_chr),
1664 no_Magic(c));
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001665 rc_did_emsg = TRUE;
1666 return NULL;
1667 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001668 if (c == 'l')
1669 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001670 if (cur)
1671 n = curwin->w_cursor.lnum;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001672 ret = regnode(RE_LNUM);
1673 if (save_prev_at_start)
1674 at_start = TRUE;
1675 }
1676 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001677 {
1678 if (cur)
1679 {
1680 n = curwin->w_cursor.col;
1681 n++;
1682 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001683 ret = regnode(RE_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001684 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001685 else
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001686 {
1687 if (cur)
1688 {
1689 colnr_T vcol = 0;
1690
1691 getvvcol(curwin, &curwin->w_cursor,
1692 NULL, NULL, &vcol);
1693 ++vcol;
1694 n = vcol;
1695 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001696 ret = regnode(RE_VCOL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001697 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001698 if (ret == JUST_CALC_SIZE)
1699 regsize += 5;
1700 else
1701 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001702 // put the number and the optional
1703 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001704 regcode = re_put_long(regcode, n);
1705 *regcode++ = cmp;
1706 }
1707 break;
1708 }
1709 }
1710
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001711 EMSG2_RET_NULL(_(e_invalid_character_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001712 reg_magic == MAGIC_ALL);
1713 }
1714 }
1715 break;
1716
1717 case Magic('['):
1718collection:
1719 {
1720 char_u *lp;
1721
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001722 // If there is no matching ']', we assume the '[' is a normal
1723 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001724 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001725 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001726 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001727 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001728 int endc;
1729
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001730 // In a character class, different parsing rules apply.
1731 // Not even \ is special anymore, nothing is.
1732 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001733 {
1734 ret = regnode(ANYBUT + extra);
1735 regparse++;
1736 }
1737 else
1738 ret = regnode(ANYOF + extra);
1739
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001740 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001741 if (*regparse == ']' || *regparse == '-')
1742 {
1743 startc = *regparse;
1744 regc(*regparse++);
1745 }
1746
1747 while (*regparse != NUL && *regparse != ']')
1748 {
1749 if (*regparse == '-')
1750 {
1751 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001752 // The '-' is not used for a range at the end and
1753 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001754 if (*regparse == ']' || *regparse == NUL
1755 || startc == -1
1756 || (regparse[0] == '\\' && regparse[1] == 'n'))
1757 {
1758 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001759 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001760 }
1761 else
1762 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001763 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001764 endc = 0;
1765 if (*regparse == '[')
1766 endc = get_coll_element(&regparse);
1767 if (endc == 0)
1768 {
1769 if (has_mbyte)
1770 endc = mb_ptr2char_adv(&regparse);
1771 else
1772 endc = *regparse++;
1773 }
1774
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001775 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001776 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1777 endc = coll_get_char();
1778
1779 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001780 EMSG_RET_NULL(_(e_reverse_range_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001781 if (has_mbyte && ((*mb_char2len)(startc) > 1
1782 || (*mb_char2len)(endc) > 1))
1783 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001784 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001785 if (endc > startc + 256)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001786 EMSG_RET_NULL(_(e_range_too_large_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001787 while (++startc <= endc)
1788 regmbc(startc);
1789 }
1790 else
1791 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001792 while (++startc <= endc)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001793 regc(startc);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001794 }
1795 startc = -1;
1796 }
1797 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001798 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1799 // accepts "\t", "\e", etc., but only when the 'l' flag in
1800 // 'cpoptions' is not included.
1801 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001802 else if (*regparse == '\\'
1803 && !reg_cpo_bsl
1804 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1805 || (!reg_cpo_lit
1806 && vim_strchr(REGEXP_ABBR,
1807 regparse[1]) != NULL)))
1808 {
1809 regparse++;
1810 if (*regparse == 'n')
1811 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001812 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001813 if (ret != JUST_CALC_SIZE)
1814 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001815 // Using \n inside [^] does not change what
1816 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001817 if (*ret == ANYOF)
1818 {
1819 *ret = ANYOF + ADD_NL;
1820 *flagp |= HASNL;
1821 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001822 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001823 }
1824 regparse++;
1825 startc = -1;
1826 }
1827 else if (*regparse == 'd'
1828 || *regparse == 'o'
1829 || *regparse == 'x'
1830 || *regparse == 'u'
1831 || *regparse == 'U')
1832 {
1833 startc = coll_get_char();
Christian Brabandtf2b16982025-03-29 09:08:58 +01001834 // max UTF-8 Codepoint is U+10FFFF,
1835 // but allow values until INT_MAX
1836 if (startc == INT_MAX)
1837 EMSG_RET_NULL(_(e_unicode_val_too_large));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001838 if (startc == 0)
1839 regc(0x0a);
1840 else
1841 regmbc(startc);
1842 }
1843 else
1844 {
1845 startc = backslash_trans(*regparse++);
1846 regc(startc);
1847 }
1848 }
1849 else if (*regparse == '[')
1850 {
1851 int c_class;
1852 int cu;
1853
1854 c_class = get_char_class(&regparse);
1855 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001856 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001857 switch (c_class)
1858 {
1859 case CLASS_NONE:
1860 c_class = get_equi_class(&regparse);
1861 if (c_class != 0)
1862 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001863 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001864 reg_equi_class(c_class);
1865 }
1866 else if ((c_class =
1867 get_coll_element(&regparse)) != 0)
1868 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001869 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001870 regmbc(c_class);
1871 }
1872 else
1873 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001874 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001875 startc = *regparse++;
1876 regc(startc);
1877 }
1878 break;
1879 case CLASS_ALNUM:
1880 for (cu = 1; cu < 128; cu++)
1881 if (isalnum(cu))
1882 regmbc(cu);
1883 break;
1884 case CLASS_ALPHA:
1885 for (cu = 1; cu < 128; cu++)
1886 if (isalpha(cu))
1887 regmbc(cu);
1888 break;
1889 case CLASS_BLANK:
1890 regc(' ');
1891 regc('\t');
1892 break;
1893 case CLASS_CNTRL:
1894 for (cu = 1; cu <= 127; cu++)
1895 if (iscntrl(cu))
1896 regmbc(cu);
1897 break;
1898 case CLASS_DIGIT:
1899 for (cu = 1; cu <= 127; cu++)
1900 if (VIM_ISDIGIT(cu))
1901 regmbc(cu);
1902 break;
1903 case CLASS_GRAPH:
1904 for (cu = 1; cu <= 127; cu++)
1905 if (isgraph(cu))
1906 regmbc(cu);
1907 break;
1908 case CLASS_LOWER:
1909 for (cu = 1; cu <= 255; cu++)
1910 if (MB_ISLOWER(cu) && cu != 170
1911 && cu != 186)
1912 regmbc(cu);
1913 break;
1914 case CLASS_PRINT:
1915 for (cu = 1; cu <= 255; cu++)
1916 if (vim_isprintc(cu))
1917 regmbc(cu);
1918 break;
1919 case CLASS_PUNCT:
1920 for (cu = 1; cu < 128; cu++)
1921 if (ispunct(cu))
1922 regmbc(cu);
1923 break;
1924 case CLASS_SPACE:
1925 for (cu = 9; cu <= 13; cu++)
1926 regc(cu);
1927 regc(' ');
1928 break;
1929 case CLASS_UPPER:
1930 for (cu = 1; cu <= 255; cu++)
1931 if (MB_ISUPPER(cu))
1932 regmbc(cu);
1933 break;
1934 case CLASS_XDIGIT:
1935 for (cu = 1; cu <= 255; cu++)
1936 if (vim_isxdigit(cu))
1937 regmbc(cu);
1938 break;
1939 case CLASS_TAB:
1940 regc('\t');
1941 break;
1942 case CLASS_RETURN:
1943 regc('\r');
1944 break;
1945 case CLASS_BACKSPACE:
1946 regc('\b');
1947 break;
1948 case CLASS_ESCAPE:
1949 regc('\033');
1950 break;
1951 case CLASS_IDENT:
1952 for (cu = 1; cu <= 255; cu++)
1953 if (vim_isIDc(cu))
1954 regmbc(cu);
1955 break;
1956 case CLASS_KEYWORD:
1957 for (cu = 1; cu <= 255; cu++)
1958 if (reg_iswordc(cu))
1959 regmbc(cu);
1960 break;
1961 case CLASS_FNAME:
1962 for (cu = 1; cu <= 255; cu++)
1963 if (vim_isfilec(cu))
1964 regmbc(cu);
1965 break;
1966 }
1967 }
1968 else
1969 {
1970 if (has_mbyte)
1971 {
1972 int len;
1973
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001974 // produce a multibyte character, including any
1975 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001976 startc = mb_ptr2char(regparse);
1977 len = (*mb_ptr2len)(regparse);
1978 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001979 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001980 while (--len >= 0)
1981 regc(*regparse++);
1982 }
1983 else
1984 {
1985 startc = *regparse++;
1986 regc(startc);
1987 }
1988 }
1989 }
1990 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001991 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001992 if (*regparse != ']')
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001993 EMSG_RET_NULL(_(e_too_many_brackets)); // Cannot happen?
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001994 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001995 *flagp |= HASWIDTH | SIMPLE;
1996 break;
1997 }
1998 else if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001999 EMSG2_RET_NULL(_(e_missing_rsb_after_str_lsb),
2000 reg_magic > MAGIC_OFF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002001 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002002 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002003
2004 default:
2005 {
2006 int len;
2007
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002008 // A multi-byte character is handled as a separate atom if it's
2009 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002010 if (use_multibytecode(c))
2011 {
2012do_multibyte:
2013 ret = regnode(MULTIBYTECODE);
2014 regmbc(c);
2015 *flagp |= HASWIDTH | SIMPLE;
2016 break;
2017 }
2018
2019 ret = regnode(EXACTLY);
2020
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002021 // Append characters as long as:
2022 // - there is no following multi, we then need the character in
2023 // front of it as a single character operand
2024 // - not running into a Magic character
2025 // - "one_exactly" is not set
2026 // But always emit at least one character. Might be a Multi,
2027 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002028 for (len = 0; c != NUL && (len == 0
2029 || (re_multi_type(peekchr()) == NOT_MULTI
2030 && !one_exactly
2031 && !is_Magic(c))); ++len)
2032 {
2033 c = no_Magic(c);
2034 if (has_mbyte)
2035 {
2036 regmbc(c);
2037 if (enc_utf8)
2038 {
2039 int l;
2040
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002041 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002042 for (;;)
2043 {
2044 l = utf_ptr2len(regparse);
2045 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2046 break;
2047 regmbc(utf_ptr2char(regparse));
2048 skipchr();
2049 }
2050 }
2051 }
2052 else
2053 regc(c);
2054 c = getchr();
2055 }
2056 ungetchr();
2057
2058 regc(NUL);
2059 *flagp |= HASWIDTH;
2060 if (len == 1)
2061 *flagp |= SIMPLE;
2062 }
2063 break;
2064 }
2065
2066 return ret;
2067}
2068
2069/*
2070 * Parse something followed by possible [*+=].
2071 *
2072 * Note that the branching code sequences used for = and the general cases
2073 * of * and + are somewhat optimized: they use the same NOTHING node as
2074 * both the endmarker for their branch list and the body of the last branch.
2075 * It might seem that this node could be dispensed with entirely, but the
2076 * endmarker role is not redundant.
2077 */
2078 static char_u *
2079regpiece(int *flagp)
2080{
2081 char_u *ret;
2082 int op;
2083 char_u *next;
2084 int flags;
2085 long minval;
2086 long maxval;
2087
2088 ret = regatom(&flags);
2089 if (ret == NULL)
2090 return NULL;
2091
2092 op = peekchr();
2093 if (re_multi_type(op) == NOT_MULTI)
2094 {
2095 *flagp = flags;
2096 return ret;
2097 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002098 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002099 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2100
2101 skipchr();
2102 switch (op)
2103 {
2104 case Magic('*'):
2105 if (flags & SIMPLE)
2106 reginsert(STAR, ret);
2107 else
2108 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002109 // Emit x* as (x&|), where & means "self".
2110 reginsert(BRANCH, ret); // Either x
2111 regoptail(ret, regnode(BACK)); // and loop
2112 regoptail(ret, ret); // back
2113 regtail(ret, regnode(BRANCH)); // or
2114 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002115 }
2116 break;
2117
2118 case Magic('+'):
2119 if (flags & SIMPLE)
2120 reginsert(PLUS, ret);
2121 else
2122 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002123 // Emit x+ as x(&|), where & means "self".
2124 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002125 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002126 regtail(regnode(BACK), ret); // loop back
2127 regtail(next, regnode(BRANCH)); // or
2128 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002129 }
2130 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2131 break;
2132
2133 case Magic('@'):
2134 {
2135 int lop = END;
2136 long nr;
2137
Christian Brabandtf2b16982025-03-29 09:08:58 +01002138 nr = (long)getdecchrs();
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002139 switch (no_Magic(getchr()))
2140 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002141 case '=': lop = MATCH; break; // \@=
2142 case '!': lop = NOMATCH; break; // \@!
2143 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002144 case '<': switch (no_Magic(getchr()))
2145 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002146 case '=': lop = BEHIND; break; // \@<=
2147 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002148 }
2149 }
2150 if (lop == END)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002151 EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002152 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002153 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002154 if (lop == BEHIND || lop == NOBEHIND)
2155 {
2156 regtail(ret, regnode(BHPOS));
2157 *flagp |= HASLOOKBH;
2158 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002159 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002160 if (lop == BEHIND || lop == NOBEHIND)
2161 {
2162 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002163 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002164 reginsert_nr(lop, nr, ret);
2165 }
2166 else
2167 reginsert(lop, ret);
2168 break;
2169 }
2170
2171 case Magic('?'):
2172 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002173 // Emit x= as (x|)
2174 reginsert(BRANCH, ret); // Either x
2175 regtail(ret, regnode(BRANCH)); // or
2176 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002177 regtail(ret, next);
2178 regoptail(ret, next);
2179 break;
2180
2181 case Magic('{'):
2182 if (!read_limits(&minval, &maxval))
2183 return NULL;
2184 if (flags & SIMPLE)
2185 {
2186 reginsert(BRACE_SIMPLE, ret);
2187 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2188 }
2189 else
2190 {
2191 if (num_complex_braces >= 10)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002192 EMSG2_RET_NULL(_(e_too_many_complex_str_curly),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002193 reg_magic == MAGIC_ALL);
2194 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2195 regoptail(ret, regnode(BACK));
2196 regoptail(ret, ret);
2197 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2198 ++num_complex_braces;
2199 }
2200 if (minval > 0 && maxval > 0)
2201 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2202 break;
2203 }
2204 if (re_multi_type(peekchr()) != NOT_MULTI)
2205 {
2206 // Can't have a multi follow a multi.
2207 if (peekchr() == Magic('*'))
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002208 EMSG2_RET_NULL(_(e_nested_str), reg_magic >= MAGIC_ON);
2209 EMSG3_RET_NULL(_(e_nested_str_chr), reg_magic == MAGIC_ALL,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002210 no_Magic(peekchr()));
2211 }
2212
2213 return ret;
2214}
2215
2216/*
2217 * Parse one alternative of an | or & operator.
2218 * Implements the concatenation operator.
2219 */
2220 static char_u *
2221regconcat(int *flagp)
2222{
2223 char_u *first = NULL;
2224 char_u *chain = NULL;
2225 char_u *latest;
2226 int flags;
2227 int cont = TRUE;
2228
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002229 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002230
2231 while (cont)
2232 {
2233 switch (peekchr())
2234 {
2235 case NUL:
2236 case Magic('|'):
2237 case Magic('&'):
2238 case Magic(')'):
2239 cont = FALSE;
2240 break;
2241 case Magic('Z'):
2242 regflags |= RF_ICOMBINE;
2243 skipchr_keepstart();
2244 break;
2245 case Magic('c'):
2246 regflags |= RF_ICASE;
2247 skipchr_keepstart();
2248 break;
2249 case Magic('C'):
2250 regflags |= RF_NOICASE;
2251 skipchr_keepstart();
2252 break;
2253 case Magic('v'):
2254 reg_magic = MAGIC_ALL;
2255 skipchr_keepstart();
2256 curchr = -1;
2257 break;
2258 case Magic('m'):
2259 reg_magic = MAGIC_ON;
2260 skipchr_keepstart();
2261 curchr = -1;
2262 break;
2263 case Magic('M'):
2264 reg_magic = MAGIC_OFF;
2265 skipchr_keepstart();
2266 curchr = -1;
2267 break;
2268 case Magic('V'):
2269 reg_magic = MAGIC_NONE;
2270 skipchr_keepstart();
2271 curchr = -1;
2272 break;
2273 default:
2274 latest = regpiece(&flags);
2275 if (latest == NULL || reg_toolong)
2276 return NULL;
2277 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002278 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002279 *flagp |= flags & SPSTART;
2280 else
2281 regtail(chain, latest);
2282 chain = latest;
2283 if (first == NULL)
2284 first = latest;
2285 break;
2286 }
2287 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002288 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002289 first = regnode(NOTHING);
2290 return first;
2291}
2292
2293/*
2294 * Parse one alternative of an | operator.
2295 * Implements the & operator.
2296 */
2297 static char_u *
2298regbranch(int *flagp)
2299{
2300 char_u *ret;
2301 char_u *chain = NULL;
2302 char_u *latest;
2303 int flags;
2304
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002305 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002306
2307 ret = regnode(BRANCH);
2308 for (;;)
2309 {
2310 latest = regconcat(&flags);
2311 if (latest == NULL)
2312 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002313 // If one of the branches has width, the whole thing has. If one of
2314 // the branches anchors at start-of-line, the whole thing does.
2315 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002316 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002317 // If one of the branches doesn't match a line-break, the whole thing
2318 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002319 *flagp &= ~HASNL | (flags & HASNL);
2320 if (chain != NULL)
2321 regtail(chain, latest);
2322 if (peekchr() != Magic('&'))
2323 break;
2324 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002325 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002326 if (reg_toolong)
2327 break;
2328 reginsert(MATCH, latest);
2329 chain = latest;
2330 }
2331
2332 return ret;
2333}
2334
2335/*
2336 * Parse regular expression, i.e. main body or parenthesized thing.
2337 *
2338 * Caller must absorb opening parenthesis.
2339 *
2340 * Combining parenthesis handling with the base level of regular expression
2341 * is a trifle forced, but the need to tie the tails of the branches to what
2342 * follows makes it hard to avoid.
2343 */
2344 static char_u *
2345reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002346 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002347 int *flagp)
2348{
2349 char_u *ret;
2350 char_u *br;
2351 char_u *ender;
2352 int parno = 0;
2353 int flags;
2354
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002355 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002356
2357#ifdef FEAT_SYN_HL
2358 if (paren == REG_ZPAREN)
2359 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002360 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002361 if (regnzpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002362 EMSG_RET_NULL(_(e_too_many_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002363 parno = regnzpar;
2364 regnzpar++;
2365 ret = regnode(ZOPEN + parno);
2366 }
2367 else
2368#endif
2369 if (paren == REG_PAREN)
2370 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002371 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002372 if (regnpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002373 EMSG2_RET_NULL(_(e_too_many_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002374 parno = regnpar;
2375 ++regnpar;
2376 ret = regnode(MOPEN + parno);
2377 }
2378 else if (paren == REG_NPAREN)
2379 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002380 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002381 ret = regnode(NOPEN);
2382 }
2383 else
2384 ret = NULL;
2385
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002386 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002387 br = regbranch(&flags);
2388 if (br == NULL)
2389 return NULL;
2390 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002391 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002392 else
2393 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002394 // If one of the branches can be zero-width, the whole thing can.
2395 // If one of the branches has * at start or matches a line-break, the
2396 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002397 if (!(flags & HASWIDTH))
2398 *flagp &= ~HASWIDTH;
2399 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2400 while (peekchr() == Magic('|'))
2401 {
2402 skipchr();
2403 br = regbranch(&flags);
2404 if (br == NULL || reg_toolong)
2405 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002406 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002407 if (!(flags & HASWIDTH))
2408 *flagp &= ~HASWIDTH;
2409 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2410 }
2411
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002412 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002413 ender = regnode(
2414#ifdef FEAT_SYN_HL
2415 paren == REG_ZPAREN ? ZCLOSE + parno :
2416#endif
2417 paren == REG_PAREN ? MCLOSE + parno :
2418 paren == REG_NPAREN ? NCLOSE : END);
2419 regtail(ret, ender);
2420
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002421 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002422 for (br = ret; br != NULL; br = regnext(br))
2423 regoptail(br, ender);
2424
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002425 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002426 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2427 {
2428#ifdef FEAT_SYN_HL
2429 if (paren == REG_ZPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002430 EMSG_RET_NULL(_(e_unmatched_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002431 else
2432#endif
2433 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002434 EMSG2_RET_NULL(_(e_unmatched_str_percent_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002435 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002436 EMSG2_RET_NULL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002437 }
2438 else if (paren == REG_NOPAREN && peekchr() != NUL)
2439 {
2440 if (curchr == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002441 EMSG2_RET_NULL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002442 else
Bram Moolenaar74409f62022-01-01 15:58:22 +00002443 EMSG_RET_NULL(_(e_trailing_characters)); // "Can't happen".
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002444 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002445 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002446 // Here we set the flag allowing back references to this set of
2447 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002448 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002449 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002450 return ret;
2451}
2452
2453/*
2454 * bt_regcomp() - compile a regular expression into internal code for the
2455 * traditional back track matcher.
2456 * Returns the program in allocated space. Returns NULL for an error.
2457 *
2458 * We can't allocate space until we know how big the compiled form will be,
2459 * but we can't compile it (and thus know how big it is) until we've got a
2460 * place to put the code. So we cheat: we compile it twice, once with code
2461 * generation turned off and size counting turned on, and once "for real".
2462 * This also means that we don't allocate space until we are sure that the
2463 * thing really will compile successfully, and we never have to move the
2464 * code and thus invalidate pointers into it. (Note that it has to be in
2465 * one piece because vim_free() must be able to free it all.)
2466 *
2467 * Whether upper/lower case is to be ignored is decided when executing the
2468 * program, it does not matter here.
2469 *
2470 * Beware that the optimization-preparation code in here knows about some
2471 * of the structure of the compiled regexp.
2472 * "re_flags": RE_MAGIC and/or RE_STRING.
2473 */
2474 static regprog_T *
2475bt_regcomp(char_u *expr, int re_flags)
2476{
2477 bt_regprog_T *r;
2478 char_u *scan;
2479 char_u *longest;
2480 int len;
2481 int flags;
2482
2483 if (expr == NULL)
RestorerZ68ebcee2023-05-31 17:12:14 +01002484 IEMSG_RET_NULL(e_null_argument);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002485
2486 init_class_tab();
2487
2488 // First pass: determine size, legality.
2489 regcomp_start(expr, re_flags);
2490 regcode = JUST_CALC_SIZE;
2491 regc(REGMAGIC);
2492 if (reg(REG_NOPAREN, &flags) == NULL)
2493 return NULL;
2494
2495 // Allocate space.
2496 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2497 if (r == NULL)
2498 return NULL;
2499 r->re_in_use = FALSE;
2500
2501 // Second pass: emit code.
2502 regcomp_start(expr, re_flags);
2503 regcode = r->program;
2504 regc(REGMAGIC);
2505 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2506 {
2507 vim_free(r);
2508 if (reg_toolong)
Bram Moolenaareaaac012022-01-02 17:00:40 +00002509 EMSG_RET_NULL(_(e_pattern_too_long));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002510 return NULL;
2511 }
2512
2513 // Dig out information for optimizations.
2514 r->regstart = NUL; // Worst-case defaults.
2515 r->reganch = 0;
2516 r->regmust = NULL;
2517 r->regmlen = 0;
2518 r->regflags = regflags;
2519 if (flags & HASNL)
2520 r->regflags |= RF_HASNL;
2521 if (flags & HASLOOKBH)
2522 r->regflags |= RF_LOOKBH;
2523#ifdef FEAT_SYN_HL
2524 // Remember whether this pattern has any \z specials in it.
2525 r->reghasz = re_has_z;
2526#endif
2527 scan = r->program + 1; // First BRANCH.
2528 if (OP(regnext(scan)) == END) // Only one top-level choice.
2529 {
2530 scan = OPERAND(scan);
2531
2532 // Starting-point info.
2533 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2534 {
2535 r->reganch++;
2536 scan = regnext(scan);
2537 }
2538
2539 if (OP(scan) == EXACTLY)
2540 {
2541 if (has_mbyte)
2542 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2543 else
2544 r->regstart = *OPERAND(scan);
2545 }
2546 else if ((OP(scan) == BOW
2547 || OP(scan) == EOW
2548 || OP(scan) == NOTHING
2549 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2550 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2551 && OP(regnext(scan)) == EXACTLY)
2552 {
2553 if (has_mbyte)
2554 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2555 else
2556 r->regstart = *OPERAND(regnext(scan));
2557 }
2558
2559 // If there's something expensive in the r.e., find the longest
2560 // literal string that must appear and make it the regmust. Resolve
2561 // ties in favor of later strings, since the regstart check works
2562 // with the beginning of the r.e. and avoiding duplication
2563 // strengthens checking. Not a strong reason, but sufficient in the
2564 // absence of others.
2565
2566 // When the r.e. starts with BOW, it is faster to look for a regmust
2567 // first. Used a lot for "#" and "*" commands. (Added by mool).
2568 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2569 && !(flags & HASNL))
2570 {
John Marriott82792db2024-05-12 00:07:17 +02002571 size_t scanlen;
2572
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002573 longest = NULL;
2574 len = 0;
2575 for (; scan != NULL; scan = regnext(scan))
John Marriott82792db2024-05-12 00:07:17 +02002576 {
2577 if (OP(scan) == EXACTLY)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002578 {
John Marriott82792db2024-05-12 00:07:17 +02002579 scanlen = STRLEN(OPERAND(scan));
2580 if (scanlen >= (size_t)len)
2581 {
2582 longest = OPERAND(scan);
2583 len = (int)scanlen;
2584 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002585 }
John Marriott82792db2024-05-12 00:07:17 +02002586 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002587 r->regmust = longest;
2588 r->regmlen = len;
2589 }
2590 }
2591#ifdef BT_REGEXP_DUMP
2592 regdump(expr, r);
2593#endif
2594 r->engine = &bt_regengine;
2595 return (regprog_T *)r;
2596}
2597
2598#if defined(FEAT_SYN_HL) || defined(PROTO)
2599/*
2600 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2601 * found. This is messy, but it works fine.
2602 */
2603 int
2604vim_regcomp_had_eol(void)
2605{
2606 return had_eol;
2607}
2608#endif
2609
2610/*
2611 * Get a number after a backslash that is inside [].
2612 * When nothing is recognized return a backslash.
2613 */
2614 static int
2615coll_get_char(void)
2616{
Christian Brabandtf2b16982025-03-29 09:08:58 +01002617 vimlong_T nr = -1;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002618
2619 switch (*regparse++)
2620 {
2621 case 'd': nr = getdecchrs(); break;
2622 case 'o': nr = getoctchrs(); break;
2623 case 'x': nr = gethexchrs(2); break;
2624 case 'u': nr = gethexchrs(4); break;
2625 case 'U': nr = gethexchrs(8); break;
2626 }
Christian Brabandtf2b16982025-03-29 09:08:58 +01002627 if (nr < 0)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002628 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002629 // If getting the number fails be backwards compatible: the character
2630 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002631 --regparse;
2632 nr = '\\';
2633 }
Christian Brabandtf2b16982025-03-29 09:08:58 +01002634 if (nr > INT_MAX)
2635 nr = INT_MAX;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002636 return nr;
2637}
2638
2639/*
2640 * Free a compiled regexp program, returned by bt_regcomp().
2641 */
2642 static void
2643bt_regfree(regprog_T *prog)
2644{
2645 vim_free(prog);
2646}
2647
2648#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2649
2650/*
2651 * The arguments from BRACE_LIMITS are stored here. They are actually local
2652 * to regmatch(), but they are here to reduce the amount of stack space used
2653 * (it can be called recursively many times).
2654 */
2655static long bl_minval;
2656static long bl_maxval;
2657
2658/*
2659 * Save the input line and position in a regsave_T.
2660 */
2661 static void
2662reg_save(regsave_T *save, garray_T *gap)
2663{
2664 if (REG_MULTI)
2665 {
2666 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2667 save->rs_u.pos.lnum = rex.lnum;
2668 }
2669 else
2670 save->rs_u.ptr = rex.input;
2671 save->rs_len = gap->ga_len;
2672}
2673
2674/*
2675 * Restore the input line and position from a regsave_T.
2676 */
2677 static void
2678reg_restore(regsave_T *save, garray_T *gap)
2679{
2680 if (REG_MULTI)
2681 {
2682 if (rex.lnum != save->rs_u.pos.lnum)
2683 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002684 // only call reg_getline() when the line number changed to save
2685 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002686 rex.lnum = save->rs_u.pos.lnum;
2687 rex.line = reg_getline(rex.lnum);
2688 }
2689 rex.input = rex.line + save->rs_u.pos.col;
2690 }
2691 else
2692 rex.input = save->rs_u.ptr;
2693 gap->ga_len = save->rs_len;
2694}
2695
2696/*
2697 * Return TRUE if current position is equal to saved position.
2698 */
2699 static int
2700reg_save_equal(regsave_T *save)
2701{
2702 if (REG_MULTI)
2703 return rex.lnum == save->rs_u.pos.lnum
2704 && rex.input == rex.line + save->rs_u.pos.col;
2705 return rex.input == save->rs_u.ptr;
2706}
2707
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002708// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002709#define save_se(savep, posp, pp) \
2710 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2711
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002712// After a failed match restore the sub-expressions.
Bram Moolenaarebfec1c2023-01-22 21:14:53 +00002713#define restore_se(savep, posp, pp) \
2714{ \
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002715 if (REG_MULTI) \
2716 *(posp) = (savep)->se_u.pos; \
2717 else \
Bram Moolenaarebfec1c2023-01-22 21:14:53 +00002718 *(pp) = (savep)->se_u.ptr; \
2719}
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002720
2721/*
2722 * Tentatively set the sub-expression start to the current position (after
2723 * calling regmatch() they will have changed). Need to save the existing
2724 * values for when there is no match.
2725 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2726 * depending on REG_MULTI.
2727 */
2728 static void
2729save_se_multi(save_se_T *savep, lpos_T *posp)
2730{
2731 savep->se_u.pos = *posp;
2732 posp->lnum = rex.lnum;
2733 posp->col = (colnr_T)(rex.input - rex.line);
2734}
2735
2736 static void
2737save_se_one(save_se_T *savep, char_u **pp)
2738{
2739 savep->se_u.ptr = *pp;
2740 *pp = rex.input;
2741}
2742
2743/*
2744 * regrepeat - repeatedly match something simple, return how many.
2745 * Advances rex.input (and rex.lnum) to just after the matched chars.
2746 */
2747 static int
2748regrepeat(
2749 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002750 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002751{
2752 long count = 0;
2753 char_u *scan;
2754 char_u *opnd;
2755 int mask;
2756 int testval = 0;
2757
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002758 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002759 opnd = OPERAND(p);
2760 switch (OP(p))
2761 {
2762 case ANY:
2763 case ANY + ADD_NL:
2764 while (count < maxcount)
2765 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002766 // Matching anything means we continue until end-of-line (or
2767 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002768 while (*scan != NUL && count < maxcount)
2769 {
2770 ++count;
2771 MB_PTR_ADV(scan);
2772 }
2773 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2774 || rex.reg_line_lbr || count == maxcount)
2775 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002776 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002777 reg_nextline();
2778 scan = rex.input;
2779 if (got_int)
2780 break;
2781 }
2782 break;
2783
2784 case IDENT:
2785 case IDENT + ADD_NL:
2786 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002787 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002788 case SIDENT:
2789 case SIDENT + ADD_NL:
2790 while (count < maxcount)
2791 {
2792 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2793 {
2794 MB_PTR_ADV(scan);
2795 }
2796 else if (*scan == NUL)
2797 {
2798 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2799 || rex.reg_line_lbr)
2800 break;
2801 reg_nextline();
2802 scan = rex.input;
2803 if (got_int)
2804 break;
2805 }
2806 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2807 ++scan;
2808 else
2809 break;
2810 ++count;
2811 }
2812 break;
2813
2814 case KWORD:
2815 case KWORD + ADD_NL:
2816 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002817 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002818 case SKWORD:
2819 case SKWORD + ADD_NL:
2820 while (count < maxcount)
2821 {
2822 if (vim_iswordp_buf(scan, rex.reg_buf)
2823 && (testval || !VIM_ISDIGIT(*scan)))
2824 {
2825 MB_PTR_ADV(scan);
2826 }
2827 else if (*scan == NUL)
2828 {
2829 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2830 || rex.reg_line_lbr)
2831 break;
2832 reg_nextline();
2833 scan = rex.input;
2834 if (got_int)
2835 break;
2836 }
2837 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2838 ++scan;
2839 else
2840 break;
2841 ++count;
2842 }
2843 break;
2844
2845 case FNAME:
2846 case FNAME + ADD_NL:
2847 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002848 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002849 case SFNAME:
2850 case SFNAME + ADD_NL:
2851 while (count < maxcount)
2852 {
2853 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2854 {
2855 MB_PTR_ADV(scan);
2856 }
2857 else if (*scan == NUL)
2858 {
2859 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2860 || rex.reg_line_lbr)
2861 break;
2862 reg_nextline();
2863 scan = rex.input;
2864 if (got_int)
2865 break;
2866 }
2867 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2868 ++scan;
2869 else
2870 break;
2871 ++count;
2872 }
2873 break;
2874
2875 case PRINT:
2876 case PRINT + ADD_NL:
2877 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002878 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002879 case SPRINT:
2880 case SPRINT + ADD_NL:
2881 while (count < maxcount)
2882 {
2883 if (*scan == NUL)
2884 {
2885 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2886 || rex.reg_line_lbr)
2887 break;
2888 reg_nextline();
2889 scan = rex.input;
2890 if (got_int)
2891 break;
2892 }
2893 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2894 && (testval || !VIM_ISDIGIT(*scan)))
2895 {
2896 MB_PTR_ADV(scan);
2897 }
2898 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2899 ++scan;
2900 else
2901 break;
2902 ++count;
2903 }
2904 break;
2905
2906 case WHITE:
2907 case WHITE + ADD_NL:
2908 testval = mask = RI_WHITE;
2909do_class:
2910 while (count < maxcount)
2911 {
2912 int l;
2913
2914 if (*scan == NUL)
2915 {
2916 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2917 || rex.reg_line_lbr)
2918 break;
2919 reg_nextline();
2920 scan = rex.input;
2921 if (got_int)
2922 break;
2923 }
2924 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2925 {
2926 if (testval != 0)
2927 break;
2928 scan += l;
2929 }
2930 else if ((class_tab[*scan] & mask) == testval)
2931 ++scan;
2932 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2933 ++scan;
2934 else
2935 break;
2936 ++count;
2937 }
2938 break;
2939
2940 case NWHITE:
2941 case NWHITE + ADD_NL:
2942 mask = RI_WHITE;
2943 goto do_class;
2944 case DIGIT:
2945 case DIGIT + ADD_NL:
2946 testval = mask = RI_DIGIT;
2947 goto do_class;
2948 case NDIGIT:
2949 case NDIGIT + ADD_NL:
2950 mask = RI_DIGIT;
2951 goto do_class;
2952 case HEX:
2953 case HEX + ADD_NL:
2954 testval = mask = RI_HEX;
2955 goto do_class;
2956 case NHEX:
2957 case NHEX + ADD_NL:
2958 mask = RI_HEX;
2959 goto do_class;
2960 case OCTAL:
2961 case OCTAL + ADD_NL:
2962 testval = mask = RI_OCTAL;
2963 goto do_class;
2964 case NOCTAL:
2965 case NOCTAL + ADD_NL:
2966 mask = RI_OCTAL;
2967 goto do_class;
2968 case WORD:
2969 case WORD + ADD_NL:
2970 testval = mask = RI_WORD;
2971 goto do_class;
2972 case NWORD:
2973 case NWORD + ADD_NL:
2974 mask = RI_WORD;
2975 goto do_class;
2976 case HEAD:
2977 case HEAD + ADD_NL:
2978 testval = mask = RI_HEAD;
2979 goto do_class;
2980 case NHEAD:
2981 case NHEAD + ADD_NL:
2982 mask = RI_HEAD;
2983 goto do_class;
2984 case ALPHA:
2985 case ALPHA + ADD_NL:
2986 testval = mask = RI_ALPHA;
2987 goto do_class;
2988 case NALPHA:
2989 case NALPHA + ADD_NL:
2990 mask = RI_ALPHA;
2991 goto do_class;
2992 case LOWER:
2993 case LOWER + ADD_NL:
2994 testval = mask = RI_LOWER;
2995 goto do_class;
2996 case NLOWER:
2997 case NLOWER + ADD_NL:
2998 mask = RI_LOWER;
2999 goto do_class;
3000 case UPPER:
3001 case UPPER + ADD_NL:
3002 testval = mask = RI_UPPER;
3003 goto do_class;
3004 case NUPPER:
3005 case NUPPER + ADD_NL:
3006 mask = RI_UPPER;
3007 goto do_class;
3008
3009 case EXACTLY:
3010 {
3011 int cu, cl;
3012
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003013 // This doesn't do a multi-byte character, because a MULTIBYTECODE
3014 // would have been used for it. It does handle single-byte
3015 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003016 if (rex.reg_ic)
3017 {
3018 cu = MB_TOUPPER(*opnd);
3019 cl = MB_TOLOWER(*opnd);
3020 while (count < maxcount && (*scan == cu || *scan == cl))
3021 {
3022 count++;
3023 scan++;
3024 }
3025 }
3026 else
3027 {
3028 cu = *opnd;
3029 while (count < maxcount && *scan == cu)
3030 {
3031 count++;
3032 scan++;
3033 }
3034 }
3035 break;
3036 }
3037
3038 case MULTIBYTECODE:
3039 {
3040 int i, len, cf = 0;
3041
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003042 // Safety check (just in case 'encoding' was changed since
3043 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003044 if ((len = (*mb_ptr2len)(opnd)) > 1)
3045 {
3046 if (rex.reg_ic && enc_utf8)
3047 cf = utf_fold(utf_ptr2char(opnd));
3048 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3049 {
3050 for (i = 0; i < len; ++i)
3051 if (opnd[i] != scan[i])
3052 break;
3053 if (i < len && (!rex.reg_ic || !enc_utf8
3054 || utf_fold(utf_ptr2char(scan)) != cf))
3055 break;
3056 scan += len;
3057 ++count;
3058 }
3059 }
3060 }
3061 break;
3062
3063 case ANYOF:
3064 case ANYOF + ADD_NL:
3065 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003066 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003067
3068 case ANYBUT:
3069 case ANYBUT + ADD_NL:
3070 while (count < maxcount)
3071 {
3072 int len;
3073
3074 if (*scan == NUL)
3075 {
3076 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3077 || rex.reg_line_lbr)
3078 break;
3079 reg_nextline();
3080 scan = rex.input;
3081 if (got_int)
3082 break;
3083 }
3084 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3085 ++scan;
3086 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3087 {
3088 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3089 break;
3090 scan += len;
3091 }
3092 else
3093 {
3094 if ((cstrchr(opnd, *scan) == NULL) == testval)
3095 break;
3096 ++scan;
3097 }
3098 ++count;
3099 }
3100 break;
3101
3102 case NEWL:
3103 while (count < maxcount
3104 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3105 && !rex.reg_line_lbr && REG_MULTI)
3106 || (*scan == '\n' && rex.reg_line_lbr)))
3107 {
3108 count++;
3109 if (rex.reg_line_lbr)
3110 ADVANCE_REGINPUT();
3111 else
3112 reg_nextline();
3113 scan = rex.input;
3114 if (got_int)
3115 break;
3116 }
3117 break;
3118
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003119 default: // Oh dear. Called inappropriately.
RestorerZ68ebcee2023-05-31 17:12:14 +01003120 iemsg(e_corrupted_regexp_program);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003121#ifdef DEBUG
3122 printf("Called regrepeat with op code %d\n", OP(p));
3123#endif
3124 break;
3125 }
3126
3127 rex.input = scan;
3128
3129 return (int)count;
3130}
3131
3132/*
3133 * Push an item onto the regstack.
3134 * Returns pointer to new item. Returns NULL when out of memory.
3135 */
3136 static regitem_T *
3137regstack_push(regstate_T state, char_u *scan)
3138{
3139 regitem_T *rp;
3140
3141 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3142 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00003143 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003144 return NULL;
3145 }
3146 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3147 return NULL;
3148
3149 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3150 rp->rs_state = state;
3151 rp->rs_scan = scan;
3152
3153 regstack.ga_len += sizeof(regitem_T);
3154 return rp;
3155}
3156
3157/*
3158 * Pop an item from the regstack.
3159 */
3160 static void
3161regstack_pop(char_u **scan)
3162{
3163 regitem_T *rp;
3164
3165 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3166 *scan = rp->rs_scan;
3167
3168 regstack.ga_len -= sizeof(regitem_T);
3169}
3170
Bram Moolenaar616592e2022-06-17 15:17:10 +01003171#ifdef FEAT_RELTIME
3172/*
3173 * Check if the timer expired, return TRUE if so.
3174 */
3175 static int
3176bt_did_time_out(int *timed_out)
3177{
3178 if (*timeout_flag)
3179 {
3180 if (timed_out != NULL)
3181 {
Bram Moolenaar4c5678f2022-11-30 18:12:19 +00003182# ifdef FEAT_EVAL
Bram Moolenaar616592e2022-06-17 15:17:10 +01003183 if (!*timed_out)
3184 ch_log(NULL, "BT regexp timed out");
Bram Moolenaar509ce032022-06-20 11:23:01 +01003185# endif
Bram Moolenaar616592e2022-06-17 15:17:10 +01003186 *timed_out = TRUE;
3187 }
3188 return TRUE;
3189 }
3190 return FALSE;
3191}
3192#endif
3193
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003194/*
3195 * Save the current subexpr to "bp", so that they can be restored
3196 * later by restore_subexpr().
3197 */
3198 static void
3199save_subexpr(regbehind_T *bp)
3200{
3201 int i;
3202
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003203 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3204 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003205 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003206 if (rex.need_clear_subexpr)
3207 return;
3208
3209 for (i = 0; i < NSUBEXP; ++i)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003210 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003211 if (REG_MULTI)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003212 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003213 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3214 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3215 }
3216 else
3217 {
3218 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3219 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003220 }
3221 }
3222}
3223
3224/*
3225 * Restore the subexpr from "bp".
3226 */
3227 static void
3228restore_subexpr(regbehind_T *bp)
3229{
3230 int i;
3231
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003232 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003233 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003234 if (rex.need_clear_subexpr)
3235 return;
3236
3237 for (i = 0; i < NSUBEXP; ++i)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003238 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003239 if (REG_MULTI)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003240 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00003241 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3242 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3243 }
3244 else
3245 {
3246 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3247 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003248 }
3249 }
3250}
3251
3252/*
3253 * regmatch - main matching routine
3254 *
3255 * Conceptually the strategy is simple: Check to see whether the current node
3256 * matches, push an item onto the regstack and loop to see whether the rest
3257 * matches, and then act accordingly. In practice we make some effort to
3258 * avoid using the regstack, in particular by going through "ordinary" nodes
3259 * (that don't need to know whether the rest of the match failed) by a nested
3260 * loop.
3261 *
3262 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3263 * the last matched character.
3264 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3265 * undefined state!
3266 */
3267 static int
3268regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003269 char_u *scan, // Current node.
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003270 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003271{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003272 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003273 int op;
3274 int c;
3275 regitem_T *rp;
3276 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003277 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003278
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003279 // Make "regstack" and "backpos" empty. They are allocated and freed in
3280 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003281 regstack.ga_len = 0;
3282 backpos.ga_len = 0;
3283
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003284 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003285 for (;;)
3286 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003287 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3288 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003289 fast_breakcheck();
3290
3291#ifdef DEBUG
3292 if (scan != NULL && regnarrate)
3293 {
3294 mch_errmsg((char *)regprop(scan));
3295 mch_errmsg("(\n");
3296 }
3297#endif
3298
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003299 // Repeat for items that can be matched sequentially, without using the
3300 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003301 for (;;)
3302 {
3303 if (got_int || scan == NULL)
3304 {
3305 status = RA_FAIL;
3306 break;
3307 }
3308#ifdef FEAT_RELTIME
Bram Moolenaar616592e2022-06-17 15:17:10 +01003309 if (bt_did_time_out(timed_out))
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003310 {
Paul Ollis65745772022-06-05 16:55:54 +01003311 status = RA_FAIL;
3312 break;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003313 }
3314#endif
3315 status = RA_CONT;
3316
3317#ifdef DEBUG
3318 if (regnarrate)
3319 {
3320 mch_errmsg((char *)regprop(scan));
3321 mch_errmsg("...\n");
3322# ifdef FEAT_SYN_HL
3323 if (re_extmatch_in != NULL)
3324 {
3325 int i;
3326
3327 mch_errmsg(_("External submatches:\n"));
3328 for (i = 0; i < NSUBEXP; i++)
3329 {
3330 mch_errmsg(" \"");
3331 if (re_extmatch_in->matches[i] != NULL)
3332 mch_errmsg((char *)re_extmatch_in->matches[i]);
3333 mch_errmsg("\"\n");
3334 }
3335 }
3336# endif
3337 }
3338#endif
3339 next = regnext(scan);
3340
3341 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003342 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003343 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
Paul Ollis65745772022-06-05 16:55:54 +01003344 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003345 {
3346 reg_nextline();
3347 }
3348 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3349 {
3350 ADVANCE_REGINPUT();
3351 }
3352 else
3353 {
3354 if (WITH_NL(op))
3355 op -= ADD_NL;
3356 if (has_mbyte)
3357 c = (*mb_ptr2char)(rex.input);
3358 else
3359 c = *rex.input;
3360 switch (op)
3361 {
3362 case BOL:
3363 if (rex.input != rex.line)
3364 status = RA_NOMATCH;
3365 break;
3366
3367 case EOL:
3368 if (c != NUL)
3369 status = RA_NOMATCH;
3370 break;
3371
3372 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003373 // We're not at the beginning of the file when below the first
3374 // line where we started, not at the start of the line or we
3375 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003376 if (rex.lnum != 0 || rex.input != rex.line
3377 || (REG_MULTI && rex.reg_firstlnum > 1))
3378 status = RA_NOMATCH;
3379 break;
3380
3381 case RE_EOF:
3382 if (rex.lnum != rex.reg_maxline || c != NUL)
3383 status = RA_NOMATCH;
3384 break;
3385
3386 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003387 // Check if the buffer is in a window and compare the
3388 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003389 if (rex.reg_win == NULL
3390 || (rex.lnum + rex.reg_firstlnum
3391 != rex.reg_win->w_cursor.lnum)
3392 || ((colnr_T)(rex.input - rex.line)
3393 != rex.reg_win->w_cursor.col))
3394 status = RA_NOMATCH;
3395 break;
3396
3397 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003398 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003399 {
3400 int mark = OPERAND(scan)[0];
3401 int cmp = OPERAND(scan)[1];
3402 pos_T *pos;
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003403 size_t col = REG_MULTI ? rex.input - rex.line : 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003404
3405 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003406
3407 // Line may have been freed, get it again.
3408 if (REG_MULTI)
3409 {
3410 rex.line = reg_getline(rex.lnum);
3411 rex.input = rex.line + col;
3412 }
3413
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003414 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003415 || pos->lnum <= 0) // mark isn't set in reg_buf
3416 {
3417 status = RA_NOMATCH;
3418 }
3419 else
3420 {
3421 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3422 && pos->col == MAXCOL
John Marriott82792db2024-05-12 00:07:17 +02003423 ? reg_getline_len(pos->lnum - rex.reg_firstlnum)
Bram Moolenaar872bee52021-05-24 22:56:15 +02003424 : pos->col;
3425
3426 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3427 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003428 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003429 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003430 ? cmp != '>'
3431 : cmp != '<'))
3432 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3433 ? cmp != '>'
3434 : cmp != '<')))
3435 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003436 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003437 }
3438 break;
3439
3440 case RE_VISUAL:
3441 if (!reg_match_visual())
3442 status = RA_NOMATCH;
3443 break;
3444
3445 case RE_LNUM:
3446 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3447 scan))
3448 status = RA_NOMATCH;
3449 break;
3450
3451 case RE_COL:
3452 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3453 status = RA_NOMATCH;
3454 break;
3455
3456 case RE_VCOL:
Bram Moolenaar13ed4942022-08-19 13:59:25 +01003457 {
3458 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaar753aead2022-09-08 12:17:06 +01003459 linenr_T lnum = REG_MULTI ? rex.reg_firstlnum + rex.lnum : 1;
3460 long_u vcol;
Bram Moolenaar13ed4942022-08-19 13:59:25 +01003461
Bram Moolenaar753aead2022-09-08 12:17:06 +01003462 if (REG_MULTI && (lnum <= 0
3463 || lnum > wp->w_buffer->b_ml.ml_line_count))
3464 lnum = 1;
3465 vcol = (long_u)win_linetabsize(wp, lnum, rex.line,
Bram Moolenaar13ed4942022-08-19 13:59:25 +01003466 (colnr_T)(rex.input - rex.line));
3467 if (!re_num_cmp(vcol + 1, scan))
3468 status = RA_NOMATCH;
3469 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003470 break;
3471
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003472 case BOW: // \<word; rex.input points to w
3473 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003474 status = RA_NOMATCH;
3475 else if (has_mbyte)
3476 {
3477 int this_class;
3478
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003479 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003480 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3481 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003482 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003483 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003484 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003485 }
3486 else
3487 {
3488 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3489 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3490 status = RA_NOMATCH;
3491 }
3492 break;
3493
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003494 case EOW: // word\>; rex.input points after d
3495 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003496 status = RA_NOMATCH;
3497 else if (has_mbyte)
3498 {
3499 int this_class, prev_class;
3500
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003501 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003502 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3503 prev_class = reg_prev_class();
3504 if (this_class == prev_class
3505 || prev_class == 0 || prev_class == 1)
3506 status = RA_NOMATCH;
3507 }
3508 else
3509 {
3510 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3511 || (rex.input[0] != NUL
3512 && vim_iswordc_buf(c, rex.reg_buf)))
3513 status = RA_NOMATCH;
3514 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003515 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003516
3517 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003518 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003519 if (c == NUL)
3520 status = RA_NOMATCH;
3521 else
3522 ADVANCE_REGINPUT();
3523 break;
3524
3525 case IDENT:
3526 if (!vim_isIDc(c))
3527 status = RA_NOMATCH;
3528 else
3529 ADVANCE_REGINPUT();
3530 break;
3531
3532 case SIDENT:
3533 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3534 status = RA_NOMATCH;
3535 else
3536 ADVANCE_REGINPUT();
3537 break;
3538
3539 case KWORD:
3540 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3541 status = RA_NOMATCH;
3542 else
3543 ADVANCE_REGINPUT();
3544 break;
3545
3546 case SKWORD:
3547 if (VIM_ISDIGIT(*rex.input)
3548 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3549 status = RA_NOMATCH;
3550 else
3551 ADVANCE_REGINPUT();
3552 break;
3553
3554 case FNAME:
3555 if (!vim_isfilec(c))
3556 status = RA_NOMATCH;
3557 else
3558 ADVANCE_REGINPUT();
3559 break;
3560
3561 case SFNAME:
3562 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3563 status = RA_NOMATCH;
3564 else
3565 ADVANCE_REGINPUT();
3566 break;
3567
3568 case PRINT:
3569 if (!vim_isprintc(PTR2CHAR(rex.input)))
3570 status = RA_NOMATCH;
3571 else
3572 ADVANCE_REGINPUT();
3573 break;
3574
3575 case SPRINT:
3576 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3577 status = RA_NOMATCH;
3578 else
3579 ADVANCE_REGINPUT();
3580 break;
3581
3582 case WHITE:
3583 if (!VIM_ISWHITE(c))
3584 status = RA_NOMATCH;
3585 else
3586 ADVANCE_REGINPUT();
3587 break;
3588
3589 case NWHITE:
3590 if (c == NUL || VIM_ISWHITE(c))
3591 status = RA_NOMATCH;
3592 else
3593 ADVANCE_REGINPUT();
3594 break;
3595
3596 case DIGIT:
3597 if (!ri_digit(c))
3598 status = RA_NOMATCH;
3599 else
3600 ADVANCE_REGINPUT();
3601 break;
3602
3603 case NDIGIT:
3604 if (c == NUL || ri_digit(c))
3605 status = RA_NOMATCH;
3606 else
3607 ADVANCE_REGINPUT();
3608 break;
3609
3610 case HEX:
3611 if (!ri_hex(c))
3612 status = RA_NOMATCH;
3613 else
3614 ADVANCE_REGINPUT();
3615 break;
3616
3617 case NHEX:
3618 if (c == NUL || ri_hex(c))
3619 status = RA_NOMATCH;
3620 else
3621 ADVANCE_REGINPUT();
3622 break;
3623
3624 case OCTAL:
3625 if (!ri_octal(c))
3626 status = RA_NOMATCH;
3627 else
3628 ADVANCE_REGINPUT();
3629 break;
3630
3631 case NOCTAL:
3632 if (c == NUL || ri_octal(c))
3633 status = RA_NOMATCH;
3634 else
3635 ADVANCE_REGINPUT();
3636 break;
3637
3638 case WORD:
3639 if (!ri_word(c))
3640 status = RA_NOMATCH;
3641 else
3642 ADVANCE_REGINPUT();
3643 break;
3644
3645 case NWORD:
3646 if (c == NUL || ri_word(c))
3647 status = RA_NOMATCH;
3648 else
3649 ADVANCE_REGINPUT();
3650 break;
3651
3652 case HEAD:
3653 if (!ri_head(c))
3654 status = RA_NOMATCH;
3655 else
3656 ADVANCE_REGINPUT();
3657 break;
3658
3659 case NHEAD:
3660 if (c == NUL || ri_head(c))
3661 status = RA_NOMATCH;
3662 else
3663 ADVANCE_REGINPUT();
3664 break;
3665
3666 case ALPHA:
3667 if (!ri_alpha(c))
3668 status = RA_NOMATCH;
3669 else
3670 ADVANCE_REGINPUT();
3671 break;
3672
3673 case NALPHA:
3674 if (c == NUL || ri_alpha(c))
3675 status = RA_NOMATCH;
3676 else
3677 ADVANCE_REGINPUT();
3678 break;
3679
3680 case LOWER:
3681 if (!ri_lower(c))
3682 status = RA_NOMATCH;
3683 else
3684 ADVANCE_REGINPUT();
3685 break;
3686
3687 case NLOWER:
3688 if (c == NUL || ri_lower(c))
3689 status = RA_NOMATCH;
3690 else
3691 ADVANCE_REGINPUT();
3692 break;
3693
3694 case UPPER:
3695 if (!ri_upper(c))
3696 status = RA_NOMATCH;
3697 else
3698 ADVANCE_REGINPUT();
3699 break;
3700
3701 case NUPPER:
3702 if (c == NUL || ri_upper(c))
3703 status = RA_NOMATCH;
3704 else
3705 ADVANCE_REGINPUT();
3706 break;
3707
3708 case EXACTLY:
3709 {
3710 int len;
3711 char_u *opnd;
3712
3713 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003714 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003715 if (*opnd != *rex.input
3716 && (!rex.reg_ic
3717 || (!enc_utf8
3718 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3719 status = RA_NOMATCH;
3720 else if (*opnd == NUL)
3721 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003722 // match empty string always works; happens when "~" is
3723 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003724 }
3725 else
3726 {
3727 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3728 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003729 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003730 }
3731 else
3732 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003733 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003734 len = (int)STRLEN(opnd);
3735 if (cstrncmp(opnd, rex.input, &len) != 0)
3736 status = RA_NOMATCH;
3737 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003738 // Check for following composing character, unless %C
3739 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003740 if (status != RA_NOMATCH
3741 && enc_utf8
3742 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3743 && !rex.reg_icombine
3744 && OP(next) != RE_COMPOSING)
3745 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003746 // raaron: This code makes a composing character get
3747 // ignored, which is the correct behavior (sometimes)
3748 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003749 status = RA_NOMATCH;
3750 }
3751 if (status != RA_NOMATCH)
3752 rex.input += len;
3753 }
3754 }
3755 break;
3756
3757 case ANYOF:
3758 case ANYBUT:
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01003759 {
3760 char_u *q = OPERAND(scan);
3761
3762 if (c == NUL)
3763 status = RA_NOMATCH;
3764 else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
3765 status = RA_NOMATCH;
3766 else
3767 {
3768 // Check following combining characters
3769 int len = 0;
3770 int i;
3771
3772 if (enc_utf8)
3773 len = utfc_ptr2len(q) - utf_ptr2len(q);
3774
3775 MB_CPTR_ADV(rex.input);
3776 MB_CPTR_ADV(q);
3777
3778 if (!enc_utf8 || len == 0)
3779 break;
3780
3781 for (i = 0; i < len; ++i)
3782 if (q[i] != rex.input[i])
3783 {
3784 status = RA_NOMATCH;
3785 break;
3786 }
3787 rex.input += len;
3788 }
3789 break;
3790 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003791
3792 case MULTIBYTECODE:
3793 if (has_mbyte)
3794 {
3795 int i, len;
3796 char_u *opnd;
3797 int opndc = 0, inpc;
3798
3799 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003800 // Safety check (just in case 'encoding' was changed since
3801 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003802 if ((len = (*mb_ptr2len)(opnd)) < 2)
3803 {
3804 status = RA_NOMATCH;
3805 break;
3806 }
3807 if (enc_utf8)
3808 opndc = utf_ptr2char(opnd);
3809 if (enc_utf8 && utf_iscomposing(opndc))
3810 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003811 // When only a composing char is given match at any
3812 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003813 status = RA_NOMATCH;
3814 for (i = 0; rex.input[i] != NUL;
3815 i += utf_ptr2len(rex.input + i))
3816 {
3817 inpc = utf_ptr2char(rex.input + i);
3818 if (!utf_iscomposing(inpc))
3819 {
3820 if (i > 0)
3821 break;
3822 }
3823 else if (opndc == inpc)
3824 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003825 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003826 len = i + utfc_ptr2len(rex.input + i);
3827 status = RA_MATCH;
3828 break;
3829 }
3830 }
3831 }
Christian Brabandt22e8e122024-07-30 20:39:18 +02003832 else if (enc_utf8)
3833 {
3834 if (cstrncmp(opnd, rex.input, &len) != 0)
3835 {
3836 status = RA_NOMATCH;
3837 break;
3838 }
3839 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003840 else
3841 for (i = 0; i < len; ++i)
3842 if (opnd[i] != rex.input[i])
3843 {
3844 status = RA_NOMATCH;
3845 break;
3846 }
3847 rex.input += len;
3848 }
3849 else
3850 status = RA_NOMATCH;
3851 break;
3852 case RE_COMPOSING:
3853 if (enc_utf8)
3854 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003855 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003856 while (utf_iscomposing(utf_ptr2char(rex.input)))
3857 MB_CPTR_ADV(rex.input);
3858 }
3859 break;
3860
3861 case NOTHING:
3862 break;
3863
3864 case BACK:
3865 {
3866 int i;
3867 backpos_T *bp;
3868
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003869 // When we run into BACK we need to check if we don't keep
3870 // looping without matching any input. The second and later
3871 // times a BACK is encountered it fails if the input is still
3872 // at the same position as the previous time.
3873 // The positions are stored in "backpos" and found by the
3874 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003875 bp = (backpos_T *)backpos.ga_data;
3876 for (i = 0; i < backpos.ga_len; ++i)
3877 if (bp[i].bp_scan == scan)
3878 break;
3879 if (i == backpos.ga_len)
3880 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003881 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003882 if (ga_grow(&backpos, 1) == FAIL)
3883 status = RA_FAIL;
3884 else
3885 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003886 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003887 bp = (backpos_T *)backpos.ga_data;
3888 bp[i].bp_scan = scan;
3889 ++backpos.ga_len;
3890 }
3891 }
3892 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003893 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003894 status = RA_NOMATCH;
3895
3896 if (status != RA_FAIL && status != RA_NOMATCH)
3897 reg_save(&bp[i].bp_pos, &backpos);
3898 }
3899 break;
3900
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003901 case MOPEN + 0: // Match start: \zs
3902 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003903 case MOPEN + 2:
3904 case MOPEN + 3:
3905 case MOPEN + 4:
3906 case MOPEN + 5:
3907 case MOPEN + 6:
3908 case MOPEN + 7:
3909 case MOPEN + 8:
3910 case MOPEN + 9:
3911 {
3912 no = op - MOPEN;
3913 cleanup_subexpr();
3914 rp = regstack_push(RS_MOPEN, scan);
3915 if (rp == NULL)
3916 status = RA_FAIL;
3917 else
3918 {
3919 rp->rs_no = no;
3920 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3921 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003922 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003923 }
3924 }
3925 break;
3926
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003927 case NOPEN: // \%(
3928 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003929 if (regstack_push(RS_NOPEN, scan) == NULL)
3930 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003931 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003932 break;
3933
3934#ifdef FEAT_SYN_HL
3935 case ZOPEN + 1:
3936 case ZOPEN + 2:
3937 case ZOPEN + 3:
3938 case ZOPEN + 4:
3939 case ZOPEN + 5:
3940 case ZOPEN + 6:
3941 case ZOPEN + 7:
3942 case ZOPEN + 8:
3943 case ZOPEN + 9:
3944 {
3945 no = op - ZOPEN;
3946 cleanup_zsubexpr();
3947 rp = regstack_push(RS_ZOPEN, scan);
3948 if (rp == NULL)
3949 status = RA_FAIL;
3950 else
3951 {
3952 rp->rs_no = no;
3953 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3954 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003955 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003956 }
3957 }
3958 break;
3959#endif
3960
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003961 case MCLOSE + 0: // Match end: \ze
3962 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003963 case MCLOSE + 2:
3964 case MCLOSE + 3:
3965 case MCLOSE + 4:
3966 case MCLOSE + 5:
3967 case MCLOSE + 6:
3968 case MCLOSE + 7:
3969 case MCLOSE + 8:
3970 case MCLOSE + 9:
3971 {
3972 no = op - MCLOSE;
3973 cleanup_subexpr();
3974 rp = regstack_push(RS_MCLOSE, scan);
3975 if (rp == NULL)
3976 status = RA_FAIL;
3977 else
3978 {
3979 rp->rs_no = no;
3980 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3981 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003982 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003983 }
3984 }
3985 break;
3986
3987#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003988 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003989 case ZCLOSE + 2:
3990 case ZCLOSE + 3:
3991 case ZCLOSE + 4:
3992 case ZCLOSE + 5:
3993 case ZCLOSE + 6:
3994 case ZCLOSE + 7:
3995 case ZCLOSE + 8:
3996 case ZCLOSE + 9:
3997 {
3998 no = op - ZCLOSE;
3999 cleanup_zsubexpr();
4000 rp = regstack_push(RS_ZCLOSE, scan);
4001 if (rp == NULL)
4002 status = RA_FAIL;
4003 else
4004 {
4005 rp->rs_no = no;
4006 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
4007 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004008 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004009 }
4010 }
4011 break;
4012#endif
4013
4014 case BACKREF + 1:
4015 case BACKREF + 2:
4016 case BACKREF + 3:
4017 case BACKREF + 4:
4018 case BACKREF + 5:
4019 case BACKREF + 6:
4020 case BACKREF + 7:
4021 case BACKREF + 8:
4022 case BACKREF + 9:
4023 {
4024 int len;
4025
4026 no = op - BACKREF;
4027 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004028 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004029 {
4030 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
4031 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004032 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004033 len = 0;
4034 }
4035 else
4036 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004037 // Compare current input with back-ref in the same
4038 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004039 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
4040 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
4041 status = RA_NOMATCH;
4042 }
4043 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004044 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004045 {
4046 if (rex.reg_startpos[no].lnum < 0
4047 || rex.reg_endpos[no].lnum < 0)
4048 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004049 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004050 len = 0;
4051 }
4052 else
4053 {
4054 if (rex.reg_startpos[no].lnum == rex.lnum
4055 && rex.reg_endpos[no].lnum == rex.lnum)
4056 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004057 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004058 len = rex.reg_endpos[no].col
4059 - rex.reg_startpos[no].col;
4060 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
4061 rex.input, &len) != 0)
4062 status = RA_NOMATCH;
4063 }
4064 else
4065 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004066 // Messy situation: Need to compare between two
4067 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004068 int r = match_with_backref(
4069 rex.reg_startpos[no].lnum,
4070 rex.reg_startpos[no].col,
4071 rex.reg_endpos[no].lnum,
4072 rex.reg_endpos[no].col,
4073 &len);
4074
4075 if (r != RA_MATCH)
4076 status = r;
4077 }
4078 }
4079 }
4080
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004081 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004082 rex.input += len;
4083 }
4084 break;
4085
4086#ifdef FEAT_SYN_HL
4087 case ZREF + 1:
4088 case ZREF + 2:
4089 case ZREF + 3:
4090 case ZREF + 4:
4091 case ZREF + 5:
4092 case ZREF + 6:
4093 case ZREF + 7:
4094 case ZREF + 8:
4095 case ZREF + 9:
4096 {
4097 int len;
4098
4099 cleanup_zsubexpr();
4100 no = op - ZREF;
4101 if (re_extmatch_in != NULL
4102 && re_extmatch_in->matches[no] != NULL)
4103 {
4104 len = (int)STRLEN(re_extmatch_in->matches[no]);
4105 if (cstrncmp(re_extmatch_in->matches[no],
4106 rex.input, &len) != 0)
4107 status = RA_NOMATCH;
4108 else
4109 rex.input += len;
4110 }
4111 else
4112 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004113 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004114 }
4115 }
4116 break;
4117#endif
4118
4119 case BRANCH:
4120 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004121 if (OP(next) != BRANCH) // No choice.
4122 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004123 else
4124 {
4125 rp = regstack_push(RS_BRANCH, scan);
4126 if (rp == NULL)
4127 status = RA_FAIL;
4128 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004129 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004130 }
4131 }
4132 break;
4133
4134 case BRACE_LIMITS:
4135 {
4136 if (OP(next) == BRACE_SIMPLE)
4137 {
4138 bl_minval = OPERAND_MIN(scan);
4139 bl_maxval = OPERAND_MAX(scan);
4140 }
4141 else if (OP(next) >= BRACE_COMPLEX
4142 && OP(next) < BRACE_COMPLEX + 10)
4143 {
4144 no = OP(next) - BRACE_COMPLEX;
4145 brace_min[no] = OPERAND_MIN(scan);
4146 brace_max[no] = OPERAND_MAX(scan);
4147 brace_count[no] = 0;
4148 }
4149 else
4150 {
4151 internal_error("BRACE_LIMITS");
4152 status = RA_FAIL;
4153 }
4154 }
4155 break;
4156
4157 case BRACE_COMPLEX + 0:
4158 case BRACE_COMPLEX + 1:
4159 case BRACE_COMPLEX + 2:
4160 case BRACE_COMPLEX + 3:
4161 case BRACE_COMPLEX + 4:
4162 case BRACE_COMPLEX + 5:
4163 case BRACE_COMPLEX + 6:
4164 case BRACE_COMPLEX + 7:
4165 case BRACE_COMPLEX + 8:
4166 case BRACE_COMPLEX + 9:
4167 {
4168 no = op - BRACE_COMPLEX;
4169 ++brace_count[no];
4170
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004171 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004172 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4173 ? brace_min[no] : brace_max[no]))
4174 {
4175 rp = regstack_push(RS_BRCPLX_MORE, scan);
4176 if (rp == NULL)
4177 status = RA_FAIL;
4178 else
4179 {
4180 rp->rs_no = no;
4181 reg_save(&rp->rs_un.regsave, &backpos);
4182 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004183 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004184 }
4185 break;
4186 }
4187
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004188 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004189 if (brace_min[no] <= brace_max[no])
4190 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004191 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004192 if (brace_count[no] <= brace_max[no])
4193 {
4194 rp = regstack_push(RS_BRCPLX_LONG, scan);
4195 if (rp == NULL)
4196 status = RA_FAIL;
4197 else
4198 {
4199 rp->rs_no = no;
4200 reg_save(&rp->rs_un.regsave, &backpos);
4201 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004202 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004203 }
4204 }
4205 }
4206 else
4207 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004208 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004209 if (brace_count[no] <= brace_min[no])
4210 {
4211 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4212 if (rp == NULL)
4213 status = RA_FAIL;
4214 else
4215 {
4216 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004217 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004218 }
4219 }
4220 }
4221 }
4222 break;
4223
4224 case BRACE_SIMPLE:
4225 case STAR:
4226 case PLUS:
4227 {
4228 regstar_T rst;
4229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004230 // Lookahead to avoid useless match attempts when we know
4231 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004232 if (OP(next) == EXACTLY)
4233 {
4234 rst.nextb = *OPERAND(next);
4235 if (rex.reg_ic)
4236 {
4237 if (MB_ISUPPER(rst.nextb))
4238 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4239 else
4240 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4241 }
4242 else
4243 rst.nextb_ic = rst.nextb;
4244 }
4245 else
4246 {
4247 rst.nextb = NUL;
4248 rst.nextb_ic = NUL;
4249 }
4250 if (op != BRACE_SIMPLE)
4251 {
4252 rst.minval = (op == STAR) ? 0 : 1;
4253 rst.maxval = MAX_LIMIT;
4254 }
4255 else
4256 {
4257 rst.minval = bl_minval;
4258 rst.maxval = bl_maxval;
4259 }
4260
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004261 // When maxval > minval, try matching as much as possible, up
4262 // to maxval. When maxval < minval, try matching at least the
4263 // minimal number (since the range is backwards, that's also
4264 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004265 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4266 if (got_int)
4267 {
4268 status = RA_FAIL;
4269 break;
4270 }
4271 if (rst.minval <= rst.maxval
4272 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4273 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004274 // It could match. Prepare for trying to match what
4275 // follows. The code is below. Parameters are stored in
4276 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004277 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4278 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004279 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004280 status = RA_FAIL;
4281 }
4282 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4283 status = RA_FAIL;
4284 else
4285 {
4286 regstack.ga_len += sizeof(regstar_T);
4287 rp = regstack_push(rst.minval <= rst.maxval
4288 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4289 if (rp == NULL)
4290 status = RA_FAIL;
4291 else
4292 {
4293 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004294 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004295 }
4296 }
4297 }
4298 else
4299 status = RA_NOMATCH;
4300
4301 }
4302 break;
4303
4304 case NOMATCH:
4305 case MATCH:
4306 case SUBPAT:
4307 rp = regstack_push(RS_NOMATCH, scan);
4308 if (rp == NULL)
4309 status = RA_FAIL;
4310 else
4311 {
4312 rp->rs_no = op;
4313 reg_save(&rp->rs_un.regsave, &backpos);
4314 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004315 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004316 }
4317 break;
4318
4319 case BEHIND:
4320 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004321 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004322 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4323 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004324 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004325 status = RA_FAIL;
4326 }
4327 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4328 status = RA_FAIL;
4329 else
4330 {
4331 regstack.ga_len += sizeof(regbehind_T);
4332 rp = regstack_push(RS_BEHIND1, scan);
4333 if (rp == NULL)
4334 status = RA_FAIL;
4335 else
4336 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004337 // Need to save the subexpr to be able to restore them
4338 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004339 save_subexpr(((regbehind_T *)rp) - 1);
4340
4341 rp->rs_no = op;
4342 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004343 // First try if what follows matches. If it does then we
4344 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004345 }
4346 }
4347 break;
4348
4349 case BHPOS:
4350 if (REG_MULTI)
4351 {
4352 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4353 || behind_pos.rs_u.pos.lnum != rex.lnum)
4354 status = RA_NOMATCH;
4355 }
4356 else if (behind_pos.rs_u.ptr != rex.input)
4357 status = RA_NOMATCH;
4358 break;
4359
4360 case NEWL:
4361 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4362 || rex.reg_line_lbr)
4363 && (c != '\n' || !rex.reg_line_lbr))
4364 status = RA_NOMATCH;
4365 else if (rex.reg_line_lbr)
4366 ADVANCE_REGINPUT();
4367 else
4368 reg_nextline();
4369 break;
4370
4371 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004372 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004373 break;
4374
4375 default:
RestorerZ68ebcee2023-05-31 17:12:14 +01004376 iemsg(e_corrupted_regexp_program);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004377#ifdef DEBUG
4378 printf("Illegal op code %d\n", op);
4379#endif
4380 status = RA_FAIL;
4381 break;
4382 }
4383 }
4384
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004385 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004386 if (status != RA_CONT)
4387 break;
4388
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004389 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004390 scan = next;
4391
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004392 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004393
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004394 // If there is something on the regstack execute the code for the state.
4395 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004396 while (regstack.ga_len > 0 && status != RA_FAIL)
4397 {
4398 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4399 switch (rp->rs_state)
4400 {
4401 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004402 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004403 regstack_pop(&scan);
4404 break;
4405
4406 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004407 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004408 if (status == RA_NOMATCH)
4409 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4410 &rex.reg_startp[rp->rs_no]);
4411 regstack_pop(&scan);
4412 break;
4413
4414#ifdef FEAT_SYN_HL
4415 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004416 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004417 if (status == RA_NOMATCH)
4418 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4419 &reg_startzp[rp->rs_no]);
4420 regstack_pop(&scan);
4421 break;
4422#endif
4423
4424 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004425 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004426 if (status == RA_NOMATCH)
4427 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4428 &rex.reg_endp[rp->rs_no]);
4429 regstack_pop(&scan);
4430 break;
4431
4432#ifdef FEAT_SYN_HL
4433 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004434 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004435 if (status == RA_NOMATCH)
4436 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4437 &reg_endzp[rp->rs_no]);
4438 regstack_pop(&scan);
4439 break;
4440#endif
4441
4442 case RS_BRANCH:
4443 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004444 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004445 regstack_pop(&scan);
4446 else
4447 {
4448 if (status != RA_BREAK)
4449 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004450 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004451 reg_restore(&rp->rs_un.regsave, &backpos);
4452 scan = rp->rs_scan;
4453 }
4454 if (scan == NULL || OP(scan) != BRANCH)
4455 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004456 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004457 status = RA_NOMATCH;
4458 regstack_pop(&scan);
4459 }
4460 else
4461 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004462 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004463 rp->rs_scan = regnext(scan);
4464 reg_save(&rp->rs_un.regsave, &backpos);
4465 scan = OPERAND(scan);
4466 }
4467 }
4468 break;
4469
4470 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004471 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004472 if (status == RA_NOMATCH)
4473 {
4474 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004475 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004476 }
4477 regstack_pop(&scan);
4478 break;
4479
4480 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004481 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004482 if (status == RA_NOMATCH)
4483 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004484 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004485 reg_restore(&rp->rs_un.regsave, &backpos);
4486 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004487 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004488 status = RA_CONT;
4489 }
4490 regstack_pop(&scan);
4491 if (status == RA_CONT)
4492 scan = regnext(scan);
4493 break;
4494
4495 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004496 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004497 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004498 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004499 reg_restore(&rp->rs_un.regsave, &backpos);
4500 regstack_pop(&scan);
4501 if (status == RA_NOMATCH)
4502 {
4503 scan = OPERAND(scan);
4504 status = RA_CONT;
4505 }
4506 break;
4507
4508 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004509 // Pop the state. If the operand matches for NOMATCH or
4510 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4511 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004512 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4513 status = RA_NOMATCH;
4514 else
4515 {
4516 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004517 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004518 reg_restore(&rp->rs_un.regsave, &backpos);
4519 }
4520 regstack_pop(&scan);
4521 if (status == RA_CONT)
4522 scan = regnext(scan);
4523 break;
4524
4525 case RS_BEHIND1:
4526 if (status == RA_NOMATCH)
4527 {
4528 regstack_pop(&scan);
4529 regstack.ga_len -= sizeof(regbehind_T);
4530 }
4531 else
4532 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004533 // The stuff after BEHIND/NOBEHIND matches. Now try if
4534 // the behind part does (not) match before the current
4535 // position in the input. This must be done at every
4536 // position in the input and checking if the match ends at
4537 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004538
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004539 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004540 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4541
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004542 // Start looking for a match with operand at the current
4543 // position. Go back one character until we find the
4544 // result, hitting the start of the line or the previous
4545 // line (for multi-line matching).
4546 // Set behind_pos to where the match should end, BHPOS
4547 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004548 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4549 behind_pos = rp->rs_un.regsave;
4550
4551 rp->rs_state = RS_BEHIND2;
4552
4553 reg_restore(&rp->rs_un.regsave, &backpos);
4554 scan = OPERAND(rp->rs_scan) + 4;
4555 }
4556 break;
4557
4558 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004559 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004560 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4561 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004562 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004563 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4564 if (rp->rs_no == BEHIND)
4565 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4566 &backpos);
4567 else
4568 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004569 // But we didn't want a match. Need to restore the
4570 // subexpr, because what follows matched, so they have
4571 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004572 status = RA_NOMATCH;
4573 restore_subexpr(((regbehind_T *)rp) - 1);
4574 }
4575 regstack_pop(&scan);
4576 regstack.ga_len -= sizeof(regbehind_T);
4577 }
4578 else
4579 {
4580 long limit;
4581
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004582 // No match or a match that doesn't end where we want it: Go
4583 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004584 no = OK;
4585 limit = OPERAND_MIN(rp->rs_scan);
4586 if (REG_MULTI)
4587 {
4588 if (limit > 0
4589 && ((rp->rs_un.regsave.rs_u.pos.lnum
4590 < behind_pos.rs_u.pos.lnum
4591 ? (colnr_T)STRLEN(rex.line)
4592 : behind_pos.rs_u.pos.col)
4593 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4594 no = FAIL;
4595 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4596 {
4597 if (rp->rs_un.regsave.rs_u.pos.lnum
4598 < behind_pos.rs_u.pos.lnum
4599 || reg_getline(
4600 --rp->rs_un.regsave.rs_u.pos.lnum)
4601 == NULL)
4602 no = FAIL;
4603 else
4604 {
4605 reg_restore(&rp->rs_un.regsave, &backpos);
4606 rp->rs_un.regsave.rs_u.pos.col =
4607 (colnr_T)STRLEN(rex.line);
4608 }
4609 }
4610 else
4611 {
4612 if (has_mbyte)
4613 {
4614 char_u *line =
4615 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4616
4617 rp->rs_un.regsave.rs_u.pos.col -=
4618 (*mb_head_off)(line, line
4619 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4620 }
4621 else
4622 --rp->rs_un.regsave.rs_u.pos.col;
4623 }
4624 }
4625 else
4626 {
4627 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4628 no = FAIL;
4629 else
4630 {
4631 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4632 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4633 - rp->rs_un.regsave.rs_u.ptr) > limit)
4634 no = FAIL;
4635 }
4636 }
4637 if (no == OK)
4638 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004639 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004640 reg_restore(&rp->rs_un.regsave, &backpos);
4641 scan = OPERAND(rp->rs_scan) + 4;
4642 if (status == RA_MATCH)
4643 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004644 // We did match, so subexpr may have been changed,
4645 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004646 status = RA_NOMATCH;
4647 restore_subexpr(((regbehind_T *)rp) - 1);
4648 }
4649 }
4650 else
4651 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004652 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004653 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4654 if (rp->rs_no == NOBEHIND)
4655 {
4656 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4657 &backpos);
4658 status = RA_MATCH;
4659 }
4660 else
4661 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004662 // We do want a proper match. Need to restore the
4663 // subexpr if we had a match, because they may have
4664 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004665 if (status == RA_MATCH)
4666 {
4667 status = RA_NOMATCH;
4668 restore_subexpr(((regbehind_T *)rp) - 1);
4669 }
4670 }
4671 regstack_pop(&scan);
4672 regstack.ga_len -= sizeof(regbehind_T);
4673 }
4674 }
4675 break;
4676
4677 case RS_STAR_LONG:
4678 case RS_STAR_SHORT:
4679 {
4680 regstar_T *rst = ((regstar_T *)rp) - 1;
4681
4682 if (status == RA_MATCH)
4683 {
4684 regstack_pop(&scan);
4685 regstack.ga_len -= sizeof(regstar_T);
4686 break;
4687 }
4688
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004689 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004690 if (status != RA_BREAK)
4691 reg_restore(&rp->rs_un.regsave, &backpos);
4692
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004693 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004694 for (;;)
4695 {
4696 if (status != RA_BREAK)
4697 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004698 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004699 if (rp->rs_state == RS_STAR_LONG)
4700 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004701 // Trying for longest match, but couldn't or
4702 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004703 if (--rst->count < rst->minval)
4704 break;
4705 if (rex.input == rex.line)
4706 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004707 // backup to last char of previous line
Bram Moolenaar6456fae2022-02-22 13:37:31 +00004708 if (rex.lnum == 0)
4709 {
4710 status = RA_NOMATCH;
4711 break;
4712 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004713 --rex.lnum;
4714 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004715 // Just in case regrepeat() didn't count
4716 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004717 if (rex.line == NULL)
4718 break;
John Marriott82792db2024-05-12 00:07:17 +02004719 rex.input = rex.line + reg_getline_len(rex.lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004720 fast_breakcheck();
4721 }
4722 else
4723 MB_PTR_BACK(rex.line, rex.input);
4724 }
4725 else
4726 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004727 // Range is backwards, use shortest match first.
4728 // Careful: maxval and minval are exchanged!
4729 // Couldn't or didn't match: try advancing one
4730 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004731 if (rst->count == rst->minval
4732 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4733 break;
4734 ++rst->count;
4735 }
4736 if (got_int)
4737 break;
4738 }
4739 else
4740 status = RA_NOMATCH;
4741
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004742 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004743 if (rst->nextb == NUL || *rex.input == rst->nextb
4744 || *rex.input == rst->nextb_ic)
4745 {
4746 reg_save(&rp->rs_un.regsave, &backpos);
4747 scan = regnext(rp->rs_scan);
4748 status = RA_CONT;
4749 break;
4750 }
4751 }
4752 if (status != RA_CONT)
4753 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004754 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004755 regstack_pop(&scan);
4756 regstack.ga_len -= sizeof(regstar_T);
4757 status = RA_NOMATCH;
4758 }
4759 }
4760 break;
4761 }
4762
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004763 // If we want to continue the inner loop or didn't pop a state
4764 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004765 if (status == RA_CONT || rp == (regitem_T *)
4766 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4767 break;
Bram Moolenaar616592e2022-06-17 15:17:10 +01004768
4769#ifdef FEAT_RELTIME
4770 if (bt_did_time_out(timed_out))
4771 {
4772 status = RA_FAIL;
4773 break;
4774 }
4775#endif
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004776 }
4777
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004778 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004779 if (status == RA_CONT)
4780 continue;
4781
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004782 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004783 if (regstack.ga_len == 0 || status == RA_FAIL)
4784 {
4785 if (scan == NULL)
4786 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004787 // We get here only if there's trouble -- normally "case END" is
4788 // the terminating point.
RestorerZ68ebcee2023-05-31 17:12:14 +01004789 iemsg(e_corrupted_regexp_program);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004790#ifdef DEBUG
4791 printf("Premature EOL\n");
4792#endif
4793 }
4794 return (status == RA_MATCH);
4795 }
4796
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004797 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004798
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004799 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004800}
4801
4802/*
4803 * regtry - try match of "prog" with at rex.line["col"].
4804 * Returns 0 for failure, number of lines contained in the match otherwise.
4805 */
4806 static long
4807regtry(
4808 bt_regprog_T *prog,
4809 colnr_T col,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004810 int *timed_out) // flag set on timeout or NULL
4811{
4812 rex.input = rex.line + col;
4813 rex.need_clear_subexpr = TRUE;
4814#ifdef FEAT_SYN_HL
4815 // Clear the external match subpointers if necessary.
4816 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4817#endif
4818
Paul Ollis65745772022-06-05 16:55:54 +01004819 if (regmatch(prog->program + 1, timed_out) == 0)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004820 return 0;
4821
4822 cleanup_subexpr();
4823 if (REG_MULTI)
4824 {
4825 if (rex.reg_startpos[0].lnum < 0)
4826 {
4827 rex.reg_startpos[0].lnum = 0;
4828 rex.reg_startpos[0].col = col;
4829 }
4830 if (rex.reg_endpos[0].lnum < 0)
4831 {
4832 rex.reg_endpos[0].lnum = rex.lnum;
4833 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4834 }
4835 else
4836 // Use line number of "\ze".
4837 rex.lnum = rex.reg_endpos[0].lnum;
4838 }
4839 else
4840 {
4841 if (rex.reg_startp[0] == NULL)
4842 rex.reg_startp[0] = rex.line + col;
4843 if (rex.reg_endp[0] == NULL)
4844 rex.reg_endp[0] = rex.input;
4845 }
4846#ifdef FEAT_SYN_HL
4847 // Package any found \z(...\) matches for export. Default is none.
4848 unref_extmatch(re_extmatch_out);
4849 re_extmatch_out = NULL;
4850
4851 if (prog->reghasz == REX_SET)
4852 {
4853 int i;
4854
4855 cleanup_zsubexpr();
4856 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004857 if (re_extmatch_out == NULL)
4858 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004859 for (i = 0; i < NSUBEXP; i++)
4860 {
4861 if (REG_MULTI)
4862 {
4863 // Only accept single line matches.
4864 if (reg_startzpos[i].lnum >= 0
4865 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4866 && reg_endzpos[i].col >= reg_startzpos[i].col)
4867 re_extmatch_out->matches[i] =
4868 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4869 + reg_startzpos[i].col,
4870 reg_endzpos[i].col - reg_startzpos[i].col);
4871 }
4872 else
4873 {
4874 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4875 re_extmatch_out->matches[i] =
4876 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004877 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004878 }
4879 }
4880 }
4881#endif
4882 return 1 + rex.lnum;
4883}
4884
4885/*
4886 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004887 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004888 * Returns 0 for failure, number of lines contained in the match otherwise.
4889 */
4890 static long
4891bt_regexec_both(
4892 char_u *line,
Bram Moolenaar01105b32022-11-26 11:47:10 +00004893 colnr_T startcol, // column to start looking for match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004894 int *timed_out) // flag set on timeout or NULL
4895{
4896 bt_regprog_T *prog;
4897 char_u *s;
Bram Moolenaar01105b32022-11-26 11:47:10 +00004898 colnr_T col = startcol;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004899 long retval = 0L;
4900
4901 // Create "regstack" and "backpos" if they are not allocated yet.
4902 // We allocate *_INITIAL amount of bytes first and then set the grow size
4903 // to much bigger value to avoid many malloc calls in case of deep regular
4904 // expressions.
4905 if (regstack.ga_data == NULL)
4906 {
4907 // Use an item size of 1 byte, since we push different things
4908 // onto the regstack.
4909 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4910 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4911 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4912 }
4913
4914 if (backpos.ga_data == NULL)
4915 {
4916 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4917 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4918 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4919 }
4920
4921 if (REG_MULTI)
4922 {
4923 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4924 line = reg_getline((linenr_T)0);
4925 rex.reg_startpos = rex.reg_mmatch->startpos;
4926 rex.reg_endpos = rex.reg_mmatch->endpos;
4927 }
4928 else
4929 {
4930 prog = (bt_regprog_T *)rex.reg_match->regprog;
4931 rex.reg_startp = rex.reg_match->startp;
4932 rex.reg_endp = rex.reg_match->endp;
4933 }
4934
4935 // Be paranoid...
4936 if (prog == NULL || line == NULL)
4937 {
RestorerZ68ebcee2023-05-31 17:12:14 +01004938 iemsg(e_null_argument);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004939 goto theend;
4940 }
4941
4942 // Check validity of program.
4943 if (prog_magic_wrong())
4944 goto theend;
4945
4946 // If the start column is past the maximum column: no need to try.
4947 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4948 goto theend;
4949
4950 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4951 if (prog->regflags & RF_ICASE)
4952 rex.reg_ic = TRUE;
4953 else if (prog->regflags & RF_NOICASE)
4954 rex.reg_ic = FALSE;
4955
4956 // If pattern contains "\Z" overrule value of rex.reg_icombine
4957 if (prog->regflags & RF_ICOMBINE)
4958 rex.reg_icombine = TRUE;
4959
4960 // If there is a "must appear" string, look for it.
4961 if (prog->regmust != NULL)
4962 {
4963 int c;
4964
4965 if (has_mbyte)
4966 c = (*mb_ptr2char)(prog->regmust);
4967 else
4968 c = *prog->regmust;
4969 s = line + col;
4970
4971 // This is used very often, esp. for ":global". Use three versions of
4972 // the loop to avoid overhead of conditions.
4973 if (!rex.reg_ic && !has_mbyte)
4974 while ((s = vim_strbyte(s, c)) != NULL)
4975 {
4976 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4977 break; // Found it.
4978 ++s;
4979 }
4980 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4981 while ((s = vim_strchr(s, c)) != NULL)
4982 {
4983 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4984 break; // Found it.
4985 MB_PTR_ADV(s);
4986 }
4987 else
4988 while ((s = cstrchr(s, c)) != NULL)
4989 {
4990 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4991 break; // Found it.
4992 MB_PTR_ADV(s);
4993 }
4994 if (s == NULL) // Not present.
4995 goto theend;
4996 }
4997
4998 rex.line = line;
4999 rex.lnum = 0;
5000 reg_toolong = FALSE;
5001
5002 // Simplest case: Anchored match need be tried only once.
5003 if (prog->reganch)
5004 {
5005 int c;
5006
5007 if (has_mbyte)
5008 c = (*mb_ptr2char)(rex.line + col);
5009 else
5010 c = rex.line[col];
5011 if (prog->regstart == NUL
5012 || prog->regstart == c
5013 || (rex.reg_ic
5014 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
5015 || (c < 255 && prog->regstart < 255 &&
5016 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
Paul Ollis65745772022-06-05 16:55:54 +01005017 retval = regtry(prog, col, timed_out);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005018 else
5019 retval = 0;
5020 }
5021 else
5022 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005023 // Messy cases: unanchored match.
5024 while (!got_int)
5025 {
5026 if (prog->regstart != NUL)
5027 {
5028 // Skip until the char we know it must start with.
5029 // Used often, do some work to avoid call overhead.
5030 if (!rex.reg_ic && !has_mbyte)
5031 s = vim_strbyte(rex.line + col, prog->regstart);
5032 else
5033 s = cstrchr(rex.line + col, prog->regstart);
5034 if (s == NULL)
5035 {
5036 retval = 0;
5037 break;
5038 }
5039 col = (int)(s - rex.line);
5040 }
5041
5042 // Check for maximum column to try.
5043 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
5044 {
5045 retval = 0;
5046 break;
5047 }
5048
Paul Ollis65745772022-06-05 16:55:54 +01005049 retval = regtry(prog, col, timed_out);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005050 if (retval > 0)
5051 break;
5052
5053 // if not currently on the first line, get it again
5054 if (rex.lnum != 0)
5055 {
5056 rex.lnum = 0;
5057 rex.line = reg_getline((linenr_T)0);
5058 }
5059 if (rex.line[col] == NUL)
5060 break;
5061 if (has_mbyte)
5062 col += (*mb_ptr2len)(rex.line + col);
5063 else
5064 ++col;
5065#ifdef FEAT_RELTIME
Bram Moolenaar616592e2022-06-17 15:17:10 +01005066 if (bt_did_time_out(timed_out))
Paul Ollis65745772022-06-05 16:55:54 +01005067 break;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005068#endif
5069 }
5070 }
5071
5072theend:
5073 // Free "reg_tofree" when it's a bit big.
5074 // Free regstack and backpos if they are bigger than their initial size.
5075 if (reg_tofreelen > 400)
5076 VIM_CLEAR(reg_tofree);
5077 if (regstack.ga_maxlen > REGSTACK_INITIAL)
5078 ga_clear(&regstack);
5079 if (backpos.ga_maxlen > BACKPOS_INITIAL)
5080 ga_clear(&backpos);
5081
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005082 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005083 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005084 // Make sure the end is never before the start. Can happen when \zs
5085 // and \ze are used.
5086 if (REG_MULTI)
5087 {
5088 lpos_T *start = &rex.reg_mmatch->startpos[0];
5089 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005090
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005091 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005092 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005093 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
Bram Moolenaar01105b32022-11-26 11:47:10 +00005094
5095 // startpos[0] may be set by "\zs", also return the column where
5096 // the whole pattern matched.
5097 rex.reg_mmatch->rmm_matchcol = col;
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005098 }
5099 else
5100 {
5101 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5102 rex.reg_match->endp[0] = rex.reg_match->startp[0];
Bram Moolenaar01105b32022-11-26 11:47:10 +00005103
5104 // startpos[0] may be set by "\zs", also return the column where
5105 // the whole pattern matched.
5106 rex.reg_match->rm_matchcol = col;
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005107 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005108 }
5109
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005110 return retval;
5111}
5112
5113/*
5114 * Match a regexp against a string.
5115 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5116 * Uses curbuf for line count and 'iskeyword'.
5117 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5118 *
5119 * Returns 0 for failure, number of lines contained in the match otherwise.
5120 */
5121 static int
5122bt_regexec_nl(
5123 regmatch_T *rmp,
5124 char_u *line, // string to match against
5125 colnr_T col, // column to start looking for match
5126 int line_lbr)
5127{
5128 rex.reg_match = rmp;
5129 rex.reg_mmatch = NULL;
5130 rex.reg_maxline = 0;
5131 rex.reg_line_lbr = line_lbr;
5132 rex.reg_buf = curbuf;
5133 rex.reg_win = NULL;
5134 rex.reg_ic = rmp->rm_ic;
5135 rex.reg_icombine = FALSE;
5136 rex.reg_maxcol = 0;
5137
Paul Ollis65745772022-06-05 16:55:54 +01005138 return bt_regexec_both(line, col, NULL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005139}
5140
5141/*
5142 * Match a regexp against multiple lines.
5143 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5144 * Uses curbuf for line count and 'iskeyword'.
5145 *
5146 * Return zero if there is no match. Return number of lines contained in the
5147 * match otherwise.
5148 */
5149 static long
5150bt_regexec_multi(
5151 regmmatch_T *rmp,
5152 win_T *win, // window in which to search or NULL
5153 buf_T *buf, // buffer in which to search
5154 linenr_T lnum, // nr of line to start looking for match
5155 colnr_T col, // column to start looking for match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005156 int *timed_out) // flag set on timeout or NULL
5157{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005158 init_regexec_multi(rmp, win, buf, lnum);
Paul Ollis65745772022-06-05 16:55:54 +01005159 return bt_regexec_both(NULL, col, timed_out);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005160}
5161
5162/*
5163 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5164 */
5165 static int
5166re_num_cmp(long_u val, char_u *scan)
5167{
5168 long_u n = OPERAND_MIN(scan);
5169
5170 if (OPERAND_CMP(scan) == '>')
5171 return val > n;
5172 if (OPERAND_CMP(scan) == '<')
5173 return val < n;
5174 return val == n;
5175}
5176
5177#ifdef BT_REGEXP_DUMP
5178
5179/*
5180 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5181 */
5182 static void
5183regdump(char_u *pattern, bt_regprog_T *r)
5184{
5185 char_u *s;
5186 int op = EXACTLY; // Arbitrary non-END op.
5187 char_u *next;
5188 char_u *end = NULL;
5189 FILE *f;
5190
5191#ifdef BT_REGEXP_LOG
5192 f = fopen("bt_regexp_log.log", "a");
5193#else
5194 f = stdout;
5195#endif
5196 if (f == NULL)
5197 return;
5198 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5199
5200 s = r->program + 1;
5201 // Loop until we find the END that isn't before a referred next (an END
5202 // can also appear in a NOMATCH operand).
5203 while (op != END || s <= end)
5204 {
5205 op = OP(s);
5206 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5207 next = regnext(s);
5208 if (next == NULL) // Next ptr.
5209 fprintf(f, "(0)");
5210 else
5211 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5212 if (end < next)
5213 end = next;
5214 if (op == BRACE_LIMITS)
5215 {
5216 // Two ints
5217 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5218 s += 8;
5219 }
5220 else if (op == BEHIND || op == NOBEHIND)
5221 {
5222 // one int
5223 fprintf(f, " count %ld", OPERAND_MIN(s));
5224 s += 4;
5225 }
5226 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5227 {
5228 // one int plus comparator
5229 fprintf(f, " count %ld", OPERAND_MIN(s));
5230 s += 5;
5231 }
5232 s += 3;
5233 if (op == ANYOF || op == ANYOF + ADD_NL
5234 || op == ANYBUT || op == ANYBUT + ADD_NL
5235 || op == EXACTLY)
5236 {
5237 // Literal string, where present.
5238 fprintf(f, "\nxxxxxxxxx\n");
5239 while (*s != NUL)
5240 fprintf(f, "%c", *s++);
5241 fprintf(f, "\nxxxxxxxxx\n");
5242 s++;
5243 }
5244 fprintf(f, "\r\n");
5245 }
5246
5247 // Header fields of interest.
5248 if (r->regstart != NUL)
5249 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5250 ? (char *)transchar(r->regstart)
5251 : "multibyte", r->regstart);
5252 if (r->reganch)
5253 fprintf(f, "anchored; ");
5254 if (r->regmust != NULL)
5255 fprintf(f, "must have \"%s\"", r->regmust);
5256 fprintf(f, "\r\n");
5257
5258#ifdef BT_REGEXP_LOG
5259 fclose(f);
5260#endif
5261}
5262#endif // BT_REGEXP_DUMP
5263
5264#ifdef DEBUG
5265/*
5266 * regprop - printable representation of opcode
5267 */
5268 static char_u *
5269regprop(char_u *op)
5270{
5271 char *p;
5272 static char buf[50];
John Marriott82792db2024-05-12 00:07:17 +02005273 static size_t buflen = 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005274
5275 STRCPY(buf, ":");
John Marriott82792db2024-05-12 00:07:17 +02005276 buflen = 1;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005277
5278 switch ((int) OP(op))
5279 {
5280 case BOL:
5281 p = "BOL";
5282 break;
5283 case EOL:
5284 p = "EOL";
5285 break;
5286 case RE_BOF:
5287 p = "BOF";
5288 break;
5289 case RE_EOF:
5290 p = "EOF";
5291 break;
5292 case CURSOR:
5293 p = "CURSOR";
5294 break;
5295 case RE_VISUAL:
5296 p = "RE_VISUAL";
5297 break;
5298 case RE_LNUM:
5299 p = "RE_LNUM";
5300 break;
5301 case RE_MARK:
5302 p = "RE_MARK";
5303 break;
5304 case RE_COL:
5305 p = "RE_COL";
5306 break;
5307 case RE_VCOL:
5308 p = "RE_VCOL";
5309 break;
5310 case BOW:
5311 p = "BOW";
5312 break;
5313 case EOW:
5314 p = "EOW";
5315 break;
5316 case ANY:
5317 p = "ANY";
5318 break;
5319 case ANY + ADD_NL:
5320 p = "ANY+NL";
5321 break;
5322 case ANYOF:
5323 p = "ANYOF";
5324 break;
5325 case ANYOF + ADD_NL:
5326 p = "ANYOF+NL";
5327 break;
5328 case ANYBUT:
5329 p = "ANYBUT";
5330 break;
5331 case ANYBUT + ADD_NL:
5332 p = "ANYBUT+NL";
5333 break;
5334 case IDENT:
5335 p = "IDENT";
5336 break;
5337 case IDENT + ADD_NL:
5338 p = "IDENT+NL";
5339 break;
5340 case SIDENT:
5341 p = "SIDENT";
5342 break;
5343 case SIDENT + ADD_NL:
5344 p = "SIDENT+NL";
5345 break;
5346 case KWORD:
5347 p = "KWORD";
5348 break;
5349 case KWORD + ADD_NL:
5350 p = "KWORD+NL";
5351 break;
5352 case SKWORD:
5353 p = "SKWORD";
5354 break;
5355 case SKWORD + ADD_NL:
5356 p = "SKWORD+NL";
5357 break;
5358 case FNAME:
5359 p = "FNAME";
5360 break;
5361 case FNAME + ADD_NL:
5362 p = "FNAME+NL";
5363 break;
5364 case SFNAME:
5365 p = "SFNAME";
5366 break;
5367 case SFNAME + ADD_NL:
5368 p = "SFNAME+NL";
5369 break;
5370 case PRINT:
5371 p = "PRINT";
5372 break;
5373 case PRINT + ADD_NL:
5374 p = "PRINT+NL";
5375 break;
5376 case SPRINT:
5377 p = "SPRINT";
5378 break;
5379 case SPRINT + ADD_NL:
5380 p = "SPRINT+NL";
5381 break;
5382 case WHITE:
5383 p = "WHITE";
5384 break;
5385 case WHITE + ADD_NL:
5386 p = "WHITE+NL";
5387 break;
5388 case NWHITE:
5389 p = "NWHITE";
5390 break;
5391 case NWHITE + ADD_NL:
5392 p = "NWHITE+NL";
5393 break;
5394 case DIGIT:
5395 p = "DIGIT";
5396 break;
5397 case DIGIT + ADD_NL:
5398 p = "DIGIT+NL";
5399 break;
5400 case NDIGIT:
5401 p = "NDIGIT";
5402 break;
5403 case NDIGIT + ADD_NL:
5404 p = "NDIGIT+NL";
5405 break;
5406 case HEX:
5407 p = "HEX";
5408 break;
5409 case HEX + ADD_NL:
5410 p = "HEX+NL";
5411 break;
5412 case NHEX:
5413 p = "NHEX";
5414 break;
5415 case NHEX + ADD_NL:
5416 p = "NHEX+NL";
5417 break;
5418 case OCTAL:
5419 p = "OCTAL";
5420 break;
5421 case OCTAL + ADD_NL:
5422 p = "OCTAL+NL";
5423 break;
5424 case NOCTAL:
5425 p = "NOCTAL";
5426 break;
5427 case NOCTAL + ADD_NL:
5428 p = "NOCTAL+NL";
5429 break;
5430 case WORD:
5431 p = "WORD";
5432 break;
5433 case WORD + ADD_NL:
5434 p = "WORD+NL";
5435 break;
5436 case NWORD:
5437 p = "NWORD";
5438 break;
5439 case NWORD + ADD_NL:
5440 p = "NWORD+NL";
5441 break;
5442 case HEAD:
5443 p = "HEAD";
5444 break;
5445 case HEAD + ADD_NL:
5446 p = "HEAD+NL";
5447 break;
5448 case NHEAD:
5449 p = "NHEAD";
5450 break;
5451 case NHEAD + ADD_NL:
5452 p = "NHEAD+NL";
5453 break;
5454 case ALPHA:
5455 p = "ALPHA";
5456 break;
5457 case ALPHA + ADD_NL:
5458 p = "ALPHA+NL";
5459 break;
5460 case NALPHA:
5461 p = "NALPHA";
5462 break;
5463 case NALPHA + ADD_NL:
5464 p = "NALPHA+NL";
5465 break;
5466 case LOWER:
5467 p = "LOWER";
5468 break;
5469 case LOWER + ADD_NL:
5470 p = "LOWER+NL";
5471 break;
5472 case NLOWER:
5473 p = "NLOWER";
5474 break;
5475 case NLOWER + ADD_NL:
5476 p = "NLOWER+NL";
5477 break;
5478 case UPPER:
5479 p = "UPPER";
5480 break;
5481 case UPPER + ADD_NL:
5482 p = "UPPER+NL";
5483 break;
5484 case NUPPER:
5485 p = "NUPPER";
5486 break;
5487 case NUPPER + ADD_NL:
5488 p = "NUPPER+NL";
5489 break;
5490 case BRANCH:
5491 p = "BRANCH";
5492 break;
5493 case EXACTLY:
5494 p = "EXACTLY";
5495 break;
5496 case NOTHING:
5497 p = "NOTHING";
5498 break;
5499 case BACK:
5500 p = "BACK";
5501 break;
5502 case END:
5503 p = "END";
5504 break;
5505 case MOPEN + 0:
5506 p = "MATCH START";
5507 break;
5508 case MOPEN + 1:
5509 case MOPEN + 2:
5510 case MOPEN + 3:
5511 case MOPEN + 4:
5512 case MOPEN + 5:
5513 case MOPEN + 6:
5514 case MOPEN + 7:
5515 case MOPEN + 8:
5516 case MOPEN + 9:
John Marriott82792db2024-05-12 00:07:17 +02005517 buflen += sprintf(buf + buflen, "MOPEN%d", OP(op) - MOPEN);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005518 p = NULL;
5519 break;
5520 case MCLOSE + 0:
5521 p = "MATCH END";
5522 break;
5523 case MCLOSE + 1:
5524 case MCLOSE + 2:
5525 case MCLOSE + 3:
5526 case MCLOSE + 4:
5527 case MCLOSE + 5:
5528 case MCLOSE + 6:
5529 case MCLOSE + 7:
5530 case MCLOSE + 8:
5531 case MCLOSE + 9:
John Marriott82792db2024-05-12 00:07:17 +02005532 buflen += sprintf(buf + buflen, "MCLOSE%d", OP(op) - MCLOSE);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005533 p = NULL;
5534 break;
5535 case BACKREF + 1:
5536 case BACKREF + 2:
5537 case BACKREF + 3:
5538 case BACKREF + 4:
5539 case BACKREF + 5:
5540 case BACKREF + 6:
5541 case BACKREF + 7:
5542 case BACKREF + 8:
5543 case BACKREF + 9:
John Marriott82792db2024-05-12 00:07:17 +02005544 buflen += sprintf(buf + buflen, "BACKREF%d", OP(op) - BACKREF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005545 p = NULL;
5546 break;
5547 case NOPEN:
5548 p = "NOPEN";
5549 break;
5550 case NCLOSE:
5551 p = "NCLOSE";
5552 break;
5553#ifdef FEAT_SYN_HL
5554 case ZOPEN + 1:
5555 case ZOPEN + 2:
5556 case ZOPEN + 3:
5557 case ZOPEN + 4:
5558 case ZOPEN + 5:
5559 case ZOPEN + 6:
5560 case ZOPEN + 7:
5561 case ZOPEN + 8:
5562 case ZOPEN + 9:
John Marriott82792db2024-05-12 00:07:17 +02005563 buflen += sprintf(buf + buflen, "ZOPEN%d", OP(op) - ZOPEN);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005564 p = NULL;
5565 break;
5566 case ZCLOSE + 1:
5567 case ZCLOSE + 2:
5568 case ZCLOSE + 3:
5569 case ZCLOSE + 4:
5570 case ZCLOSE + 5:
5571 case ZCLOSE + 6:
5572 case ZCLOSE + 7:
5573 case ZCLOSE + 8:
5574 case ZCLOSE + 9:
John Marriott82792db2024-05-12 00:07:17 +02005575 buflen += sprintf(buf + buflen, "ZCLOSE%d", OP(op) - ZCLOSE);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005576 p = NULL;
5577 break;
5578 case ZREF + 1:
5579 case ZREF + 2:
5580 case ZREF + 3:
5581 case ZREF + 4:
5582 case ZREF + 5:
5583 case ZREF + 6:
5584 case ZREF + 7:
5585 case ZREF + 8:
5586 case ZREF + 9:
Christian Brabandt60430242024-05-14 11:19:47 +02005587 buflen += sprintf(buf + buflen, "ZREF%d", OP(op) - ZREF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005588 p = NULL;
5589 break;
5590#endif
5591 case STAR:
5592 p = "STAR";
5593 break;
5594 case PLUS:
5595 p = "PLUS";
5596 break;
5597 case NOMATCH:
5598 p = "NOMATCH";
5599 break;
5600 case MATCH:
5601 p = "MATCH";
5602 break;
5603 case BEHIND:
5604 p = "BEHIND";
5605 break;
5606 case NOBEHIND:
5607 p = "NOBEHIND";
5608 break;
5609 case SUBPAT:
5610 p = "SUBPAT";
5611 break;
5612 case BRACE_LIMITS:
5613 p = "BRACE_LIMITS";
5614 break;
5615 case BRACE_SIMPLE:
5616 p = "BRACE_SIMPLE";
5617 break;
5618 case BRACE_COMPLEX + 0:
5619 case BRACE_COMPLEX + 1:
5620 case BRACE_COMPLEX + 2:
5621 case BRACE_COMPLEX + 3:
5622 case BRACE_COMPLEX + 4:
5623 case BRACE_COMPLEX + 5:
5624 case BRACE_COMPLEX + 6:
5625 case BRACE_COMPLEX + 7:
5626 case BRACE_COMPLEX + 8:
5627 case BRACE_COMPLEX + 9:
John Marriott82792db2024-05-12 00:07:17 +02005628 buflen += sprintf(buf + buflen, "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005629 p = NULL;
5630 break;
5631 case MULTIBYTECODE:
5632 p = "MULTIBYTECODE";
5633 break;
5634 case NEWL:
5635 p = "NEWL";
5636 break;
5637 default:
John Marriott82792db2024-05-12 00:07:17 +02005638 buflen += sprintf(buf + buflen, "corrupt %d", OP(op));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005639 p = NULL;
5640 break;
5641 }
5642 if (p != NULL)
John Marriott82792db2024-05-12 00:07:17 +02005643 STRCPY(buf + buflen, p);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005644 return (char_u *)buf;
5645}
5646#endif // DEBUG