blob: 0b3487fa0ac50c2391099e474604149a97e8f04b [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200536 switch (c)
537 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200538 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200539 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
540 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
541 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
542 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
543 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
544 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
545 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
546 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
547 regmbc(0x100); regmbc(0x102); regmbc(0x104);
548 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
549 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
550 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
551 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
552 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
553 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
554 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200555 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200556 case 'B': case 0x181: case 0x243: case 0x1e02:
557 case 0x1e04: case 0x1e06:
558 regmbc('B');
559 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
560 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200561 return;
562 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200563 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
564 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200565 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200566 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
567 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
568 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200569 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200570 case 'D': case 0x10e: case 0x110: case 0x18a:
571 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
572 case 0x1e12:
573 regmbc('D'); regmbc(0x10e); regmbc(0x110);
574 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
575 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200576 return;
577 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200578 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
579 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
580 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
581 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
582 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200583 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200584 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
585 regmbc(0x114); regmbc(0x116); regmbc(0x118);
586 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
587 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
588 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
589 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
590 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
591 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200593 case 'F': case 0x191: case 0x1e1e: case 0xa798:
594 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
595 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200596 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200597 case 'G': case 0x11c: case 0x11e: case 0x120:
598 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
599 case 0x1f4: case 0x1e20: case 0xa7a0:
600 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
601 regmbc(0x120); regmbc(0x122); regmbc(0x193);
602 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
603 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200604 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200605 case 'H': case 0x124: case 0x126: case 0x21e:
606 case 0x1e22: case 0x1e24: case 0x1e26:
607 case 0x1e28: case 0x1e2a: case 0x2c67:
608 regmbc('H'); regmbc(0x124); regmbc(0x126);
609 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
610 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
611 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
613 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200614 case 0x128: case 0x12a: case 0x12c: case 0x12e:
615 case 0x130: case 0x197: case 0x1cf: case 0x208:
616 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
617 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200618 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200619 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
620 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
621 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
622 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
623 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200624 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200625 case 'J': case 0x134: case 0x248:
626 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200627 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200628 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
629 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
630 regmbc('K'); regmbc(0x136); regmbc(0x198);
631 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
632 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200633 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200634 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
635 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
636 case 0x1e3a: case 0x1e3c: case 0x2c60:
637 regmbc('L'); regmbc(0x139); regmbc(0x13b);
638 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
639 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
640 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200641 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200642 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
643 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
644 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200645 return;
646 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200647 case 0x143: case 0x145: case 0x147: case 0x1f8:
648 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
649 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200650 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200651 regmbc(0x143); regmbc(0x145); regmbc(0x147);
652 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
653 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200654 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200655 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
656 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
657 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
658 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
659 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
660 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
661 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
662 case 0x1ee0: case 0x1ee2:
663 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
664 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
665 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
666 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
667 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
668 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
669 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
670 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
671 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
672 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
673 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
674 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
675 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200676 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200677 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
678 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
679 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200680 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200681 case 'Q': case 0x24a:
682 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200683 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200684 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
685 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
686 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
687 regmbc('R'); regmbc(0x154); regmbc(0x156);
688 regmbc(0x210); regmbc(0x212); regmbc(0x158);
689 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
690 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
691 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
694 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
695 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
696 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
697 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
698 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
699 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
700 regmbc(0xa7a8);
701 return;
702 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
703 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
704 case 0x1e6e: case 0x1e70:
705 regmbc('T'); regmbc(0x162); regmbc(0x164);
706 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
707 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
708 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200709 return;
710 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200711 case 0x168: case 0x16a: case 0x16c: case 0x16e:
712 case 0x170: case 0x172: case 0x1af: case 0x1d3:
713 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
714 case 0x214: case 0x216: case 0x244: case 0x1e72:
715 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
716 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
717 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200718 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200719 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
720 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
721 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
722 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
723 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
724 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
725 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
726 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
727 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
728 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200729 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200730 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
731 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
732 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200733 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200734 case 'W': case 0x174: case 0x1e80: case 0x1e82:
735 case 0x1e84: case 0x1e86: case 0x1e88:
736 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
737 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
738 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200739 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200740 case 'X': case 0x1e8a: case 0x1e8c:
741 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200742 return;
743 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200744 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
745 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
746 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
747 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
748 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
749 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200750 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200751 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
752 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
753 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
754 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
755 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200756 return;
757 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200758 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
759 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
760 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
761 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
762 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
763 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
764 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200765 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
766 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
768 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
769 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
770 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
771 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
772 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
773 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
774 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
775 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200776 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200777 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
778 case 0x1e03: case 0x1e05: case 0x1e07:
779 regmbc('b');
780 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
781 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
782 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200783 return;
784 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200785 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
786 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
787 regmbc('c'); regmbc(0xe7); regmbc(0x107);
788 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
789 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
790 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200791 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200792 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
793 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
794 case 0x1e0f: case 0x1e11: case 0x1e13:
795 regmbc('d'); regmbc(0x10f); regmbc(0x111);
796 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
797 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
798 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x113: case 0x115: case 0x117: case 0x119:
802 case 0x11b: case 0x205: case 0x207: case 0x229:
803 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
804 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
805 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
806 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
809 regmbc(0x115); regmbc(0x117); regmbc(0x119);
810 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
811 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
812 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
813 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
814 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
815 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
816 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200817 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200818 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
819 case 0x1e1f: case 0xa799:
820 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
821 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
822 return;
823 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
824 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
825 case 0x1e21: case 0xa7a1:
826 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
827 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
828 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
829 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200830 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200831 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
832 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
833 case 0x1e96: case 0x2c68: case 0xa795:
834 regmbc('h'); regmbc(0x125); regmbc(0x127);
835 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
836 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
837 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200838 return;
839 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200840 case 0x129: case 0x12b: case 0x12d: case 0x12f:
841 case 0x1d0: case 0x209: case 0x20b: case 0x268:
842 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
843 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200844 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200845 regmbc(0xee); regmbc(0xef); regmbc(0x129);
846 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
847 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
848 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
849 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200850 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200851 case 'j': case 0x135: case 0x1f0: case 0x249:
852 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
853 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200855 case 'k': case 0x137: case 0x199: case 0x1e9:
856 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
857 case 0x2c6a: case 0xa741:
858 regmbc('k'); regmbc(0x137); regmbc(0x199);
859 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
860 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
861 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200862 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200863 case 'l': case 0x13a: case 0x13c: case 0x13e:
864 case 0x140: case 0x142: case 0x19a: case 0x1e37:
865 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
866 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
867 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
868 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
869 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
872 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
873 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200874 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200875 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
876 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
877 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
878 case 0xa7a5:
879 regmbc('n'); regmbc(0xf1); regmbc(0x144);
880 regmbc(0x146); regmbc(0x148); regmbc(0x149);
881 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
882 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
883 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200884 return;
885 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200886 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
887 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
888 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
889 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
890 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
891 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
892 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
893 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200894 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
895 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200896 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
897 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
898 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
899 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
900 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
901 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
902 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
903 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
904 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
905 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
906 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200907 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200908 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
909 case 0x1e55: case 0x1e57:
910 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
911 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
912 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200913 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200914 case 'q': case 0x24b: case 0x2a0:
915 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200916 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200917 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
918 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
919 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
920 case 0xa7a7:
921 regmbc('r'); regmbc(0x155); regmbc(0x157);
922 regmbc(0x159); regmbc(0x211); regmbc(0x213);
923 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
924 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
925 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
926 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200927 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200928 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
929 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
930 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
931 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
932 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
933 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
934 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
935 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
936 return;
937 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
938 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
939 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
940 regmbc('t'); regmbc(0x163); regmbc(0x165);
941 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
942 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
943 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
944 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200945 return;
946 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200947 case 0x169: case 0x16b: case 0x16d: case 0x16f:
948 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
949 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
950 case 0x215: case 0x217: case 0x289: case 0x1e73:
951 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
952 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
953 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
954 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200955 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
957 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
958 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
959 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
960 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
961 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
962 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
963 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
964 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
965 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
966 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200967 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200968 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
969 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
970 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 case 'w': case 0x175: case 0x1e81: case 0x1e83:
973 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
974 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
975 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
976 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200977 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200978 case 'x': case 0x1e8b: case 0x1e8d:
979 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200980 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
982 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
983 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200984 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
986 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
987 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
988 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200989 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
991 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
992 case 0x1e95: case 0x2c6c:
993 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
994 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
995 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
996 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200997 return;
998 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200999 }
1000 regmbc(c);
1001}
1002
1003/*
1004 * Emit a node.
1005 * Return pointer to generated code.
1006 */
1007 static char_u *
1008regnode(int op)
1009{
1010 char_u *ret;
1011
1012 ret = regcode;
1013 if (ret == JUST_CALC_SIZE)
1014 regsize += 3;
1015 else
1016 {
1017 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001018 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001019 *regcode++ = NUL;
1020 }
1021 return ret;
1022}
1023
1024/*
1025 * Write a long as four bytes at "p" and return pointer to the next char.
1026 */
1027 static char_u *
1028re_put_long(char_u *p, long_u val)
1029{
1030 *p++ = (char_u) ((val >> 24) & 0377);
1031 *p++ = (char_u) ((val >> 16) & 0377);
1032 *p++ = (char_u) ((val >> 8) & 0377);
1033 *p++ = (char_u) (val & 0377);
1034 return p;
1035}
1036
1037/*
1038 * regnext - dig the "next" pointer out of a node
1039 * Returns NULL when calculating size, when there is no next item and when
1040 * there is an error.
1041 */
1042 static char_u *
1043regnext(char_u *p)
1044{
1045 int offset;
1046
1047 if (p == JUST_CALC_SIZE || reg_toolong)
1048 return NULL;
1049
1050 offset = NEXT(p);
1051 if (offset == 0)
1052 return NULL;
1053
1054 if (OP(p) == BACK)
1055 return p - offset;
1056 else
1057 return p + offset;
1058}
1059
1060/*
1061 * Set the next-pointer at the end of a node chain.
1062 */
1063 static void
1064regtail(char_u *p, char_u *val)
1065{
1066 char_u *scan;
1067 char_u *temp;
1068 int offset;
1069
1070 if (p == JUST_CALC_SIZE)
1071 return;
1072
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001073 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001074 scan = p;
1075 for (;;)
1076 {
1077 temp = regnext(scan);
1078 if (temp == NULL)
1079 break;
1080 scan = temp;
1081 }
1082
1083 if (OP(scan) == BACK)
1084 offset = (int)(scan - val);
1085 else
1086 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001087 // When the offset uses more than 16 bits it can no longer fit in the two
1088 // bytes available. Use a global flag to avoid having to check return
1089 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001090 if (offset > 0xffff)
1091 reg_toolong = TRUE;
1092 else
1093 {
1094 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1095 *(scan + 2) = (char_u) (offset & 0377);
1096 }
1097}
1098
1099/*
1100 * Like regtail, on item after a BRANCH; nop if none.
1101 */
1102 static void
1103regoptail(char_u *p, char_u *val)
1104{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001105 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001106 if (p == NULL || p == JUST_CALC_SIZE
1107 || (OP(p) != BRANCH
1108 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1109 return;
1110 regtail(OPERAND(p), val);
1111}
1112
1113/*
1114 * Insert an operator in front of already-emitted operand
1115 *
1116 * Means relocating the operand.
1117 */
1118 static void
1119reginsert(int op, char_u *opnd)
1120{
1121 char_u *src;
1122 char_u *dst;
1123 char_u *place;
1124
1125 if (regcode == JUST_CALC_SIZE)
1126 {
1127 regsize += 3;
1128 return;
1129 }
1130 src = regcode;
1131 regcode += 3;
1132 dst = regcode;
1133 while (src > opnd)
1134 *--dst = *--src;
1135
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001136 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001137 *place++ = op;
1138 *place++ = NUL;
1139 *place = NUL;
1140}
1141
1142/*
1143 * Insert an operator in front of already-emitted operand.
1144 * Add a number to the operator.
1145 */
1146 static void
1147reginsert_nr(int op, long val, char_u *opnd)
1148{
1149 char_u *src;
1150 char_u *dst;
1151 char_u *place;
1152
1153 if (regcode == JUST_CALC_SIZE)
1154 {
1155 regsize += 7;
1156 return;
1157 }
1158 src = regcode;
1159 regcode += 7;
1160 dst = regcode;
1161 while (src > opnd)
1162 *--dst = *--src;
1163
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001164 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001165 *place++ = op;
1166 *place++ = NUL;
1167 *place++ = NUL;
1168 re_put_long(place, (long_u)val);
1169}
1170
1171/*
1172 * Insert an operator in front of already-emitted operand.
1173 * The operator has the given limit values as operands. Also set next pointer.
1174 *
1175 * Means relocating the operand.
1176 */
1177 static void
1178reginsert_limits(
1179 int op,
1180 long minval,
1181 long maxval,
1182 char_u *opnd)
1183{
1184 char_u *src;
1185 char_u *dst;
1186 char_u *place;
1187
1188 if (regcode == JUST_CALC_SIZE)
1189 {
1190 regsize += 11;
1191 return;
1192 }
1193 src = regcode;
1194 regcode += 11;
1195 dst = regcode;
1196 while (src > opnd)
1197 *--dst = *--src;
1198
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001199 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001200 *place++ = op;
1201 *place++ = NUL;
1202 *place++ = NUL;
1203 place = re_put_long(place, (long_u)minval);
1204 place = re_put_long(place, (long_u)maxval);
1205 regtail(opnd, place);
1206}
1207
1208/*
1209 * Return TRUE if the back reference is legal. We must have seen the close
1210 * brace.
1211 * TODO: Should also check that we don't refer to something that is repeated
1212 * (+*=): what instance of the repetition should we match?
1213 */
1214 static int
1215seen_endbrace(int refnum)
1216{
1217 if (!had_endbrace[refnum])
1218 {
1219 char_u *p;
1220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001221 // Trick: check if "@<=" or "@<!" follows, in which case
1222 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001223 for (p = regparse; *p != NUL; ++p)
1224 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1225 break;
1226 if (*p == NUL)
1227 {
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001228 emsg(_(e_illegal_back_reference));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001229 rc_did_emsg = TRUE;
1230 return FALSE;
1231 }
1232 }
1233 return TRUE;
1234}
1235
1236/*
1237 * Parse the lowest level.
1238 *
1239 * Optimization: gobbles an entire sequence of ordinary characters so that
1240 * it can turn them into a single node, which is smaller to store and
1241 * faster to run. Don't do this when one_exactly is set.
1242 */
1243 static char_u *
1244regatom(int *flagp)
1245{
1246 char_u *ret;
1247 int flags;
1248 int c;
1249 char_u *p;
1250 int extra = 0;
1251 int save_prev_at_start = prev_at_start;
1252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001253 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001254
1255 c = getchr();
1256 switch (c)
1257 {
1258 case Magic('^'):
1259 ret = regnode(BOL);
1260 break;
1261
1262 case Magic('$'):
1263 ret = regnode(EOL);
1264#if defined(FEAT_SYN_HL) || defined(PROTO)
1265 had_eol = TRUE;
1266#endif
1267 break;
1268
1269 case Magic('<'):
1270 ret = regnode(BOW);
1271 break;
1272
1273 case Magic('>'):
1274 ret = regnode(EOW);
1275 break;
1276
1277 case Magic('_'):
1278 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001279 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001280 {
1281 ret = regnode(BOL);
1282 break;
1283 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001284 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001285 {
1286 ret = regnode(EOL);
1287#if defined(FEAT_SYN_HL) || defined(PROTO)
1288 had_eol = TRUE;
1289#endif
1290 break;
1291 }
1292
1293 extra = ADD_NL;
1294 *flagp |= HASNL;
1295
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 if (c == '[')
1298 goto collection;
1299
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001300 // "\_x" is character class plus newline
1301 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001303 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001304 case Magic('.'):
1305 case Magic('i'):
1306 case Magic('I'):
1307 case Magic('k'):
1308 case Magic('K'):
1309 case Magic('f'):
1310 case Magic('F'):
1311 case Magic('p'):
1312 case Magic('P'):
1313 case Magic('s'):
1314 case Magic('S'):
1315 case Magic('d'):
1316 case Magic('D'):
1317 case Magic('x'):
1318 case Magic('X'):
1319 case Magic('o'):
1320 case Magic('O'):
1321 case Magic('w'):
1322 case Magic('W'):
1323 case Magic('h'):
1324 case Magic('H'):
1325 case Magic('a'):
1326 case Magic('A'):
1327 case Magic('l'):
1328 case Magic('L'):
1329 case Magic('u'):
1330 case Magic('U'):
1331 p = vim_strchr(classchars, no_Magic(c));
1332 if (p == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001333 EMSG_RET_NULL(_(e_invalid_use_of_underscore));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001334
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001335 // When '.' is followed by a composing char ignore the dot, so that
1336 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001337 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1338 {
1339 c = getchr();
1340 goto do_multibyte;
1341 }
1342 ret = regnode(classcodes[p - classchars] + extra);
1343 *flagp |= HASWIDTH | SIMPLE;
1344 break;
1345
1346 case Magic('n'):
1347 if (reg_string)
1348 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001349 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001350 ret = regnode(EXACTLY);
1351 regc(NL);
1352 regc(NUL);
1353 *flagp |= HASWIDTH | SIMPLE;
1354 }
1355 else
1356 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001357 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001358 ret = regnode(NEWL);
1359 *flagp |= HASWIDTH | HASNL;
1360 }
1361 break;
1362
1363 case Magic('('):
1364 if (one_exactly)
1365 EMSG_ONE_RET_NULL;
1366 ret = reg(REG_PAREN, &flags);
1367 if (ret == NULL)
1368 return NULL;
1369 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1370 break;
1371
1372 case NUL:
1373 case Magic('|'):
1374 case Magic('&'):
1375 case Magic(')'):
1376 if (one_exactly)
1377 EMSG_ONE_RET_NULL;
Bram Moolenaard0819d12021-12-31 23:15:53 +00001378 // Supposed to be caught earlier.
1379 IEMSG_RET_NULL(_(e_internal_error_in_regexp));
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001380 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001381
1382 case Magic('='):
1383 case Magic('?'):
1384 case Magic('+'):
1385 case Magic('@'):
1386 case Magic('{'):
1387 case Magic('*'):
1388 c = no_Magic(c);
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001389 EMSG3_RET_NULL(_(e_str_chr_follows_nothing),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001390 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001391 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001392
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001393 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001394 if (reg_prev_sub != NULL)
1395 {
1396 char_u *lp;
1397
1398 ret = regnode(EXACTLY);
1399 lp = reg_prev_sub;
1400 while (*lp != NUL)
1401 regc(*lp++);
1402 regc(NUL);
1403 if (*reg_prev_sub != NUL)
1404 {
1405 *flagp |= HASWIDTH;
1406 if ((lp - reg_prev_sub) == 1)
1407 *flagp |= SIMPLE;
1408 }
1409 }
1410 else
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001411 EMSG_RET_NULL(_(e_no_previous_substitute_regular_expression));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001412 break;
1413
1414 case Magic('1'):
1415 case Magic('2'):
1416 case Magic('3'):
1417 case Magic('4'):
1418 case Magic('5'):
1419 case Magic('6'):
1420 case Magic('7'):
1421 case Magic('8'):
1422 case Magic('9'):
1423 {
1424 int refnum;
1425
1426 refnum = c - Magic('0');
1427 if (!seen_endbrace(refnum))
1428 return NULL;
1429 ret = regnode(BACKREF + refnum);
1430 }
1431 break;
1432
1433 case Magic('z'):
1434 {
1435 c = no_Magic(getchr());
1436 switch (c)
1437 {
1438#ifdef FEAT_SYN_HL
1439 case '(': if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001440 EMSG_RET_NULL(_(e_z_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001441 if (one_exactly)
1442 EMSG_ONE_RET_NULL;
1443 ret = reg(REG_ZPAREN, &flags);
1444 if (ret == NULL)
1445 return NULL;
1446 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1447 re_has_z = REX_SET;
1448 break;
1449
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9': if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001459 EMSG_RET_NULL(_(e_z1_z9_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001460 ret = regnode(ZREF + c - '0');
1461 re_has_z = REX_USE;
1462 break;
1463#endif
1464
1465 case 's': ret = regnode(MOPEN + 0);
1466 if (re_mult_next("\\zs") == FAIL)
1467 return NULL;
1468 break;
1469
1470 case 'e': ret = regnode(MCLOSE + 0);
1471 if (re_mult_next("\\ze") == FAIL)
1472 return NULL;
1473 break;
1474
Bram Moolenaarb2810f12022-01-08 21:38:52 +00001475 default: EMSG_RET_NULL(_(e_invalid_character_after_bsl_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001476 }
1477 }
1478 break;
1479
1480 case Magic('%'):
1481 {
1482 c = no_Magic(getchr());
1483 switch (c)
1484 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001485 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001486 case '(':
1487 if (one_exactly)
1488 EMSG_ONE_RET_NULL;
1489 ret = reg(REG_NPAREN, &flags);
1490 if (ret == NULL)
1491 return NULL;
1492 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1493 break;
1494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001495 // Catch \%^ and \%$ regardless of where they appear in the
1496 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001497 case '^':
1498 ret = regnode(RE_BOF);
1499 break;
1500
1501 case '$':
1502 ret = regnode(RE_EOF);
1503 break;
1504
1505 case '#':
1506 ret = regnode(CURSOR);
1507 break;
1508
1509 case 'V':
1510 ret = regnode(RE_VISUAL);
1511 break;
1512
1513 case 'C':
1514 ret = regnode(RE_COMPOSING);
1515 break;
1516
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001517 // \%[abc]: Emit as a list of branches, all ending at the last
1518 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001519 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001520 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001521 EMSG_ONE_RET_NULL;
1522 {
1523 char_u *lastbranch;
1524 char_u *lastnode = NULL;
1525 char_u *br;
1526
1527 ret = NULL;
1528 while ((c = getchr()) != ']')
1529 {
1530 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001531 EMSG2_RET_NULL(_(e_missing_sb_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001532 reg_magic == MAGIC_ALL);
1533 br = regnode(BRANCH);
1534 if (ret == NULL)
1535 ret = br;
1536 else
1537 {
1538 regtail(lastnode, br);
1539 if (reg_toolong)
1540 return NULL;
1541 }
1542
1543 ungetchr();
1544 one_exactly = TRUE;
1545 lastnode = regatom(flagp);
1546 one_exactly = FALSE;
1547 if (lastnode == NULL)
1548 return NULL;
1549 }
1550 if (ret == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001551 EMSG2_RET_NULL(_(e_empty_str_brackets),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001552 reg_magic == MAGIC_ALL);
1553 lastbranch = regnode(BRANCH);
1554 br = regnode(NOTHING);
1555 if (ret != JUST_CALC_SIZE)
1556 {
1557 regtail(lastnode, br);
1558 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001559 // connect all branches to the NOTHING
1560 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001561 for (br = ret; br != lastnode; )
1562 {
1563 if (OP(br) == BRANCH)
1564 {
1565 regtail(br, lastbranch);
1566 if (reg_toolong)
1567 return NULL;
1568 br = OPERAND(br);
1569 }
1570 else
1571 br = regnext(br);
1572 }
1573 }
1574 *flagp &= ~(HASWIDTH | SIMPLE);
1575 break;
1576 }
1577
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001578 case 'd': // %d123 decimal
1579 case 'o': // %o123 octal
1580 case 'x': // %xab hex 2
1581 case 'u': // %uabcd hex 4
1582 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001583 {
1584 long i;
1585
1586 switch (c)
1587 {
1588 case 'd': i = getdecchrs(); break;
1589 case 'o': i = getoctchrs(); break;
1590 case 'x': i = gethexchrs(2); break;
1591 case 'u': i = gethexchrs(4); break;
1592 case 'U': i = gethexchrs(8); break;
1593 default: i = -1; break;
1594 }
1595
1596 if (i < 0 || i > INT_MAX)
1597 EMSG2_RET_NULL(
Bram Moolenaara6f79292022-01-04 21:30:47 +00001598 _(e_invalid_character_after_str_2),
1599 reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001600 if (use_multibytecode(i))
1601 ret = regnode(MULTIBYTECODE);
1602 else
1603 ret = regnode(EXACTLY);
1604 if (i == 0)
1605 regc(0x0a);
1606 else
1607 regmbc(i);
1608 regc(NUL);
1609 *flagp |= HASWIDTH;
1610 break;
1611 }
1612
1613 default:
1614 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001615 || c == '\'' || c == '.')
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001616 {
1617 long_u n = 0;
1618 int cmp;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001619 int cur = FALSE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001620
1621 cmp = c;
1622 if (cmp == '<' || cmp == '>')
1623 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001624 if (no_Magic(c) == '.')
1625 {
1626 cur = TRUE;
1627 c = getchr();
1628 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001629 while (VIM_ISDIGIT(c))
1630 {
1631 n = n * 10 + (c - '0');
1632 c = getchr();
1633 }
1634 if (c == '\'' && n == 0)
1635 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001636 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001637 c = getchr();
1638 ret = regnode(RE_MARK);
1639 if (ret == JUST_CALC_SIZE)
1640 regsize += 2;
1641 else
1642 {
1643 *regcode++ = c;
1644 *regcode++ = cmp;
1645 }
1646 break;
1647 }
1648 else if (c == 'l' || c == 'c' || c == 'v')
1649 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001650 if (cur && n)
1651 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001652 semsg(_(e_regexp_number_after_dot_pos_search_chr),
1653 no_Magic(c));
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001654 rc_did_emsg = TRUE;
1655 return NULL;
1656 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001657 if (c == 'l')
1658 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001659 if (cur)
1660 n = curwin->w_cursor.lnum;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001661 ret = regnode(RE_LNUM);
1662 if (save_prev_at_start)
1663 at_start = TRUE;
1664 }
1665 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001666 {
1667 if (cur)
1668 {
1669 n = curwin->w_cursor.col;
1670 n++;
1671 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001672 ret = regnode(RE_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001673 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001674 else
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001675 {
1676 if (cur)
1677 {
1678 colnr_T vcol = 0;
1679
1680 getvvcol(curwin, &curwin->w_cursor,
1681 NULL, NULL, &vcol);
1682 ++vcol;
1683 n = vcol;
1684 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001685 ret = regnode(RE_VCOL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001686 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001687 if (ret == JUST_CALC_SIZE)
1688 regsize += 5;
1689 else
1690 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001691 // put the number and the optional
1692 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001693 regcode = re_put_long(regcode, n);
1694 *regcode++ = cmp;
1695 }
1696 break;
1697 }
1698 }
1699
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001700 EMSG2_RET_NULL(_(e_invalid_character_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001701 reg_magic == MAGIC_ALL);
1702 }
1703 }
1704 break;
1705
1706 case Magic('['):
1707collection:
1708 {
1709 char_u *lp;
1710
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001711 // If there is no matching ']', we assume the '[' is a normal
1712 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001713 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001714 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001715 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001716 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001717 int endc;
1718
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001719 // In a character class, different parsing rules apply.
1720 // Not even \ is special anymore, nothing is.
1721 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001722 {
1723 ret = regnode(ANYBUT + extra);
1724 regparse++;
1725 }
1726 else
1727 ret = regnode(ANYOF + extra);
1728
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001729 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001730 if (*regparse == ']' || *regparse == '-')
1731 {
1732 startc = *regparse;
1733 regc(*regparse++);
1734 }
1735
1736 while (*regparse != NUL && *regparse != ']')
1737 {
1738 if (*regparse == '-')
1739 {
1740 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001741 // The '-' is not used for a range at the end and
1742 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001743 if (*regparse == ']' || *regparse == NUL
1744 || startc == -1
1745 || (regparse[0] == '\\' && regparse[1] == 'n'))
1746 {
1747 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001748 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001749 }
1750 else
1751 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001752 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001753 endc = 0;
1754 if (*regparse == '[')
1755 endc = get_coll_element(&regparse);
1756 if (endc == 0)
1757 {
1758 if (has_mbyte)
1759 endc = mb_ptr2char_adv(&regparse);
1760 else
1761 endc = *regparse++;
1762 }
1763
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001764 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001765 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1766 endc = coll_get_char();
1767
1768 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001769 EMSG_RET_NULL(_(e_reverse_range_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001770 if (has_mbyte && ((*mb_char2len)(startc) > 1
1771 || (*mb_char2len)(endc) > 1))
1772 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001773 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001774 if (endc > startc + 256)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001775 EMSG_RET_NULL(_(e_range_too_large_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001776 while (++startc <= endc)
1777 regmbc(startc);
1778 }
1779 else
1780 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001781 while (++startc <= endc)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001782 regc(startc);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001783 }
1784 startc = -1;
1785 }
1786 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001787 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1788 // accepts "\t", "\e", etc., but only when the 'l' flag in
1789 // 'cpoptions' is not included.
1790 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001791 else if (*regparse == '\\'
1792 && !reg_cpo_bsl
1793 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1794 || (!reg_cpo_lit
1795 && vim_strchr(REGEXP_ABBR,
1796 regparse[1]) != NULL)))
1797 {
1798 regparse++;
1799 if (*regparse == 'n')
1800 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001801 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001802 if (ret != JUST_CALC_SIZE)
1803 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001804 // Using \n inside [^] does not change what
1805 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001806 if (*ret == ANYOF)
1807 {
1808 *ret = ANYOF + ADD_NL;
1809 *flagp |= HASNL;
1810 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001811 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001812 }
1813 regparse++;
1814 startc = -1;
1815 }
1816 else if (*regparse == 'd'
1817 || *regparse == 'o'
1818 || *regparse == 'x'
1819 || *regparse == 'u'
1820 || *regparse == 'U')
1821 {
1822 startc = coll_get_char();
1823 if (startc == 0)
1824 regc(0x0a);
1825 else
1826 regmbc(startc);
1827 }
1828 else
1829 {
1830 startc = backslash_trans(*regparse++);
1831 regc(startc);
1832 }
1833 }
1834 else if (*regparse == '[')
1835 {
1836 int c_class;
1837 int cu;
1838
1839 c_class = get_char_class(&regparse);
1840 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001841 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001842 switch (c_class)
1843 {
1844 case CLASS_NONE:
1845 c_class = get_equi_class(&regparse);
1846 if (c_class != 0)
1847 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001848 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001849 reg_equi_class(c_class);
1850 }
1851 else if ((c_class =
1852 get_coll_element(&regparse)) != 0)
1853 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001854 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001855 regmbc(c_class);
1856 }
1857 else
1858 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001859 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001860 startc = *regparse++;
1861 regc(startc);
1862 }
1863 break;
1864 case CLASS_ALNUM:
1865 for (cu = 1; cu < 128; cu++)
1866 if (isalnum(cu))
1867 regmbc(cu);
1868 break;
1869 case CLASS_ALPHA:
1870 for (cu = 1; cu < 128; cu++)
1871 if (isalpha(cu))
1872 regmbc(cu);
1873 break;
1874 case CLASS_BLANK:
1875 regc(' ');
1876 regc('\t');
1877 break;
1878 case CLASS_CNTRL:
1879 for (cu = 1; cu <= 127; cu++)
1880 if (iscntrl(cu))
1881 regmbc(cu);
1882 break;
1883 case CLASS_DIGIT:
1884 for (cu = 1; cu <= 127; cu++)
1885 if (VIM_ISDIGIT(cu))
1886 regmbc(cu);
1887 break;
1888 case CLASS_GRAPH:
1889 for (cu = 1; cu <= 127; cu++)
1890 if (isgraph(cu))
1891 regmbc(cu);
1892 break;
1893 case CLASS_LOWER:
1894 for (cu = 1; cu <= 255; cu++)
1895 if (MB_ISLOWER(cu) && cu != 170
1896 && cu != 186)
1897 regmbc(cu);
1898 break;
1899 case CLASS_PRINT:
1900 for (cu = 1; cu <= 255; cu++)
1901 if (vim_isprintc(cu))
1902 regmbc(cu);
1903 break;
1904 case CLASS_PUNCT:
1905 for (cu = 1; cu < 128; cu++)
1906 if (ispunct(cu))
1907 regmbc(cu);
1908 break;
1909 case CLASS_SPACE:
1910 for (cu = 9; cu <= 13; cu++)
1911 regc(cu);
1912 regc(' ');
1913 break;
1914 case CLASS_UPPER:
1915 for (cu = 1; cu <= 255; cu++)
1916 if (MB_ISUPPER(cu))
1917 regmbc(cu);
1918 break;
1919 case CLASS_XDIGIT:
1920 for (cu = 1; cu <= 255; cu++)
1921 if (vim_isxdigit(cu))
1922 regmbc(cu);
1923 break;
1924 case CLASS_TAB:
1925 regc('\t');
1926 break;
1927 case CLASS_RETURN:
1928 regc('\r');
1929 break;
1930 case CLASS_BACKSPACE:
1931 regc('\b');
1932 break;
1933 case CLASS_ESCAPE:
1934 regc('\033');
1935 break;
1936 case CLASS_IDENT:
1937 for (cu = 1; cu <= 255; cu++)
1938 if (vim_isIDc(cu))
1939 regmbc(cu);
1940 break;
1941 case CLASS_KEYWORD:
1942 for (cu = 1; cu <= 255; cu++)
1943 if (reg_iswordc(cu))
1944 regmbc(cu);
1945 break;
1946 case CLASS_FNAME:
1947 for (cu = 1; cu <= 255; cu++)
1948 if (vim_isfilec(cu))
1949 regmbc(cu);
1950 break;
1951 }
1952 }
1953 else
1954 {
1955 if (has_mbyte)
1956 {
1957 int len;
1958
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001959 // produce a multibyte character, including any
1960 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001961 startc = mb_ptr2char(regparse);
1962 len = (*mb_ptr2len)(regparse);
1963 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001964 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001965 while (--len >= 0)
1966 regc(*regparse++);
1967 }
1968 else
1969 {
1970 startc = *regparse++;
1971 regc(startc);
1972 }
1973 }
1974 }
1975 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001976 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001977 if (*regparse != ']')
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001978 EMSG_RET_NULL(_(e_too_many_brackets)); // Cannot happen?
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001979 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001980 *flagp |= HASWIDTH | SIMPLE;
1981 break;
1982 }
1983 else if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001984 EMSG2_RET_NULL(_(e_missing_rsb_after_str_lsb),
1985 reg_magic > MAGIC_OFF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001986 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001987 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001988
1989 default:
1990 {
1991 int len;
1992
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001993 // A multi-byte character is handled as a separate atom if it's
1994 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001995 if (use_multibytecode(c))
1996 {
1997do_multibyte:
1998 ret = regnode(MULTIBYTECODE);
1999 regmbc(c);
2000 *flagp |= HASWIDTH | SIMPLE;
2001 break;
2002 }
2003
2004 ret = regnode(EXACTLY);
2005
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002006 // Append characters as long as:
2007 // - there is no following multi, we then need the character in
2008 // front of it as a single character operand
2009 // - not running into a Magic character
2010 // - "one_exactly" is not set
2011 // But always emit at least one character. Might be a Multi,
2012 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002013 for (len = 0; c != NUL && (len == 0
2014 || (re_multi_type(peekchr()) == NOT_MULTI
2015 && !one_exactly
2016 && !is_Magic(c))); ++len)
2017 {
2018 c = no_Magic(c);
2019 if (has_mbyte)
2020 {
2021 regmbc(c);
2022 if (enc_utf8)
2023 {
2024 int l;
2025
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002026 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002027 for (;;)
2028 {
2029 l = utf_ptr2len(regparse);
2030 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2031 break;
2032 regmbc(utf_ptr2char(regparse));
2033 skipchr();
2034 }
2035 }
2036 }
2037 else
2038 regc(c);
2039 c = getchr();
2040 }
2041 ungetchr();
2042
2043 regc(NUL);
2044 *flagp |= HASWIDTH;
2045 if (len == 1)
2046 *flagp |= SIMPLE;
2047 }
2048 break;
2049 }
2050
2051 return ret;
2052}
2053
2054/*
2055 * Parse something followed by possible [*+=].
2056 *
2057 * Note that the branching code sequences used for = and the general cases
2058 * of * and + are somewhat optimized: they use the same NOTHING node as
2059 * both the endmarker for their branch list and the body of the last branch.
2060 * It might seem that this node could be dispensed with entirely, but the
2061 * endmarker role is not redundant.
2062 */
2063 static char_u *
2064regpiece(int *flagp)
2065{
2066 char_u *ret;
2067 int op;
2068 char_u *next;
2069 int flags;
2070 long minval;
2071 long maxval;
2072
2073 ret = regatom(&flags);
2074 if (ret == NULL)
2075 return NULL;
2076
2077 op = peekchr();
2078 if (re_multi_type(op) == NOT_MULTI)
2079 {
2080 *flagp = flags;
2081 return ret;
2082 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002083 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002084 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2085
2086 skipchr();
2087 switch (op)
2088 {
2089 case Magic('*'):
2090 if (flags & SIMPLE)
2091 reginsert(STAR, ret);
2092 else
2093 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002094 // Emit x* as (x&|), where & means "self".
2095 reginsert(BRANCH, ret); // Either x
2096 regoptail(ret, regnode(BACK)); // and loop
2097 regoptail(ret, ret); // back
2098 regtail(ret, regnode(BRANCH)); // or
2099 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002100 }
2101 break;
2102
2103 case Magic('+'):
2104 if (flags & SIMPLE)
2105 reginsert(PLUS, ret);
2106 else
2107 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002108 // Emit x+ as x(&|), where & means "self".
2109 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002110 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002111 regtail(regnode(BACK), ret); // loop back
2112 regtail(next, regnode(BRANCH)); // or
2113 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002114 }
2115 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2116 break;
2117
2118 case Magic('@'):
2119 {
2120 int lop = END;
2121 long nr;
2122
2123 nr = getdecchrs();
2124 switch (no_Magic(getchr()))
2125 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002126 case '=': lop = MATCH; break; // \@=
2127 case '!': lop = NOMATCH; break; // \@!
2128 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002129 case '<': switch (no_Magic(getchr()))
2130 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002131 case '=': lop = BEHIND; break; // \@<=
2132 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002133 }
2134 }
2135 if (lop == END)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002136 EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002137 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002138 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002139 if (lop == BEHIND || lop == NOBEHIND)
2140 {
2141 regtail(ret, regnode(BHPOS));
2142 *flagp |= HASLOOKBH;
2143 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002144 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002145 if (lop == BEHIND || lop == NOBEHIND)
2146 {
2147 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002148 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002149 reginsert_nr(lop, nr, ret);
2150 }
2151 else
2152 reginsert(lop, ret);
2153 break;
2154 }
2155
2156 case Magic('?'):
2157 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002158 // Emit x= as (x|)
2159 reginsert(BRANCH, ret); // Either x
2160 regtail(ret, regnode(BRANCH)); // or
2161 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002162 regtail(ret, next);
2163 regoptail(ret, next);
2164 break;
2165
2166 case Magic('{'):
2167 if (!read_limits(&minval, &maxval))
2168 return NULL;
2169 if (flags & SIMPLE)
2170 {
2171 reginsert(BRACE_SIMPLE, ret);
2172 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2173 }
2174 else
2175 {
2176 if (num_complex_braces >= 10)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002177 EMSG2_RET_NULL(_(e_too_many_complex_str_curly),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002178 reg_magic == MAGIC_ALL);
2179 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2180 regoptail(ret, regnode(BACK));
2181 regoptail(ret, ret);
2182 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2183 ++num_complex_braces;
2184 }
2185 if (minval > 0 && maxval > 0)
2186 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2187 break;
2188 }
2189 if (re_multi_type(peekchr()) != NOT_MULTI)
2190 {
2191 // Can't have a multi follow a multi.
2192 if (peekchr() == Magic('*'))
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002193 EMSG2_RET_NULL(_(e_nested_str), reg_magic >= MAGIC_ON);
2194 EMSG3_RET_NULL(_(e_nested_str_chr), reg_magic == MAGIC_ALL,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002195 no_Magic(peekchr()));
2196 }
2197
2198 return ret;
2199}
2200
2201/*
2202 * Parse one alternative of an | or & operator.
2203 * Implements the concatenation operator.
2204 */
2205 static char_u *
2206regconcat(int *flagp)
2207{
2208 char_u *first = NULL;
2209 char_u *chain = NULL;
2210 char_u *latest;
2211 int flags;
2212 int cont = TRUE;
2213
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002214 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002215
2216 while (cont)
2217 {
2218 switch (peekchr())
2219 {
2220 case NUL:
2221 case Magic('|'):
2222 case Magic('&'):
2223 case Magic(')'):
2224 cont = FALSE;
2225 break;
2226 case Magic('Z'):
2227 regflags |= RF_ICOMBINE;
2228 skipchr_keepstart();
2229 break;
2230 case Magic('c'):
2231 regflags |= RF_ICASE;
2232 skipchr_keepstart();
2233 break;
2234 case Magic('C'):
2235 regflags |= RF_NOICASE;
2236 skipchr_keepstart();
2237 break;
2238 case Magic('v'):
2239 reg_magic = MAGIC_ALL;
2240 skipchr_keepstart();
2241 curchr = -1;
2242 break;
2243 case Magic('m'):
2244 reg_magic = MAGIC_ON;
2245 skipchr_keepstart();
2246 curchr = -1;
2247 break;
2248 case Magic('M'):
2249 reg_magic = MAGIC_OFF;
2250 skipchr_keepstart();
2251 curchr = -1;
2252 break;
2253 case Magic('V'):
2254 reg_magic = MAGIC_NONE;
2255 skipchr_keepstart();
2256 curchr = -1;
2257 break;
2258 default:
2259 latest = regpiece(&flags);
2260 if (latest == NULL || reg_toolong)
2261 return NULL;
2262 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002263 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002264 *flagp |= flags & SPSTART;
2265 else
2266 regtail(chain, latest);
2267 chain = latest;
2268 if (first == NULL)
2269 first = latest;
2270 break;
2271 }
2272 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002273 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002274 first = regnode(NOTHING);
2275 return first;
2276}
2277
2278/*
2279 * Parse one alternative of an | operator.
2280 * Implements the & operator.
2281 */
2282 static char_u *
2283regbranch(int *flagp)
2284{
2285 char_u *ret;
2286 char_u *chain = NULL;
2287 char_u *latest;
2288 int flags;
2289
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002290 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002291
2292 ret = regnode(BRANCH);
2293 for (;;)
2294 {
2295 latest = regconcat(&flags);
2296 if (latest == NULL)
2297 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002298 // If one of the branches has width, the whole thing has. If one of
2299 // the branches anchors at start-of-line, the whole thing does.
2300 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002301 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002302 // If one of the branches doesn't match a line-break, the whole thing
2303 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002304 *flagp &= ~HASNL | (flags & HASNL);
2305 if (chain != NULL)
2306 regtail(chain, latest);
2307 if (peekchr() != Magic('&'))
2308 break;
2309 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002310 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002311 if (reg_toolong)
2312 break;
2313 reginsert(MATCH, latest);
2314 chain = latest;
2315 }
2316
2317 return ret;
2318}
2319
2320/*
2321 * Parse regular expression, i.e. main body or parenthesized thing.
2322 *
2323 * Caller must absorb opening parenthesis.
2324 *
2325 * Combining parenthesis handling with the base level of regular expression
2326 * is a trifle forced, but the need to tie the tails of the branches to what
2327 * follows makes it hard to avoid.
2328 */
2329 static char_u *
2330reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002331 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002332 int *flagp)
2333{
2334 char_u *ret;
2335 char_u *br;
2336 char_u *ender;
2337 int parno = 0;
2338 int flags;
2339
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002340 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002341
2342#ifdef FEAT_SYN_HL
2343 if (paren == REG_ZPAREN)
2344 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002345 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002346 if (regnzpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002347 EMSG_RET_NULL(_(e_too_many_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002348 parno = regnzpar;
2349 regnzpar++;
2350 ret = regnode(ZOPEN + parno);
2351 }
2352 else
2353#endif
2354 if (paren == REG_PAREN)
2355 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002356 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002357 if (regnpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002358 EMSG2_RET_NULL(_(e_too_many_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002359 parno = regnpar;
2360 ++regnpar;
2361 ret = regnode(MOPEN + parno);
2362 }
2363 else if (paren == REG_NPAREN)
2364 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002365 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002366 ret = regnode(NOPEN);
2367 }
2368 else
2369 ret = NULL;
2370
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002371 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002372 br = regbranch(&flags);
2373 if (br == NULL)
2374 return NULL;
2375 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002376 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002377 else
2378 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002379 // If one of the branches can be zero-width, the whole thing can.
2380 // If one of the branches has * at start or matches a line-break, the
2381 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002382 if (!(flags & HASWIDTH))
2383 *flagp &= ~HASWIDTH;
2384 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2385 while (peekchr() == Magic('|'))
2386 {
2387 skipchr();
2388 br = regbranch(&flags);
2389 if (br == NULL || reg_toolong)
2390 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002391 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002392 if (!(flags & HASWIDTH))
2393 *flagp &= ~HASWIDTH;
2394 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2395 }
2396
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002397 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002398 ender = regnode(
2399#ifdef FEAT_SYN_HL
2400 paren == REG_ZPAREN ? ZCLOSE + parno :
2401#endif
2402 paren == REG_PAREN ? MCLOSE + parno :
2403 paren == REG_NPAREN ? NCLOSE : END);
2404 regtail(ret, ender);
2405
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002406 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002407 for (br = ret; br != NULL; br = regnext(br))
2408 regoptail(br, ender);
2409
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002410 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002411 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2412 {
2413#ifdef FEAT_SYN_HL
2414 if (paren == REG_ZPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002415 EMSG_RET_NULL(_(e_unmatched_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002416 else
2417#endif
2418 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002419 EMSG2_RET_NULL(_(e_unmatched_str_percent_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002420 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002421 EMSG2_RET_NULL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002422 }
2423 else if (paren == REG_NOPAREN && peekchr() != NUL)
2424 {
2425 if (curchr == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002426 EMSG2_RET_NULL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002427 else
Bram Moolenaar74409f62022-01-01 15:58:22 +00002428 EMSG_RET_NULL(_(e_trailing_characters)); // "Can't happen".
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002429 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002430 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002431 // Here we set the flag allowing back references to this set of
2432 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002433 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002434 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002435 return ret;
2436}
2437
2438/*
2439 * bt_regcomp() - compile a regular expression into internal code for the
2440 * traditional back track matcher.
2441 * Returns the program in allocated space. Returns NULL for an error.
2442 *
2443 * We can't allocate space until we know how big the compiled form will be,
2444 * but we can't compile it (and thus know how big it is) until we've got a
2445 * place to put the code. So we cheat: we compile it twice, once with code
2446 * generation turned off and size counting turned on, and once "for real".
2447 * This also means that we don't allocate space until we are sure that the
2448 * thing really will compile successfully, and we never have to move the
2449 * code and thus invalidate pointers into it. (Note that it has to be in
2450 * one piece because vim_free() must be able to free it all.)
2451 *
2452 * Whether upper/lower case is to be ignored is decided when executing the
2453 * program, it does not matter here.
2454 *
2455 * Beware that the optimization-preparation code in here knows about some
2456 * of the structure of the compiled regexp.
2457 * "re_flags": RE_MAGIC and/or RE_STRING.
2458 */
2459 static regprog_T *
2460bt_regcomp(char_u *expr, int re_flags)
2461{
2462 bt_regprog_T *r;
2463 char_u *scan;
2464 char_u *longest;
2465 int len;
2466 int flags;
2467
2468 if (expr == NULL)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002469 IEMSG_RET_NULL(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002470
2471 init_class_tab();
2472
2473 // First pass: determine size, legality.
2474 regcomp_start(expr, re_flags);
2475 regcode = JUST_CALC_SIZE;
2476 regc(REGMAGIC);
2477 if (reg(REG_NOPAREN, &flags) == NULL)
2478 return NULL;
2479
2480 // Allocate space.
2481 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2482 if (r == NULL)
2483 return NULL;
2484 r->re_in_use = FALSE;
2485
2486 // Second pass: emit code.
2487 regcomp_start(expr, re_flags);
2488 regcode = r->program;
2489 regc(REGMAGIC);
2490 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2491 {
2492 vim_free(r);
2493 if (reg_toolong)
Bram Moolenaareaaac012022-01-02 17:00:40 +00002494 EMSG_RET_NULL(_(e_pattern_too_long));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002495 return NULL;
2496 }
2497
2498 // Dig out information for optimizations.
2499 r->regstart = NUL; // Worst-case defaults.
2500 r->reganch = 0;
2501 r->regmust = NULL;
2502 r->regmlen = 0;
2503 r->regflags = regflags;
2504 if (flags & HASNL)
2505 r->regflags |= RF_HASNL;
2506 if (flags & HASLOOKBH)
2507 r->regflags |= RF_LOOKBH;
2508#ifdef FEAT_SYN_HL
2509 // Remember whether this pattern has any \z specials in it.
2510 r->reghasz = re_has_z;
2511#endif
2512 scan = r->program + 1; // First BRANCH.
2513 if (OP(regnext(scan)) == END) // Only one top-level choice.
2514 {
2515 scan = OPERAND(scan);
2516
2517 // Starting-point info.
2518 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2519 {
2520 r->reganch++;
2521 scan = regnext(scan);
2522 }
2523
2524 if (OP(scan) == EXACTLY)
2525 {
2526 if (has_mbyte)
2527 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2528 else
2529 r->regstart = *OPERAND(scan);
2530 }
2531 else if ((OP(scan) == BOW
2532 || OP(scan) == EOW
2533 || OP(scan) == NOTHING
2534 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2535 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2536 && OP(regnext(scan)) == EXACTLY)
2537 {
2538 if (has_mbyte)
2539 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2540 else
2541 r->regstart = *OPERAND(regnext(scan));
2542 }
2543
2544 // If there's something expensive in the r.e., find the longest
2545 // literal string that must appear and make it the regmust. Resolve
2546 // ties in favor of later strings, since the regstart check works
2547 // with the beginning of the r.e. and avoiding duplication
2548 // strengthens checking. Not a strong reason, but sufficient in the
2549 // absence of others.
2550
2551 // When the r.e. starts with BOW, it is faster to look for a regmust
2552 // first. Used a lot for "#" and "*" commands. (Added by mool).
2553 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2554 && !(flags & HASNL))
2555 {
2556 longest = NULL;
2557 len = 0;
2558 for (; scan != NULL; scan = regnext(scan))
2559 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
2560 {
2561 longest = OPERAND(scan);
2562 len = (int)STRLEN(OPERAND(scan));
2563 }
2564 r->regmust = longest;
2565 r->regmlen = len;
2566 }
2567 }
2568#ifdef BT_REGEXP_DUMP
2569 regdump(expr, r);
2570#endif
2571 r->engine = &bt_regengine;
2572 return (regprog_T *)r;
2573}
2574
2575#if defined(FEAT_SYN_HL) || defined(PROTO)
2576/*
2577 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2578 * found. This is messy, but it works fine.
2579 */
2580 int
2581vim_regcomp_had_eol(void)
2582{
2583 return had_eol;
2584}
2585#endif
2586
2587/*
2588 * Get a number after a backslash that is inside [].
2589 * When nothing is recognized return a backslash.
2590 */
2591 static int
2592coll_get_char(void)
2593{
2594 long nr = -1;
2595
2596 switch (*regparse++)
2597 {
2598 case 'd': nr = getdecchrs(); break;
2599 case 'o': nr = getoctchrs(); break;
2600 case 'x': nr = gethexchrs(2); break;
2601 case 'u': nr = gethexchrs(4); break;
2602 case 'U': nr = gethexchrs(8); break;
2603 }
2604 if (nr < 0 || nr > INT_MAX)
2605 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002606 // If getting the number fails be backwards compatible: the character
2607 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002608 --regparse;
2609 nr = '\\';
2610 }
2611 return nr;
2612}
2613
2614/*
2615 * Free a compiled regexp program, returned by bt_regcomp().
2616 */
2617 static void
2618bt_regfree(regprog_T *prog)
2619{
2620 vim_free(prog);
2621}
2622
2623#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2624
2625/*
2626 * The arguments from BRACE_LIMITS are stored here. They are actually local
2627 * to regmatch(), but they are here to reduce the amount of stack space used
2628 * (it can be called recursively many times).
2629 */
2630static long bl_minval;
2631static long bl_maxval;
2632
2633/*
2634 * Save the input line and position in a regsave_T.
2635 */
2636 static void
2637reg_save(regsave_T *save, garray_T *gap)
2638{
2639 if (REG_MULTI)
2640 {
2641 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2642 save->rs_u.pos.lnum = rex.lnum;
2643 }
2644 else
2645 save->rs_u.ptr = rex.input;
2646 save->rs_len = gap->ga_len;
2647}
2648
2649/*
2650 * Restore the input line and position from a regsave_T.
2651 */
2652 static void
2653reg_restore(regsave_T *save, garray_T *gap)
2654{
2655 if (REG_MULTI)
2656 {
2657 if (rex.lnum != save->rs_u.pos.lnum)
2658 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002659 // only call reg_getline() when the line number changed to save
2660 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002661 rex.lnum = save->rs_u.pos.lnum;
2662 rex.line = reg_getline(rex.lnum);
2663 }
2664 rex.input = rex.line + save->rs_u.pos.col;
2665 }
2666 else
2667 rex.input = save->rs_u.ptr;
2668 gap->ga_len = save->rs_len;
2669}
2670
2671/*
2672 * Return TRUE if current position is equal to saved position.
2673 */
2674 static int
2675reg_save_equal(regsave_T *save)
2676{
2677 if (REG_MULTI)
2678 return rex.lnum == save->rs_u.pos.lnum
2679 && rex.input == rex.line + save->rs_u.pos.col;
2680 return rex.input == save->rs_u.ptr;
2681}
2682
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002683// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002684#define save_se(savep, posp, pp) \
2685 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2686
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002687// After a failed match restore the sub-expressions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002688#define restore_se(savep, posp, pp) { \
2689 if (REG_MULTI) \
2690 *(posp) = (savep)->se_u.pos; \
2691 else \
2692 *(pp) = (savep)->se_u.ptr; }
2693
2694/*
2695 * Tentatively set the sub-expression start to the current position (after
2696 * calling regmatch() they will have changed). Need to save the existing
2697 * values for when there is no match.
2698 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2699 * depending on REG_MULTI.
2700 */
2701 static void
2702save_se_multi(save_se_T *savep, lpos_T *posp)
2703{
2704 savep->se_u.pos = *posp;
2705 posp->lnum = rex.lnum;
2706 posp->col = (colnr_T)(rex.input - rex.line);
2707}
2708
2709 static void
2710save_se_one(save_se_T *savep, char_u **pp)
2711{
2712 savep->se_u.ptr = *pp;
2713 *pp = rex.input;
2714}
2715
2716/*
2717 * regrepeat - repeatedly match something simple, return how many.
2718 * Advances rex.input (and rex.lnum) to just after the matched chars.
2719 */
2720 static int
2721regrepeat(
2722 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002723 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002724{
2725 long count = 0;
2726 char_u *scan;
2727 char_u *opnd;
2728 int mask;
2729 int testval = 0;
2730
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002731 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002732 opnd = OPERAND(p);
2733 switch (OP(p))
2734 {
2735 case ANY:
2736 case ANY + ADD_NL:
2737 while (count < maxcount)
2738 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002739 // Matching anything means we continue until end-of-line (or
2740 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002741 while (*scan != NUL && count < maxcount)
2742 {
2743 ++count;
2744 MB_PTR_ADV(scan);
2745 }
2746 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2747 || rex.reg_line_lbr || count == maxcount)
2748 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002749 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002750 reg_nextline();
2751 scan = rex.input;
2752 if (got_int)
2753 break;
2754 }
2755 break;
2756
2757 case IDENT:
2758 case IDENT + ADD_NL:
2759 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002760 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002761 case SIDENT:
2762 case SIDENT + ADD_NL:
2763 while (count < maxcount)
2764 {
2765 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2766 {
2767 MB_PTR_ADV(scan);
2768 }
2769 else if (*scan == NUL)
2770 {
2771 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2772 || rex.reg_line_lbr)
2773 break;
2774 reg_nextline();
2775 scan = rex.input;
2776 if (got_int)
2777 break;
2778 }
2779 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2780 ++scan;
2781 else
2782 break;
2783 ++count;
2784 }
2785 break;
2786
2787 case KWORD:
2788 case KWORD + ADD_NL:
2789 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002790 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002791 case SKWORD:
2792 case SKWORD + ADD_NL:
2793 while (count < maxcount)
2794 {
2795 if (vim_iswordp_buf(scan, rex.reg_buf)
2796 && (testval || !VIM_ISDIGIT(*scan)))
2797 {
2798 MB_PTR_ADV(scan);
2799 }
2800 else if (*scan == NUL)
2801 {
2802 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2803 || rex.reg_line_lbr)
2804 break;
2805 reg_nextline();
2806 scan = rex.input;
2807 if (got_int)
2808 break;
2809 }
2810 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2811 ++scan;
2812 else
2813 break;
2814 ++count;
2815 }
2816 break;
2817
2818 case FNAME:
2819 case FNAME + ADD_NL:
2820 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002821 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002822 case SFNAME:
2823 case SFNAME + ADD_NL:
2824 while (count < maxcount)
2825 {
2826 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2827 {
2828 MB_PTR_ADV(scan);
2829 }
2830 else if (*scan == NUL)
2831 {
2832 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2833 || rex.reg_line_lbr)
2834 break;
2835 reg_nextline();
2836 scan = rex.input;
2837 if (got_int)
2838 break;
2839 }
2840 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2841 ++scan;
2842 else
2843 break;
2844 ++count;
2845 }
2846 break;
2847
2848 case PRINT:
2849 case PRINT + ADD_NL:
2850 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002851 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002852 case SPRINT:
2853 case SPRINT + ADD_NL:
2854 while (count < maxcount)
2855 {
2856 if (*scan == NUL)
2857 {
2858 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2859 || rex.reg_line_lbr)
2860 break;
2861 reg_nextline();
2862 scan = rex.input;
2863 if (got_int)
2864 break;
2865 }
2866 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2867 && (testval || !VIM_ISDIGIT(*scan)))
2868 {
2869 MB_PTR_ADV(scan);
2870 }
2871 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2872 ++scan;
2873 else
2874 break;
2875 ++count;
2876 }
2877 break;
2878
2879 case WHITE:
2880 case WHITE + ADD_NL:
2881 testval = mask = RI_WHITE;
2882do_class:
2883 while (count < maxcount)
2884 {
2885 int l;
2886
2887 if (*scan == NUL)
2888 {
2889 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2890 || rex.reg_line_lbr)
2891 break;
2892 reg_nextline();
2893 scan = rex.input;
2894 if (got_int)
2895 break;
2896 }
2897 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2898 {
2899 if (testval != 0)
2900 break;
2901 scan += l;
2902 }
2903 else if ((class_tab[*scan] & mask) == testval)
2904 ++scan;
2905 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2906 ++scan;
2907 else
2908 break;
2909 ++count;
2910 }
2911 break;
2912
2913 case NWHITE:
2914 case NWHITE + ADD_NL:
2915 mask = RI_WHITE;
2916 goto do_class;
2917 case DIGIT:
2918 case DIGIT + ADD_NL:
2919 testval = mask = RI_DIGIT;
2920 goto do_class;
2921 case NDIGIT:
2922 case NDIGIT + ADD_NL:
2923 mask = RI_DIGIT;
2924 goto do_class;
2925 case HEX:
2926 case HEX + ADD_NL:
2927 testval = mask = RI_HEX;
2928 goto do_class;
2929 case NHEX:
2930 case NHEX + ADD_NL:
2931 mask = RI_HEX;
2932 goto do_class;
2933 case OCTAL:
2934 case OCTAL + ADD_NL:
2935 testval = mask = RI_OCTAL;
2936 goto do_class;
2937 case NOCTAL:
2938 case NOCTAL + ADD_NL:
2939 mask = RI_OCTAL;
2940 goto do_class;
2941 case WORD:
2942 case WORD + ADD_NL:
2943 testval = mask = RI_WORD;
2944 goto do_class;
2945 case NWORD:
2946 case NWORD + ADD_NL:
2947 mask = RI_WORD;
2948 goto do_class;
2949 case HEAD:
2950 case HEAD + ADD_NL:
2951 testval = mask = RI_HEAD;
2952 goto do_class;
2953 case NHEAD:
2954 case NHEAD + ADD_NL:
2955 mask = RI_HEAD;
2956 goto do_class;
2957 case ALPHA:
2958 case ALPHA + ADD_NL:
2959 testval = mask = RI_ALPHA;
2960 goto do_class;
2961 case NALPHA:
2962 case NALPHA + ADD_NL:
2963 mask = RI_ALPHA;
2964 goto do_class;
2965 case LOWER:
2966 case LOWER + ADD_NL:
2967 testval = mask = RI_LOWER;
2968 goto do_class;
2969 case NLOWER:
2970 case NLOWER + ADD_NL:
2971 mask = RI_LOWER;
2972 goto do_class;
2973 case UPPER:
2974 case UPPER + ADD_NL:
2975 testval = mask = RI_UPPER;
2976 goto do_class;
2977 case NUPPER:
2978 case NUPPER + ADD_NL:
2979 mask = RI_UPPER;
2980 goto do_class;
2981
2982 case EXACTLY:
2983 {
2984 int cu, cl;
2985
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002986 // This doesn't do a multi-byte character, because a MULTIBYTECODE
2987 // would have been used for it. It does handle single-byte
2988 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002989 if (rex.reg_ic)
2990 {
2991 cu = MB_TOUPPER(*opnd);
2992 cl = MB_TOLOWER(*opnd);
2993 while (count < maxcount && (*scan == cu || *scan == cl))
2994 {
2995 count++;
2996 scan++;
2997 }
2998 }
2999 else
3000 {
3001 cu = *opnd;
3002 while (count < maxcount && *scan == cu)
3003 {
3004 count++;
3005 scan++;
3006 }
3007 }
3008 break;
3009 }
3010
3011 case MULTIBYTECODE:
3012 {
3013 int i, len, cf = 0;
3014
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003015 // Safety check (just in case 'encoding' was changed since
3016 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003017 if ((len = (*mb_ptr2len)(opnd)) > 1)
3018 {
3019 if (rex.reg_ic && enc_utf8)
3020 cf = utf_fold(utf_ptr2char(opnd));
3021 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3022 {
3023 for (i = 0; i < len; ++i)
3024 if (opnd[i] != scan[i])
3025 break;
3026 if (i < len && (!rex.reg_ic || !enc_utf8
3027 || utf_fold(utf_ptr2char(scan)) != cf))
3028 break;
3029 scan += len;
3030 ++count;
3031 }
3032 }
3033 }
3034 break;
3035
3036 case ANYOF:
3037 case ANYOF + ADD_NL:
3038 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003039 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003040
3041 case ANYBUT:
3042 case ANYBUT + ADD_NL:
3043 while (count < maxcount)
3044 {
3045 int len;
3046
3047 if (*scan == NUL)
3048 {
3049 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3050 || rex.reg_line_lbr)
3051 break;
3052 reg_nextline();
3053 scan = rex.input;
3054 if (got_int)
3055 break;
3056 }
3057 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3058 ++scan;
3059 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3060 {
3061 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3062 break;
3063 scan += len;
3064 }
3065 else
3066 {
3067 if ((cstrchr(opnd, *scan) == NULL) == testval)
3068 break;
3069 ++scan;
3070 }
3071 ++count;
3072 }
3073 break;
3074
3075 case NEWL:
3076 while (count < maxcount
3077 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3078 && !rex.reg_line_lbr && REG_MULTI)
3079 || (*scan == '\n' && rex.reg_line_lbr)))
3080 {
3081 count++;
3082 if (rex.reg_line_lbr)
3083 ADVANCE_REGINPUT();
3084 else
3085 reg_nextline();
3086 scan = rex.input;
3087 if (got_int)
3088 break;
3089 }
3090 break;
3091
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003092 default: // Oh dear. Called inappropriately.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02003093 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003094#ifdef DEBUG
3095 printf("Called regrepeat with op code %d\n", OP(p));
3096#endif
3097 break;
3098 }
3099
3100 rex.input = scan;
3101
3102 return (int)count;
3103}
3104
3105/*
3106 * Push an item onto the regstack.
3107 * Returns pointer to new item. Returns NULL when out of memory.
3108 */
3109 static regitem_T *
3110regstack_push(regstate_T state, char_u *scan)
3111{
3112 regitem_T *rp;
3113
3114 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3115 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00003116 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003117 return NULL;
3118 }
3119 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3120 return NULL;
3121
3122 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3123 rp->rs_state = state;
3124 rp->rs_scan = scan;
3125
3126 regstack.ga_len += sizeof(regitem_T);
3127 return rp;
3128}
3129
3130/*
3131 * Pop an item from the regstack.
3132 */
3133 static void
3134regstack_pop(char_u **scan)
3135{
3136 regitem_T *rp;
3137
3138 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3139 *scan = rp->rs_scan;
3140
3141 regstack.ga_len -= sizeof(regitem_T);
3142}
3143
3144/*
3145 * Save the current subexpr to "bp", so that they can be restored
3146 * later by restore_subexpr().
3147 */
3148 static void
3149save_subexpr(regbehind_T *bp)
3150{
3151 int i;
3152
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003153 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3154 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003155 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
3156 if (!rex.need_clear_subexpr)
3157 {
3158 for (i = 0; i < NSUBEXP; ++i)
3159 {
3160 if (REG_MULTI)
3161 {
3162 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3163 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3164 }
3165 else
3166 {
3167 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3168 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
3169 }
3170 }
3171 }
3172}
3173
3174/*
3175 * Restore the subexpr from "bp".
3176 */
3177 static void
3178restore_subexpr(regbehind_T *bp)
3179{
3180 int i;
3181
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003182 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003183 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
3184 if (!rex.need_clear_subexpr)
3185 {
3186 for (i = 0; i < NSUBEXP; ++i)
3187 {
3188 if (REG_MULTI)
3189 {
3190 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3191 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3192 }
3193 else
3194 {
3195 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3196 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
3197 }
3198 }
3199 }
3200}
3201
3202/*
3203 * regmatch - main matching routine
3204 *
3205 * Conceptually the strategy is simple: Check to see whether the current node
3206 * matches, push an item onto the regstack and loop to see whether the rest
3207 * matches, and then act accordingly. In practice we make some effort to
3208 * avoid using the regstack, in particular by going through "ordinary" nodes
3209 * (that don't need to know whether the rest of the match failed) by a nested
3210 * loop.
3211 *
3212 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3213 * the last matched character.
3214 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3215 * undefined state!
3216 */
3217 static int
3218regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003219 char_u *scan, // Current node.
3220 proftime_T *tm UNUSED, // timeout limit or NULL
3221 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003222{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003223 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003224 int op;
3225 int c;
3226 regitem_T *rp;
3227 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003228 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003229#ifdef FEAT_RELTIME
3230 int tm_count = 0;
3231#endif
3232
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003233 // Make "regstack" and "backpos" empty. They are allocated and freed in
3234 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003235 regstack.ga_len = 0;
3236 backpos.ga_len = 0;
3237
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003238 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003239 for (;;)
3240 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003241 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3242 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003243 fast_breakcheck();
3244
3245#ifdef DEBUG
3246 if (scan != NULL && regnarrate)
3247 {
3248 mch_errmsg((char *)regprop(scan));
3249 mch_errmsg("(\n");
3250 }
3251#endif
3252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003253 // Repeat for items that can be matched sequentially, without using the
3254 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003255 for (;;)
3256 {
3257 if (got_int || scan == NULL)
3258 {
3259 status = RA_FAIL;
3260 break;
3261 }
3262#ifdef FEAT_RELTIME
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003263 // Check for timeout once in a 100 times to avoid overhead.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003264 if (tm != NULL && ++tm_count == 100)
3265 {
3266 tm_count = 0;
3267 if (profile_passed_limit(tm))
3268 {
3269 if (timed_out != NULL)
3270 *timed_out = TRUE;
3271 status = RA_FAIL;
3272 break;
3273 }
3274 }
3275#endif
3276 status = RA_CONT;
3277
3278#ifdef DEBUG
3279 if (regnarrate)
3280 {
3281 mch_errmsg((char *)regprop(scan));
3282 mch_errmsg("...\n");
3283# ifdef FEAT_SYN_HL
3284 if (re_extmatch_in != NULL)
3285 {
3286 int i;
3287
3288 mch_errmsg(_("External submatches:\n"));
3289 for (i = 0; i < NSUBEXP; i++)
3290 {
3291 mch_errmsg(" \"");
3292 if (re_extmatch_in->matches[i] != NULL)
3293 mch_errmsg((char *)re_extmatch_in->matches[i]);
3294 mch_errmsg("\"\n");
3295 }
3296 }
3297# endif
3298 }
3299#endif
3300 next = regnext(scan);
3301
3302 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003303 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003304 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3305 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
3306 {
3307 reg_nextline();
3308 }
3309 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3310 {
3311 ADVANCE_REGINPUT();
3312 }
3313 else
3314 {
3315 if (WITH_NL(op))
3316 op -= ADD_NL;
3317 if (has_mbyte)
3318 c = (*mb_ptr2char)(rex.input);
3319 else
3320 c = *rex.input;
3321 switch (op)
3322 {
3323 case BOL:
3324 if (rex.input != rex.line)
3325 status = RA_NOMATCH;
3326 break;
3327
3328 case EOL:
3329 if (c != NUL)
3330 status = RA_NOMATCH;
3331 break;
3332
3333 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003334 // We're not at the beginning of the file when below the first
3335 // line where we started, not at the start of the line or we
3336 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003337 if (rex.lnum != 0 || rex.input != rex.line
3338 || (REG_MULTI && rex.reg_firstlnum > 1))
3339 status = RA_NOMATCH;
3340 break;
3341
3342 case RE_EOF:
3343 if (rex.lnum != rex.reg_maxline || c != NUL)
3344 status = RA_NOMATCH;
3345 break;
3346
3347 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003348 // Check if the buffer is in a window and compare the
3349 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003350 if (rex.reg_win == NULL
3351 || (rex.lnum + rex.reg_firstlnum
3352 != rex.reg_win->w_cursor.lnum)
3353 || ((colnr_T)(rex.input - rex.line)
3354 != rex.reg_win->w_cursor.col))
3355 status = RA_NOMATCH;
3356 break;
3357
3358 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003359 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003360 {
3361 int mark = OPERAND(scan)[0];
3362 int cmp = OPERAND(scan)[1];
3363 pos_T *pos;
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003364 size_t col = REG_MULTI ? rex.input - rex.line : 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003365
3366 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003367
3368 // Line may have been freed, get it again.
3369 if (REG_MULTI)
3370 {
3371 rex.line = reg_getline(rex.lnum);
3372 rex.input = rex.line + col;
3373 }
3374
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003375 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003376 || pos->lnum <= 0) // mark isn't set in reg_buf
3377 {
3378 status = RA_NOMATCH;
3379 }
3380 else
3381 {
3382 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3383 && pos->col == MAXCOL
3384 ? (colnr_T)STRLEN(reg_getline(
3385 pos->lnum - rex.reg_firstlnum))
3386 : pos->col;
3387
3388 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3389 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003390 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003391 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003392 ? cmp != '>'
3393 : cmp != '<'))
3394 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3395 ? cmp != '>'
3396 : cmp != '<')))
3397 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003398 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003399 }
3400 break;
3401
3402 case RE_VISUAL:
3403 if (!reg_match_visual())
3404 status = RA_NOMATCH;
3405 break;
3406
3407 case RE_LNUM:
3408 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3409 scan))
3410 status = RA_NOMATCH;
3411 break;
3412
3413 case RE_COL:
3414 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3415 status = RA_NOMATCH;
3416 break;
3417
3418 case RE_VCOL:
3419 if (!re_num_cmp((long_u)win_linetabsize(
3420 rex.reg_win == NULL ? curwin : rex.reg_win,
3421 rex.line, (colnr_T)(rex.input - rex.line)) + 1, scan))
3422 status = RA_NOMATCH;
3423 break;
3424
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003425 case BOW: // \<word; rex.input points to w
3426 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003427 status = RA_NOMATCH;
3428 else if (has_mbyte)
3429 {
3430 int this_class;
3431
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003432 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003433 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3434 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003435 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003436 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003437 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003438 }
3439 else
3440 {
3441 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3442 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3443 status = RA_NOMATCH;
3444 }
3445 break;
3446
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003447 case EOW: // word\>; rex.input points after d
3448 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003449 status = RA_NOMATCH;
3450 else if (has_mbyte)
3451 {
3452 int this_class, prev_class;
3453
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003454 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003455 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3456 prev_class = reg_prev_class();
3457 if (this_class == prev_class
3458 || prev_class == 0 || prev_class == 1)
3459 status = RA_NOMATCH;
3460 }
3461 else
3462 {
3463 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3464 || (rex.input[0] != NUL
3465 && vim_iswordc_buf(c, rex.reg_buf)))
3466 status = RA_NOMATCH;
3467 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003468 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003469
3470 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003471 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003472 if (c == NUL)
3473 status = RA_NOMATCH;
3474 else
3475 ADVANCE_REGINPUT();
3476 break;
3477
3478 case IDENT:
3479 if (!vim_isIDc(c))
3480 status = RA_NOMATCH;
3481 else
3482 ADVANCE_REGINPUT();
3483 break;
3484
3485 case SIDENT:
3486 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3487 status = RA_NOMATCH;
3488 else
3489 ADVANCE_REGINPUT();
3490 break;
3491
3492 case KWORD:
3493 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3494 status = RA_NOMATCH;
3495 else
3496 ADVANCE_REGINPUT();
3497 break;
3498
3499 case SKWORD:
3500 if (VIM_ISDIGIT(*rex.input)
3501 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3502 status = RA_NOMATCH;
3503 else
3504 ADVANCE_REGINPUT();
3505 break;
3506
3507 case FNAME:
3508 if (!vim_isfilec(c))
3509 status = RA_NOMATCH;
3510 else
3511 ADVANCE_REGINPUT();
3512 break;
3513
3514 case SFNAME:
3515 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3516 status = RA_NOMATCH;
3517 else
3518 ADVANCE_REGINPUT();
3519 break;
3520
3521 case PRINT:
3522 if (!vim_isprintc(PTR2CHAR(rex.input)))
3523 status = RA_NOMATCH;
3524 else
3525 ADVANCE_REGINPUT();
3526 break;
3527
3528 case SPRINT:
3529 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3530 status = RA_NOMATCH;
3531 else
3532 ADVANCE_REGINPUT();
3533 break;
3534
3535 case WHITE:
3536 if (!VIM_ISWHITE(c))
3537 status = RA_NOMATCH;
3538 else
3539 ADVANCE_REGINPUT();
3540 break;
3541
3542 case NWHITE:
3543 if (c == NUL || VIM_ISWHITE(c))
3544 status = RA_NOMATCH;
3545 else
3546 ADVANCE_REGINPUT();
3547 break;
3548
3549 case DIGIT:
3550 if (!ri_digit(c))
3551 status = RA_NOMATCH;
3552 else
3553 ADVANCE_REGINPUT();
3554 break;
3555
3556 case NDIGIT:
3557 if (c == NUL || ri_digit(c))
3558 status = RA_NOMATCH;
3559 else
3560 ADVANCE_REGINPUT();
3561 break;
3562
3563 case HEX:
3564 if (!ri_hex(c))
3565 status = RA_NOMATCH;
3566 else
3567 ADVANCE_REGINPUT();
3568 break;
3569
3570 case NHEX:
3571 if (c == NUL || ri_hex(c))
3572 status = RA_NOMATCH;
3573 else
3574 ADVANCE_REGINPUT();
3575 break;
3576
3577 case OCTAL:
3578 if (!ri_octal(c))
3579 status = RA_NOMATCH;
3580 else
3581 ADVANCE_REGINPUT();
3582 break;
3583
3584 case NOCTAL:
3585 if (c == NUL || ri_octal(c))
3586 status = RA_NOMATCH;
3587 else
3588 ADVANCE_REGINPUT();
3589 break;
3590
3591 case WORD:
3592 if (!ri_word(c))
3593 status = RA_NOMATCH;
3594 else
3595 ADVANCE_REGINPUT();
3596 break;
3597
3598 case NWORD:
3599 if (c == NUL || ri_word(c))
3600 status = RA_NOMATCH;
3601 else
3602 ADVANCE_REGINPUT();
3603 break;
3604
3605 case HEAD:
3606 if (!ri_head(c))
3607 status = RA_NOMATCH;
3608 else
3609 ADVANCE_REGINPUT();
3610 break;
3611
3612 case NHEAD:
3613 if (c == NUL || ri_head(c))
3614 status = RA_NOMATCH;
3615 else
3616 ADVANCE_REGINPUT();
3617 break;
3618
3619 case ALPHA:
3620 if (!ri_alpha(c))
3621 status = RA_NOMATCH;
3622 else
3623 ADVANCE_REGINPUT();
3624 break;
3625
3626 case NALPHA:
3627 if (c == NUL || ri_alpha(c))
3628 status = RA_NOMATCH;
3629 else
3630 ADVANCE_REGINPUT();
3631 break;
3632
3633 case LOWER:
3634 if (!ri_lower(c))
3635 status = RA_NOMATCH;
3636 else
3637 ADVANCE_REGINPUT();
3638 break;
3639
3640 case NLOWER:
3641 if (c == NUL || ri_lower(c))
3642 status = RA_NOMATCH;
3643 else
3644 ADVANCE_REGINPUT();
3645 break;
3646
3647 case UPPER:
3648 if (!ri_upper(c))
3649 status = RA_NOMATCH;
3650 else
3651 ADVANCE_REGINPUT();
3652 break;
3653
3654 case NUPPER:
3655 if (c == NUL || ri_upper(c))
3656 status = RA_NOMATCH;
3657 else
3658 ADVANCE_REGINPUT();
3659 break;
3660
3661 case EXACTLY:
3662 {
3663 int len;
3664 char_u *opnd;
3665
3666 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003667 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003668 if (*opnd != *rex.input
3669 && (!rex.reg_ic
3670 || (!enc_utf8
3671 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3672 status = RA_NOMATCH;
3673 else if (*opnd == NUL)
3674 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003675 // match empty string always works; happens when "~" is
3676 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003677 }
3678 else
3679 {
3680 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3681 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003682 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003683 }
3684 else
3685 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003686 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003687 len = (int)STRLEN(opnd);
3688 if (cstrncmp(opnd, rex.input, &len) != 0)
3689 status = RA_NOMATCH;
3690 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003691 // Check for following composing character, unless %C
3692 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003693 if (status != RA_NOMATCH
3694 && enc_utf8
3695 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3696 && !rex.reg_icombine
3697 && OP(next) != RE_COMPOSING)
3698 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003699 // raaron: This code makes a composing character get
3700 // ignored, which is the correct behavior (sometimes)
3701 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003702 status = RA_NOMATCH;
3703 }
3704 if (status != RA_NOMATCH)
3705 rex.input += len;
3706 }
3707 }
3708 break;
3709
3710 case ANYOF:
3711 case ANYBUT:
3712 if (c == NUL)
3713 status = RA_NOMATCH;
3714 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3715 status = RA_NOMATCH;
3716 else
3717 ADVANCE_REGINPUT();
3718 break;
3719
3720 case MULTIBYTECODE:
3721 if (has_mbyte)
3722 {
3723 int i, len;
3724 char_u *opnd;
3725 int opndc = 0, inpc;
3726
3727 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003728 // Safety check (just in case 'encoding' was changed since
3729 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003730 if ((len = (*mb_ptr2len)(opnd)) < 2)
3731 {
3732 status = RA_NOMATCH;
3733 break;
3734 }
3735 if (enc_utf8)
3736 opndc = utf_ptr2char(opnd);
3737 if (enc_utf8 && utf_iscomposing(opndc))
3738 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003739 // When only a composing char is given match at any
3740 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003741 status = RA_NOMATCH;
3742 for (i = 0; rex.input[i] != NUL;
3743 i += utf_ptr2len(rex.input + i))
3744 {
3745 inpc = utf_ptr2char(rex.input + i);
3746 if (!utf_iscomposing(inpc))
3747 {
3748 if (i > 0)
3749 break;
3750 }
3751 else if (opndc == inpc)
3752 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003753 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003754 len = i + utfc_ptr2len(rex.input + i);
3755 status = RA_MATCH;
3756 break;
3757 }
3758 }
3759 }
3760 else
3761 for (i = 0; i < len; ++i)
3762 if (opnd[i] != rex.input[i])
3763 {
3764 status = RA_NOMATCH;
3765 break;
3766 }
3767 rex.input += len;
3768 }
3769 else
3770 status = RA_NOMATCH;
3771 break;
3772 case RE_COMPOSING:
3773 if (enc_utf8)
3774 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003775 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003776 while (utf_iscomposing(utf_ptr2char(rex.input)))
3777 MB_CPTR_ADV(rex.input);
3778 }
3779 break;
3780
3781 case NOTHING:
3782 break;
3783
3784 case BACK:
3785 {
3786 int i;
3787 backpos_T *bp;
3788
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003789 // When we run into BACK we need to check if we don't keep
3790 // looping without matching any input. The second and later
3791 // times a BACK is encountered it fails if the input is still
3792 // at the same position as the previous time.
3793 // The positions are stored in "backpos" and found by the
3794 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003795 bp = (backpos_T *)backpos.ga_data;
3796 for (i = 0; i < backpos.ga_len; ++i)
3797 if (bp[i].bp_scan == scan)
3798 break;
3799 if (i == backpos.ga_len)
3800 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003801 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003802 if (ga_grow(&backpos, 1) == FAIL)
3803 status = RA_FAIL;
3804 else
3805 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003806 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003807 bp = (backpos_T *)backpos.ga_data;
3808 bp[i].bp_scan = scan;
3809 ++backpos.ga_len;
3810 }
3811 }
3812 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003813 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003814 status = RA_NOMATCH;
3815
3816 if (status != RA_FAIL && status != RA_NOMATCH)
3817 reg_save(&bp[i].bp_pos, &backpos);
3818 }
3819 break;
3820
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003821 case MOPEN + 0: // Match start: \zs
3822 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003823 case MOPEN + 2:
3824 case MOPEN + 3:
3825 case MOPEN + 4:
3826 case MOPEN + 5:
3827 case MOPEN + 6:
3828 case MOPEN + 7:
3829 case MOPEN + 8:
3830 case MOPEN + 9:
3831 {
3832 no = op - MOPEN;
3833 cleanup_subexpr();
3834 rp = regstack_push(RS_MOPEN, scan);
3835 if (rp == NULL)
3836 status = RA_FAIL;
3837 else
3838 {
3839 rp->rs_no = no;
3840 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3841 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003842 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003843 }
3844 }
3845 break;
3846
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003847 case NOPEN: // \%(
3848 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003849 if (regstack_push(RS_NOPEN, scan) == NULL)
3850 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003851 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003852 break;
3853
3854#ifdef FEAT_SYN_HL
3855 case ZOPEN + 1:
3856 case ZOPEN + 2:
3857 case ZOPEN + 3:
3858 case ZOPEN + 4:
3859 case ZOPEN + 5:
3860 case ZOPEN + 6:
3861 case ZOPEN + 7:
3862 case ZOPEN + 8:
3863 case ZOPEN + 9:
3864 {
3865 no = op - ZOPEN;
3866 cleanup_zsubexpr();
3867 rp = regstack_push(RS_ZOPEN, scan);
3868 if (rp == NULL)
3869 status = RA_FAIL;
3870 else
3871 {
3872 rp->rs_no = no;
3873 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3874 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003875 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003876 }
3877 }
3878 break;
3879#endif
3880
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003881 case MCLOSE + 0: // Match end: \ze
3882 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003883 case MCLOSE + 2:
3884 case MCLOSE + 3:
3885 case MCLOSE + 4:
3886 case MCLOSE + 5:
3887 case MCLOSE + 6:
3888 case MCLOSE + 7:
3889 case MCLOSE + 8:
3890 case MCLOSE + 9:
3891 {
3892 no = op - MCLOSE;
3893 cleanup_subexpr();
3894 rp = regstack_push(RS_MCLOSE, scan);
3895 if (rp == NULL)
3896 status = RA_FAIL;
3897 else
3898 {
3899 rp->rs_no = no;
3900 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3901 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003902 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003903 }
3904 }
3905 break;
3906
3907#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003908 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003909 case ZCLOSE + 2:
3910 case ZCLOSE + 3:
3911 case ZCLOSE + 4:
3912 case ZCLOSE + 5:
3913 case ZCLOSE + 6:
3914 case ZCLOSE + 7:
3915 case ZCLOSE + 8:
3916 case ZCLOSE + 9:
3917 {
3918 no = op - ZCLOSE;
3919 cleanup_zsubexpr();
3920 rp = regstack_push(RS_ZCLOSE, scan);
3921 if (rp == NULL)
3922 status = RA_FAIL;
3923 else
3924 {
3925 rp->rs_no = no;
3926 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
3927 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003928 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003929 }
3930 }
3931 break;
3932#endif
3933
3934 case BACKREF + 1:
3935 case BACKREF + 2:
3936 case BACKREF + 3:
3937 case BACKREF + 4:
3938 case BACKREF + 5:
3939 case BACKREF + 6:
3940 case BACKREF + 7:
3941 case BACKREF + 8:
3942 case BACKREF + 9:
3943 {
3944 int len;
3945
3946 no = op - BACKREF;
3947 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003948 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003949 {
3950 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
3951 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003952 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003953 len = 0;
3954 }
3955 else
3956 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003957 // Compare current input with back-ref in the same
3958 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003959 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
3960 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
3961 status = RA_NOMATCH;
3962 }
3963 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003964 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003965 {
3966 if (rex.reg_startpos[no].lnum < 0
3967 || rex.reg_endpos[no].lnum < 0)
3968 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003969 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003970 len = 0;
3971 }
3972 else
3973 {
3974 if (rex.reg_startpos[no].lnum == rex.lnum
3975 && rex.reg_endpos[no].lnum == rex.lnum)
3976 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003977 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003978 len = rex.reg_endpos[no].col
3979 - rex.reg_startpos[no].col;
3980 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
3981 rex.input, &len) != 0)
3982 status = RA_NOMATCH;
3983 }
3984 else
3985 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003986 // Messy situation: Need to compare between two
3987 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003988 int r = match_with_backref(
3989 rex.reg_startpos[no].lnum,
3990 rex.reg_startpos[no].col,
3991 rex.reg_endpos[no].lnum,
3992 rex.reg_endpos[no].col,
3993 &len);
3994
3995 if (r != RA_MATCH)
3996 status = r;
3997 }
3998 }
3999 }
4000
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004001 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004002 rex.input += len;
4003 }
4004 break;
4005
4006#ifdef FEAT_SYN_HL
4007 case ZREF + 1:
4008 case ZREF + 2:
4009 case ZREF + 3:
4010 case ZREF + 4:
4011 case ZREF + 5:
4012 case ZREF + 6:
4013 case ZREF + 7:
4014 case ZREF + 8:
4015 case ZREF + 9:
4016 {
4017 int len;
4018
4019 cleanup_zsubexpr();
4020 no = op - ZREF;
4021 if (re_extmatch_in != NULL
4022 && re_extmatch_in->matches[no] != NULL)
4023 {
4024 len = (int)STRLEN(re_extmatch_in->matches[no]);
4025 if (cstrncmp(re_extmatch_in->matches[no],
4026 rex.input, &len) != 0)
4027 status = RA_NOMATCH;
4028 else
4029 rex.input += len;
4030 }
4031 else
4032 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004033 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004034 }
4035 }
4036 break;
4037#endif
4038
4039 case BRANCH:
4040 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004041 if (OP(next) != BRANCH) // No choice.
4042 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004043 else
4044 {
4045 rp = regstack_push(RS_BRANCH, scan);
4046 if (rp == NULL)
4047 status = RA_FAIL;
4048 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004049 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004050 }
4051 }
4052 break;
4053
4054 case BRACE_LIMITS:
4055 {
4056 if (OP(next) == BRACE_SIMPLE)
4057 {
4058 bl_minval = OPERAND_MIN(scan);
4059 bl_maxval = OPERAND_MAX(scan);
4060 }
4061 else if (OP(next) >= BRACE_COMPLEX
4062 && OP(next) < BRACE_COMPLEX + 10)
4063 {
4064 no = OP(next) - BRACE_COMPLEX;
4065 brace_min[no] = OPERAND_MIN(scan);
4066 brace_max[no] = OPERAND_MAX(scan);
4067 brace_count[no] = 0;
4068 }
4069 else
4070 {
4071 internal_error("BRACE_LIMITS");
4072 status = RA_FAIL;
4073 }
4074 }
4075 break;
4076
4077 case BRACE_COMPLEX + 0:
4078 case BRACE_COMPLEX + 1:
4079 case BRACE_COMPLEX + 2:
4080 case BRACE_COMPLEX + 3:
4081 case BRACE_COMPLEX + 4:
4082 case BRACE_COMPLEX + 5:
4083 case BRACE_COMPLEX + 6:
4084 case BRACE_COMPLEX + 7:
4085 case BRACE_COMPLEX + 8:
4086 case BRACE_COMPLEX + 9:
4087 {
4088 no = op - BRACE_COMPLEX;
4089 ++brace_count[no];
4090
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004091 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004092 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4093 ? brace_min[no] : brace_max[no]))
4094 {
4095 rp = regstack_push(RS_BRCPLX_MORE, scan);
4096 if (rp == NULL)
4097 status = RA_FAIL;
4098 else
4099 {
4100 rp->rs_no = no;
4101 reg_save(&rp->rs_un.regsave, &backpos);
4102 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004103 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004104 }
4105 break;
4106 }
4107
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004108 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004109 if (brace_min[no] <= brace_max[no])
4110 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004111 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004112 if (brace_count[no] <= brace_max[no])
4113 {
4114 rp = regstack_push(RS_BRCPLX_LONG, scan);
4115 if (rp == NULL)
4116 status = RA_FAIL;
4117 else
4118 {
4119 rp->rs_no = no;
4120 reg_save(&rp->rs_un.regsave, &backpos);
4121 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004122 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004123 }
4124 }
4125 }
4126 else
4127 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004128 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004129 if (brace_count[no] <= brace_min[no])
4130 {
4131 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4132 if (rp == NULL)
4133 status = RA_FAIL;
4134 else
4135 {
4136 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004137 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004138 }
4139 }
4140 }
4141 }
4142 break;
4143
4144 case BRACE_SIMPLE:
4145 case STAR:
4146 case PLUS:
4147 {
4148 regstar_T rst;
4149
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004150 // Lookahead to avoid useless match attempts when we know
4151 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004152 if (OP(next) == EXACTLY)
4153 {
4154 rst.nextb = *OPERAND(next);
4155 if (rex.reg_ic)
4156 {
4157 if (MB_ISUPPER(rst.nextb))
4158 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4159 else
4160 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4161 }
4162 else
4163 rst.nextb_ic = rst.nextb;
4164 }
4165 else
4166 {
4167 rst.nextb = NUL;
4168 rst.nextb_ic = NUL;
4169 }
4170 if (op != BRACE_SIMPLE)
4171 {
4172 rst.minval = (op == STAR) ? 0 : 1;
4173 rst.maxval = MAX_LIMIT;
4174 }
4175 else
4176 {
4177 rst.minval = bl_minval;
4178 rst.maxval = bl_maxval;
4179 }
4180
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004181 // When maxval > minval, try matching as much as possible, up
4182 // to maxval. When maxval < minval, try matching at least the
4183 // minimal number (since the range is backwards, that's also
4184 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004185 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4186 if (got_int)
4187 {
4188 status = RA_FAIL;
4189 break;
4190 }
4191 if (rst.minval <= rst.maxval
4192 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4193 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004194 // It could match. Prepare for trying to match what
4195 // follows. The code is below. Parameters are stored in
4196 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004197 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4198 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004199 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004200 status = RA_FAIL;
4201 }
4202 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4203 status = RA_FAIL;
4204 else
4205 {
4206 regstack.ga_len += sizeof(regstar_T);
4207 rp = regstack_push(rst.minval <= rst.maxval
4208 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4209 if (rp == NULL)
4210 status = RA_FAIL;
4211 else
4212 {
4213 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004214 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004215 }
4216 }
4217 }
4218 else
4219 status = RA_NOMATCH;
4220
4221 }
4222 break;
4223
4224 case NOMATCH:
4225 case MATCH:
4226 case SUBPAT:
4227 rp = regstack_push(RS_NOMATCH, scan);
4228 if (rp == NULL)
4229 status = RA_FAIL;
4230 else
4231 {
4232 rp->rs_no = op;
4233 reg_save(&rp->rs_un.regsave, &backpos);
4234 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004235 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004236 }
4237 break;
4238
4239 case BEHIND:
4240 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004241 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004242 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4243 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004244 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004245 status = RA_FAIL;
4246 }
4247 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4248 status = RA_FAIL;
4249 else
4250 {
4251 regstack.ga_len += sizeof(regbehind_T);
4252 rp = regstack_push(RS_BEHIND1, scan);
4253 if (rp == NULL)
4254 status = RA_FAIL;
4255 else
4256 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004257 // Need to save the subexpr to be able to restore them
4258 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004259 save_subexpr(((regbehind_T *)rp) - 1);
4260
4261 rp->rs_no = op;
4262 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004263 // First try if what follows matches. If it does then we
4264 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004265 }
4266 }
4267 break;
4268
4269 case BHPOS:
4270 if (REG_MULTI)
4271 {
4272 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4273 || behind_pos.rs_u.pos.lnum != rex.lnum)
4274 status = RA_NOMATCH;
4275 }
4276 else if (behind_pos.rs_u.ptr != rex.input)
4277 status = RA_NOMATCH;
4278 break;
4279
4280 case NEWL:
4281 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4282 || rex.reg_line_lbr)
4283 && (c != '\n' || !rex.reg_line_lbr))
4284 status = RA_NOMATCH;
4285 else if (rex.reg_line_lbr)
4286 ADVANCE_REGINPUT();
4287 else
4288 reg_nextline();
4289 break;
4290
4291 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004292 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004293 break;
4294
4295 default:
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004296 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004297#ifdef DEBUG
4298 printf("Illegal op code %d\n", op);
4299#endif
4300 status = RA_FAIL;
4301 break;
4302 }
4303 }
4304
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004305 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004306 if (status != RA_CONT)
4307 break;
4308
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004309 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004310 scan = next;
4311
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004312 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004313
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004314 // If there is something on the regstack execute the code for the state.
4315 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004316 while (regstack.ga_len > 0 && status != RA_FAIL)
4317 {
4318 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4319 switch (rp->rs_state)
4320 {
4321 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004322 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004323 regstack_pop(&scan);
4324 break;
4325
4326 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004327 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004328 if (status == RA_NOMATCH)
4329 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4330 &rex.reg_startp[rp->rs_no]);
4331 regstack_pop(&scan);
4332 break;
4333
4334#ifdef FEAT_SYN_HL
4335 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004336 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004337 if (status == RA_NOMATCH)
4338 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4339 &reg_startzp[rp->rs_no]);
4340 regstack_pop(&scan);
4341 break;
4342#endif
4343
4344 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004345 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004346 if (status == RA_NOMATCH)
4347 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4348 &rex.reg_endp[rp->rs_no]);
4349 regstack_pop(&scan);
4350 break;
4351
4352#ifdef FEAT_SYN_HL
4353 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004354 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004355 if (status == RA_NOMATCH)
4356 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4357 &reg_endzp[rp->rs_no]);
4358 regstack_pop(&scan);
4359 break;
4360#endif
4361
4362 case RS_BRANCH:
4363 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004364 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004365 regstack_pop(&scan);
4366 else
4367 {
4368 if (status != RA_BREAK)
4369 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004370 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004371 reg_restore(&rp->rs_un.regsave, &backpos);
4372 scan = rp->rs_scan;
4373 }
4374 if (scan == NULL || OP(scan) != BRANCH)
4375 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004376 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004377 status = RA_NOMATCH;
4378 regstack_pop(&scan);
4379 }
4380 else
4381 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004382 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004383 rp->rs_scan = regnext(scan);
4384 reg_save(&rp->rs_un.regsave, &backpos);
4385 scan = OPERAND(scan);
4386 }
4387 }
4388 break;
4389
4390 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004391 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004392 if (status == RA_NOMATCH)
4393 {
4394 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004395 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004396 }
4397 regstack_pop(&scan);
4398 break;
4399
4400 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004401 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004402 if (status == RA_NOMATCH)
4403 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004404 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004405 reg_restore(&rp->rs_un.regsave, &backpos);
4406 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004407 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004408 status = RA_CONT;
4409 }
4410 regstack_pop(&scan);
4411 if (status == RA_CONT)
4412 scan = regnext(scan);
4413 break;
4414
4415 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004416 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004417 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004418 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004419 reg_restore(&rp->rs_un.regsave, &backpos);
4420 regstack_pop(&scan);
4421 if (status == RA_NOMATCH)
4422 {
4423 scan = OPERAND(scan);
4424 status = RA_CONT;
4425 }
4426 break;
4427
4428 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004429 // Pop the state. If the operand matches for NOMATCH or
4430 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4431 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004432 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4433 status = RA_NOMATCH;
4434 else
4435 {
4436 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004437 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004438 reg_restore(&rp->rs_un.regsave, &backpos);
4439 }
4440 regstack_pop(&scan);
4441 if (status == RA_CONT)
4442 scan = regnext(scan);
4443 break;
4444
4445 case RS_BEHIND1:
4446 if (status == RA_NOMATCH)
4447 {
4448 regstack_pop(&scan);
4449 regstack.ga_len -= sizeof(regbehind_T);
4450 }
4451 else
4452 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004453 // The stuff after BEHIND/NOBEHIND matches. Now try if
4454 // the behind part does (not) match before the current
4455 // position in the input. This must be done at every
4456 // position in the input and checking if the match ends at
4457 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004458
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004459 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004460 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4461
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004462 // Start looking for a match with operand at the current
4463 // position. Go back one character until we find the
4464 // result, hitting the start of the line or the previous
4465 // line (for multi-line matching).
4466 // Set behind_pos to where the match should end, BHPOS
4467 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004468 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4469 behind_pos = rp->rs_un.regsave;
4470
4471 rp->rs_state = RS_BEHIND2;
4472
4473 reg_restore(&rp->rs_un.regsave, &backpos);
4474 scan = OPERAND(rp->rs_scan) + 4;
4475 }
4476 break;
4477
4478 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004479 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004480 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4481 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004482 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004483 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4484 if (rp->rs_no == BEHIND)
4485 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4486 &backpos);
4487 else
4488 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004489 // But we didn't want a match. Need to restore the
4490 // subexpr, because what follows matched, so they have
4491 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004492 status = RA_NOMATCH;
4493 restore_subexpr(((regbehind_T *)rp) - 1);
4494 }
4495 regstack_pop(&scan);
4496 regstack.ga_len -= sizeof(regbehind_T);
4497 }
4498 else
4499 {
4500 long limit;
4501
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004502 // No match or a match that doesn't end where we want it: Go
4503 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004504 no = OK;
4505 limit = OPERAND_MIN(rp->rs_scan);
4506 if (REG_MULTI)
4507 {
4508 if (limit > 0
4509 && ((rp->rs_un.regsave.rs_u.pos.lnum
4510 < behind_pos.rs_u.pos.lnum
4511 ? (colnr_T)STRLEN(rex.line)
4512 : behind_pos.rs_u.pos.col)
4513 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4514 no = FAIL;
4515 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4516 {
4517 if (rp->rs_un.regsave.rs_u.pos.lnum
4518 < behind_pos.rs_u.pos.lnum
4519 || reg_getline(
4520 --rp->rs_un.regsave.rs_u.pos.lnum)
4521 == NULL)
4522 no = FAIL;
4523 else
4524 {
4525 reg_restore(&rp->rs_un.regsave, &backpos);
4526 rp->rs_un.regsave.rs_u.pos.col =
4527 (colnr_T)STRLEN(rex.line);
4528 }
4529 }
4530 else
4531 {
4532 if (has_mbyte)
4533 {
4534 char_u *line =
4535 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4536
4537 rp->rs_un.regsave.rs_u.pos.col -=
4538 (*mb_head_off)(line, line
4539 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4540 }
4541 else
4542 --rp->rs_un.regsave.rs_u.pos.col;
4543 }
4544 }
4545 else
4546 {
4547 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4548 no = FAIL;
4549 else
4550 {
4551 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4552 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4553 - rp->rs_un.regsave.rs_u.ptr) > limit)
4554 no = FAIL;
4555 }
4556 }
4557 if (no == OK)
4558 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004559 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004560 reg_restore(&rp->rs_un.regsave, &backpos);
4561 scan = OPERAND(rp->rs_scan) + 4;
4562 if (status == RA_MATCH)
4563 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004564 // We did match, so subexpr may have been changed,
4565 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004566 status = RA_NOMATCH;
4567 restore_subexpr(((regbehind_T *)rp) - 1);
4568 }
4569 }
4570 else
4571 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004572 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004573 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4574 if (rp->rs_no == NOBEHIND)
4575 {
4576 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4577 &backpos);
4578 status = RA_MATCH;
4579 }
4580 else
4581 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004582 // We do want a proper match. Need to restore the
4583 // subexpr if we had a match, because they may have
4584 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004585 if (status == RA_MATCH)
4586 {
4587 status = RA_NOMATCH;
4588 restore_subexpr(((regbehind_T *)rp) - 1);
4589 }
4590 }
4591 regstack_pop(&scan);
4592 regstack.ga_len -= sizeof(regbehind_T);
4593 }
4594 }
4595 break;
4596
4597 case RS_STAR_LONG:
4598 case RS_STAR_SHORT:
4599 {
4600 regstar_T *rst = ((regstar_T *)rp) - 1;
4601
4602 if (status == RA_MATCH)
4603 {
4604 regstack_pop(&scan);
4605 regstack.ga_len -= sizeof(regstar_T);
4606 break;
4607 }
4608
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004609 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004610 if (status != RA_BREAK)
4611 reg_restore(&rp->rs_un.regsave, &backpos);
4612
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004613 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004614 for (;;)
4615 {
4616 if (status != RA_BREAK)
4617 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004618 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004619 if (rp->rs_state == RS_STAR_LONG)
4620 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004621 // Trying for longest match, but couldn't or
4622 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004623 if (--rst->count < rst->minval)
4624 break;
4625 if (rex.input == rex.line)
4626 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004627 // backup to last char of previous line
Bram Moolenaar6456fae2022-02-22 13:37:31 +00004628 if (rex.lnum == 0)
4629 {
4630 status = RA_NOMATCH;
4631 break;
4632 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004633 --rex.lnum;
4634 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004635 // Just in case regrepeat() didn't count
4636 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004637 if (rex.line == NULL)
4638 break;
4639 rex.input = rex.line + STRLEN(rex.line);
4640 fast_breakcheck();
4641 }
4642 else
4643 MB_PTR_BACK(rex.line, rex.input);
4644 }
4645 else
4646 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004647 // Range is backwards, use shortest match first.
4648 // Careful: maxval and minval are exchanged!
4649 // Couldn't or didn't match: try advancing one
4650 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004651 if (rst->count == rst->minval
4652 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4653 break;
4654 ++rst->count;
4655 }
4656 if (got_int)
4657 break;
4658 }
4659 else
4660 status = RA_NOMATCH;
4661
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004662 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004663 if (rst->nextb == NUL || *rex.input == rst->nextb
4664 || *rex.input == rst->nextb_ic)
4665 {
4666 reg_save(&rp->rs_un.regsave, &backpos);
4667 scan = regnext(rp->rs_scan);
4668 status = RA_CONT;
4669 break;
4670 }
4671 }
4672 if (status != RA_CONT)
4673 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004674 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004675 regstack_pop(&scan);
4676 regstack.ga_len -= sizeof(regstar_T);
4677 status = RA_NOMATCH;
4678 }
4679 }
4680 break;
4681 }
4682
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004683 // If we want to continue the inner loop or didn't pop a state
4684 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004685 if (status == RA_CONT || rp == (regitem_T *)
4686 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4687 break;
4688 }
4689
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004690 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004691 if (status == RA_CONT)
4692 continue;
4693
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004694 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004695 if (regstack.ga_len == 0 || status == RA_FAIL)
4696 {
4697 if (scan == NULL)
4698 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004699 // We get here only if there's trouble -- normally "case END" is
4700 // the terminating point.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004701 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004702#ifdef DEBUG
4703 printf("Premature EOL\n");
4704#endif
4705 }
4706 return (status == RA_MATCH);
4707 }
4708
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004709 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004710
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004711 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004712}
4713
4714/*
4715 * regtry - try match of "prog" with at rex.line["col"].
4716 * Returns 0 for failure, number of lines contained in the match otherwise.
4717 */
4718 static long
4719regtry(
4720 bt_regprog_T *prog,
4721 colnr_T col,
4722 proftime_T *tm, // timeout limit or NULL
4723 int *timed_out) // flag set on timeout or NULL
4724{
4725 rex.input = rex.line + col;
4726 rex.need_clear_subexpr = TRUE;
4727#ifdef FEAT_SYN_HL
4728 // Clear the external match subpointers if necessary.
4729 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4730#endif
4731
4732 if (regmatch(prog->program + 1, tm, timed_out) == 0)
4733 return 0;
4734
4735 cleanup_subexpr();
4736 if (REG_MULTI)
4737 {
4738 if (rex.reg_startpos[0].lnum < 0)
4739 {
4740 rex.reg_startpos[0].lnum = 0;
4741 rex.reg_startpos[0].col = col;
4742 }
4743 if (rex.reg_endpos[0].lnum < 0)
4744 {
4745 rex.reg_endpos[0].lnum = rex.lnum;
4746 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4747 }
4748 else
4749 // Use line number of "\ze".
4750 rex.lnum = rex.reg_endpos[0].lnum;
4751 }
4752 else
4753 {
4754 if (rex.reg_startp[0] == NULL)
4755 rex.reg_startp[0] = rex.line + col;
4756 if (rex.reg_endp[0] == NULL)
4757 rex.reg_endp[0] = rex.input;
4758 }
4759#ifdef FEAT_SYN_HL
4760 // Package any found \z(...\) matches for export. Default is none.
4761 unref_extmatch(re_extmatch_out);
4762 re_extmatch_out = NULL;
4763
4764 if (prog->reghasz == REX_SET)
4765 {
4766 int i;
4767
4768 cleanup_zsubexpr();
4769 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004770 if (re_extmatch_out == NULL)
4771 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004772 for (i = 0; i < NSUBEXP; i++)
4773 {
4774 if (REG_MULTI)
4775 {
4776 // Only accept single line matches.
4777 if (reg_startzpos[i].lnum >= 0
4778 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4779 && reg_endzpos[i].col >= reg_startzpos[i].col)
4780 re_extmatch_out->matches[i] =
4781 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4782 + reg_startzpos[i].col,
4783 reg_endzpos[i].col - reg_startzpos[i].col);
4784 }
4785 else
4786 {
4787 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4788 re_extmatch_out->matches[i] =
4789 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004790 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004791 }
4792 }
4793 }
4794#endif
4795 return 1 + rex.lnum;
4796}
4797
4798/*
4799 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004800 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004801 * Returns 0 for failure, number of lines contained in the match otherwise.
4802 */
4803 static long
4804bt_regexec_both(
4805 char_u *line,
4806 colnr_T col, // column to start looking for match
4807 proftime_T *tm, // timeout limit or NULL
4808 int *timed_out) // flag set on timeout or NULL
4809{
4810 bt_regprog_T *prog;
4811 char_u *s;
4812 long retval = 0L;
4813
4814 // Create "regstack" and "backpos" if they are not allocated yet.
4815 // We allocate *_INITIAL amount of bytes first and then set the grow size
4816 // to much bigger value to avoid many malloc calls in case of deep regular
4817 // expressions.
4818 if (regstack.ga_data == NULL)
4819 {
4820 // Use an item size of 1 byte, since we push different things
4821 // onto the regstack.
4822 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4823 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4824 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4825 }
4826
4827 if (backpos.ga_data == NULL)
4828 {
4829 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4830 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4831 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4832 }
4833
4834 if (REG_MULTI)
4835 {
4836 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4837 line = reg_getline((linenr_T)0);
4838 rex.reg_startpos = rex.reg_mmatch->startpos;
4839 rex.reg_endpos = rex.reg_mmatch->endpos;
4840 }
4841 else
4842 {
4843 prog = (bt_regprog_T *)rex.reg_match->regprog;
4844 rex.reg_startp = rex.reg_match->startp;
4845 rex.reg_endp = rex.reg_match->endp;
4846 }
4847
4848 // Be paranoid...
4849 if (prog == NULL || line == NULL)
4850 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004851 iemsg(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004852 goto theend;
4853 }
4854
4855 // Check validity of program.
4856 if (prog_magic_wrong())
4857 goto theend;
4858
4859 // If the start column is past the maximum column: no need to try.
4860 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4861 goto theend;
4862
4863 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4864 if (prog->regflags & RF_ICASE)
4865 rex.reg_ic = TRUE;
4866 else if (prog->regflags & RF_NOICASE)
4867 rex.reg_ic = FALSE;
4868
4869 // If pattern contains "\Z" overrule value of rex.reg_icombine
4870 if (prog->regflags & RF_ICOMBINE)
4871 rex.reg_icombine = TRUE;
4872
4873 // If there is a "must appear" string, look for it.
4874 if (prog->regmust != NULL)
4875 {
4876 int c;
4877
4878 if (has_mbyte)
4879 c = (*mb_ptr2char)(prog->regmust);
4880 else
4881 c = *prog->regmust;
4882 s = line + col;
4883
4884 // This is used very often, esp. for ":global". Use three versions of
4885 // the loop to avoid overhead of conditions.
4886 if (!rex.reg_ic && !has_mbyte)
4887 while ((s = vim_strbyte(s, c)) != NULL)
4888 {
4889 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4890 break; // Found it.
4891 ++s;
4892 }
4893 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4894 while ((s = vim_strchr(s, c)) != NULL)
4895 {
4896 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4897 break; // Found it.
4898 MB_PTR_ADV(s);
4899 }
4900 else
4901 while ((s = cstrchr(s, c)) != NULL)
4902 {
4903 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4904 break; // Found it.
4905 MB_PTR_ADV(s);
4906 }
4907 if (s == NULL) // Not present.
4908 goto theend;
4909 }
4910
4911 rex.line = line;
4912 rex.lnum = 0;
4913 reg_toolong = FALSE;
4914
4915 // Simplest case: Anchored match need be tried only once.
4916 if (prog->reganch)
4917 {
4918 int c;
4919
4920 if (has_mbyte)
4921 c = (*mb_ptr2char)(rex.line + col);
4922 else
4923 c = rex.line[col];
4924 if (prog->regstart == NUL
4925 || prog->regstart == c
4926 || (rex.reg_ic
4927 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
4928 || (c < 255 && prog->regstart < 255 &&
4929 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
4930 retval = regtry(prog, col, tm, timed_out);
4931 else
4932 retval = 0;
4933 }
4934 else
4935 {
4936#ifdef FEAT_RELTIME
4937 int tm_count = 0;
4938#endif
4939 // Messy cases: unanchored match.
4940 while (!got_int)
4941 {
4942 if (prog->regstart != NUL)
4943 {
4944 // Skip until the char we know it must start with.
4945 // Used often, do some work to avoid call overhead.
4946 if (!rex.reg_ic && !has_mbyte)
4947 s = vim_strbyte(rex.line + col, prog->regstart);
4948 else
4949 s = cstrchr(rex.line + col, prog->regstart);
4950 if (s == NULL)
4951 {
4952 retval = 0;
4953 break;
4954 }
4955 col = (int)(s - rex.line);
4956 }
4957
4958 // Check for maximum column to try.
4959 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4960 {
4961 retval = 0;
4962 break;
4963 }
4964
4965 retval = regtry(prog, col, tm, timed_out);
4966 if (retval > 0)
4967 break;
4968
4969 // if not currently on the first line, get it again
4970 if (rex.lnum != 0)
4971 {
4972 rex.lnum = 0;
4973 rex.line = reg_getline((linenr_T)0);
4974 }
4975 if (rex.line[col] == NUL)
4976 break;
4977 if (has_mbyte)
4978 col += (*mb_ptr2len)(rex.line + col);
4979 else
4980 ++col;
4981#ifdef FEAT_RELTIME
4982 // Check for timeout once in a twenty times to avoid overhead.
4983 if (tm != NULL && ++tm_count == 20)
4984 {
4985 tm_count = 0;
4986 if (profile_passed_limit(tm))
4987 {
4988 if (timed_out != NULL)
4989 *timed_out = TRUE;
4990 break;
4991 }
4992 }
4993#endif
4994 }
4995 }
4996
4997theend:
4998 // Free "reg_tofree" when it's a bit big.
4999 // Free regstack and backpos if they are bigger than their initial size.
5000 if (reg_tofreelen > 400)
5001 VIM_CLEAR(reg_tofree);
5002 if (regstack.ga_maxlen > REGSTACK_INITIAL)
5003 ga_clear(&regstack);
5004 if (backpos.ga_maxlen > BACKPOS_INITIAL)
5005 ga_clear(&backpos);
5006
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005007 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005008 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005009 // Make sure the end is never before the start. Can happen when \zs
5010 // and \ze are used.
5011 if (REG_MULTI)
5012 {
5013 lpos_T *start = &rex.reg_mmatch->startpos[0];
5014 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005015
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005016 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005017 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005018 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
5019 }
5020 else
5021 {
5022 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5023 rex.reg_match->endp[0] = rex.reg_match->startp[0];
5024 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005025 }
5026
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005027 return retval;
5028}
5029
5030/*
5031 * Match a regexp against a string.
5032 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5033 * Uses curbuf for line count and 'iskeyword'.
5034 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5035 *
5036 * Returns 0 for failure, number of lines contained in the match otherwise.
5037 */
5038 static int
5039bt_regexec_nl(
5040 regmatch_T *rmp,
5041 char_u *line, // string to match against
5042 colnr_T col, // column to start looking for match
5043 int line_lbr)
5044{
5045 rex.reg_match = rmp;
5046 rex.reg_mmatch = NULL;
5047 rex.reg_maxline = 0;
5048 rex.reg_line_lbr = line_lbr;
5049 rex.reg_buf = curbuf;
5050 rex.reg_win = NULL;
5051 rex.reg_ic = rmp->rm_ic;
5052 rex.reg_icombine = FALSE;
5053 rex.reg_maxcol = 0;
5054
5055 return bt_regexec_both(line, col, NULL, NULL);
5056}
5057
5058/*
5059 * Match a regexp against multiple lines.
5060 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5061 * Uses curbuf for line count and 'iskeyword'.
5062 *
5063 * Return zero if there is no match. Return number of lines contained in the
5064 * match otherwise.
5065 */
5066 static long
5067bt_regexec_multi(
5068 regmmatch_T *rmp,
5069 win_T *win, // window in which to search or NULL
5070 buf_T *buf, // buffer in which to search
5071 linenr_T lnum, // nr of line to start looking for match
5072 colnr_T col, // column to start looking for match
5073 proftime_T *tm, // timeout limit or NULL
5074 int *timed_out) // flag set on timeout or NULL
5075{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005076 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005077 return bt_regexec_both(NULL, col, tm, timed_out);
5078}
5079
5080/*
5081 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5082 */
5083 static int
5084re_num_cmp(long_u val, char_u *scan)
5085{
5086 long_u n = OPERAND_MIN(scan);
5087
5088 if (OPERAND_CMP(scan) == '>')
5089 return val > n;
5090 if (OPERAND_CMP(scan) == '<')
5091 return val < n;
5092 return val == n;
5093}
5094
5095#ifdef BT_REGEXP_DUMP
5096
5097/*
5098 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5099 */
5100 static void
5101regdump(char_u *pattern, bt_regprog_T *r)
5102{
5103 char_u *s;
5104 int op = EXACTLY; // Arbitrary non-END op.
5105 char_u *next;
5106 char_u *end = NULL;
5107 FILE *f;
5108
5109#ifdef BT_REGEXP_LOG
5110 f = fopen("bt_regexp_log.log", "a");
5111#else
5112 f = stdout;
5113#endif
5114 if (f == NULL)
5115 return;
5116 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5117
5118 s = r->program + 1;
5119 // Loop until we find the END that isn't before a referred next (an END
5120 // can also appear in a NOMATCH operand).
5121 while (op != END || s <= end)
5122 {
5123 op = OP(s);
5124 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5125 next = regnext(s);
5126 if (next == NULL) // Next ptr.
5127 fprintf(f, "(0)");
5128 else
5129 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5130 if (end < next)
5131 end = next;
5132 if (op == BRACE_LIMITS)
5133 {
5134 // Two ints
5135 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5136 s += 8;
5137 }
5138 else if (op == BEHIND || op == NOBEHIND)
5139 {
5140 // one int
5141 fprintf(f, " count %ld", OPERAND_MIN(s));
5142 s += 4;
5143 }
5144 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5145 {
5146 // one int plus comparator
5147 fprintf(f, " count %ld", OPERAND_MIN(s));
5148 s += 5;
5149 }
5150 s += 3;
5151 if (op == ANYOF || op == ANYOF + ADD_NL
5152 || op == ANYBUT || op == ANYBUT + ADD_NL
5153 || op == EXACTLY)
5154 {
5155 // Literal string, where present.
5156 fprintf(f, "\nxxxxxxxxx\n");
5157 while (*s != NUL)
5158 fprintf(f, "%c", *s++);
5159 fprintf(f, "\nxxxxxxxxx\n");
5160 s++;
5161 }
5162 fprintf(f, "\r\n");
5163 }
5164
5165 // Header fields of interest.
5166 if (r->regstart != NUL)
5167 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5168 ? (char *)transchar(r->regstart)
5169 : "multibyte", r->regstart);
5170 if (r->reganch)
5171 fprintf(f, "anchored; ");
5172 if (r->regmust != NULL)
5173 fprintf(f, "must have \"%s\"", r->regmust);
5174 fprintf(f, "\r\n");
5175
5176#ifdef BT_REGEXP_LOG
5177 fclose(f);
5178#endif
5179}
5180#endif // BT_REGEXP_DUMP
5181
5182#ifdef DEBUG
5183/*
5184 * regprop - printable representation of opcode
5185 */
5186 static char_u *
5187regprop(char_u *op)
5188{
5189 char *p;
5190 static char buf[50];
5191
5192 STRCPY(buf, ":");
5193
5194 switch ((int) OP(op))
5195 {
5196 case BOL:
5197 p = "BOL";
5198 break;
5199 case EOL:
5200 p = "EOL";
5201 break;
5202 case RE_BOF:
5203 p = "BOF";
5204 break;
5205 case RE_EOF:
5206 p = "EOF";
5207 break;
5208 case CURSOR:
5209 p = "CURSOR";
5210 break;
5211 case RE_VISUAL:
5212 p = "RE_VISUAL";
5213 break;
5214 case RE_LNUM:
5215 p = "RE_LNUM";
5216 break;
5217 case RE_MARK:
5218 p = "RE_MARK";
5219 break;
5220 case RE_COL:
5221 p = "RE_COL";
5222 break;
5223 case RE_VCOL:
5224 p = "RE_VCOL";
5225 break;
5226 case BOW:
5227 p = "BOW";
5228 break;
5229 case EOW:
5230 p = "EOW";
5231 break;
5232 case ANY:
5233 p = "ANY";
5234 break;
5235 case ANY + ADD_NL:
5236 p = "ANY+NL";
5237 break;
5238 case ANYOF:
5239 p = "ANYOF";
5240 break;
5241 case ANYOF + ADD_NL:
5242 p = "ANYOF+NL";
5243 break;
5244 case ANYBUT:
5245 p = "ANYBUT";
5246 break;
5247 case ANYBUT + ADD_NL:
5248 p = "ANYBUT+NL";
5249 break;
5250 case IDENT:
5251 p = "IDENT";
5252 break;
5253 case IDENT + ADD_NL:
5254 p = "IDENT+NL";
5255 break;
5256 case SIDENT:
5257 p = "SIDENT";
5258 break;
5259 case SIDENT + ADD_NL:
5260 p = "SIDENT+NL";
5261 break;
5262 case KWORD:
5263 p = "KWORD";
5264 break;
5265 case KWORD + ADD_NL:
5266 p = "KWORD+NL";
5267 break;
5268 case SKWORD:
5269 p = "SKWORD";
5270 break;
5271 case SKWORD + ADD_NL:
5272 p = "SKWORD+NL";
5273 break;
5274 case FNAME:
5275 p = "FNAME";
5276 break;
5277 case FNAME + ADD_NL:
5278 p = "FNAME+NL";
5279 break;
5280 case SFNAME:
5281 p = "SFNAME";
5282 break;
5283 case SFNAME + ADD_NL:
5284 p = "SFNAME+NL";
5285 break;
5286 case PRINT:
5287 p = "PRINT";
5288 break;
5289 case PRINT + ADD_NL:
5290 p = "PRINT+NL";
5291 break;
5292 case SPRINT:
5293 p = "SPRINT";
5294 break;
5295 case SPRINT + ADD_NL:
5296 p = "SPRINT+NL";
5297 break;
5298 case WHITE:
5299 p = "WHITE";
5300 break;
5301 case WHITE + ADD_NL:
5302 p = "WHITE+NL";
5303 break;
5304 case NWHITE:
5305 p = "NWHITE";
5306 break;
5307 case NWHITE + ADD_NL:
5308 p = "NWHITE+NL";
5309 break;
5310 case DIGIT:
5311 p = "DIGIT";
5312 break;
5313 case DIGIT + ADD_NL:
5314 p = "DIGIT+NL";
5315 break;
5316 case NDIGIT:
5317 p = "NDIGIT";
5318 break;
5319 case NDIGIT + ADD_NL:
5320 p = "NDIGIT+NL";
5321 break;
5322 case HEX:
5323 p = "HEX";
5324 break;
5325 case HEX + ADD_NL:
5326 p = "HEX+NL";
5327 break;
5328 case NHEX:
5329 p = "NHEX";
5330 break;
5331 case NHEX + ADD_NL:
5332 p = "NHEX+NL";
5333 break;
5334 case OCTAL:
5335 p = "OCTAL";
5336 break;
5337 case OCTAL + ADD_NL:
5338 p = "OCTAL+NL";
5339 break;
5340 case NOCTAL:
5341 p = "NOCTAL";
5342 break;
5343 case NOCTAL + ADD_NL:
5344 p = "NOCTAL+NL";
5345 break;
5346 case WORD:
5347 p = "WORD";
5348 break;
5349 case WORD + ADD_NL:
5350 p = "WORD+NL";
5351 break;
5352 case NWORD:
5353 p = "NWORD";
5354 break;
5355 case NWORD + ADD_NL:
5356 p = "NWORD+NL";
5357 break;
5358 case HEAD:
5359 p = "HEAD";
5360 break;
5361 case HEAD + ADD_NL:
5362 p = "HEAD+NL";
5363 break;
5364 case NHEAD:
5365 p = "NHEAD";
5366 break;
5367 case NHEAD + ADD_NL:
5368 p = "NHEAD+NL";
5369 break;
5370 case ALPHA:
5371 p = "ALPHA";
5372 break;
5373 case ALPHA + ADD_NL:
5374 p = "ALPHA+NL";
5375 break;
5376 case NALPHA:
5377 p = "NALPHA";
5378 break;
5379 case NALPHA + ADD_NL:
5380 p = "NALPHA+NL";
5381 break;
5382 case LOWER:
5383 p = "LOWER";
5384 break;
5385 case LOWER + ADD_NL:
5386 p = "LOWER+NL";
5387 break;
5388 case NLOWER:
5389 p = "NLOWER";
5390 break;
5391 case NLOWER + ADD_NL:
5392 p = "NLOWER+NL";
5393 break;
5394 case UPPER:
5395 p = "UPPER";
5396 break;
5397 case UPPER + ADD_NL:
5398 p = "UPPER+NL";
5399 break;
5400 case NUPPER:
5401 p = "NUPPER";
5402 break;
5403 case NUPPER + ADD_NL:
5404 p = "NUPPER+NL";
5405 break;
5406 case BRANCH:
5407 p = "BRANCH";
5408 break;
5409 case EXACTLY:
5410 p = "EXACTLY";
5411 break;
5412 case NOTHING:
5413 p = "NOTHING";
5414 break;
5415 case BACK:
5416 p = "BACK";
5417 break;
5418 case END:
5419 p = "END";
5420 break;
5421 case MOPEN + 0:
5422 p = "MATCH START";
5423 break;
5424 case MOPEN + 1:
5425 case MOPEN + 2:
5426 case MOPEN + 3:
5427 case MOPEN + 4:
5428 case MOPEN + 5:
5429 case MOPEN + 6:
5430 case MOPEN + 7:
5431 case MOPEN + 8:
5432 case MOPEN + 9:
5433 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5434 p = NULL;
5435 break;
5436 case MCLOSE + 0:
5437 p = "MATCH END";
5438 break;
5439 case MCLOSE + 1:
5440 case MCLOSE + 2:
5441 case MCLOSE + 3:
5442 case MCLOSE + 4:
5443 case MCLOSE + 5:
5444 case MCLOSE + 6:
5445 case MCLOSE + 7:
5446 case MCLOSE + 8:
5447 case MCLOSE + 9:
5448 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5449 p = NULL;
5450 break;
5451 case BACKREF + 1:
5452 case BACKREF + 2:
5453 case BACKREF + 3:
5454 case BACKREF + 4:
5455 case BACKREF + 5:
5456 case BACKREF + 6:
5457 case BACKREF + 7:
5458 case BACKREF + 8:
5459 case BACKREF + 9:
5460 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5461 p = NULL;
5462 break;
5463 case NOPEN:
5464 p = "NOPEN";
5465 break;
5466 case NCLOSE:
5467 p = "NCLOSE";
5468 break;
5469#ifdef FEAT_SYN_HL
5470 case ZOPEN + 1:
5471 case ZOPEN + 2:
5472 case ZOPEN + 3:
5473 case ZOPEN + 4:
5474 case ZOPEN + 5:
5475 case ZOPEN + 6:
5476 case ZOPEN + 7:
5477 case ZOPEN + 8:
5478 case ZOPEN + 9:
5479 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5480 p = NULL;
5481 break;
5482 case ZCLOSE + 1:
5483 case ZCLOSE + 2:
5484 case ZCLOSE + 3:
5485 case ZCLOSE + 4:
5486 case ZCLOSE + 5:
5487 case ZCLOSE + 6:
5488 case ZCLOSE + 7:
5489 case ZCLOSE + 8:
5490 case ZCLOSE + 9:
5491 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5492 p = NULL;
5493 break;
5494 case ZREF + 1:
5495 case ZREF + 2:
5496 case ZREF + 3:
5497 case ZREF + 4:
5498 case ZREF + 5:
5499 case ZREF + 6:
5500 case ZREF + 7:
5501 case ZREF + 8:
5502 case ZREF + 9:
5503 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5504 p = NULL;
5505 break;
5506#endif
5507 case STAR:
5508 p = "STAR";
5509 break;
5510 case PLUS:
5511 p = "PLUS";
5512 break;
5513 case NOMATCH:
5514 p = "NOMATCH";
5515 break;
5516 case MATCH:
5517 p = "MATCH";
5518 break;
5519 case BEHIND:
5520 p = "BEHIND";
5521 break;
5522 case NOBEHIND:
5523 p = "NOBEHIND";
5524 break;
5525 case SUBPAT:
5526 p = "SUBPAT";
5527 break;
5528 case BRACE_LIMITS:
5529 p = "BRACE_LIMITS";
5530 break;
5531 case BRACE_SIMPLE:
5532 p = "BRACE_SIMPLE";
5533 break;
5534 case BRACE_COMPLEX + 0:
5535 case BRACE_COMPLEX + 1:
5536 case BRACE_COMPLEX + 2:
5537 case BRACE_COMPLEX + 3:
5538 case BRACE_COMPLEX + 4:
5539 case BRACE_COMPLEX + 5:
5540 case BRACE_COMPLEX + 6:
5541 case BRACE_COMPLEX + 7:
5542 case BRACE_COMPLEX + 8:
5543 case BRACE_COMPLEX + 9:
5544 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5545 p = NULL;
5546 break;
5547 case MULTIBYTECODE:
5548 p = "MULTIBYTECODE";
5549 break;
5550 case NEWL:
5551 p = "NEWL";
5552 break;
5553 default:
5554 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5555 p = NULL;
5556 break;
5557 }
5558 if (p != NULL)
5559 STRCAT(buf, p);
5560 return (char_u *)buf;
5561}
5562#endif // DEBUG