blob: 18bad80548158d0d28c628450dec849462c58f4c [file] [log] [blame]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001/* vi:set ts=8 sts=4 sw=4 noet:
2 *
3 * Backtracking regular expression implementation.
4 *
5 * This file is included in "regexp.c".
6 *
7 * NOTICE:
8 *
9 * This is NOT the original regular expression code as written by Henry
10 * Spencer. This code has been modified specifically for use with the VIM
11 * editor, and should not be used separately from Vim. If you want a good
12 * regular expression library, get the original code. The copyright notice
13 * that follows is from the original.
14 *
15 * END NOTICE
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
39 * Webb, Ciaran McCreesh and Bram Moolenaar.
40 * Named character class support added by Walter Briscoe (1998 Jul 01)
41 */
42
43/*
44 * The "internal use only" fields in regexp.h are present to pass info from
45 * compile to execute that permits the execute phase to run lots faster on
46 * simple cases. They are:
47 *
48 * regstart char that must begin a match; NUL if none obvious; Can be a
49 * multi-byte character.
50 * reganch is the match anchored (at beginning-of-line only)?
51 * regmust string (pointer into program) that match must include, or NULL
52 * regmlen length of regmust string
53 * regflags RF_ values or'ed together
54 *
55 * Regstart and reganch permit very fast decisions on suitable starting points
56 * for a match, cutting down the work a lot. Regmust permits fast rejection
57 * of lines that cannot possibly match. The regmust tests are costly enough
58 * that vim_regcomp() supplies a regmust only if the r.e. contains something
59 * potentially expensive (at present, the only such thing detected is * or +
60 * at the start of the r.e., which can involve a lot of backup). Regmlen is
61 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
62 * computing it anyway.
63 */
64
65/*
66 * Structure for regexp "program". This is essentially a linear encoding
67 * of a nondeterministic finite-state machine (aka syntax charts or
68 * "railroad normal form" in parsing technology). Each node is an opcode
69 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
70 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
71 * pointer with a BRANCH on both ends of it is connecting two alternatives.
72 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
73 * (as opposed to a collection of them) is never concatenated with anything
74 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
75 * node points to the node after the stuff to be repeated.
76 * The operand of some types of node is a literal string; for others, it is a
77 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
78 * is the first node of the branch.
79 * (NB this is *not* a tree structure: the tail of the branch connects to the
80 * thing following the set of BRANCHes.)
81 *
82 * pattern is coded like:
83 *
84 * +-----------------+
85 * | V
86 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
87 * | ^ | ^
88 * +------+ +----------+
89 *
90 *
91 * +------------------+
92 * V |
93 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
94 * | | ^ ^
95 * | +---------------+ |
96 * +---------------------------------------------+
97 *
98 *
99 * +----------------------+
100 * V |
101 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
102 * | | ^ ^
103 * | +-----------+ |
104 * +--------------------------------------------------+
105 *
106 *
107 * +-------------------------+
108 * V |
109 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
110 * | | ^
111 * | +----------------+
112 * +-----------------------------------------------+
113 *
114 *
115 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
116 * | | ^ ^
117 * | +----------------+ |
118 * +--------------------------------+
119 *
120 * +---------+
121 * | V
122 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
123 * | | | | ^ ^
124 * | | | +-----+ |
125 * | | +----------------+ |
126 * | +---------------------------+ |
127 * +------------------------------------------------------+
128 *
129 * They all start with a BRANCH for "\|" alternatives, even when there is only
130 * one alternative.
131 */
132
133/*
134 * The opcodes are:
135 */
136
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200137// definition number opnd? meaning
138#define END 0 // End of program or NOMATCH operand.
139#define BOL 1 // Match "" at beginning of line.
140#define EOL 2 // Match "" at end of line.
141#define BRANCH 3 // node Match this alternative, or the
142 // next...
143#define BACK 4 // Match "", "next" ptr points backward.
144#define EXACTLY 5 // str Match this string.
145#define NOTHING 6 // Match empty string.
146#define STAR 7 // node Match this (simple) thing 0 or more
147 // times.
148#define PLUS 8 // node Match this (simple) thing 1 or more
149 // times.
150#define MATCH 9 // node match the operand zero-width
151#define NOMATCH 10 // node check for no match with operand
152#define BEHIND 11 // node look behind for a match with operand
153#define NOBEHIND 12 // node look behind for no match with operand
154#define SUBPAT 13 // node match the operand here
155#define BRACE_SIMPLE 14 // node Match this (simple) thing between m and
156 // n times (\{m,n\}).
157#define BOW 15 // Match "" after [^a-zA-Z0-9_]
158#define EOW 16 // Match "" at [^a-zA-Z0-9_]
159#define BRACE_LIMITS 17 // nr nr define the min & max for BRACE_SIMPLE
160 // and BRACE_COMPLEX.
161#define NEWL 18 // Match line-break
162#define BHPOS 19 // End position for BEHIND or NOBEHIND
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200163
164
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200165// character classes: 20-48 normal, 50-78 include a line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200166#define ADD_NL 30
167#define FIRST_NL ANY + ADD_NL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200168#define ANY 20 // Match any one character.
169#define ANYOF 21 // str Match any character in this string.
170#define ANYBUT 22 // str Match any character not in this
171 // string.
172#define IDENT 23 // Match identifier char
173#define SIDENT 24 // Match identifier char but no digit
174#define KWORD 25 // Match keyword char
175#define SKWORD 26 // Match word char but no digit
176#define FNAME 27 // Match file name char
177#define SFNAME 28 // Match file name char but no digit
178#define PRINT 29 // Match printable char
179#define SPRINT 30 // Match printable char but no digit
180#define WHITE 31 // Match whitespace char
181#define NWHITE 32 // Match non-whitespace char
182#define DIGIT 33 // Match digit char
183#define NDIGIT 34 // Match non-digit char
184#define HEX 35 // Match hex char
185#define NHEX 36 // Match non-hex char
186#define OCTAL 37 // Match octal char
187#define NOCTAL 38 // Match non-octal char
188#define WORD 39 // Match word char
189#define NWORD 40 // Match non-word char
190#define HEAD 41 // Match head char
191#define NHEAD 42 // Match non-head char
192#define ALPHA 43 // Match alpha char
193#define NALPHA 44 // Match non-alpha char
194#define LOWER 45 // Match lowercase char
195#define NLOWER 46 // Match non-lowercase char
196#define UPPER 47 // Match uppercase char
197#define NUPPER 48 // Match non-uppercase char
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200198#define LAST_NL NUPPER + ADD_NL
199#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
200
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200201#define MOPEN 80 // -89 Mark this point in input as start of
202 // \( subexpr. MOPEN + 0 marks start of
203 // match.
204#define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
205 // end of match.
206#define BACKREF 100 // -109 node Match same string again \1-\9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200207
208#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200209# define ZOPEN 110 // -119 Mark this point in input as start of
210 // \z( subexpr.
211# define ZCLOSE 120 // -129 Analogous to ZOPEN.
212# define ZREF 130 // -139 node Match external submatch \z1-\z9
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200213#endif
214
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200215#define BRACE_COMPLEX 140 // -149 node Match nodes between m & n times
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200217#define NOPEN 150 // Mark this point in input as start of
218 // \%( subexpr.
219#define NCLOSE 151 // Analogous to NOPEN.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200221#define MULTIBYTECODE 200 // mbc Match one multi-byte character
222#define RE_BOF 201 // Match "" at beginning of file.
223#define RE_EOF 202 // Match "" at end of file.
224#define CURSOR 203 // Match location of cursor.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200225
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200226#define RE_LNUM 204 // nr cmp Match line number
227#define RE_COL 205 // nr cmp Match column number
228#define RE_VCOL 206 // nr cmp Match virtual column number
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200229
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200230#define RE_MARK 207 // mark cmp Match mark position
231#define RE_VISUAL 208 // Match Visual area
232#define RE_COMPOSING 209 // any composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200233
234/*
235 * Flags to be passed up and down.
236 */
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200237#define HASWIDTH 0x1 // Known never to match null string.
238#define SIMPLE 0x2 // Simple enough to be STAR/PLUS operand.
239#define SPSTART 0x4 // Starts with * or +.
240#define HASNL 0x8 // Contains some \n.
241#define HASLOOKBH 0x10 // Contains "\@<=" or "\@<!".
242#define WORST 0 // Worst case.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200243
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200244static int num_complex_braces; // Complex \{...} count
245static char_u *regcode; // Code-emit pointer, or JUST_CALC_SIZE
246static long regsize; // Code size.
247static int reg_toolong; // TRUE when offset out of range
248static char_u had_endbrace[NSUBEXP]; // flags, TRUE if end of () found
249static long brace_min[10]; // Minimums for complex brace repeats
250static long brace_max[10]; // Maximums for complex brace repeats
251static int brace_count[10]; // Current counts for complex brace repeats
252static int one_exactly = FALSE; // only do one char for EXACTLY
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200253
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200254// When making changes to classchars also change nfa_classcodes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200255static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
256static int classcodes[] = {
257 ANY, IDENT, SIDENT, KWORD, SKWORD,
258 FNAME, SFNAME, PRINT, SPRINT,
259 WHITE, NWHITE, DIGIT, NDIGIT,
260 HEX, NHEX, OCTAL, NOCTAL,
261 WORD, NWORD, HEAD, NHEAD,
262 ALPHA, NALPHA, LOWER, NLOWER,
263 UPPER, NUPPER
264};
265
266/*
267 * When regcode is set to this value, code is not emitted and size is computed
268 * instead.
269 */
270#define JUST_CALC_SIZE ((char_u *) -1)
271
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200272// Values for rs_state in regitem_T.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200273typedef enum regstate_E
274{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200275 RS_NOPEN = 0 // NOPEN and NCLOSE
276 , RS_MOPEN // MOPEN + [0-9]
277 , RS_MCLOSE // MCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200278#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200279 , RS_ZOPEN // ZOPEN + [0-9]
280 , RS_ZCLOSE // ZCLOSE + [0-9]
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200281#endif
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200282 , RS_BRANCH // BRANCH
283 , RS_BRCPLX_MORE // BRACE_COMPLEX and trying one more match
284 , RS_BRCPLX_LONG // BRACE_COMPLEX and trying longest match
285 , RS_BRCPLX_SHORT // BRACE_COMPLEX and trying shortest match
286 , RS_NOMATCH // NOMATCH
287 , RS_BEHIND1 // BEHIND / NOBEHIND matching rest
288 , RS_BEHIND2 // BEHIND / NOBEHIND matching behind part
289 , RS_STAR_LONG // STAR/PLUS/BRACE_SIMPLE longest match
290 , RS_STAR_SHORT // STAR/PLUS/BRACE_SIMPLE shortest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200291} regstate_T;
292
293/*
294 * Structure used to save the current input state, when it needs to be
295 * restored after trying a match. Used by reg_save() and reg_restore().
296 * Also stores the length of "backpos".
297 */
298typedef struct
299{
300 union
301 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200302 char_u *ptr; // rex.input pointer, for single-line regexp
303 lpos_T pos; // rex.input pos, for multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200304 } rs_u;
305 int rs_len;
306} regsave_T;
307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200308// struct to save start/end pointer/position in for \(\)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200309typedef struct
310{
311 union
312 {
313 char_u *ptr;
314 lpos_T pos;
315 } se_u;
316} save_se_T;
317
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200318// used for BEHIND and NOBEHIND matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200319typedef struct regbehind_S
320{
321 regsave_T save_after;
322 regsave_T save_behind;
323 int save_need_clear_subexpr;
324 save_se_T save_start[NSUBEXP];
325 save_se_T save_end[NSUBEXP];
326} regbehind_T;
327
328/*
329 * When there are alternatives a regstate_T is put on the regstack to remember
330 * what we are doing.
331 * Before it may be another type of item, depending on rs_state, to remember
332 * more things.
333 */
334typedef struct regitem_S
335{
336 regstate_T rs_state; // what we are doing, one of RS_ above
337 short rs_no; // submatch nr or BEHIND/NOBEHIND
338 char_u *rs_scan; // current node in program
339 union
340 {
341 save_se_T sesave;
342 regsave_T regsave;
343 } rs_un; // room for saving rex.input
344} regitem_T;
345
346
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200347// used for STAR, PLUS and BRACE_SIMPLE matching
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200348typedef struct regstar_S
349{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200350 int nextb; // next byte
351 int nextb_ic; // next byte reverse case
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200352 long count;
353 long minval;
354 long maxval;
355} regstar_T;
356
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200357// used to store input position when a BACK was encountered, so that we now if
358// we made any progress since the last time.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200359typedef struct backpos_S
360{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200361 char_u *bp_scan; // "scan" where BACK was encountered
362 regsave_T bp_pos; // last input position
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363} backpos_T;
364
365/*
366 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
367 * to avoid invoking malloc() and free() often.
368 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
369 * or regbehind_T.
370 * "backpos_T" is a table with backpos_T for BACK
371 */
372static garray_T regstack = {0, 0, 0, 0, NULL};
373static garray_T backpos = {0, 0, 0, 0, NULL};
374
375static regsave_T behind_pos;
376
377/*
378 * Both for regstack and backpos tables we use the following strategy of
379 * allocation (to reduce malloc/free calls):
380 * - Initial size is fairly small.
381 * - When needed, the tables are grown bigger (8 times at first, double after
382 * that).
383 * - After executing the match we free the memory only if the array has grown.
384 * Thus the memory is kept allocated when it's at the initial size.
385 * This makes it fast while not keeping a lot of memory allocated.
386 * A three times speed increase was observed when using many simple patterns.
387 */
388#define REGSTACK_INITIAL 2048
389#define BACKPOS_INITIAL 64
390
391/*
392 * Opcode notes:
393 *
394 * BRANCH The set of branches constituting a single choice are hooked
395 * together with their "next" pointers, since precedence prevents
396 * anything being concatenated to any individual branch. The
397 * "next" pointer of the last BRANCH in a choice points to the
398 * thing following the whole choice. This is also where the
399 * final "next" pointer of each individual branch points; each
400 * branch starts with the operand node of a BRANCH node.
401 *
402 * BACK Normal "next" pointers all implicitly point forward; BACK
403 * exists to make loop structures possible.
404 *
405 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
406 * BRANCH structures using BACK. Simple cases (one character
407 * per match) are implemented with STAR and PLUS for speed
408 * and to minimize recursive plunges.
409 *
410 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
411 * node, and defines the min and max limits to be used for that
412 * node.
413 *
414 * MOPEN,MCLOSE ...are numbered at compile time.
415 * ZOPEN,ZCLOSE ...ditto
416 */
417
418/*
419 * A node is one char of opcode followed by two chars of "next" pointer.
420 * "Next" pointers are stored as two 8-bit bytes, high order first. The
421 * value is a positive offset from the opcode of the node containing it.
422 * An operand, if any, simply follows the node. (Note that much of the
423 * code generation knows about this implicit relationship.)
424 *
425 * Using two bytes for the "next" pointer is vast overkill for most things,
426 * but allows patterns to get big without disasters.
427 */
428#define OP(p) ((int)*(p))
429#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
430#define OPERAND(p) ((p) + 3)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200431// Obtain an operand that was stored as four bytes, MSB first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200432#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
433 + ((long)(p)[5] << 8) + (long)(p)[6])
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200434// Obtain a second operand stored as four bytes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200436// Obtain a second single-byte operand stored after a four bytes operand.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200437#define OPERAND_CMP(p) (p)[7]
438
439static char_u *reg(int paren, int *flagp);
440
441#ifdef BT_REGEXP_DUMP
442static void regdump(char_u *, bt_regprog_T *);
443#endif
444
445static int re_num_cmp(long_u val, char_u *scan);
446
447#ifdef DEBUG
448static char_u *regprop(char_u *);
449
450static int regnarrate = 0;
451#endif
452
453
454/*
455 * Setup to parse the regexp. Used once to get the length and once to do it.
456 */
457 static void
458regcomp_start(
459 char_u *expr,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200460 int re_flags) // see vim_regcomp()
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200461{
462 initchr(expr);
463 if (re_flags & RE_MAGIC)
464 reg_magic = MAGIC_ON;
465 else
466 reg_magic = MAGIC_OFF;
467 reg_string = (re_flags & RE_STRING);
468 reg_strict = (re_flags & RE_STRICT);
469 get_cpo_flags();
470
471 num_complex_braces = 0;
472 regnpar = 1;
Bram Moolenaara80faa82020-04-12 19:37:17 +0200473 CLEAR_FIELD(had_endbrace);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200474#ifdef FEAT_SYN_HL
475 regnzpar = 1;
476 re_has_z = 0;
477#endif
478 regsize = 0L;
479 reg_toolong = FALSE;
480 regflags = 0;
481#if defined(FEAT_SYN_HL) || defined(PROTO)
482 had_eol = FALSE;
483#endif
484}
485
486/*
487 * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for
488 * character "c".
489 */
490 static int
491use_multibytecode(int c)
492{
493 return has_mbyte && (*mb_char2len)(c) > 1
494 && (re_multi_type(peekchr()) != NOT_MULTI
495 || (enc_utf8 && utf_iscomposing(c)));
496}
497
498/*
499 * Emit (if appropriate) a byte of code
500 */
501 static void
502regc(int b)
503{
504 if (regcode == JUST_CALC_SIZE)
505 regsize++;
506 else
507 *regcode++ = b;
508}
509
510/*
511 * Emit (if appropriate) a multi-byte character of code
512 */
513 static void
514regmbc(int c)
515{
516 if (!has_mbyte && c > 0xff)
517 return;
518 if (regcode == JUST_CALC_SIZE)
519 regsize += (*mb_char2len)(c);
520 else
521 regcode += (*mb_char2bytes)(c, regcode);
522}
523
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200524
525/*
526 * Produce the bytes for equivalence class "c".
527 * Currently only handles latin1, latin9 and utf-8.
528 * NOTE: When changing this function, also change nfa_emit_equi_class()
529 */
530 static void
531reg_equi_class(int c)
532{
533 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
534 || STRCMP(p_enc, "iso-8859-15") == 0)
535 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200536 switch (c)
537 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +0200538 // Do not use '\300' style, it results in a negative number.
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200539 case 'A': case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc4:
540 case 0xc5: case 0x100: case 0x102: case 0x104: case 0x1cd:
541 case 0x1de: case 0x1e0: case 0x1fa: case 0x202: case 0x226:
542 case 0x23a: case 0x1e00: case 0x1ea0: case 0x1ea2: case 0x1ea4:
543 case 0x1ea6: case 0x1ea8: case 0x1eaa: case 0x1eac: case 0x1eae:
544 case 0x1eb0: case 0x1eb2: case 0x1eb4: case 0x1eb6:
545 regmbc('A'); regmbc(0xc0); regmbc(0xc1); regmbc(0xc2);
546 regmbc(0xc3); regmbc(0xc4); regmbc(0xc5);
547 regmbc(0x100); regmbc(0x102); regmbc(0x104);
548 regmbc(0x1cd); regmbc(0x1de); regmbc(0x1e0);
549 regmbc(0x1fa); regmbc(0x202); regmbc(0x226);
550 regmbc(0x23a); regmbc(0x1e00); regmbc(0x1ea0);
551 regmbc(0x1ea2); regmbc(0x1ea4); regmbc(0x1ea6);
552 regmbc(0x1ea8); regmbc(0x1eaa); regmbc(0x1eac);
553 regmbc(0x1eae); regmbc(0x1eb0); regmbc(0x1eb2);
554 regmbc(0x1eb4); regmbc(0x1eb6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200555 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200556 case 'B': case 0x181: case 0x243: case 0x1e02:
557 case 0x1e04: case 0x1e06:
558 regmbc('B');
559 regmbc(0x181); regmbc(0x243); regmbc(0x1e02);
560 regmbc(0x1e04); regmbc(0x1e06);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200561 return;
562 case 'C': case 0xc7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200563 case 0x106: case 0x108: case 0x10a: case 0x10c: case 0x187:
564 case 0x23b: case 0x1e08: case 0xa792:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200565 regmbc('C'); regmbc(0xc7);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200566 regmbc(0x106); regmbc(0x108); regmbc(0x10a);
567 regmbc(0x10c); regmbc(0x187); regmbc(0x23b);
568 regmbc(0x1e08); regmbc(0xa792);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200569 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200570 case 'D': case 0x10e: case 0x110: case 0x18a:
571 case 0x1e0a: case 0x1e0c: case 0x1e0e: case 0x1e10:
572 case 0x1e12:
573 regmbc('D'); regmbc(0x10e); regmbc(0x110);
574 regmbc(0x18a); regmbc(0x1e0a); regmbc(0x1e0c);
575 regmbc(0x1e0e); regmbc(0x1e10); regmbc(0x1e12);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200576 return;
577 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200578 case 0x112: case 0x114: case 0x116: case 0x118: case 0x11a:
579 case 0x204: case 0x206: case 0x228: case 0x246: case 0x1e14:
580 case 0x1e16: case 0x1e18: case 0x1e1a: case 0x1e1c:
581 case 0x1eb8: case 0x1eba: case 0x1ebc: case 0x1ebe:
582 case 0x1ec0: case 0x1ec2: case 0x1ec4: case 0x1ec6:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200583 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200584 regmbc(0xca); regmbc(0xcb); regmbc(0x112);
585 regmbc(0x114); regmbc(0x116); regmbc(0x118);
586 regmbc(0x11a); regmbc(0x204); regmbc(0x206);
587 regmbc(0x228); regmbc(0x246); regmbc(0x1e14);
588 regmbc(0x1e16); regmbc(0x1e18); regmbc(0x1e1a);
589 regmbc(0x1e1c); regmbc(0x1eb8); regmbc(0x1eba);
590 regmbc(0x1ebc); regmbc(0x1ebe); regmbc(0x1ec0);
591 regmbc(0x1ec2); regmbc(0x1ec4); regmbc(0x1ec6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200592 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200593 case 'F': case 0x191: case 0x1e1e: case 0xa798:
594 regmbc('F'); regmbc(0x191); regmbc(0x1e1e);
595 regmbc(0xa798);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200596 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200597 case 'G': case 0x11c: case 0x11e: case 0x120:
598 case 0x122: case 0x193: case 0x1e4: case 0x1e6:
599 case 0x1f4: case 0x1e20: case 0xa7a0:
600 regmbc('G'); regmbc(0x11c); regmbc(0x11e);
601 regmbc(0x120); regmbc(0x122); regmbc(0x193);
602 regmbc(0x1e4); regmbc(0x1e6); regmbc(0x1f4);
603 regmbc(0x1e20); regmbc(0xa7a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200604 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200605 case 'H': case 0x124: case 0x126: case 0x21e:
606 case 0x1e22: case 0x1e24: case 0x1e26:
607 case 0x1e28: case 0x1e2a: case 0x2c67:
608 regmbc('H'); regmbc(0x124); regmbc(0x126);
609 regmbc(0x21e); regmbc(0x1e22); regmbc(0x1e24);
610 regmbc(0x1e26); regmbc(0x1e28); regmbc(0x1e2a);
611 regmbc(0x2c67);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200612 return;
613 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200614 case 0x128: case 0x12a: case 0x12c: case 0x12e:
615 case 0x130: case 0x197: case 0x1cf: case 0x208:
616 case 0x20a: case 0x1e2c: case 0x1e2e: case 0x1ec8:
617 case 0x1eca:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200618 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200619 regmbc(0xce); regmbc(0xcf); regmbc(0x128);
620 regmbc(0x12a); regmbc(0x12c); regmbc(0x12e);
621 regmbc(0x130); regmbc(0x197); regmbc(0x1cf);
622 regmbc(0x208); regmbc(0x20a); regmbc(0x1e2c);
623 regmbc(0x1e2e); regmbc(0x1ec8); regmbc(0x1eca);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200624 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200625 case 'J': case 0x134: case 0x248:
626 regmbc('J'); regmbc(0x134); regmbc(0x248);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200627 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200628 case 'K': case 0x136: case 0x198: case 0x1e8: case 0x1e30:
629 case 0x1e32: case 0x1e34: case 0x2c69: case 0xa740:
630 regmbc('K'); regmbc(0x136); regmbc(0x198);
631 regmbc(0x1e8); regmbc(0x1e30); regmbc(0x1e32);
632 regmbc(0x1e34); regmbc(0x2c69); regmbc(0xa740);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200633 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200634 case 'L': case 0x139: case 0x13b: case 0x13d: case 0x13f:
635 case 0x141: case 0x23d: case 0x1e36: case 0x1e38:
636 case 0x1e3a: case 0x1e3c: case 0x2c60:
637 regmbc('L'); regmbc(0x139); regmbc(0x13b);
638 regmbc(0x13d); regmbc(0x13f); regmbc(0x141);
639 regmbc(0x23d); regmbc(0x1e36); regmbc(0x1e38);
640 regmbc(0x1e3a); regmbc(0x1e3c); regmbc(0x2c60);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200641 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200642 case 'M': case 0x1e3e: case 0x1e40: case 0x1e42:
643 regmbc('M'); regmbc(0x1e3e); regmbc(0x1e40);
644 regmbc(0x1e42);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200645 return;
646 case 'N': case 0xd1:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200647 case 0x143: case 0x145: case 0x147: case 0x1f8:
648 case 0x1e44: case 0x1e46: case 0x1e48: case 0x1e4a:
649 case 0xa7a4:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200650 regmbc('N'); regmbc(0xd1);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200651 regmbc(0x143); regmbc(0x145); regmbc(0x147);
652 regmbc(0x1f8); regmbc(0x1e44); regmbc(0x1e46);
653 regmbc(0x1e48); regmbc(0x1e4a); regmbc(0xa7a4);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200654 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200655 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd6:
656 case 0xd8: case 0x14c: case 0x14e: case 0x150: case 0x19f:
657 case 0x1a0: case 0x1d1: case 0x1ea: case 0x1ec: case 0x1fe:
658 case 0x20c: case 0x20e: case 0x22a: case 0x22c: case 0x22e:
659 case 0x230: case 0x1e4c: case 0x1e4e: case 0x1e50: case 0x1e52:
660 case 0x1ecc: case 0x1ece: case 0x1ed0: case 0x1ed2: case 0x1ed4:
661 case 0x1ed6: case 0x1ed8: case 0x1eda: case 0x1edc: case 0x1ede:
662 case 0x1ee0: case 0x1ee2:
663 regmbc('O'); regmbc(0xd2); regmbc(0xd3); regmbc(0xd4);
664 regmbc(0xd5); regmbc(0xd6); regmbc(0xd8);
665 regmbc(0x14c); regmbc(0x14e); regmbc(0x150);
666 regmbc(0x19f); regmbc(0x1a0); regmbc(0x1d1);
667 regmbc(0x1ea); regmbc(0x1ec); regmbc(0x1fe);
668 regmbc(0x20c); regmbc(0x20e); regmbc(0x22a);
669 regmbc(0x22c); regmbc(0x22e); regmbc(0x230);
670 regmbc(0x1e4c); regmbc(0x1e4e); regmbc(0x1e50);
671 regmbc(0x1e52); regmbc(0x1ecc); regmbc(0x1ece);
672 regmbc(0x1ed0); regmbc(0x1ed2); regmbc(0x1ed4);
673 regmbc(0x1ed6); regmbc(0x1ed8); regmbc(0x1eda);
674 regmbc(0x1edc); regmbc(0x1ede); regmbc(0x1ee0);
675 regmbc(0x1ee2);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200676 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200677 case 'P': case 0x1a4: case 0x1e54: case 0x1e56: case 0x2c63:
678 regmbc('P'); regmbc(0x1a4); regmbc(0x1e54);
679 regmbc(0x1e56); regmbc(0x2c63);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200680 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200681 case 'Q': case 0x24a:
682 regmbc('Q'); regmbc(0x24a);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200683 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200684 case 'R': case 0x154: case 0x156: case 0x158: case 0x210:
685 case 0x212: case 0x24c: case 0x1e58: case 0x1e5a:
686 case 0x1e5c: case 0x1e5e: case 0x2c64: case 0xa7a6:
687 regmbc('R'); regmbc(0x154); regmbc(0x156);
688 regmbc(0x210); regmbc(0x212); regmbc(0x158);
689 regmbc(0x24c); regmbc(0x1e58); regmbc(0x1e5a);
690 regmbc(0x1e5c); regmbc(0x1e5e); regmbc(0x2c64);
691 regmbc(0xa7a6);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200692 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200693 case 'S': case 0x15a: case 0x15c: case 0x15e: case 0x160:
694 case 0x218: case 0x1e60: case 0x1e62: case 0x1e64:
695 case 0x1e66: case 0x1e68: case 0x2c7e: case 0xa7a8:
696 regmbc('S'); regmbc(0x15a); regmbc(0x15c);
697 regmbc(0x15e); regmbc(0x160); regmbc(0x218);
698 regmbc(0x1e60); regmbc(0x1e62); regmbc(0x1e64);
699 regmbc(0x1e66); regmbc(0x1e68); regmbc(0x2c7e);
700 regmbc(0xa7a8);
701 return;
702 case 'T': case 0x162: case 0x164: case 0x166: case 0x1ac:
703 case 0x1ae: case 0x21a: case 0x23e: case 0x1e6a: case 0x1e6c:
704 case 0x1e6e: case 0x1e70:
705 regmbc('T'); regmbc(0x162); regmbc(0x164);
706 regmbc(0x166); regmbc(0x1ac); regmbc(0x23e);
707 regmbc(0x1ae); regmbc(0x21a); regmbc(0x1e6a);
708 regmbc(0x1e6c); regmbc(0x1e6e); regmbc(0x1e70);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200709 return;
710 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200711 case 0x168: case 0x16a: case 0x16c: case 0x16e:
712 case 0x170: case 0x172: case 0x1af: case 0x1d3:
713 case 0x1d5: case 0x1d7: case 0x1d9: case 0x1db:
714 case 0x214: case 0x216: case 0x244: case 0x1e72:
715 case 0x1e74: case 0x1e76: case 0x1e78: case 0x1e7a:
716 case 0x1ee4: case 0x1ee6: case 0x1ee8: case 0x1eea:
717 case 0x1eec: case 0x1eee: case 0x1ef0:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200718 regmbc('U'); regmbc(0xd9); regmbc(0xda);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200719 regmbc(0xdb); regmbc(0xdc); regmbc(0x168);
720 regmbc(0x16a); regmbc(0x16c); regmbc(0x16e);
721 regmbc(0x170); regmbc(0x172); regmbc(0x1af);
722 regmbc(0x1d3); regmbc(0x1d5); regmbc(0x1d7);
723 regmbc(0x1d9); regmbc(0x1db); regmbc(0x214);
724 regmbc(0x216); regmbc(0x244); regmbc(0x1e72);
725 regmbc(0x1e74); regmbc(0x1e76); regmbc(0x1e78);
726 regmbc(0x1e7a); regmbc(0x1ee4); regmbc(0x1ee6);
727 regmbc(0x1ee8); regmbc(0x1eea); regmbc(0x1eec);
728 regmbc(0x1eee); regmbc(0x1ef0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200729 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200730 case 'V': case 0x1b2: case 0x1e7c: case 0x1e7e:
731 regmbc('V'); regmbc(0x1b2); regmbc(0x1e7c);
732 regmbc(0x1e7e);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200733 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200734 case 'W': case 0x174: case 0x1e80: case 0x1e82:
735 case 0x1e84: case 0x1e86: case 0x1e88:
736 regmbc('W'); regmbc(0x174); regmbc(0x1e80);
737 regmbc(0x1e82); regmbc(0x1e84); regmbc(0x1e86);
738 regmbc(0x1e88);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200739 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200740 case 'X': case 0x1e8a: case 0x1e8c:
741 regmbc('X'); regmbc(0x1e8a); regmbc(0x1e8c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200742 return;
743 case 'Y': case 0xdd:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200744 case 0x176: case 0x178: case 0x1b3: case 0x232: case 0x24e:
745 case 0x1e8e: case 0x1ef2: case 0x1ef6: case 0x1ef4: case 0x1ef8:
746 regmbc('Y'); regmbc(0xdd); regmbc(0x176);
747 regmbc(0x178); regmbc(0x1b3); regmbc(0x232);
748 regmbc(0x24e); regmbc(0x1e8e); regmbc(0x1ef2);
749 regmbc(0x1ef4); regmbc(0x1ef6); regmbc(0x1ef8);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200750 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200751 case 'Z': case 0x179: case 0x17b: case 0x17d: case 0x1b5:
752 case 0x1e90: case 0x1e92: case 0x1e94: case 0x2c6b:
753 regmbc('Z'); regmbc(0x179); regmbc(0x17b);
754 regmbc(0x17d); regmbc(0x1b5); regmbc(0x1e90);
755 regmbc(0x1e92); regmbc(0x1e94); regmbc(0x2c6b);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200756 return;
757 case 'a': case 0xe0: case 0xe1: case 0xe2:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200758 case 0xe3: case 0xe4: case 0xe5: case 0x101: case 0x103:
759 case 0x105: case 0x1ce: case 0x1df: case 0x1e1: case 0x1fb:
760 case 0x201: case 0x203: case 0x227: case 0x1d8f: case 0x1e01:
761 case 0x1e9a: case 0x1ea1: case 0x1ea3: case 0x1ea5:
762 case 0x1ea7: case 0x1ea9: case 0x1eab: case 0x1ead:
763 case 0x1eaf: case 0x1eb1: case 0x1eb3: case 0x1eb5:
764 case 0x1eb7: case 0x2c65:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200765 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
766 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200767 regmbc(0xe5); regmbc(0x101); regmbc(0x103);
768 regmbc(0x105); regmbc(0x1ce); regmbc(0x1df);
769 regmbc(0x1e1); regmbc(0x1fb); regmbc(0x201);
770 regmbc(0x203); regmbc(0x227); regmbc(0x1d8f);
771 regmbc(0x1e01); regmbc(0x1e9a); regmbc(0x1ea1);
772 regmbc(0x1ea3); regmbc(0x1ea5); regmbc(0x1ea7);
773 regmbc(0x1ea9); regmbc(0x1eab); regmbc(0x1ead);
774 regmbc(0x1eaf); regmbc(0x1eb1); regmbc(0x1eb3);
775 regmbc(0x1eb5); regmbc(0x1eb7); regmbc(0x2c65);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200776 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200777 case 'b': case 0x180: case 0x253: case 0x1d6c: case 0x1d80:
778 case 0x1e03: case 0x1e05: case 0x1e07:
779 regmbc('b');
780 regmbc(0x180); regmbc(0x253); regmbc(0x1d6c);
781 regmbc(0x1d80); regmbc(0x1e03); regmbc(0x1e05);
782 regmbc(0x1e07);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200783 return;
784 case 'c': case 0xe7:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200785 case 0x107: case 0x109: case 0x10b: case 0x10d: case 0x188:
786 case 0x23c: case 0x1e09: case 0xa793: case 0xa794:
787 regmbc('c'); regmbc(0xe7); regmbc(0x107);
788 regmbc(0x109); regmbc(0x10b); regmbc(0x10d);
789 regmbc(0x188); regmbc(0x23c); regmbc(0x1e09);
790 regmbc(0xa793); regmbc(0xa794);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200791 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200792 case 'd': case 0x10f: case 0x111: case 0x257: case 0x1d6d:
793 case 0x1d81: case 0x1d91: case 0x1e0b: case 0x1e0d:
794 case 0x1e0f: case 0x1e11: case 0x1e13:
795 regmbc('d'); regmbc(0x10f); regmbc(0x111);
796 regmbc(0x257); regmbc(0x1d6d); regmbc(0x1d81);
797 regmbc(0x1d91); regmbc(0x1e0b); regmbc(0x1e0d);
798 regmbc(0x1e0f); regmbc(0x1e11); regmbc(0x1e13);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200799 return;
800 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200801 case 0x113: case 0x115: case 0x117: case 0x119:
802 case 0x11b: case 0x205: case 0x207: case 0x229:
803 case 0x247: case 0x1d92: case 0x1e15: case 0x1e17:
804 case 0x1e19: case 0x1e1b: case 0x1eb9: case 0x1ebb:
805 case 0x1e1d: case 0x1ebd: case 0x1ebf: case 0x1ec1:
806 case 0x1ec3: case 0x1ec5: case 0x1ec7:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200807 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200808 regmbc(0xea); regmbc(0xeb); regmbc(0x113);
809 regmbc(0x115); regmbc(0x117); regmbc(0x119);
810 regmbc(0x11b); regmbc(0x205); regmbc(0x207);
811 regmbc(0x229); regmbc(0x247); regmbc(0x1d92);
812 regmbc(0x1e15); regmbc(0x1e17); regmbc(0x1e19);
813 regmbc(0x1e1b); regmbc(0x1e1d); regmbc(0x1eb9);
814 regmbc(0x1ebb); regmbc(0x1ebd); regmbc(0x1ebf);
815 regmbc(0x1ec1); regmbc(0x1ec3); regmbc(0x1ec5);
816 regmbc(0x1ec7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200817 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200818 case 'f': case 0x192: case 0x1d6e: case 0x1d82:
819 case 0x1e1f: case 0xa799:
820 regmbc('f'); regmbc(0x192); regmbc(0x1d6e);
821 regmbc(0x1d82); regmbc(0x1e1f); regmbc(0xa799);
822 return;
823 case 'g': case 0x11d: case 0x11f: case 0x121: case 0x123:
824 case 0x1e5: case 0x1e7: case 0x260: case 0x1f5: case 0x1d83:
825 case 0x1e21: case 0xa7a1:
826 regmbc('g'); regmbc(0x11d); regmbc(0x11f);
827 regmbc(0x121); regmbc(0x123); regmbc(0x1e5);
828 regmbc(0x1e7); regmbc(0x1f5); regmbc(0x260);
829 regmbc(0x1d83); regmbc(0x1e21); regmbc(0xa7a1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200830 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200831 case 'h': case 0x125: case 0x127: case 0x21f: case 0x1e23:
832 case 0x1e25: case 0x1e27: case 0x1e29: case 0x1e2b:
833 case 0x1e96: case 0x2c68: case 0xa795:
834 regmbc('h'); regmbc(0x125); regmbc(0x127);
835 regmbc(0x21f); regmbc(0x1e23); regmbc(0x1e25);
836 regmbc(0x1e27); regmbc(0x1e29); regmbc(0x1e2b);
837 regmbc(0x1e96); regmbc(0x2c68); regmbc(0xa795);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200838 return;
839 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200840 case 0x129: case 0x12b: case 0x12d: case 0x12f:
841 case 0x1d0: case 0x209: case 0x20b: case 0x268:
842 case 0x1d96: case 0x1e2d: case 0x1e2f: case 0x1ec9:
843 case 0x1ecb:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200844 regmbc('i'); regmbc(0xec); regmbc(0xed);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200845 regmbc(0xee); regmbc(0xef); regmbc(0x129);
846 regmbc(0x12b); regmbc(0x12d); regmbc(0x12f);
847 regmbc(0x1d0); regmbc(0x209); regmbc(0x20b);
848 regmbc(0x268); regmbc(0x1d96); regmbc(0x1e2d);
849 regmbc(0x1e2f); regmbc(0x1ec9); regmbc(0x1ecb);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200850 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200851 case 'j': case 0x135: case 0x1f0: case 0x249:
852 regmbc('j'); regmbc(0x135); regmbc(0x1f0);
853 regmbc(0x249);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200854 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200855 case 'k': case 0x137: case 0x199: case 0x1e9:
856 case 0x1d84: case 0x1e31: case 0x1e33: case 0x1e35:
857 case 0x2c6a: case 0xa741:
858 regmbc('k'); regmbc(0x137); regmbc(0x199);
859 regmbc(0x1e9); regmbc(0x1d84); regmbc(0x1e31);
860 regmbc(0x1e33); regmbc(0x1e35); regmbc(0x2c6a);
861 regmbc(0xa741);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200862 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200863 case 'l': case 0x13a: case 0x13c: case 0x13e:
864 case 0x140: case 0x142: case 0x19a: case 0x1e37:
865 case 0x1e39: case 0x1e3b: case 0x1e3d: case 0x2c61:
866 regmbc('l'); regmbc(0x13a); regmbc(0x13c);
867 regmbc(0x13e); regmbc(0x140); regmbc(0x142);
868 regmbc(0x19a); regmbc(0x1e37); regmbc(0x1e39);
869 regmbc(0x1e3b); regmbc(0x1e3d); regmbc(0x2c61);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200870 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200871 case 'm': case 0x1d6f: case 0x1e3f: case 0x1e41: case 0x1e43:
872 regmbc('m'); regmbc(0x1d6f); regmbc(0x1e3f);
873 regmbc(0x1e41); regmbc(0x1e43);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200874 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200875 case 'n': case 0xf1: case 0x144: case 0x146: case 0x148:
876 case 0x149: case 0x1f9: case 0x1d70: case 0x1d87:
877 case 0x1e45: case 0x1e47: case 0x1e49: case 0x1e4b:
878 case 0xa7a5:
879 regmbc('n'); regmbc(0xf1); regmbc(0x144);
880 regmbc(0x146); regmbc(0x148); regmbc(0x149);
881 regmbc(0x1f9); regmbc(0x1d70); regmbc(0x1d87);
882 regmbc(0x1e45); regmbc(0x1e47); regmbc(0x1e49);
883 regmbc(0x1e4b); regmbc(0xa7a5);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200884 return;
885 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200886 case 0xf6: case 0xf8: case 0x14d: case 0x14f: case 0x151:
887 case 0x1a1: case 0x1d2: case 0x1eb: case 0x1ed: case 0x1ff:
888 case 0x20d: case 0x20f: case 0x22b: case 0x22d: case 0x22f:
889 case 0x231: case 0x275: case 0x1e4d: case 0x1e4f:
890 case 0x1e51: case 0x1e53: case 0x1ecd: case 0x1ecf:
891 case 0x1ed1: case 0x1ed3: case 0x1ed5: case 0x1ed7:
892 case 0x1ed9: case 0x1edb: case 0x1edd: case 0x1edf:
893 case 0x1ee1: case 0x1ee3:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200894 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
895 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200896 regmbc(0xf8); regmbc(0x14d); regmbc(0x14f);
897 regmbc(0x151); regmbc(0x1a1); regmbc(0x1d2);
898 regmbc(0x1eb); regmbc(0x1ed); regmbc(0x1ff);
899 regmbc(0x20d); regmbc(0x20f); regmbc(0x22b);
900 regmbc(0x22d); regmbc(0x22f); regmbc(0x231);
901 regmbc(0x275); regmbc(0x1e4d); regmbc(0x1e4f);
902 regmbc(0x1e51); regmbc(0x1e53); regmbc(0x1ecd);
903 regmbc(0x1ecf); regmbc(0x1ed1); regmbc(0x1ed3);
904 regmbc(0x1ed5); regmbc(0x1ed7); regmbc(0x1ed9);
905 regmbc(0x1edb); regmbc(0x1edd); regmbc(0x1edf);
906 regmbc(0x1ee1); regmbc(0x1ee3);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200907 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200908 case 'p': case 0x1a5: case 0x1d71: case 0x1d88: case 0x1d7d:
909 case 0x1e55: case 0x1e57:
910 regmbc('p'); regmbc(0x1a5); regmbc(0x1d71);
911 regmbc(0x1d7d); regmbc(0x1d88); regmbc(0x1e55);
912 regmbc(0x1e57);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200913 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200914 case 'q': case 0x24b: case 0x2a0:
915 regmbc('q'); regmbc(0x24b); regmbc(0x2a0);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200916 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200917 case 'r': case 0x155: case 0x157: case 0x159: case 0x211:
918 case 0x213: case 0x24d: case 0x27d: case 0x1d72: case 0x1d73:
919 case 0x1d89: case 0x1e59: case 0x1e5b: case 0x1e5d: case 0x1e5f:
920 case 0xa7a7:
921 regmbc('r'); regmbc(0x155); regmbc(0x157);
922 regmbc(0x159); regmbc(0x211); regmbc(0x213);
923 regmbc(0x24d); regmbc(0x1d72); regmbc(0x1d73);
924 regmbc(0x1d89); regmbc(0x1e59); regmbc(0x27d);
925 regmbc(0x1e5b); regmbc(0x1e5d); regmbc(0x1e5f);
926 regmbc(0xa7a7);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200927 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200928 case 's': case 0x15b: case 0x15d: case 0x15f: case 0x161:
929 case 0x1e61: case 0x219: case 0x23f: case 0x1d74: case 0x1d8a:
930 case 0x1e63: case 0x1e65: case 0x1e67: case 0x1e69: case 0xa7a9:
931 regmbc('s'); regmbc(0x15b); regmbc(0x15d);
932 regmbc(0x15f); regmbc(0x161); regmbc(0x23f);
933 regmbc(0x219); regmbc(0x1d74); regmbc(0x1d8a);
934 regmbc(0x1e61); regmbc(0x1e63); regmbc(0x1e65);
935 regmbc(0x1e67); regmbc(0x1e69); regmbc(0xa7a9);
936 return;
937 case 't': case 0x163: case 0x165: case 0x167: case 0x1ab:
938 case 0x1ad: case 0x21b: case 0x288: case 0x1d75: case 0x1e6b:
939 case 0x1e6d: case 0x1e6f: case 0x1e71: case 0x1e97: case 0x2c66:
940 regmbc('t'); regmbc(0x163); regmbc(0x165);
941 regmbc(0x167); regmbc(0x1ab); regmbc(0x21b);
942 regmbc(0x1ad); regmbc(0x288); regmbc(0x1d75);
943 regmbc(0x1e6b); regmbc(0x1e6d); regmbc(0x1e6f);
944 regmbc(0x1e71); regmbc(0x1e97); regmbc(0x2c66);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200945 return;
946 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200947 case 0x169: case 0x16b: case 0x16d: case 0x16f:
948 case 0x171: case 0x173: case 0x1b0: case 0x1d4:
949 case 0x1d6: case 0x1d8: case 0x1da: case 0x1dc:
950 case 0x215: case 0x217: case 0x289: case 0x1e73:
951 case 0x1d7e: case 0x1d99: case 0x1e75: case 0x1e77:
952 case 0x1e79: case 0x1e7b: case 0x1ee5: case 0x1ee7:
953 case 0x1ee9: case 0x1eeb: case 0x1eed: case 0x1eef:
954 case 0x1ef1:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200955 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200956 regmbc(0xfb); regmbc(0xfc); regmbc(0x169);
957 regmbc(0x16b); regmbc(0x16d); regmbc(0x16f);
958 regmbc(0x171); regmbc(0x173); regmbc(0x1d6);
959 regmbc(0x1d8); regmbc(0x1da); regmbc(0x1dc);
960 regmbc(0x215); regmbc(0x217); regmbc(0x1b0);
961 regmbc(0x1d4); regmbc(0x289); regmbc(0x1d7e);
962 regmbc(0x1d99); regmbc(0x1e73); regmbc(0x1e75);
963 regmbc(0x1e77); regmbc(0x1e79); regmbc(0x1e7b);
964 regmbc(0x1ee5); regmbc(0x1ee7); regmbc(0x1ee9);
965 regmbc(0x1eeb); regmbc(0x1eed); regmbc(0x1eef);
966 regmbc(0x1ef1);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200967 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200968 case 'v': case 0x28b: case 0x1d8c: case 0x1e7d: case 0x1e7f:
969 regmbc('v'); regmbc(0x28b); regmbc(0x1d8c);
970 regmbc(0x1e7d); regmbc(0x1e7f);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200971 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200972 case 'w': case 0x175: case 0x1e81: case 0x1e83:
973 case 0x1e85: case 0x1e87: case 0x1e89: case 0x1e98:
974 regmbc('w'); regmbc(0x175); regmbc(0x1e81);
975 regmbc(0x1e83); regmbc(0x1e85); regmbc(0x1e87);
976 regmbc(0x1e89); regmbc(0x1e98);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200977 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200978 case 'x': case 0x1e8b: case 0x1e8d:
979 regmbc('x'); regmbc(0x1e8b); regmbc(0x1e8d);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200980 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200981 case 'y': case 0xfd: case 0xff: case 0x177: case 0x1b4:
982 case 0x233: case 0x24f: case 0x1e8f: case 0x1e99: case 0x1ef3:
983 case 0x1ef5: case 0x1ef7: case 0x1ef9:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200984 regmbc('y'); regmbc(0xfd); regmbc(0xff);
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200985 regmbc(0x177); regmbc(0x1b4); regmbc(0x233);
986 regmbc(0x24f); regmbc(0x1e8f); regmbc(0x1e99);
987 regmbc(0x1ef3); regmbc(0x1ef5); regmbc(0x1ef7);
988 regmbc(0x1ef9);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200989 return;
Bram Moolenaar0b94e292021-04-05 13:59:53 +0200990 case 'z': case 0x17a: case 0x17c: case 0x17e: case 0x1b6:
991 case 0x1d76: case 0x1d8e: case 0x1e91: case 0x1e93:
992 case 0x1e95: case 0x2c6c:
993 regmbc('z'); regmbc(0x17a); regmbc(0x17c);
994 regmbc(0x17e); regmbc(0x1b6); regmbc(0x1d76);
995 regmbc(0x1d8e); regmbc(0x1e91); regmbc(0x1e93);
996 regmbc(0x1e95); regmbc(0x2c6c);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200997 return;
998 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200999 }
1000 regmbc(c);
1001}
1002
1003/*
1004 * Emit a node.
1005 * Return pointer to generated code.
1006 */
1007 static char_u *
1008regnode(int op)
1009{
1010 char_u *ret;
1011
1012 ret = regcode;
1013 if (ret == JUST_CALC_SIZE)
1014 regsize += 3;
1015 else
1016 {
1017 *regcode++ = op;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001018 *regcode++ = NUL; // Null "next" pointer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001019 *regcode++ = NUL;
1020 }
1021 return ret;
1022}
1023
1024/*
1025 * Write a long as four bytes at "p" and return pointer to the next char.
1026 */
1027 static char_u *
1028re_put_long(char_u *p, long_u val)
1029{
1030 *p++ = (char_u) ((val >> 24) & 0377);
1031 *p++ = (char_u) ((val >> 16) & 0377);
1032 *p++ = (char_u) ((val >> 8) & 0377);
1033 *p++ = (char_u) (val & 0377);
1034 return p;
1035}
1036
1037/*
1038 * regnext - dig the "next" pointer out of a node
1039 * Returns NULL when calculating size, when there is no next item and when
1040 * there is an error.
1041 */
1042 static char_u *
1043regnext(char_u *p)
1044{
1045 int offset;
1046
1047 if (p == JUST_CALC_SIZE || reg_toolong)
1048 return NULL;
1049
1050 offset = NEXT(p);
1051 if (offset == 0)
1052 return NULL;
1053
1054 if (OP(p) == BACK)
1055 return p - offset;
1056 else
1057 return p + offset;
1058}
1059
1060/*
1061 * Set the next-pointer at the end of a node chain.
1062 */
1063 static void
1064regtail(char_u *p, char_u *val)
1065{
1066 char_u *scan;
1067 char_u *temp;
1068 int offset;
1069
1070 if (p == JUST_CALC_SIZE)
1071 return;
1072
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001073 // Find last node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001074 scan = p;
1075 for (;;)
1076 {
1077 temp = regnext(scan);
1078 if (temp == NULL)
1079 break;
1080 scan = temp;
1081 }
1082
1083 if (OP(scan) == BACK)
1084 offset = (int)(scan - val);
1085 else
1086 offset = (int)(val - scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001087 // When the offset uses more than 16 bits it can no longer fit in the two
1088 // bytes available. Use a global flag to avoid having to check return
1089 // values in too many places.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001090 if (offset > 0xffff)
1091 reg_toolong = TRUE;
1092 else
1093 {
1094 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
1095 *(scan + 2) = (char_u) (offset & 0377);
1096 }
1097}
1098
1099/*
1100 * Like regtail, on item after a BRANCH; nop if none.
1101 */
1102 static void
1103regoptail(char_u *p, char_u *val)
1104{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001105 // When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001106 if (p == NULL || p == JUST_CALC_SIZE
1107 || (OP(p) != BRANCH
1108 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
1109 return;
1110 regtail(OPERAND(p), val);
1111}
1112
1113/*
1114 * Insert an operator in front of already-emitted operand
1115 *
1116 * Means relocating the operand.
1117 */
1118 static void
1119reginsert(int op, char_u *opnd)
1120{
1121 char_u *src;
1122 char_u *dst;
1123 char_u *place;
1124
1125 if (regcode == JUST_CALC_SIZE)
1126 {
1127 regsize += 3;
1128 return;
1129 }
1130 src = regcode;
1131 regcode += 3;
1132 dst = regcode;
1133 while (src > opnd)
1134 *--dst = *--src;
1135
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001136 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001137 *place++ = op;
1138 *place++ = NUL;
1139 *place = NUL;
1140}
1141
1142/*
1143 * Insert an operator in front of already-emitted operand.
1144 * Add a number to the operator.
1145 */
1146 static void
1147reginsert_nr(int op, long val, char_u *opnd)
1148{
1149 char_u *src;
1150 char_u *dst;
1151 char_u *place;
1152
1153 if (regcode == JUST_CALC_SIZE)
1154 {
1155 regsize += 7;
1156 return;
1157 }
1158 src = regcode;
1159 regcode += 7;
1160 dst = regcode;
1161 while (src > opnd)
1162 *--dst = *--src;
1163
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001164 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001165 *place++ = op;
1166 *place++ = NUL;
1167 *place++ = NUL;
1168 re_put_long(place, (long_u)val);
1169}
1170
1171/*
1172 * Insert an operator in front of already-emitted operand.
1173 * The operator has the given limit values as operands. Also set next pointer.
1174 *
1175 * Means relocating the operand.
1176 */
1177 static void
1178reginsert_limits(
1179 int op,
1180 long minval,
1181 long maxval,
1182 char_u *opnd)
1183{
1184 char_u *src;
1185 char_u *dst;
1186 char_u *place;
1187
1188 if (regcode == JUST_CALC_SIZE)
1189 {
1190 regsize += 11;
1191 return;
1192 }
1193 src = regcode;
1194 regcode += 11;
1195 dst = regcode;
1196 while (src > opnd)
1197 *--dst = *--src;
1198
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001199 place = opnd; // Op node, where operand used to be.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001200 *place++ = op;
1201 *place++ = NUL;
1202 *place++ = NUL;
1203 place = re_put_long(place, (long_u)minval);
1204 place = re_put_long(place, (long_u)maxval);
1205 regtail(opnd, place);
1206}
1207
1208/*
1209 * Return TRUE if the back reference is legal. We must have seen the close
1210 * brace.
1211 * TODO: Should also check that we don't refer to something that is repeated
1212 * (+*=): what instance of the repetition should we match?
1213 */
1214 static int
1215seen_endbrace(int refnum)
1216{
1217 if (!had_endbrace[refnum])
1218 {
1219 char_u *p;
1220
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001221 // Trick: check if "@<=" or "@<!" follows, in which case
1222 // the \1 can appear before the referenced match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001223 for (p = regparse; *p != NUL; ++p)
1224 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
1225 break;
1226 if (*p == NUL)
1227 {
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001228 emsg(_(e_illegal_back_reference));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001229 rc_did_emsg = TRUE;
1230 return FALSE;
1231 }
1232 }
1233 return TRUE;
1234}
1235
1236/*
1237 * Parse the lowest level.
1238 *
1239 * Optimization: gobbles an entire sequence of ordinary characters so that
1240 * it can turn them into a single node, which is smaller to store and
1241 * faster to run. Don't do this when one_exactly is set.
1242 */
1243 static char_u *
1244regatom(int *flagp)
1245{
1246 char_u *ret;
1247 int flags;
1248 int c;
1249 char_u *p;
1250 int extra = 0;
1251 int save_prev_at_start = prev_at_start;
1252
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001253 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001254
1255 c = getchr();
1256 switch (c)
1257 {
1258 case Magic('^'):
1259 ret = regnode(BOL);
1260 break;
1261
1262 case Magic('$'):
1263 ret = regnode(EOL);
1264#if defined(FEAT_SYN_HL) || defined(PROTO)
1265 had_eol = TRUE;
1266#endif
1267 break;
1268
1269 case Magic('<'):
1270 ret = regnode(BOW);
1271 break;
1272
1273 case Magic('>'):
1274 ret = regnode(EOW);
1275 break;
1276
1277 case Magic('_'):
1278 c = no_Magic(getchr());
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001279 if (c == '^') // "\_^" is start-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001280 {
1281 ret = regnode(BOL);
1282 break;
1283 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001284 if (c == '$') // "\_$" is end-of-line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001285 {
1286 ret = regnode(EOL);
1287#if defined(FEAT_SYN_HL) || defined(PROTO)
1288 had_eol = TRUE;
1289#endif
1290 break;
1291 }
1292
1293 extra = ADD_NL;
1294 *flagp |= HASNL;
1295
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001296 // "\_[" is character range plus newline
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001297 if (c == '[')
1298 goto collection;
1299
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001300 // "\_x" is character class plus newline
1301 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001302
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001303 // Character classes.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001304 case Magic('.'):
1305 case Magic('i'):
1306 case Magic('I'):
1307 case Magic('k'):
1308 case Magic('K'):
1309 case Magic('f'):
1310 case Magic('F'):
1311 case Magic('p'):
1312 case Magic('P'):
1313 case Magic('s'):
1314 case Magic('S'):
1315 case Magic('d'):
1316 case Magic('D'):
1317 case Magic('x'):
1318 case Magic('X'):
1319 case Magic('o'):
1320 case Magic('O'):
1321 case Magic('w'):
1322 case Magic('W'):
1323 case Magic('h'):
1324 case Magic('H'):
1325 case Magic('a'):
1326 case Magic('A'):
1327 case Magic('l'):
1328 case Magic('L'):
1329 case Magic('u'):
1330 case Magic('U'):
1331 p = vim_strchr(classchars, no_Magic(c));
1332 if (p == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001333 EMSG_RET_NULL(_(e_invalid_use_of_underscore));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001334
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001335 // When '.' is followed by a composing char ignore the dot, so that
1336 // the composing char is matched here.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001337 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr()))
1338 {
1339 c = getchr();
1340 goto do_multibyte;
1341 }
1342 ret = regnode(classcodes[p - classchars] + extra);
1343 *flagp |= HASWIDTH | SIMPLE;
1344 break;
1345
1346 case Magic('n'):
1347 if (reg_string)
1348 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001349 // In a string "\n" matches a newline character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001350 ret = regnode(EXACTLY);
1351 regc(NL);
1352 regc(NUL);
1353 *flagp |= HASWIDTH | SIMPLE;
1354 }
1355 else
1356 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001357 // In buffer text "\n" matches the end of a line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001358 ret = regnode(NEWL);
1359 *flagp |= HASWIDTH | HASNL;
1360 }
1361 break;
1362
1363 case Magic('('):
1364 if (one_exactly)
1365 EMSG_ONE_RET_NULL;
1366 ret = reg(REG_PAREN, &flags);
1367 if (ret == NULL)
1368 return NULL;
1369 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1370 break;
1371
1372 case NUL:
1373 case Magic('|'):
1374 case Magic('&'):
1375 case Magic(')'):
1376 if (one_exactly)
1377 EMSG_ONE_RET_NULL;
Bram Moolenaard0819d12021-12-31 23:15:53 +00001378 // Supposed to be caught earlier.
1379 IEMSG_RET_NULL(_(e_internal_error_in_regexp));
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001380 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001381
1382 case Magic('='):
1383 case Magic('?'):
1384 case Magic('+'):
1385 case Magic('@'):
1386 case Magic('{'):
1387 case Magic('*'):
1388 c = no_Magic(c);
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001389 EMSG3_RET_NULL(_(e_str_chr_follows_nothing),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001390 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL), c);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001391 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001392
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001393 case Magic('~'): // previous substitute pattern
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001394 if (reg_prev_sub != NULL)
1395 {
1396 char_u *lp;
1397
1398 ret = regnode(EXACTLY);
1399 lp = reg_prev_sub;
1400 while (*lp != NUL)
1401 regc(*lp++);
1402 regc(NUL);
1403 if (*reg_prev_sub != NUL)
1404 {
1405 *flagp |= HASWIDTH;
1406 if ((lp - reg_prev_sub) == 1)
1407 *flagp |= SIMPLE;
1408 }
1409 }
1410 else
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001411 EMSG_RET_NULL(_(e_no_previous_substitute_regular_expression));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001412 break;
1413
1414 case Magic('1'):
1415 case Magic('2'):
1416 case Magic('3'):
1417 case Magic('4'):
1418 case Magic('5'):
1419 case Magic('6'):
1420 case Magic('7'):
1421 case Magic('8'):
1422 case Magic('9'):
1423 {
1424 int refnum;
1425
1426 refnum = c - Magic('0');
1427 if (!seen_endbrace(refnum))
1428 return NULL;
1429 ret = regnode(BACKREF + refnum);
1430 }
1431 break;
1432
1433 case Magic('z'):
1434 {
1435 c = no_Magic(getchr());
1436 switch (c)
1437 {
1438#ifdef FEAT_SYN_HL
1439 case '(': if ((reg_do_extmatch & REX_SET) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001440 EMSG_RET_NULL(_(e_z_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001441 if (one_exactly)
1442 EMSG_ONE_RET_NULL;
1443 ret = reg(REG_ZPAREN, &flags);
1444 if (ret == NULL)
1445 return NULL;
1446 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1447 re_has_z = REX_SET;
1448 break;
1449
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9': if ((reg_do_extmatch & REX_USE) == 0)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001459 EMSG_RET_NULL(_(e_z1_z9_not_allowed_here));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001460 ret = regnode(ZREF + c - '0');
1461 re_has_z = REX_USE;
1462 break;
1463#endif
1464
1465 case 's': ret = regnode(MOPEN + 0);
1466 if (re_mult_next("\\zs") == FAIL)
1467 return NULL;
1468 break;
1469
1470 case 'e': ret = regnode(MCLOSE + 0);
1471 if (re_mult_next("\\ze") == FAIL)
1472 return NULL;
1473 break;
1474
Bram Moolenaarb2810f12022-01-08 21:38:52 +00001475 default: EMSG_RET_NULL(_(e_invalid_character_after_bsl_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001476 }
1477 }
1478 break;
1479
1480 case Magic('%'):
1481 {
1482 c = no_Magic(getchr());
1483 switch (c)
1484 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001485 // () without a back reference
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001486 case '(':
1487 if (one_exactly)
1488 EMSG_ONE_RET_NULL;
1489 ret = reg(REG_NPAREN, &flags);
1490 if (ret == NULL)
1491 return NULL;
1492 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1493 break;
1494
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001495 // Catch \%^ and \%$ regardless of where they appear in the
1496 // pattern -- regardless of whether or not it makes sense.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001497 case '^':
1498 ret = regnode(RE_BOF);
1499 break;
1500
1501 case '$':
1502 ret = regnode(RE_EOF);
1503 break;
1504
1505 case '#':
1506 ret = regnode(CURSOR);
1507 break;
1508
1509 case 'V':
1510 ret = regnode(RE_VISUAL);
1511 break;
1512
1513 case 'C':
1514 ret = regnode(RE_COMPOSING);
1515 break;
1516
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001517 // \%[abc]: Emit as a list of branches, all ending at the last
1518 // branch which matches nothing.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001519 case '[':
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001520 if (one_exactly) // doesn't nest
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001521 EMSG_ONE_RET_NULL;
1522 {
1523 char_u *lastbranch;
1524 char_u *lastnode = NULL;
1525 char_u *br;
1526
1527 ret = NULL;
1528 while ((c = getchr()) != ']')
1529 {
1530 if (c == NUL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001531 EMSG2_RET_NULL(_(e_missing_sb_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001532 reg_magic == MAGIC_ALL);
1533 br = regnode(BRANCH);
1534 if (ret == NULL)
1535 ret = br;
1536 else
1537 {
1538 regtail(lastnode, br);
1539 if (reg_toolong)
1540 return NULL;
1541 }
1542
1543 ungetchr();
1544 one_exactly = TRUE;
1545 lastnode = regatom(flagp);
1546 one_exactly = FALSE;
1547 if (lastnode == NULL)
1548 return NULL;
1549 }
1550 if (ret == NULL)
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001551 EMSG2_RET_NULL(_(e_empty_str_brackets),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001552 reg_magic == MAGIC_ALL);
1553 lastbranch = regnode(BRANCH);
1554 br = regnode(NOTHING);
1555 if (ret != JUST_CALC_SIZE)
1556 {
1557 regtail(lastnode, br);
1558 regtail(lastbranch, br);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001559 // connect all branches to the NOTHING
1560 // branch at the end
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001561 for (br = ret; br != lastnode; )
1562 {
1563 if (OP(br) == BRANCH)
1564 {
1565 regtail(br, lastbranch);
1566 if (reg_toolong)
1567 return NULL;
1568 br = OPERAND(br);
1569 }
1570 else
1571 br = regnext(br);
1572 }
1573 }
1574 *flagp &= ~(HASWIDTH | SIMPLE);
1575 break;
1576 }
1577
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001578 case 'd': // %d123 decimal
1579 case 'o': // %o123 octal
1580 case 'x': // %xab hex 2
1581 case 'u': // %uabcd hex 4
1582 case 'U': // %U1234abcd hex 8
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001583 {
1584 long i;
1585
1586 switch (c)
1587 {
1588 case 'd': i = getdecchrs(); break;
1589 case 'o': i = getoctchrs(); break;
1590 case 'x': i = gethexchrs(2); break;
1591 case 'u': i = gethexchrs(4); break;
1592 case 'U': i = gethexchrs(8); break;
1593 default: i = -1; break;
1594 }
1595
1596 if (i < 0 || i > INT_MAX)
1597 EMSG2_RET_NULL(
Bram Moolenaara6f79292022-01-04 21:30:47 +00001598 _(e_invalid_character_after_str_2),
1599 reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001600 if (use_multibytecode(i))
1601 ret = regnode(MULTIBYTECODE);
1602 else
1603 ret = regnode(EXACTLY);
1604 if (i == 0)
1605 regc(0x0a);
1606 else
1607 regmbc(i);
1608 regc(NUL);
1609 *flagp |= HASWIDTH;
1610 break;
1611 }
1612
1613 default:
1614 if (VIM_ISDIGIT(c) || c == '<' || c == '>'
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001615 || c == '\'' || c == '.')
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001616 {
1617 long_u n = 0;
1618 int cmp;
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001619 int cur = FALSE;
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001620 int got_digit = FALSE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001621
1622 cmp = c;
1623 if (cmp == '<' || cmp == '>')
1624 c = getchr();
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001625 if (no_Magic(c) == '.')
1626 {
1627 cur = TRUE;
1628 c = getchr();
1629 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001630 while (VIM_ISDIGIT(c))
1631 {
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001632 got_digit = TRUE;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001633 n = n * 10 + (c - '0');
1634 c = getchr();
1635 }
1636 if (c == '\'' && n == 0)
1637 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001638 // "\%'m", "\%<'m" and "\%>'m": Mark
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001639 c = getchr();
1640 ret = regnode(RE_MARK);
1641 if (ret == JUST_CALC_SIZE)
1642 regsize += 2;
1643 else
1644 {
1645 *regcode++ = c;
1646 *regcode++ = cmp;
1647 }
1648 break;
1649 }
Bram Moolenaar72bb10d2022-04-05 14:00:32 +01001650 else if ((c == 'l' || c == 'c' || c == 'v')
1651 && (cur || got_digit))
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001652 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001653 if (cur && n)
1654 {
Bram Moolenaar91ff3d42022-04-04 18:32:32 +01001655 semsg(_(e_regexp_number_after_dot_pos_search_chr),
1656 no_Magic(c));
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001657 rc_did_emsg = TRUE;
1658 return NULL;
1659 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001660 if (c == 'l')
1661 {
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001662 if (cur)
1663 n = curwin->w_cursor.lnum;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001664 ret = regnode(RE_LNUM);
1665 if (save_prev_at_start)
1666 at_start = TRUE;
1667 }
1668 else if (c == 'c')
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001669 {
1670 if (cur)
1671 {
1672 n = curwin->w_cursor.col;
1673 n++;
1674 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001675 ret = regnode(RE_COL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001676 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001677 else
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001678 {
1679 if (cur)
1680 {
1681 colnr_T vcol = 0;
1682
1683 getvvcol(curwin, &curwin->w_cursor,
1684 NULL, NULL, &vcol);
1685 ++vcol;
1686 n = vcol;
1687 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001688 ret = regnode(RE_VCOL);
Bram Moolenaar04db26b2021-07-05 20:15:23 +02001689 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001690 if (ret == JUST_CALC_SIZE)
1691 regsize += 5;
1692 else
1693 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001694 // put the number and the optional
1695 // comparator after the opcode
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001696 regcode = re_put_long(regcode, n);
1697 *regcode++ = cmp;
1698 }
1699 break;
1700 }
1701 }
1702
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001703 EMSG2_RET_NULL(_(e_invalid_character_after_str),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001704 reg_magic == MAGIC_ALL);
1705 }
1706 }
1707 break;
1708
1709 case Magic('['):
1710collection:
1711 {
1712 char_u *lp;
1713
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001714 // If there is no matching ']', we assume the '[' is a normal
1715 // character. This makes 'incsearch' and ":help [" work.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001716 lp = skip_anyof(regparse);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001717 if (*lp == ']') // there is a matching ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001718 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001719 int startc = -1; // > 0 when next '-' is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001720 int endc;
1721
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001722 // In a character class, different parsing rules apply.
1723 // Not even \ is special anymore, nothing is.
1724 if (*regparse == '^') // Complement of range.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001725 {
1726 ret = regnode(ANYBUT + extra);
1727 regparse++;
1728 }
1729 else
1730 ret = regnode(ANYOF + extra);
1731
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001732 // At the start ']' and '-' mean the literal character.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001733 if (*regparse == ']' || *regparse == '-')
1734 {
1735 startc = *regparse;
1736 regc(*regparse++);
1737 }
1738
1739 while (*regparse != NUL && *regparse != ']')
1740 {
1741 if (*regparse == '-')
1742 {
1743 ++regparse;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001744 // The '-' is not used for a range at the end and
1745 // after or before a '\n'.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001746 if (*regparse == ']' || *regparse == NUL
1747 || startc == -1
1748 || (regparse[0] == '\\' && regparse[1] == 'n'))
1749 {
1750 regc('-');
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001751 startc = '-'; // [--x] is a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001752 }
1753 else
1754 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001755 // Also accept "a-[.z.]"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001756 endc = 0;
1757 if (*regparse == '[')
1758 endc = get_coll_element(&regparse);
1759 if (endc == 0)
1760 {
1761 if (has_mbyte)
1762 endc = mb_ptr2char_adv(&regparse);
1763 else
1764 endc = *regparse++;
1765 }
1766
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001767 // Handle \o40, \x20 and \u20AC style sequences
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001768 if (endc == '\\' && !reg_cpo_lit && !reg_cpo_bsl)
1769 endc = coll_get_char();
1770
1771 if (startc > endc)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001772 EMSG_RET_NULL(_(e_reverse_range_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001773 if (has_mbyte && ((*mb_char2len)(startc) > 1
1774 || (*mb_char2len)(endc) > 1))
1775 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001776 // Limit to a range of 256 chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001777 if (endc > startc + 256)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001778 EMSG_RET_NULL(_(e_range_too_large_in_character_class));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001779 while (++startc <= endc)
1780 regmbc(startc);
1781 }
1782 else
1783 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001784 while (++startc <= endc)
Bram Moolenaar424bcae2022-01-31 14:59:41 +00001785 regc(startc);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001786 }
1787 startc = -1;
1788 }
1789 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001790 // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1791 // accepts "\t", "\e", etc., but only when the 'l' flag in
1792 // 'cpoptions' is not included.
1793 // Posix doesn't recognize backslash at all.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001794 else if (*regparse == '\\'
1795 && !reg_cpo_bsl
1796 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1797 || (!reg_cpo_lit
1798 && vim_strchr(REGEXP_ABBR,
1799 regparse[1]) != NULL)))
1800 {
1801 regparse++;
1802 if (*regparse == 'n')
1803 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001804 // '\n' in range: also match NL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001805 if (ret != JUST_CALC_SIZE)
1806 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001807 // Using \n inside [^] does not change what
1808 // matches. "[^\n]" is the same as ".".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001809 if (*ret == ANYOF)
1810 {
1811 *ret = ANYOF + ADD_NL;
1812 *flagp |= HASNL;
1813 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001814 // else: must have had a \n already
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001815 }
1816 regparse++;
1817 startc = -1;
1818 }
1819 else if (*regparse == 'd'
1820 || *regparse == 'o'
1821 || *regparse == 'x'
1822 || *regparse == 'u'
1823 || *regparse == 'U')
1824 {
1825 startc = coll_get_char();
1826 if (startc == 0)
1827 regc(0x0a);
1828 else
1829 regmbc(startc);
1830 }
1831 else
1832 {
1833 startc = backslash_trans(*regparse++);
1834 regc(startc);
1835 }
1836 }
1837 else if (*regparse == '[')
1838 {
1839 int c_class;
1840 int cu;
1841
1842 c_class = get_char_class(&regparse);
1843 startc = -1;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001844 // Characters assumed to be 8 bits!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001845 switch (c_class)
1846 {
1847 case CLASS_NONE:
1848 c_class = get_equi_class(&regparse);
1849 if (c_class != 0)
1850 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001851 // produce equivalence class
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001852 reg_equi_class(c_class);
1853 }
1854 else if ((c_class =
1855 get_coll_element(&regparse)) != 0)
1856 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001857 // produce a collating element
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001858 regmbc(c_class);
1859 }
1860 else
1861 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001862 // literal '[', allow [[-x] as a range
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001863 startc = *regparse++;
1864 regc(startc);
1865 }
1866 break;
1867 case CLASS_ALNUM:
1868 for (cu = 1; cu < 128; cu++)
1869 if (isalnum(cu))
1870 regmbc(cu);
1871 break;
1872 case CLASS_ALPHA:
1873 for (cu = 1; cu < 128; cu++)
1874 if (isalpha(cu))
1875 regmbc(cu);
1876 break;
1877 case CLASS_BLANK:
1878 regc(' ');
1879 regc('\t');
1880 break;
1881 case CLASS_CNTRL:
1882 for (cu = 1; cu <= 127; cu++)
1883 if (iscntrl(cu))
1884 regmbc(cu);
1885 break;
1886 case CLASS_DIGIT:
1887 for (cu = 1; cu <= 127; cu++)
1888 if (VIM_ISDIGIT(cu))
1889 regmbc(cu);
1890 break;
1891 case CLASS_GRAPH:
1892 for (cu = 1; cu <= 127; cu++)
1893 if (isgraph(cu))
1894 regmbc(cu);
1895 break;
1896 case CLASS_LOWER:
1897 for (cu = 1; cu <= 255; cu++)
1898 if (MB_ISLOWER(cu) && cu != 170
1899 && cu != 186)
1900 regmbc(cu);
1901 break;
1902 case CLASS_PRINT:
1903 for (cu = 1; cu <= 255; cu++)
1904 if (vim_isprintc(cu))
1905 regmbc(cu);
1906 break;
1907 case CLASS_PUNCT:
1908 for (cu = 1; cu < 128; cu++)
1909 if (ispunct(cu))
1910 regmbc(cu);
1911 break;
1912 case CLASS_SPACE:
1913 for (cu = 9; cu <= 13; cu++)
1914 regc(cu);
1915 regc(' ');
1916 break;
1917 case CLASS_UPPER:
1918 for (cu = 1; cu <= 255; cu++)
1919 if (MB_ISUPPER(cu))
1920 regmbc(cu);
1921 break;
1922 case CLASS_XDIGIT:
1923 for (cu = 1; cu <= 255; cu++)
1924 if (vim_isxdigit(cu))
1925 regmbc(cu);
1926 break;
1927 case CLASS_TAB:
1928 regc('\t');
1929 break;
1930 case CLASS_RETURN:
1931 regc('\r');
1932 break;
1933 case CLASS_BACKSPACE:
1934 regc('\b');
1935 break;
1936 case CLASS_ESCAPE:
1937 regc('\033');
1938 break;
1939 case CLASS_IDENT:
1940 for (cu = 1; cu <= 255; cu++)
1941 if (vim_isIDc(cu))
1942 regmbc(cu);
1943 break;
1944 case CLASS_KEYWORD:
1945 for (cu = 1; cu <= 255; cu++)
1946 if (reg_iswordc(cu))
1947 regmbc(cu);
1948 break;
1949 case CLASS_FNAME:
1950 for (cu = 1; cu <= 255; cu++)
1951 if (vim_isfilec(cu))
1952 regmbc(cu);
1953 break;
1954 }
1955 }
1956 else
1957 {
1958 if (has_mbyte)
1959 {
1960 int len;
1961
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001962 // produce a multibyte character, including any
1963 // following composing characters
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001964 startc = mb_ptr2char(regparse);
1965 len = (*mb_ptr2len)(regparse);
1966 if (enc_utf8 && utf_char2len(startc) != len)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001967 startc = -1; // composing chars
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001968 while (--len >= 0)
1969 regc(*regparse++);
1970 }
1971 else
1972 {
1973 startc = *regparse++;
1974 regc(startc);
1975 }
1976 }
1977 }
1978 regc(NUL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001979 prevchr_len = 1; // last char was the ']'
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001980 if (*regparse != ']')
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00001981 EMSG_RET_NULL(_(e_too_many_brackets)); // Cannot happen?
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001982 skipchr(); // let's be friends with the lexer again
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001983 *flagp |= HASWIDTH | SIMPLE;
1984 break;
1985 }
1986 else if (reg_strict)
Bram Moolenaar677658a2022-01-05 16:09:06 +00001987 EMSG2_RET_NULL(_(e_missing_rsb_after_str_lsb),
1988 reg_magic > MAGIC_OFF);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001989 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001990 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001991
1992 default:
1993 {
1994 int len;
1995
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02001996 // A multi-byte character is handled as a separate atom if it's
1997 // before a multi and when it's a composing char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001998 if (use_multibytecode(c))
1999 {
2000do_multibyte:
2001 ret = regnode(MULTIBYTECODE);
2002 regmbc(c);
2003 *flagp |= HASWIDTH | SIMPLE;
2004 break;
2005 }
2006
2007 ret = regnode(EXACTLY);
2008
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002009 // Append characters as long as:
2010 // - there is no following multi, we then need the character in
2011 // front of it as a single character operand
2012 // - not running into a Magic character
2013 // - "one_exactly" is not set
2014 // But always emit at least one character. Might be a Multi,
2015 // e.g., a "[" without matching "]".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002016 for (len = 0; c != NUL && (len == 0
2017 || (re_multi_type(peekchr()) == NOT_MULTI
2018 && !one_exactly
2019 && !is_Magic(c))); ++len)
2020 {
2021 c = no_Magic(c);
2022 if (has_mbyte)
2023 {
2024 regmbc(c);
2025 if (enc_utf8)
2026 {
2027 int l;
2028
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002029 // Need to get composing character too.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002030 for (;;)
2031 {
2032 l = utf_ptr2len(regparse);
2033 if (!UTF_COMPOSINGLIKE(regparse, regparse + l))
2034 break;
2035 regmbc(utf_ptr2char(regparse));
2036 skipchr();
2037 }
2038 }
2039 }
2040 else
2041 regc(c);
2042 c = getchr();
2043 }
2044 ungetchr();
2045
2046 regc(NUL);
2047 *flagp |= HASWIDTH;
2048 if (len == 1)
2049 *flagp |= SIMPLE;
2050 }
2051 break;
2052 }
2053
2054 return ret;
2055}
2056
2057/*
2058 * Parse something followed by possible [*+=].
2059 *
2060 * Note that the branching code sequences used for = and the general cases
2061 * of * and + are somewhat optimized: they use the same NOTHING node as
2062 * both the endmarker for their branch list and the body of the last branch.
2063 * It might seem that this node could be dispensed with entirely, but the
2064 * endmarker role is not redundant.
2065 */
2066 static char_u *
2067regpiece(int *flagp)
2068{
2069 char_u *ret;
2070 int op;
2071 char_u *next;
2072 int flags;
2073 long minval;
2074 long maxval;
2075
2076 ret = regatom(&flags);
2077 if (ret == NULL)
2078 return NULL;
2079
2080 op = peekchr();
2081 if (re_multi_type(op) == NOT_MULTI)
2082 {
2083 *flagp = flags;
2084 return ret;
2085 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002086 // default flags
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002087 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
2088
2089 skipchr();
2090 switch (op)
2091 {
2092 case Magic('*'):
2093 if (flags & SIMPLE)
2094 reginsert(STAR, ret);
2095 else
2096 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002097 // Emit x* as (x&|), where & means "self".
2098 reginsert(BRANCH, ret); // Either x
2099 regoptail(ret, regnode(BACK)); // and loop
2100 regoptail(ret, ret); // back
2101 regtail(ret, regnode(BRANCH)); // or
2102 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002103 }
2104 break;
2105
2106 case Magic('+'):
2107 if (flags & SIMPLE)
2108 reginsert(PLUS, ret);
2109 else
2110 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002111 // Emit x+ as x(&|), where & means "self".
2112 next = regnode(BRANCH); // Either
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002113 regtail(ret, next);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002114 regtail(regnode(BACK), ret); // loop back
2115 regtail(next, regnode(BRANCH)); // or
2116 regtail(ret, regnode(NOTHING)); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002117 }
2118 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2119 break;
2120
2121 case Magic('@'):
2122 {
2123 int lop = END;
2124 long nr;
2125
2126 nr = getdecchrs();
2127 switch (no_Magic(getchr()))
2128 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002129 case '=': lop = MATCH; break; // \@=
2130 case '!': lop = NOMATCH; break; // \@!
2131 case '>': lop = SUBPAT; break; // \@>
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002132 case '<': switch (no_Magic(getchr()))
2133 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002134 case '=': lop = BEHIND; break; // \@<=
2135 case '!': lop = NOBEHIND; break; // \@<!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002136 }
2137 }
2138 if (lop == END)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002139 EMSG2_RET_NULL(_(e_invalid_character_after_str_at),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002140 reg_magic == MAGIC_ALL);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002141 // Look behind must match with behind_pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002142 if (lop == BEHIND || lop == NOBEHIND)
2143 {
2144 regtail(ret, regnode(BHPOS));
2145 *flagp |= HASLOOKBH;
2146 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002147 regtail(ret, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002148 if (lop == BEHIND || lop == NOBEHIND)
2149 {
2150 if (nr < 0)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002151 nr = 0; // no limit is same as zero limit
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002152 reginsert_nr(lop, nr, ret);
2153 }
2154 else
2155 reginsert(lop, ret);
2156 break;
2157 }
2158
2159 case Magic('?'):
2160 case Magic('='):
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002161 // Emit x= as (x|)
2162 reginsert(BRANCH, ret); // Either x
2163 regtail(ret, regnode(BRANCH)); // or
2164 next = regnode(NOTHING); // null.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002165 regtail(ret, next);
2166 regoptail(ret, next);
2167 break;
2168
2169 case Magic('{'):
2170 if (!read_limits(&minval, &maxval))
2171 return NULL;
2172 if (flags & SIMPLE)
2173 {
2174 reginsert(BRACE_SIMPLE, ret);
2175 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2176 }
2177 else
2178 {
2179 if (num_complex_braces >= 10)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002180 EMSG2_RET_NULL(_(e_too_many_complex_str_curly),
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002181 reg_magic == MAGIC_ALL);
2182 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
2183 regoptail(ret, regnode(BACK));
2184 regoptail(ret, ret);
2185 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
2186 ++num_complex_braces;
2187 }
2188 if (minval > 0 && maxval > 0)
2189 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
2190 break;
2191 }
2192 if (re_multi_type(peekchr()) != NOT_MULTI)
2193 {
2194 // Can't have a multi follow a multi.
2195 if (peekchr() == Magic('*'))
Bram Moolenaar12f3c1b2021-12-05 21:46:34 +00002196 EMSG2_RET_NULL(_(e_nested_str), reg_magic >= MAGIC_ON);
2197 EMSG3_RET_NULL(_(e_nested_str_chr), reg_magic == MAGIC_ALL,
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002198 no_Magic(peekchr()));
2199 }
2200
2201 return ret;
2202}
2203
2204/*
2205 * Parse one alternative of an | or & operator.
2206 * Implements the concatenation operator.
2207 */
2208 static char_u *
2209regconcat(int *flagp)
2210{
2211 char_u *first = NULL;
2212 char_u *chain = NULL;
2213 char_u *latest;
2214 int flags;
2215 int cont = TRUE;
2216
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002217 *flagp = WORST; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002218
2219 while (cont)
2220 {
2221 switch (peekchr())
2222 {
2223 case NUL:
2224 case Magic('|'):
2225 case Magic('&'):
2226 case Magic(')'):
2227 cont = FALSE;
2228 break;
2229 case Magic('Z'):
2230 regflags |= RF_ICOMBINE;
2231 skipchr_keepstart();
2232 break;
2233 case Magic('c'):
2234 regflags |= RF_ICASE;
2235 skipchr_keepstart();
2236 break;
2237 case Magic('C'):
2238 regflags |= RF_NOICASE;
2239 skipchr_keepstart();
2240 break;
2241 case Magic('v'):
2242 reg_magic = MAGIC_ALL;
2243 skipchr_keepstart();
2244 curchr = -1;
2245 break;
2246 case Magic('m'):
2247 reg_magic = MAGIC_ON;
2248 skipchr_keepstart();
2249 curchr = -1;
2250 break;
2251 case Magic('M'):
2252 reg_magic = MAGIC_OFF;
2253 skipchr_keepstart();
2254 curchr = -1;
2255 break;
2256 case Magic('V'):
2257 reg_magic = MAGIC_NONE;
2258 skipchr_keepstart();
2259 curchr = -1;
2260 break;
2261 default:
2262 latest = regpiece(&flags);
2263 if (latest == NULL || reg_toolong)
2264 return NULL;
2265 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002266 if (chain == NULL) // First piece.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002267 *flagp |= flags & SPSTART;
2268 else
2269 regtail(chain, latest);
2270 chain = latest;
2271 if (first == NULL)
2272 first = latest;
2273 break;
2274 }
2275 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002276 if (first == NULL) // Loop ran zero times.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002277 first = regnode(NOTHING);
2278 return first;
2279}
2280
2281/*
2282 * Parse one alternative of an | operator.
2283 * Implements the & operator.
2284 */
2285 static char_u *
2286regbranch(int *flagp)
2287{
2288 char_u *ret;
2289 char_u *chain = NULL;
2290 char_u *latest;
2291 int flags;
2292
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002293 *flagp = WORST | HASNL; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002294
2295 ret = regnode(BRANCH);
2296 for (;;)
2297 {
2298 latest = regconcat(&flags);
2299 if (latest == NULL)
2300 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002301 // If one of the branches has width, the whole thing has. If one of
2302 // the branches anchors at start-of-line, the whole thing does.
2303 // If one of the branches uses look-behind, the whole thing does.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002304 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002305 // If one of the branches doesn't match a line-break, the whole thing
2306 // doesn't.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002307 *flagp &= ~HASNL | (flags & HASNL);
2308 if (chain != NULL)
2309 regtail(chain, latest);
2310 if (peekchr() != Magic('&'))
2311 break;
2312 skipchr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002313 regtail(latest, regnode(END)); // operand ends
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002314 if (reg_toolong)
2315 break;
2316 reginsert(MATCH, latest);
2317 chain = latest;
2318 }
2319
2320 return ret;
2321}
2322
2323/*
2324 * Parse regular expression, i.e. main body or parenthesized thing.
2325 *
2326 * Caller must absorb opening parenthesis.
2327 *
2328 * Combining parenthesis handling with the base level of regular expression
2329 * is a trifle forced, but the need to tie the tails of the branches to what
2330 * follows makes it hard to avoid.
2331 */
2332 static char_u *
2333reg(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002334 int paren, // REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002335 int *flagp)
2336{
2337 char_u *ret;
2338 char_u *br;
2339 char_u *ender;
2340 int parno = 0;
2341 int flags;
2342
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002343 *flagp = HASWIDTH; // Tentatively.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002344
2345#ifdef FEAT_SYN_HL
2346 if (paren == REG_ZPAREN)
2347 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002348 // Make a ZOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002349 if (regnzpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002350 EMSG_RET_NULL(_(e_too_many_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002351 parno = regnzpar;
2352 regnzpar++;
2353 ret = regnode(ZOPEN + parno);
2354 }
2355 else
2356#endif
2357 if (paren == REG_PAREN)
2358 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002359 // Make a MOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002360 if (regnpar >= NSUBEXP)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002361 EMSG2_RET_NULL(_(e_too_many_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002362 parno = regnpar;
2363 ++regnpar;
2364 ret = regnode(MOPEN + parno);
2365 }
2366 else if (paren == REG_NPAREN)
2367 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002368 // Make a NOPEN node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002369 ret = regnode(NOPEN);
2370 }
2371 else
2372 ret = NULL;
2373
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002374 // Pick up the branches, linking them together.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002375 br = regbranch(&flags);
2376 if (br == NULL)
2377 return NULL;
2378 if (ret != NULL)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002379 regtail(ret, br); // [MZ]OPEN -> first.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002380 else
2381 ret = br;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002382 // If one of the branches can be zero-width, the whole thing can.
2383 // If one of the branches has * at start or matches a line-break, the
2384 // whole thing can.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002385 if (!(flags & HASWIDTH))
2386 *flagp &= ~HASWIDTH;
2387 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2388 while (peekchr() == Magic('|'))
2389 {
2390 skipchr();
2391 br = regbranch(&flags);
2392 if (br == NULL || reg_toolong)
2393 return NULL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002394 regtail(ret, br); // BRANCH -> BRANCH.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002395 if (!(flags & HASWIDTH))
2396 *flagp &= ~HASWIDTH;
2397 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
2398 }
2399
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002400 // Make a closing node, and hook it on the end.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002401 ender = regnode(
2402#ifdef FEAT_SYN_HL
2403 paren == REG_ZPAREN ? ZCLOSE + parno :
2404#endif
2405 paren == REG_PAREN ? MCLOSE + parno :
2406 paren == REG_NPAREN ? NCLOSE : END);
2407 regtail(ret, ender);
2408
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002409 // Hook the tails of the branches to the closing node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002410 for (br = ret; br != NULL; br = regnext(br))
2411 regoptail(br, ender);
2412
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002413 // Check for proper termination.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002414 if (paren != REG_NOPAREN && getchr() != Magic(')'))
2415 {
2416#ifdef FEAT_SYN_HL
2417 if (paren == REG_ZPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002418 EMSG_RET_NULL(_(e_unmatched_z));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002419 else
2420#endif
2421 if (paren == REG_NPAREN)
Bram Moolenaard8e44472021-07-21 22:20:33 +02002422 EMSG2_RET_NULL(_(e_unmatched_str_percent_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002423 else
Bram Moolenaard8e44472021-07-21 22:20:33 +02002424 EMSG2_RET_NULL(_(e_unmatched_str_open), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002425 }
2426 else if (paren == REG_NOPAREN && peekchr() != NUL)
2427 {
2428 if (curchr == Magic(')'))
Bram Moolenaard8e44472021-07-21 22:20:33 +02002429 EMSG2_RET_NULL(_(e_unmatched_str_close), reg_magic == MAGIC_ALL);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002430 else
Bram Moolenaar74409f62022-01-01 15:58:22 +00002431 EMSG_RET_NULL(_(e_trailing_characters)); // "Can't happen".
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002432 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002433 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002434 // Here we set the flag allowing back references to this set of
2435 // parentheses.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002436 if (paren == REG_PAREN)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002437 had_endbrace[parno] = TRUE; // have seen the close paren
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002438 return ret;
2439}
2440
2441/*
2442 * bt_regcomp() - compile a regular expression into internal code for the
2443 * traditional back track matcher.
2444 * Returns the program in allocated space. Returns NULL for an error.
2445 *
2446 * We can't allocate space until we know how big the compiled form will be,
2447 * but we can't compile it (and thus know how big it is) until we've got a
2448 * place to put the code. So we cheat: we compile it twice, once with code
2449 * generation turned off and size counting turned on, and once "for real".
2450 * This also means that we don't allocate space until we are sure that the
2451 * thing really will compile successfully, and we never have to move the
2452 * code and thus invalidate pointers into it. (Note that it has to be in
2453 * one piece because vim_free() must be able to free it all.)
2454 *
2455 * Whether upper/lower case is to be ignored is decided when executing the
2456 * program, it does not matter here.
2457 *
2458 * Beware that the optimization-preparation code in here knows about some
2459 * of the structure of the compiled regexp.
2460 * "re_flags": RE_MAGIC and/or RE_STRING.
2461 */
2462 static regprog_T *
2463bt_regcomp(char_u *expr, int re_flags)
2464{
2465 bt_regprog_T *r;
2466 char_u *scan;
2467 char_u *longest;
2468 int len;
2469 int flags;
2470
2471 if (expr == NULL)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002472 IEMSG_RET_NULL(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002473
2474 init_class_tab();
2475
2476 // First pass: determine size, legality.
2477 regcomp_start(expr, re_flags);
2478 regcode = JUST_CALC_SIZE;
2479 regc(REGMAGIC);
2480 if (reg(REG_NOPAREN, &flags) == NULL)
2481 return NULL;
2482
2483 // Allocate space.
2484 r = alloc(offsetof(bt_regprog_T, program) + regsize);
2485 if (r == NULL)
2486 return NULL;
2487 r->re_in_use = FALSE;
2488
2489 // Second pass: emit code.
2490 regcomp_start(expr, re_flags);
2491 regcode = r->program;
2492 regc(REGMAGIC);
2493 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
2494 {
2495 vim_free(r);
2496 if (reg_toolong)
Bram Moolenaareaaac012022-01-02 17:00:40 +00002497 EMSG_RET_NULL(_(e_pattern_too_long));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002498 return NULL;
2499 }
2500
2501 // Dig out information for optimizations.
2502 r->regstart = NUL; // Worst-case defaults.
2503 r->reganch = 0;
2504 r->regmust = NULL;
2505 r->regmlen = 0;
2506 r->regflags = regflags;
2507 if (flags & HASNL)
2508 r->regflags |= RF_HASNL;
2509 if (flags & HASLOOKBH)
2510 r->regflags |= RF_LOOKBH;
2511#ifdef FEAT_SYN_HL
2512 // Remember whether this pattern has any \z specials in it.
2513 r->reghasz = re_has_z;
2514#endif
2515 scan = r->program + 1; // First BRANCH.
2516 if (OP(regnext(scan)) == END) // Only one top-level choice.
2517 {
2518 scan = OPERAND(scan);
2519
2520 // Starting-point info.
2521 if (OP(scan) == BOL || OP(scan) == RE_BOF)
2522 {
2523 r->reganch++;
2524 scan = regnext(scan);
2525 }
2526
2527 if (OP(scan) == EXACTLY)
2528 {
2529 if (has_mbyte)
2530 r->regstart = (*mb_ptr2char)(OPERAND(scan));
2531 else
2532 r->regstart = *OPERAND(scan);
2533 }
2534 else if ((OP(scan) == BOW
2535 || OP(scan) == EOW
2536 || OP(scan) == NOTHING
2537 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
2538 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
2539 && OP(regnext(scan)) == EXACTLY)
2540 {
2541 if (has_mbyte)
2542 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
2543 else
2544 r->regstart = *OPERAND(regnext(scan));
2545 }
2546
2547 // If there's something expensive in the r.e., find the longest
2548 // literal string that must appear and make it the regmust. Resolve
2549 // ties in favor of later strings, since the regstart check works
2550 // with the beginning of the r.e. and avoiding duplication
2551 // strengthens checking. Not a strong reason, but sufficient in the
2552 // absence of others.
2553
2554 // When the r.e. starts with BOW, it is faster to look for a regmust
2555 // first. Used a lot for "#" and "*" commands. (Added by mool).
2556 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
2557 && !(flags & HASNL))
2558 {
2559 longest = NULL;
2560 len = 0;
2561 for (; scan != NULL; scan = regnext(scan))
2562 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
2563 {
2564 longest = OPERAND(scan);
2565 len = (int)STRLEN(OPERAND(scan));
2566 }
2567 r->regmust = longest;
2568 r->regmlen = len;
2569 }
2570 }
2571#ifdef BT_REGEXP_DUMP
2572 regdump(expr, r);
2573#endif
2574 r->engine = &bt_regengine;
2575 return (regprog_T *)r;
2576}
2577
2578#if defined(FEAT_SYN_HL) || defined(PROTO)
2579/*
2580 * Check if during the previous call to vim_regcomp the EOL item "$" has been
2581 * found. This is messy, but it works fine.
2582 */
2583 int
2584vim_regcomp_had_eol(void)
2585{
2586 return had_eol;
2587}
2588#endif
2589
2590/*
2591 * Get a number after a backslash that is inside [].
2592 * When nothing is recognized return a backslash.
2593 */
2594 static int
2595coll_get_char(void)
2596{
2597 long nr = -1;
2598
2599 switch (*regparse++)
2600 {
2601 case 'd': nr = getdecchrs(); break;
2602 case 'o': nr = getoctchrs(); break;
2603 case 'x': nr = gethexchrs(2); break;
2604 case 'u': nr = gethexchrs(4); break;
2605 case 'U': nr = gethexchrs(8); break;
2606 }
2607 if (nr < 0 || nr > INT_MAX)
2608 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002609 // If getting the number fails be backwards compatible: the character
2610 // is a backslash.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002611 --regparse;
2612 nr = '\\';
2613 }
2614 return nr;
2615}
2616
2617/*
2618 * Free a compiled regexp program, returned by bt_regcomp().
2619 */
2620 static void
2621bt_regfree(regprog_T *prog)
2622{
2623 vim_free(prog);
2624}
2625
2626#define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
2627
2628/*
2629 * The arguments from BRACE_LIMITS are stored here. They are actually local
2630 * to regmatch(), but they are here to reduce the amount of stack space used
2631 * (it can be called recursively many times).
2632 */
2633static long bl_minval;
2634static long bl_maxval;
2635
2636/*
2637 * Save the input line and position in a regsave_T.
2638 */
2639 static void
2640reg_save(regsave_T *save, garray_T *gap)
2641{
2642 if (REG_MULTI)
2643 {
2644 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
2645 save->rs_u.pos.lnum = rex.lnum;
2646 }
2647 else
2648 save->rs_u.ptr = rex.input;
2649 save->rs_len = gap->ga_len;
2650}
2651
2652/*
2653 * Restore the input line and position from a regsave_T.
2654 */
2655 static void
2656reg_restore(regsave_T *save, garray_T *gap)
2657{
2658 if (REG_MULTI)
2659 {
2660 if (rex.lnum != save->rs_u.pos.lnum)
2661 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002662 // only call reg_getline() when the line number changed to save
2663 // a bit of time
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002664 rex.lnum = save->rs_u.pos.lnum;
2665 rex.line = reg_getline(rex.lnum);
2666 }
2667 rex.input = rex.line + save->rs_u.pos.col;
2668 }
2669 else
2670 rex.input = save->rs_u.ptr;
2671 gap->ga_len = save->rs_len;
2672}
2673
2674/*
2675 * Return TRUE if current position is equal to saved position.
2676 */
2677 static int
2678reg_save_equal(regsave_T *save)
2679{
2680 if (REG_MULTI)
2681 return rex.lnum == save->rs_u.pos.lnum
2682 && rex.input == rex.line + save->rs_u.pos.col;
2683 return rex.input == save->rs_u.ptr;
2684}
2685
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002686// Save the sub-expressions before attempting a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002687#define save_se(savep, posp, pp) \
2688 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2689
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002690// After a failed match restore the sub-expressions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002691#define restore_se(savep, posp, pp) { \
2692 if (REG_MULTI) \
2693 *(posp) = (savep)->se_u.pos; \
2694 else \
2695 *(pp) = (savep)->se_u.ptr; }
2696
2697/*
2698 * Tentatively set the sub-expression start to the current position (after
2699 * calling regmatch() they will have changed). Need to save the existing
2700 * values for when there is no match.
2701 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
2702 * depending on REG_MULTI.
2703 */
2704 static void
2705save_se_multi(save_se_T *savep, lpos_T *posp)
2706{
2707 savep->se_u.pos = *posp;
2708 posp->lnum = rex.lnum;
2709 posp->col = (colnr_T)(rex.input - rex.line);
2710}
2711
2712 static void
2713save_se_one(save_se_T *savep, char_u **pp)
2714{
2715 savep->se_u.ptr = *pp;
2716 *pp = rex.input;
2717}
2718
2719/*
2720 * regrepeat - repeatedly match something simple, return how many.
2721 * Advances rex.input (and rex.lnum) to just after the matched chars.
2722 */
2723 static int
2724regrepeat(
2725 char_u *p,
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002726 long maxcount) // maximum number of matches allowed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002727{
2728 long count = 0;
2729 char_u *scan;
2730 char_u *opnd;
2731 int mask;
2732 int testval = 0;
2733
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002734 scan = rex.input; // Make local copy of rex.input for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002735 opnd = OPERAND(p);
2736 switch (OP(p))
2737 {
2738 case ANY:
2739 case ANY + ADD_NL:
2740 while (count < maxcount)
2741 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002742 // Matching anything means we continue until end-of-line (or
2743 // end-of-file for ANY + ADD_NL), only limited by maxcount.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002744 while (*scan != NUL && count < maxcount)
2745 {
2746 ++count;
2747 MB_PTR_ADV(scan);
2748 }
2749 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2750 || rex.reg_line_lbr || count == maxcount)
2751 break;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002752 ++count; // count the line-break
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002753 reg_nextline();
2754 scan = rex.input;
2755 if (got_int)
2756 break;
2757 }
2758 break;
2759
2760 case IDENT:
2761 case IDENT + ADD_NL:
2762 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002763 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002764 case SIDENT:
2765 case SIDENT + ADD_NL:
2766 while (count < maxcount)
2767 {
2768 if (vim_isIDc(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2769 {
2770 MB_PTR_ADV(scan);
2771 }
2772 else if (*scan == NUL)
2773 {
2774 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2775 || rex.reg_line_lbr)
2776 break;
2777 reg_nextline();
2778 scan = rex.input;
2779 if (got_int)
2780 break;
2781 }
2782 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2783 ++scan;
2784 else
2785 break;
2786 ++count;
2787 }
2788 break;
2789
2790 case KWORD:
2791 case KWORD + ADD_NL:
2792 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002793 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002794 case SKWORD:
2795 case SKWORD + ADD_NL:
2796 while (count < maxcount)
2797 {
2798 if (vim_iswordp_buf(scan, rex.reg_buf)
2799 && (testval || !VIM_ISDIGIT(*scan)))
2800 {
2801 MB_PTR_ADV(scan);
2802 }
2803 else if (*scan == NUL)
2804 {
2805 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2806 || rex.reg_line_lbr)
2807 break;
2808 reg_nextline();
2809 scan = rex.input;
2810 if (got_int)
2811 break;
2812 }
2813 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2814 ++scan;
2815 else
2816 break;
2817 ++count;
2818 }
2819 break;
2820
2821 case FNAME:
2822 case FNAME + ADD_NL:
2823 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002824 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002825 case SFNAME:
2826 case SFNAME + ADD_NL:
2827 while (count < maxcount)
2828 {
2829 if (vim_isfilec(PTR2CHAR(scan)) && (testval || !VIM_ISDIGIT(*scan)))
2830 {
2831 MB_PTR_ADV(scan);
2832 }
2833 else if (*scan == NUL)
2834 {
2835 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2836 || rex.reg_line_lbr)
2837 break;
2838 reg_nextline();
2839 scan = rex.input;
2840 if (got_int)
2841 break;
2842 }
2843 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2844 ++scan;
2845 else
2846 break;
2847 ++count;
2848 }
2849 break;
2850
2851 case PRINT:
2852 case PRINT + ADD_NL:
2853 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002854 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002855 case SPRINT:
2856 case SPRINT + ADD_NL:
2857 while (count < maxcount)
2858 {
2859 if (*scan == NUL)
2860 {
2861 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2862 || rex.reg_line_lbr)
2863 break;
2864 reg_nextline();
2865 scan = rex.input;
2866 if (got_int)
2867 break;
2868 }
2869 else if (vim_isprintc(PTR2CHAR(scan)) == 1
2870 && (testval || !VIM_ISDIGIT(*scan)))
2871 {
2872 MB_PTR_ADV(scan);
2873 }
2874 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2875 ++scan;
2876 else
2877 break;
2878 ++count;
2879 }
2880 break;
2881
2882 case WHITE:
2883 case WHITE + ADD_NL:
2884 testval = mask = RI_WHITE;
2885do_class:
2886 while (count < maxcount)
2887 {
2888 int l;
2889
2890 if (*scan == NUL)
2891 {
2892 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
2893 || rex.reg_line_lbr)
2894 break;
2895 reg_nextline();
2896 scan = rex.input;
2897 if (got_int)
2898 break;
2899 }
2900 else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1)
2901 {
2902 if (testval != 0)
2903 break;
2904 scan += l;
2905 }
2906 else if ((class_tab[*scan] & mask) == testval)
2907 ++scan;
2908 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
2909 ++scan;
2910 else
2911 break;
2912 ++count;
2913 }
2914 break;
2915
2916 case NWHITE:
2917 case NWHITE + ADD_NL:
2918 mask = RI_WHITE;
2919 goto do_class;
2920 case DIGIT:
2921 case DIGIT + ADD_NL:
2922 testval = mask = RI_DIGIT;
2923 goto do_class;
2924 case NDIGIT:
2925 case NDIGIT + ADD_NL:
2926 mask = RI_DIGIT;
2927 goto do_class;
2928 case HEX:
2929 case HEX + ADD_NL:
2930 testval = mask = RI_HEX;
2931 goto do_class;
2932 case NHEX:
2933 case NHEX + ADD_NL:
2934 mask = RI_HEX;
2935 goto do_class;
2936 case OCTAL:
2937 case OCTAL + ADD_NL:
2938 testval = mask = RI_OCTAL;
2939 goto do_class;
2940 case NOCTAL:
2941 case NOCTAL + ADD_NL:
2942 mask = RI_OCTAL;
2943 goto do_class;
2944 case WORD:
2945 case WORD + ADD_NL:
2946 testval = mask = RI_WORD;
2947 goto do_class;
2948 case NWORD:
2949 case NWORD + ADD_NL:
2950 mask = RI_WORD;
2951 goto do_class;
2952 case HEAD:
2953 case HEAD + ADD_NL:
2954 testval = mask = RI_HEAD;
2955 goto do_class;
2956 case NHEAD:
2957 case NHEAD + ADD_NL:
2958 mask = RI_HEAD;
2959 goto do_class;
2960 case ALPHA:
2961 case ALPHA + ADD_NL:
2962 testval = mask = RI_ALPHA;
2963 goto do_class;
2964 case NALPHA:
2965 case NALPHA + ADD_NL:
2966 mask = RI_ALPHA;
2967 goto do_class;
2968 case LOWER:
2969 case LOWER + ADD_NL:
2970 testval = mask = RI_LOWER;
2971 goto do_class;
2972 case NLOWER:
2973 case NLOWER + ADD_NL:
2974 mask = RI_LOWER;
2975 goto do_class;
2976 case UPPER:
2977 case UPPER + ADD_NL:
2978 testval = mask = RI_UPPER;
2979 goto do_class;
2980 case NUPPER:
2981 case NUPPER + ADD_NL:
2982 mask = RI_UPPER;
2983 goto do_class;
2984
2985 case EXACTLY:
2986 {
2987 int cu, cl;
2988
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02002989 // This doesn't do a multi-byte character, because a MULTIBYTECODE
2990 // would have been used for it. It does handle single-byte
2991 // characters, such as latin1.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002992 if (rex.reg_ic)
2993 {
2994 cu = MB_TOUPPER(*opnd);
2995 cl = MB_TOLOWER(*opnd);
2996 while (count < maxcount && (*scan == cu || *scan == cl))
2997 {
2998 count++;
2999 scan++;
3000 }
3001 }
3002 else
3003 {
3004 cu = *opnd;
3005 while (count < maxcount && *scan == cu)
3006 {
3007 count++;
3008 scan++;
3009 }
3010 }
3011 break;
3012 }
3013
3014 case MULTIBYTECODE:
3015 {
3016 int i, len, cf = 0;
3017
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003018 // Safety check (just in case 'encoding' was changed since
3019 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003020 if ((len = (*mb_ptr2len)(opnd)) > 1)
3021 {
3022 if (rex.reg_ic && enc_utf8)
3023 cf = utf_fold(utf_ptr2char(opnd));
3024 while (count < maxcount && (*mb_ptr2len)(scan) >= len)
3025 {
3026 for (i = 0; i < len; ++i)
3027 if (opnd[i] != scan[i])
3028 break;
3029 if (i < len && (!rex.reg_ic || !enc_utf8
3030 || utf_fold(utf_ptr2char(scan)) != cf))
3031 break;
3032 scan += len;
3033 ++count;
3034 }
3035 }
3036 }
3037 break;
3038
3039 case ANYOF:
3040 case ANYOF + ADD_NL:
3041 testval = TRUE;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003042 // FALLTHROUGH
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003043
3044 case ANYBUT:
3045 case ANYBUT + ADD_NL:
3046 while (count < maxcount)
3047 {
3048 int len;
3049
3050 if (*scan == NUL)
3051 {
3052 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
3053 || rex.reg_line_lbr)
3054 break;
3055 reg_nextline();
3056 scan = rex.input;
3057 if (got_int)
3058 break;
3059 }
3060 else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
3061 ++scan;
3062 else if (has_mbyte && (len = (*mb_ptr2len)(scan)) > 1)
3063 {
3064 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
3065 break;
3066 scan += len;
3067 }
3068 else
3069 {
3070 if ((cstrchr(opnd, *scan) == NULL) == testval)
3071 break;
3072 ++scan;
3073 }
3074 ++count;
3075 }
3076 break;
3077
3078 case NEWL:
3079 while (count < maxcount
3080 && ((*scan == NUL && rex.lnum <= rex.reg_maxline
3081 && !rex.reg_line_lbr && REG_MULTI)
3082 || (*scan == '\n' && rex.reg_line_lbr)))
3083 {
3084 count++;
3085 if (rex.reg_line_lbr)
3086 ADVANCE_REGINPUT();
3087 else
3088 reg_nextline();
3089 scan = rex.input;
3090 if (got_int)
3091 break;
3092 }
3093 break;
3094
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003095 default: // Oh dear. Called inappropriately.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02003096 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003097#ifdef DEBUG
3098 printf("Called regrepeat with op code %d\n", OP(p));
3099#endif
3100 break;
3101 }
3102
3103 rex.input = scan;
3104
3105 return (int)count;
3106}
3107
3108/*
3109 * Push an item onto the regstack.
3110 * Returns pointer to new item. Returns NULL when out of memory.
3111 */
3112 static regitem_T *
3113regstack_push(regstate_T state, char_u *scan)
3114{
3115 regitem_T *rp;
3116
3117 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
3118 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00003119 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003120 return NULL;
3121 }
3122 if (ga_grow(&regstack, sizeof(regitem_T)) == FAIL)
3123 return NULL;
3124
3125 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
3126 rp->rs_state = state;
3127 rp->rs_scan = scan;
3128
3129 regstack.ga_len += sizeof(regitem_T);
3130 return rp;
3131}
3132
3133/*
3134 * Pop an item from the regstack.
3135 */
3136 static void
3137regstack_pop(char_u **scan)
3138{
3139 regitem_T *rp;
3140
3141 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
3142 *scan = rp->rs_scan;
3143
3144 regstack.ga_len -= sizeof(regitem_T);
3145}
3146
3147/*
3148 * Save the current subexpr to "bp", so that they can be restored
3149 * later by restore_subexpr().
3150 */
3151 static void
3152save_subexpr(regbehind_T *bp)
3153{
3154 int i;
3155
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003156 // When "rex.need_clear_subexpr" is set we don't need to save the values,
3157 // only remember that this flag needs to be set again when restoring.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003158 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
3159 if (!rex.need_clear_subexpr)
3160 {
3161 for (i = 0; i < NSUBEXP; ++i)
3162 {
3163 if (REG_MULTI)
3164 {
3165 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
3166 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
3167 }
3168 else
3169 {
3170 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
3171 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
3172 }
3173 }
3174 }
3175}
3176
3177/*
3178 * Restore the subexpr from "bp".
3179 */
3180 static void
3181restore_subexpr(regbehind_T *bp)
3182{
3183 int i;
3184
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003185 // Only need to restore saved values when they are not to be cleared.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003186 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
3187 if (!rex.need_clear_subexpr)
3188 {
3189 for (i = 0; i < NSUBEXP; ++i)
3190 {
3191 if (REG_MULTI)
3192 {
3193 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
3194 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
3195 }
3196 else
3197 {
3198 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
3199 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
3200 }
3201 }
3202 }
3203}
3204
3205/*
3206 * regmatch - main matching routine
3207 *
3208 * Conceptually the strategy is simple: Check to see whether the current node
3209 * matches, push an item onto the regstack and loop to see whether the rest
3210 * matches, and then act accordingly. In practice we make some effort to
3211 * avoid using the regstack, in particular by going through "ordinary" nodes
3212 * (that don't need to know whether the rest of the match failed) by a nested
3213 * loop.
3214 *
3215 * Returns TRUE when there is a match. Leaves rex.input and rex.lnum just after
3216 * the last matched character.
3217 * Returns FALSE when there is no match. Leaves rex.input and rex.lnum in an
3218 * undefined state!
3219 */
3220 static int
3221regmatch(
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003222 char_u *scan, // Current node.
3223 proftime_T *tm UNUSED, // timeout limit or NULL
3224 int *timed_out UNUSED) // flag set on timeout or NULL
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003225{
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003226 char_u *next; // Next node.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003227 int op;
3228 int c;
3229 regitem_T *rp;
3230 int no;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003231 int status; // one of the RA_ values:
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003232#ifdef FEAT_RELTIME
3233 int tm_count = 0;
3234#endif
3235
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003236 // Make "regstack" and "backpos" empty. They are allocated and freed in
3237 // bt_regexec_both() to reduce malloc()/free() calls.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003238 regstack.ga_len = 0;
3239 backpos.ga_len = 0;
3240
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003241 // Repeat until "regstack" is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003242 for (;;)
3243 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003244 // Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3245 // Allow interrupting them with CTRL-C.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003246 fast_breakcheck();
3247
3248#ifdef DEBUG
3249 if (scan != NULL && regnarrate)
3250 {
3251 mch_errmsg((char *)regprop(scan));
3252 mch_errmsg("(\n");
3253 }
3254#endif
3255
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003256 // Repeat for items that can be matched sequentially, without using the
3257 // regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003258 for (;;)
3259 {
3260 if (got_int || scan == NULL)
3261 {
3262 status = RA_FAIL;
3263 break;
3264 }
3265#ifdef FEAT_RELTIME
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003266 // Check for timeout once in a 100 times to avoid overhead.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003267 if (tm != NULL && ++tm_count == 100)
3268 {
3269 tm_count = 0;
3270 if (profile_passed_limit(tm))
3271 {
3272 if (timed_out != NULL)
3273 *timed_out = TRUE;
3274 status = RA_FAIL;
3275 break;
3276 }
3277 }
3278#endif
3279 status = RA_CONT;
3280
3281#ifdef DEBUG
3282 if (regnarrate)
3283 {
3284 mch_errmsg((char *)regprop(scan));
3285 mch_errmsg("...\n");
3286# ifdef FEAT_SYN_HL
3287 if (re_extmatch_in != NULL)
3288 {
3289 int i;
3290
3291 mch_errmsg(_("External submatches:\n"));
3292 for (i = 0; i < NSUBEXP; i++)
3293 {
3294 mch_errmsg(" \"");
3295 if (re_extmatch_in->matches[i] != NULL)
3296 mch_errmsg((char *)re_extmatch_in->matches[i]);
3297 mch_errmsg("\"\n");
3298 }
3299 }
3300# endif
3301 }
3302#endif
3303 next = regnext(scan);
3304
3305 op = OP(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003306 // Check for character class with NL added.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003307 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3308 && *rex.input == NUL && rex.lnum <= rex.reg_maxline)
3309 {
3310 reg_nextline();
3311 }
3312 else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n')
3313 {
3314 ADVANCE_REGINPUT();
3315 }
3316 else
3317 {
3318 if (WITH_NL(op))
3319 op -= ADD_NL;
3320 if (has_mbyte)
3321 c = (*mb_ptr2char)(rex.input);
3322 else
3323 c = *rex.input;
3324 switch (op)
3325 {
3326 case BOL:
3327 if (rex.input != rex.line)
3328 status = RA_NOMATCH;
3329 break;
3330
3331 case EOL:
3332 if (c != NUL)
3333 status = RA_NOMATCH;
3334 break;
3335
3336 case RE_BOF:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003337 // We're not at the beginning of the file when below the first
3338 // line where we started, not at the start of the line or we
3339 // didn't start at the first line of the buffer.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003340 if (rex.lnum != 0 || rex.input != rex.line
3341 || (REG_MULTI && rex.reg_firstlnum > 1))
3342 status = RA_NOMATCH;
3343 break;
3344
3345 case RE_EOF:
3346 if (rex.lnum != rex.reg_maxline || c != NUL)
3347 status = RA_NOMATCH;
3348 break;
3349
3350 case CURSOR:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003351 // Check if the buffer is in a window and compare the
3352 // rex.reg_win->w_cursor position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003353 if (rex.reg_win == NULL
3354 || (rex.lnum + rex.reg_firstlnum
3355 != rex.reg_win->w_cursor.lnum)
3356 || ((colnr_T)(rex.input - rex.line)
3357 != rex.reg_win->w_cursor.col))
3358 status = RA_NOMATCH;
3359 break;
3360
3361 case RE_MARK:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003362 // Compare the mark position to the match position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003363 {
3364 int mark = OPERAND(scan)[0];
3365 int cmp = OPERAND(scan)[1];
3366 pos_T *pos;
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003367 size_t col = REG_MULTI ? rex.input - rex.line : 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003368
3369 pos = getmark_buf(rex.reg_buf, mark, FALSE);
Bram Moolenaarb55986c2022-03-29 13:24:58 +01003370
3371 // Line may have been freed, get it again.
3372 if (REG_MULTI)
3373 {
3374 rex.line = reg_getline(rex.lnum);
3375 rex.input = rex.line + col;
3376 }
3377
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003378 if (pos == NULL // mark doesn't exist
Bram Moolenaar872bee52021-05-24 22:56:15 +02003379 || pos->lnum <= 0) // mark isn't set in reg_buf
3380 {
3381 status = RA_NOMATCH;
3382 }
3383 else
3384 {
3385 colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3386 && pos->col == MAXCOL
3387 ? (colnr_T)STRLEN(reg_getline(
3388 pos->lnum - rex.reg_firstlnum))
3389 : pos->col;
3390
3391 if ((pos->lnum == rex.lnum + rex.reg_firstlnum
3392 ? (pos_col == (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003393 ? (cmp == '<' || cmp == '>')
Bram Moolenaar872bee52021-05-24 22:56:15 +02003394 : (pos_col < (colnr_T)(rex.input - rex.line)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003395 ? cmp != '>'
3396 : cmp != '<'))
3397 : (pos->lnum < rex.lnum + rex.reg_firstlnum
3398 ? cmp != '>'
3399 : cmp != '<')))
3400 status = RA_NOMATCH;
Bram Moolenaar872bee52021-05-24 22:56:15 +02003401 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003402 }
3403 break;
3404
3405 case RE_VISUAL:
3406 if (!reg_match_visual())
3407 status = RA_NOMATCH;
3408 break;
3409
3410 case RE_LNUM:
3411 if (!REG_MULTI || !re_num_cmp((long_u)(rex.lnum + rex.reg_firstlnum),
3412 scan))
3413 status = RA_NOMATCH;
3414 break;
3415
3416 case RE_COL:
3417 if (!re_num_cmp((long_u)(rex.input - rex.line) + 1, scan))
3418 status = RA_NOMATCH;
3419 break;
3420
3421 case RE_VCOL:
3422 if (!re_num_cmp((long_u)win_linetabsize(
3423 rex.reg_win == NULL ? curwin : rex.reg_win,
3424 rex.line, (colnr_T)(rex.input - rex.line)) + 1, scan))
3425 status = RA_NOMATCH;
3426 break;
3427
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003428 case BOW: // \<word; rex.input points to w
3429 if (c == NUL) // Can't match at end of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003430 status = RA_NOMATCH;
3431 else if (has_mbyte)
3432 {
3433 int this_class;
3434
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003435 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003436 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3437 if (this_class <= 1)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003438 status = RA_NOMATCH; // not on a word at all
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003439 else if (reg_prev_class() == this_class)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003440 status = RA_NOMATCH; // previous char is in same word
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003441 }
3442 else
3443 {
3444 if (!vim_iswordc_buf(c, rex.reg_buf) || (rex.input > rex.line
3445 && vim_iswordc_buf(rex.input[-1], rex.reg_buf)))
3446 status = RA_NOMATCH;
3447 }
3448 break;
3449
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003450 case EOW: // word\>; rex.input points after d
3451 if (rex.input == rex.line) // Can't match at start of line
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003452 status = RA_NOMATCH;
3453 else if (has_mbyte)
3454 {
3455 int this_class, prev_class;
3456
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003457 // Get class of current and previous char (if it exists).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003458 this_class = mb_get_class_buf(rex.input, rex.reg_buf);
3459 prev_class = reg_prev_class();
3460 if (this_class == prev_class
3461 || prev_class == 0 || prev_class == 1)
3462 status = RA_NOMATCH;
3463 }
3464 else
3465 {
3466 if (!vim_iswordc_buf(rex.input[-1], rex.reg_buf)
3467 || (rex.input[0] != NUL
3468 && vim_iswordc_buf(c, rex.reg_buf)))
3469 status = RA_NOMATCH;
3470 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003471 break; // Matched with EOW
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003472
3473 case ANY:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003474 // ANY does not match new lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003475 if (c == NUL)
3476 status = RA_NOMATCH;
3477 else
3478 ADVANCE_REGINPUT();
3479 break;
3480
3481 case IDENT:
3482 if (!vim_isIDc(c))
3483 status = RA_NOMATCH;
3484 else
3485 ADVANCE_REGINPUT();
3486 break;
3487
3488 case SIDENT:
3489 if (VIM_ISDIGIT(*rex.input) || !vim_isIDc(c))
3490 status = RA_NOMATCH;
3491 else
3492 ADVANCE_REGINPUT();
3493 break;
3494
3495 case KWORD:
3496 if (!vim_iswordp_buf(rex.input, rex.reg_buf))
3497 status = RA_NOMATCH;
3498 else
3499 ADVANCE_REGINPUT();
3500 break;
3501
3502 case SKWORD:
3503 if (VIM_ISDIGIT(*rex.input)
3504 || !vim_iswordp_buf(rex.input, rex.reg_buf))
3505 status = RA_NOMATCH;
3506 else
3507 ADVANCE_REGINPUT();
3508 break;
3509
3510 case FNAME:
3511 if (!vim_isfilec(c))
3512 status = RA_NOMATCH;
3513 else
3514 ADVANCE_REGINPUT();
3515 break;
3516
3517 case SFNAME:
3518 if (VIM_ISDIGIT(*rex.input) || !vim_isfilec(c))
3519 status = RA_NOMATCH;
3520 else
3521 ADVANCE_REGINPUT();
3522 break;
3523
3524 case PRINT:
3525 if (!vim_isprintc(PTR2CHAR(rex.input)))
3526 status = RA_NOMATCH;
3527 else
3528 ADVANCE_REGINPUT();
3529 break;
3530
3531 case SPRINT:
3532 if (VIM_ISDIGIT(*rex.input) || !vim_isprintc(PTR2CHAR(rex.input)))
3533 status = RA_NOMATCH;
3534 else
3535 ADVANCE_REGINPUT();
3536 break;
3537
3538 case WHITE:
3539 if (!VIM_ISWHITE(c))
3540 status = RA_NOMATCH;
3541 else
3542 ADVANCE_REGINPUT();
3543 break;
3544
3545 case NWHITE:
3546 if (c == NUL || VIM_ISWHITE(c))
3547 status = RA_NOMATCH;
3548 else
3549 ADVANCE_REGINPUT();
3550 break;
3551
3552 case DIGIT:
3553 if (!ri_digit(c))
3554 status = RA_NOMATCH;
3555 else
3556 ADVANCE_REGINPUT();
3557 break;
3558
3559 case NDIGIT:
3560 if (c == NUL || ri_digit(c))
3561 status = RA_NOMATCH;
3562 else
3563 ADVANCE_REGINPUT();
3564 break;
3565
3566 case HEX:
3567 if (!ri_hex(c))
3568 status = RA_NOMATCH;
3569 else
3570 ADVANCE_REGINPUT();
3571 break;
3572
3573 case NHEX:
3574 if (c == NUL || ri_hex(c))
3575 status = RA_NOMATCH;
3576 else
3577 ADVANCE_REGINPUT();
3578 break;
3579
3580 case OCTAL:
3581 if (!ri_octal(c))
3582 status = RA_NOMATCH;
3583 else
3584 ADVANCE_REGINPUT();
3585 break;
3586
3587 case NOCTAL:
3588 if (c == NUL || ri_octal(c))
3589 status = RA_NOMATCH;
3590 else
3591 ADVANCE_REGINPUT();
3592 break;
3593
3594 case WORD:
3595 if (!ri_word(c))
3596 status = RA_NOMATCH;
3597 else
3598 ADVANCE_REGINPUT();
3599 break;
3600
3601 case NWORD:
3602 if (c == NUL || ri_word(c))
3603 status = RA_NOMATCH;
3604 else
3605 ADVANCE_REGINPUT();
3606 break;
3607
3608 case HEAD:
3609 if (!ri_head(c))
3610 status = RA_NOMATCH;
3611 else
3612 ADVANCE_REGINPUT();
3613 break;
3614
3615 case NHEAD:
3616 if (c == NUL || ri_head(c))
3617 status = RA_NOMATCH;
3618 else
3619 ADVANCE_REGINPUT();
3620 break;
3621
3622 case ALPHA:
3623 if (!ri_alpha(c))
3624 status = RA_NOMATCH;
3625 else
3626 ADVANCE_REGINPUT();
3627 break;
3628
3629 case NALPHA:
3630 if (c == NUL || ri_alpha(c))
3631 status = RA_NOMATCH;
3632 else
3633 ADVANCE_REGINPUT();
3634 break;
3635
3636 case LOWER:
3637 if (!ri_lower(c))
3638 status = RA_NOMATCH;
3639 else
3640 ADVANCE_REGINPUT();
3641 break;
3642
3643 case NLOWER:
3644 if (c == NUL || ri_lower(c))
3645 status = RA_NOMATCH;
3646 else
3647 ADVANCE_REGINPUT();
3648 break;
3649
3650 case UPPER:
3651 if (!ri_upper(c))
3652 status = RA_NOMATCH;
3653 else
3654 ADVANCE_REGINPUT();
3655 break;
3656
3657 case NUPPER:
3658 if (c == NUL || ri_upper(c))
3659 status = RA_NOMATCH;
3660 else
3661 ADVANCE_REGINPUT();
3662 break;
3663
3664 case EXACTLY:
3665 {
3666 int len;
3667 char_u *opnd;
3668
3669 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003670 // Inline the first byte, for speed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003671 if (*opnd != *rex.input
3672 && (!rex.reg_ic
3673 || (!enc_utf8
3674 && MB_TOLOWER(*opnd) != MB_TOLOWER(*rex.input))))
3675 status = RA_NOMATCH;
3676 else if (*opnd == NUL)
3677 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003678 // match empty string always works; happens when "~" is
3679 // empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003680 }
3681 else
3682 {
3683 if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic))
3684 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003685 len = 1; // matched a single byte above
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003686 }
3687 else
3688 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003689 // Need to match first byte again for multi-byte.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003690 len = (int)STRLEN(opnd);
3691 if (cstrncmp(opnd, rex.input, &len) != 0)
3692 status = RA_NOMATCH;
3693 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003694 // Check for following composing character, unless %C
3695 // follows (skips over all composing chars).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003696 if (status != RA_NOMATCH
3697 && enc_utf8
3698 && UTF_COMPOSINGLIKE(rex.input, rex.input + len)
3699 && !rex.reg_icombine
3700 && OP(next) != RE_COMPOSING)
3701 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003702 // raaron: This code makes a composing character get
3703 // ignored, which is the correct behavior (sometimes)
3704 // for voweled Hebrew texts.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003705 status = RA_NOMATCH;
3706 }
3707 if (status != RA_NOMATCH)
3708 rex.input += len;
3709 }
3710 }
3711 break;
3712
3713 case ANYOF:
3714 case ANYBUT:
3715 if (c == NUL)
3716 status = RA_NOMATCH;
3717 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3718 status = RA_NOMATCH;
3719 else
3720 ADVANCE_REGINPUT();
3721 break;
3722
3723 case MULTIBYTECODE:
3724 if (has_mbyte)
3725 {
3726 int i, len;
3727 char_u *opnd;
3728 int opndc = 0, inpc;
3729
3730 opnd = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003731 // Safety check (just in case 'encoding' was changed since
3732 // compiling the program).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003733 if ((len = (*mb_ptr2len)(opnd)) < 2)
3734 {
3735 status = RA_NOMATCH;
3736 break;
3737 }
3738 if (enc_utf8)
3739 opndc = utf_ptr2char(opnd);
3740 if (enc_utf8 && utf_iscomposing(opndc))
3741 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003742 // When only a composing char is given match at any
3743 // position where that composing char appears.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003744 status = RA_NOMATCH;
3745 for (i = 0; rex.input[i] != NUL;
3746 i += utf_ptr2len(rex.input + i))
3747 {
3748 inpc = utf_ptr2char(rex.input + i);
3749 if (!utf_iscomposing(inpc))
3750 {
3751 if (i > 0)
3752 break;
3753 }
3754 else if (opndc == inpc)
3755 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003756 // Include all following composing chars.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003757 len = i + utfc_ptr2len(rex.input + i);
3758 status = RA_MATCH;
3759 break;
3760 }
3761 }
3762 }
3763 else
3764 for (i = 0; i < len; ++i)
3765 if (opnd[i] != rex.input[i])
3766 {
3767 status = RA_NOMATCH;
3768 break;
3769 }
3770 rex.input += len;
3771 }
3772 else
3773 status = RA_NOMATCH;
3774 break;
3775 case RE_COMPOSING:
3776 if (enc_utf8)
3777 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003778 // Skip composing characters.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003779 while (utf_iscomposing(utf_ptr2char(rex.input)))
3780 MB_CPTR_ADV(rex.input);
3781 }
3782 break;
3783
3784 case NOTHING:
3785 break;
3786
3787 case BACK:
3788 {
3789 int i;
3790 backpos_T *bp;
3791
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003792 // When we run into BACK we need to check if we don't keep
3793 // looping without matching any input. The second and later
3794 // times a BACK is encountered it fails if the input is still
3795 // at the same position as the previous time.
3796 // The positions are stored in "backpos" and found by the
3797 // current value of "scan", the position in the RE program.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003798 bp = (backpos_T *)backpos.ga_data;
3799 for (i = 0; i < backpos.ga_len; ++i)
3800 if (bp[i].bp_scan == scan)
3801 break;
3802 if (i == backpos.ga_len)
3803 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003804 // First time at this BACK, make room to store the pos.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003805 if (ga_grow(&backpos, 1) == FAIL)
3806 status = RA_FAIL;
3807 else
3808 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003809 // get "ga_data" again, it may have changed
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003810 bp = (backpos_T *)backpos.ga_data;
3811 bp[i].bp_scan = scan;
3812 ++backpos.ga_len;
3813 }
3814 }
3815 else if (reg_save_equal(&bp[i].bp_pos))
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003816 // Still at same position as last time, fail.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003817 status = RA_NOMATCH;
3818
3819 if (status != RA_FAIL && status != RA_NOMATCH)
3820 reg_save(&bp[i].bp_pos, &backpos);
3821 }
3822 break;
3823
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003824 case MOPEN + 0: // Match start: \zs
3825 case MOPEN + 1: // \(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003826 case MOPEN + 2:
3827 case MOPEN + 3:
3828 case MOPEN + 4:
3829 case MOPEN + 5:
3830 case MOPEN + 6:
3831 case MOPEN + 7:
3832 case MOPEN + 8:
3833 case MOPEN + 9:
3834 {
3835 no = op - MOPEN;
3836 cleanup_subexpr();
3837 rp = regstack_push(RS_MOPEN, scan);
3838 if (rp == NULL)
3839 status = RA_FAIL;
3840 else
3841 {
3842 rp->rs_no = no;
3843 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
3844 &rex.reg_startp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003845 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003846 }
3847 }
3848 break;
3849
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003850 case NOPEN: // \%(
3851 case NCLOSE: // \) after \%(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003852 if (regstack_push(RS_NOPEN, scan) == NULL)
3853 status = RA_FAIL;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003854 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003855 break;
3856
3857#ifdef FEAT_SYN_HL
3858 case ZOPEN + 1:
3859 case ZOPEN + 2:
3860 case ZOPEN + 3:
3861 case ZOPEN + 4:
3862 case ZOPEN + 5:
3863 case ZOPEN + 6:
3864 case ZOPEN + 7:
3865 case ZOPEN + 8:
3866 case ZOPEN + 9:
3867 {
3868 no = op - ZOPEN;
3869 cleanup_zsubexpr();
3870 rp = regstack_push(RS_ZOPEN, scan);
3871 if (rp == NULL)
3872 status = RA_FAIL;
3873 else
3874 {
3875 rp->rs_no = no;
3876 save_se(&rp->rs_un.sesave, &reg_startzpos[no],
3877 &reg_startzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003878 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003879 }
3880 }
3881 break;
3882#endif
3883
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003884 case MCLOSE + 0: // Match end: \ze
3885 case MCLOSE + 1: // \)
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003886 case MCLOSE + 2:
3887 case MCLOSE + 3:
3888 case MCLOSE + 4:
3889 case MCLOSE + 5:
3890 case MCLOSE + 6:
3891 case MCLOSE + 7:
3892 case MCLOSE + 8:
3893 case MCLOSE + 9:
3894 {
3895 no = op - MCLOSE;
3896 cleanup_subexpr();
3897 rp = regstack_push(RS_MCLOSE, scan);
3898 if (rp == NULL)
3899 status = RA_FAIL;
3900 else
3901 {
3902 rp->rs_no = no;
3903 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no],
3904 &rex.reg_endp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003905 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003906 }
3907 }
3908 break;
3909
3910#ifdef FEAT_SYN_HL
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003911 case ZCLOSE + 1: // \) after \z(
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003912 case ZCLOSE + 2:
3913 case ZCLOSE + 3:
3914 case ZCLOSE + 4:
3915 case ZCLOSE + 5:
3916 case ZCLOSE + 6:
3917 case ZCLOSE + 7:
3918 case ZCLOSE + 8:
3919 case ZCLOSE + 9:
3920 {
3921 no = op - ZCLOSE;
3922 cleanup_zsubexpr();
3923 rp = regstack_push(RS_ZCLOSE, scan);
3924 if (rp == NULL)
3925 status = RA_FAIL;
3926 else
3927 {
3928 rp->rs_no = no;
3929 save_se(&rp->rs_un.sesave, &reg_endzpos[no],
3930 &reg_endzp[no]);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003931 // We simply continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003932 }
3933 }
3934 break;
3935#endif
3936
3937 case BACKREF + 1:
3938 case BACKREF + 2:
3939 case BACKREF + 3:
3940 case BACKREF + 4:
3941 case BACKREF + 5:
3942 case BACKREF + 6:
3943 case BACKREF + 7:
3944 case BACKREF + 8:
3945 case BACKREF + 9:
3946 {
3947 int len;
3948
3949 no = op - BACKREF;
3950 cleanup_subexpr();
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003951 if (!REG_MULTI) // Single-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003952 {
3953 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL)
3954 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003955 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003956 len = 0;
3957 }
3958 else
3959 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003960 // Compare current input with back-ref in the same
3961 // line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003962 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
3963 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0)
3964 status = RA_NOMATCH;
3965 }
3966 }
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003967 else // Multi-line regexp
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003968 {
3969 if (rex.reg_startpos[no].lnum < 0
3970 || rex.reg_endpos[no].lnum < 0)
3971 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003972 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003973 len = 0;
3974 }
3975 else
3976 {
3977 if (rex.reg_startpos[no].lnum == rex.lnum
3978 && rex.reg_endpos[no].lnum == rex.lnum)
3979 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003980 // Compare back-ref within the current line.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003981 len = rex.reg_endpos[no].col
3982 - rex.reg_startpos[no].col;
3983 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
3984 rex.input, &len) != 0)
3985 status = RA_NOMATCH;
3986 }
3987 else
3988 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02003989 // Messy situation: Need to compare between two
3990 // lines.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003991 int r = match_with_backref(
3992 rex.reg_startpos[no].lnum,
3993 rex.reg_startpos[no].col,
3994 rex.reg_endpos[no].lnum,
3995 rex.reg_endpos[no].col,
3996 &len);
3997
3998 if (r != RA_MATCH)
3999 status = r;
4000 }
4001 }
4002 }
4003
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004004 // Matched the backref, skip over it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004005 rex.input += len;
4006 }
4007 break;
4008
4009#ifdef FEAT_SYN_HL
4010 case ZREF + 1:
4011 case ZREF + 2:
4012 case ZREF + 3:
4013 case ZREF + 4:
4014 case ZREF + 5:
4015 case ZREF + 6:
4016 case ZREF + 7:
4017 case ZREF + 8:
4018 case ZREF + 9:
4019 {
4020 int len;
4021
4022 cleanup_zsubexpr();
4023 no = op - ZREF;
4024 if (re_extmatch_in != NULL
4025 && re_extmatch_in->matches[no] != NULL)
4026 {
4027 len = (int)STRLEN(re_extmatch_in->matches[no]);
4028 if (cstrncmp(re_extmatch_in->matches[no],
4029 rex.input, &len) != 0)
4030 status = RA_NOMATCH;
4031 else
4032 rex.input += len;
4033 }
4034 else
4035 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004036 // Backref was not set: Match an empty string.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004037 }
4038 }
4039 break;
4040#endif
4041
4042 case BRANCH:
4043 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004044 if (OP(next) != BRANCH) // No choice.
4045 next = OPERAND(scan); // Avoid recursion.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004046 else
4047 {
4048 rp = regstack_push(RS_BRANCH, scan);
4049 if (rp == NULL)
4050 status = RA_FAIL;
4051 else
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004052 status = RA_BREAK; // rest is below
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004053 }
4054 }
4055 break;
4056
4057 case BRACE_LIMITS:
4058 {
4059 if (OP(next) == BRACE_SIMPLE)
4060 {
4061 bl_minval = OPERAND_MIN(scan);
4062 bl_maxval = OPERAND_MAX(scan);
4063 }
4064 else if (OP(next) >= BRACE_COMPLEX
4065 && OP(next) < BRACE_COMPLEX + 10)
4066 {
4067 no = OP(next) - BRACE_COMPLEX;
4068 brace_min[no] = OPERAND_MIN(scan);
4069 brace_max[no] = OPERAND_MAX(scan);
4070 brace_count[no] = 0;
4071 }
4072 else
4073 {
4074 internal_error("BRACE_LIMITS");
4075 status = RA_FAIL;
4076 }
4077 }
4078 break;
4079
4080 case BRACE_COMPLEX + 0:
4081 case BRACE_COMPLEX + 1:
4082 case BRACE_COMPLEX + 2:
4083 case BRACE_COMPLEX + 3:
4084 case BRACE_COMPLEX + 4:
4085 case BRACE_COMPLEX + 5:
4086 case BRACE_COMPLEX + 6:
4087 case BRACE_COMPLEX + 7:
4088 case BRACE_COMPLEX + 8:
4089 case BRACE_COMPLEX + 9:
4090 {
4091 no = op - BRACE_COMPLEX;
4092 ++brace_count[no];
4093
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004094 // If not matched enough times yet, try one more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004095 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4096 ? brace_min[no] : brace_max[no]))
4097 {
4098 rp = regstack_push(RS_BRCPLX_MORE, scan);
4099 if (rp == NULL)
4100 status = RA_FAIL;
4101 else
4102 {
4103 rp->rs_no = no;
4104 reg_save(&rp->rs_un.regsave, &backpos);
4105 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004106 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004107 }
4108 break;
4109 }
4110
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004111 // If matched enough times, may try matching some more
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004112 if (brace_min[no] <= brace_max[no])
4113 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004114 // Range is the normal way around, use longest match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004115 if (brace_count[no] <= brace_max[no])
4116 {
4117 rp = regstack_push(RS_BRCPLX_LONG, scan);
4118 if (rp == NULL)
4119 status = RA_FAIL;
4120 else
4121 {
4122 rp->rs_no = no;
4123 reg_save(&rp->rs_un.regsave, &backpos);
4124 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004125 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004126 }
4127 }
4128 }
4129 else
4130 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004131 // Range is backwards, use shortest match first
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004132 if (brace_count[no] <= brace_min[no])
4133 {
4134 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4135 if (rp == NULL)
4136 status = RA_FAIL;
4137 else
4138 {
4139 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004140 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004141 }
4142 }
4143 }
4144 }
4145 break;
4146
4147 case BRACE_SIMPLE:
4148 case STAR:
4149 case PLUS:
4150 {
4151 regstar_T rst;
4152
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004153 // Lookahead to avoid useless match attempts when we know
4154 // what character comes next.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004155 if (OP(next) == EXACTLY)
4156 {
4157 rst.nextb = *OPERAND(next);
4158 if (rex.reg_ic)
4159 {
4160 if (MB_ISUPPER(rst.nextb))
4161 rst.nextb_ic = MB_TOLOWER(rst.nextb);
4162 else
4163 rst.nextb_ic = MB_TOUPPER(rst.nextb);
4164 }
4165 else
4166 rst.nextb_ic = rst.nextb;
4167 }
4168 else
4169 {
4170 rst.nextb = NUL;
4171 rst.nextb_ic = NUL;
4172 }
4173 if (op != BRACE_SIMPLE)
4174 {
4175 rst.minval = (op == STAR) ? 0 : 1;
4176 rst.maxval = MAX_LIMIT;
4177 }
4178 else
4179 {
4180 rst.minval = bl_minval;
4181 rst.maxval = bl_maxval;
4182 }
4183
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004184 // When maxval > minval, try matching as much as possible, up
4185 // to maxval. When maxval < minval, try matching at least the
4186 // minimal number (since the range is backwards, that's also
4187 // maxval!).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004188 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4189 if (got_int)
4190 {
4191 status = RA_FAIL;
4192 break;
4193 }
4194 if (rst.minval <= rst.maxval
4195 ? rst.count >= rst.minval : rst.count >= rst.maxval)
4196 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004197 // It could match. Prepare for trying to match what
4198 // follows. The code is below. Parameters are stored in
4199 // a regstar_T on the regstack.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004200 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4201 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004202 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004203 status = RA_FAIL;
4204 }
4205 else if (ga_grow(&regstack, sizeof(regstar_T)) == FAIL)
4206 status = RA_FAIL;
4207 else
4208 {
4209 regstack.ga_len += sizeof(regstar_T);
4210 rp = regstack_push(rst.minval <= rst.maxval
4211 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4212 if (rp == NULL)
4213 status = RA_FAIL;
4214 else
4215 {
4216 *(((regstar_T *)rp) - 1) = rst;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004217 status = RA_BREAK; // skip the restore bits
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004218 }
4219 }
4220 }
4221 else
4222 status = RA_NOMATCH;
4223
4224 }
4225 break;
4226
4227 case NOMATCH:
4228 case MATCH:
4229 case SUBPAT:
4230 rp = regstack_push(RS_NOMATCH, scan);
4231 if (rp == NULL)
4232 status = RA_FAIL;
4233 else
4234 {
4235 rp->rs_no = op;
4236 reg_save(&rp->rs_un.regsave, &backpos);
4237 next = OPERAND(scan);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004238 // We continue and handle the result when done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004239 }
4240 break;
4241
4242 case BEHIND:
4243 case NOBEHIND:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004244 // Need a bit of room to store extra positions.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004245 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp)
4246 {
Bram Moolenaar74409f62022-01-01 15:58:22 +00004247 emsg(_(e_pattern_uses_more_memory_than_maxmempattern));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004248 status = RA_FAIL;
4249 }
4250 else if (ga_grow(&regstack, sizeof(regbehind_T)) == FAIL)
4251 status = RA_FAIL;
4252 else
4253 {
4254 regstack.ga_len += sizeof(regbehind_T);
4255 rp = regstack_push(RS_BEHIND1, scan);
4256 if (rp == NULL)
4257 status = RA_FAIL;
4258 else
4259 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004260 // Need to save the subexpr to be able to restore them
4261 // when there is a match but we don't use it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004262 save_subexpr(((regbehind_T *)rp) - 1);
4263
4264 rp->rs_no = op;
4265 reg_save(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004266 // First try if what follows matches. If it does then we
4267 // check the behind match by looping.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004268 }
4269 }
4270 break;
4271
4272 case BHPOS:
4273 if (REG_MULTI)
4274 {
4275 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4276 || behind_pos.rs_u.pos.lnum != rex.lnum)
4277 status = RA_NOMATCH;
4278 }
4279 else if (behind_pos.rs_u.ptr != rex.input)
4280 status = RA_NOMATCH;
4281 break;
4282
4283 case NEWL:
4284 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4285 || rex.reg_line_lbr)
4286 && (c != '\n' || !rex.reg_line_lbr))
4287 status = RA_NOMATCH;
4288 else if (rex.reg_line_lbr)
4289 ADVANCE_REGINPUT();
4290 else
4291 reg_nextline();
4292 break;
4293
4294 case END:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004295 status = RA_MATCH; // Success!
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004296 break;
4297
4298 default:
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004299 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004300#ifdef DEBUG
4301 printf("Illegal op code %d\n", op);
4302#endif
4303 status = RA_FAIL;
4304 break;
4305 }
4306 }
4307
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004308 // If we can't continue sequentially, break the inner loop.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004309 if (status != RA_CONT)
4310 break;
4311
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004312 // Continue in inner loop, advance to next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004313 scan = next;
4314
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004315 } // end of inner loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004316
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004317 // If there is something on the regstack execute the code for the state.
4318 // If the state is popped then loop and use the older state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004319 while (regstack.ga_len > 0 && status != RA_FAIL)
4320 {
4321 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4322 switch (rp->rs_state)
4323 {
4324 case RS_NOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004325 // Result is passed on as-is, simply pop the state.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004326 regstack_pop(&scan);
4327 break;
4328
4329 case RS_MOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004330 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004331 if (status == RA_NOMATCH)
4332 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4333 &rex.reg_startp[rp->rs_no]);
4334 regstack_pop(&scan);
4335 break;
4336
4337#ifdef FEAT_SYN_HL
4338 case RS_ZOPEN:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004339 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004340 if (status == RA_NOMATCH)
4341 restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4342 &reg_startzp[rp->rs_no]);
4343 regstack_pop(&scan);
4344 break;
4345#endif
4346
4347 case RS_MCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004348 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004349 if (status == RA_NOMATCH)
4350 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4351 &rex.reg_endp[rp->rs_no]);
4352 regstack_pop(&scan);
4353 break;
4354
4355#ifdef FEAT_SYN_HL
4356 case RS_ZCLOSE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004357 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004358 if (status == RA_NOMATCH)
4359 restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4360 &reg_endzp[rp->rs_no]);
4361 regstack_pop(&scan);
4362 break;
4363#endif
4364
4365 case RS_BRANCH:
4366 if (status == RA_MATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004367 // this branch matched, use it
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004368 regstack_pop(&scan);
4369 else
4370 {
4371 if (status != RA_BREAK)
4372 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004373 // After a non-matching branch: try next one.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004374 reg_restore(&rp->rs_un.regsave, &backpos);
4375 scan = rp->rs_scan;
4376 }
4377 if (scan == NULL || OP(scan) != BRANCH)
4378 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004379 // no more branches, didn't find a match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004380 status = RA_NOMATCH;
4381 regstack_pop(&scan);
4382 }
4383 else
4384 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004385 // Prepare to try a branch.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004386 rp->rs_scan = regnext(scan);
4387 reg_save(&rp->rs_un.regsave, &backpos);
4388 scan = OPERAND(scan);
4389 }
4390 }
4391 break;
4392
4393 case RS_BRCPLX_MORE:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004394 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004395 if (status == RA_NOMATCH)
4396 {
4397 reg_restore(&rp->rs_un.regsave, &backpos);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004398 --brace_count[rp->rs_no]; // decrement match count
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004399 }
4400 regstack_pop(&scan);
4401 break;
4402
4403 case RS_BRCPLX_LONG:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004404 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004405 if (status == RA_NOMATCH)
4406 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004407 // There was no match, but we did find enough matches.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004408 reg_restore(&rp->rs_un.regsave, &backpos);
4409 --brace_count[rp->rs_no];
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004410 // continue with the items after "\{}"
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004411 status = RA_CONT;
4412 }
4413 regstack_pop(&scan);
4414 if (status == RA_CONT)
4415 scan = regnext(scan);
4416 break;
4417
4418 case RS_BRCPLX_SHORT:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004419 // Pop the state. Restore pointers when there is no match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004420 if (status == RA_NOMATCH)
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004421 // There was no match, try to match one more item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004422 reg_restore(&rp->rs_un.regsave, &backpos);
4423 regstack_pop(&scan);
4424 if (status == RA_NOMATCH)
4425 {
4426 scan = OPERAND(scan);
4427 status = RA_CONT;
4428 }
4429 break;
4430
4431 case RS_NOMATCH:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004432 // Pop the state. If the operand matches for NOMATCH or
4433 // doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4434 // except for SUBPAT, and continue with the next item.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004435 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4436 status = RA_NOMATCH;
4437 else
4438 {
4439 status = RA_CONT;
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004440 if (rp->rs_no != SUBPAT) // zero-width
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004441 reg_restore(&rp->rs_un.regsave, &backpos);
4442 }
4443 regstack_pop(&scan);
4444 if (status == RA_CONT)
4445 scan = regnext(scan);
4446 break;
4447
4448 case RS_BEHIND1:
4449 if (status == RA_NOMATCH)
4450 {
4451 regstack_pop(&scan);
4452 regstack.ga_len -= sizeof(regbehind_T);
4453 }
4454 else
4455 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004456 // The stuff after BEHIND/NOBEHIND matches. Now try if
4457 // the behind part does (not) match before the current
4458 // position in the input. This must be done at every
4459 // position in the input and checking if the match ends at
4460 // the current position.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004461
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004462 // save the position after the found match for next
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004463 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4464
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004465 // Start looking for a match with operand at the current
4466 // position. Go back one character until we find the
4467 // result, hitting the start of the line or the previous
4468 // line (for multi-line matching).
4469 // Set behind_pos to where the match should end, BHPOS
4470 // will match it. Save the current value.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004471 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4472 behind_pos = rp->rs_un.regsave;
4473
4474 rp->rs_state = RS_BEHIND2;
4475
4476 reg_restore(&rp->rs_un.regsave, &backpos);
4477 scan = OPERAND(rp->rs_scan) + 4;
4478 }
4479 break;
4480
4481 case RS_BEHIND2:
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004482 // Looping for BEHIND / NOBEHIND match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004483 if (status == RA_MATCH && reg_save_equal(&behind_pos))
4484 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004485 // found a match that ends where "next" started
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004486 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4487 if (rp->rs_no == BEHIND)
4488 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4489 &backpos);
4490 else
4491 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004492 // But we didn't want a match. Need to restore the
4493 // subexpr, because what follows matched, so they have
4494 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004495 status = RA_NOMATCH;
4496 restore_subexpr(((regbehind_T *)rp) - 1);
4497 }
4498 regstack_pop(&scan);
4499 regstack.ga_len -= sizeof(regbehind_T);
4500 }
4501 else
4502 {
4503 long limit;
4504
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004505 // No match or a match that doesn't end where we want it: Go
4506 // back one character. May go to previous line once.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004507 no = OK;
4508 limit = OPERAND_MIN(rp->rs_scan);
4509 if (REG_MULTI)
4510 {
4511 if (limit > 0
4512 && ((rp->rs_un.regsave.rs_u.pos.lnum
4513 < behind_pos.rs_u.pos.lnum
4514 ? (colnr_T)STRLEN(rex.line)
4515 : behind_pos.rs_u.pos.col)
4516 - rp->rs_un.regsave.rs_u.pos.col >= limit))
4517 no = FAIL;
4518 else if (rp->rs_un.regsave.rs_u.pos.col == 0)
4519 {
4520 if (rp->rs_un.regsave.rs_u.pos.lnum
4521 < behind_pos.rs_u.pos.lnum
4522 || reg_getline(
4523 --rp->rs_un.regsave.rs_u.pos.lnum)
4524 == NULL)
4525 no = FAIL;
4526 else
4527 {
4528 reg_restore(&rp->rs_un.regsave, &backpos);
4529 rp->rs_un.regsave.rs_u.pos.col =
4530 (colnr_T)STRLEN(rex.line);
4531 }
4532 }
4533 else
4534 {
4535 if (has_mbyte)
4536 {
4537 char_u *line =
4538 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
4539
4540 rp->rs_un.regsave.rs_u.pos.col -=
4541 (*mb_head_off)(line, line
4542 + rp->rs_un.regsave.rs_u.pos.col - 1) + 1;
4543 }
4544 else
4545 --rp->rs_un.regsave.rs_u.pos.col;
4546 }
4547 }
4548 else
4549 {
4550 if (rp->rs_un.regsave.rs_u.ptr == rex.line)
4551 no = FAIL;
4552 else
4553 {
4554 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
4555 if (limit > 0 && (long)(behind_pos.rs_u.ptr
4556 - rp->rs_un.regsave.rs_u.ptr) > limit)
4557 no = FAIL;
4558 }
4559 }
4560 if (no == OK)
4561 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004562 // Advanced, prepare for finding match again.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004563 reg_restore(&rp->rs_un.regsave, &backpos);
4564 scan = OPERAND(rp->rs_scan) + 4;
4565 if (status == RA_MATCH)
4566 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004567 // We did match, so subexpr may have been changed,
4568 // need to restore them for the next try.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004569 status = RA_NOMATCH;
4570 restore_subexpr(((regbehind_T *)rp) - 1);
4571 }
4572 }
4573 else
4574 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004575 // Can't advance. For NOBEHIND that's a match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004576 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4577 if (rp->rs_no == NOBEHIND)
4578 {
4579 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4580 &backpos);
4581 status = RA_MATCH;
4582 }
4583 else
4584 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004585 // We do want a proper match. Need to restore the
4586 // subexpr if we had a match, because they may have
4587 // been set.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004588 if (status == RA_MATCH)
4589 {
4590 status = RA_NOMATCH;
4591 restore_subexpr(((regbehind_T *)rp) - 1);
4592 }
4593 }
4594 regstack_pop(&scan);
4595 regstack.ga_len -= sizeof(regbehind_T);
4596 }
4597 }
4598 break;
4599
4600 case RS_STAR_LONG:
4601 case RS_STAR_SHORT:
4602 {
4603 regstar_T *rst = ((regstar_T *)rp) - 1;
4604
4605 if (status == RA_MATCH)
4606 {
4607 regstack_pop(&scan);
4608 regstack.ga_len -= sizeof(regstar_T);
4609 break;
4610 }
4611
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004612 // Tried once already, restore input pointers.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004613 if (status != RA_BREAK)
4614 reg_restore(&rp->rs_un.regsave, &backpos);
4615
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004616 // Repeat until we found a position where it could match.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004617 for (;;)
4618 {
4619 if (status != RA_BREAK)
4620 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004621 // Tried first position already, advance.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004622 if (rp->rs_state == RS_STAR_LONG)
4623 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004624 // Trying for longest match, but couldn't or
4625 // didn't match -- back up one char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004626 if (--rst->count < rst->minval)
4627 break;
4628 if (rex.input == rex.line)
4629 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004630 // backup to last char of previous line
Bram Moolenaar6456fae2022-02-22 13:37:31 +00004631 if (rex.lnum == 0)
4632 {
4633 status = RA_NOMATCH;
4634 break;
4635 }
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004636 --rex.lnum;
4637 rex.line = reg_getline(rex.lnum);
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004638 // Just in case regrepeat() didn't count
4639 // right.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004640 if (rex.line == NULL)
4641 break;
4642 rex.input = rex.line + STRLEN(rex.line);
4643 fast_breakcheck();
4644 }
4645 else
4646 MB_PTR_BACK(rex.line, rex.input);
4647 }
4648 else
4649 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004650 // Range is backwards, use shortest match first.
4651 // Careful: maxval and minval are exchanged!
4652 // Couldn't or didn't match: try advancing one
4653 // char.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004654 if (rst->count == rst->minval
4655 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
4656 break;
4657 ++rst->count;
4658 }
4659 if (got_int)
4660 break;
4661 }
4662 else
4663 status = RA_NOMATCH;
4664
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004665 // If it could match, try it.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004666 if (rst->nextb == NUL || *rex.input == rst->nextb
4667 || *rex.input == rst->nextb_ic)
4668 {
4669 reg_save(&rp->rs_un.regsave, &backpos);
4670 scan = regnext(rp->rs_scan);
4671 status = RA_CONT;
4672 break;
4673 }
4674 }
4675 if (status != RA_CONT)
4676 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004677 // Failed.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004678 regstack_pop(&scan);
4679 regstack.ga_len -= sizeof(regstar_T);
4680 status = RA_NOMATCH;
4681 }
4682 }
4683 break;
4684 }
4685
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004686 // If we want to continue the inner loop or didn't pop a state
4687 // continue matching loop
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004688 if (status == RA_CONT || rp == (regitem_T *)
4689 ((char *)regstack.ga_data + regstack.ga_len) - 1)
4690 break;
4691 }
4692
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004693 // May need to continue with the inner loop, starting at "scan".
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004694 if (status == RA_CONT)
4695 continue;
4696
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004697 // If the regstack is empty or something failed we are done.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004698 if (regstack.ga_len == 0 || status == RA_FAIL)
4699 {
4700 if (scan == NULL)
4701 {
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004702 // We get here only if there's trouble -- normally "case END" is
4703 // the terminating point.
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004704 iemsg(_(e_corrupted_regexp_program));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004705#ifdef DEBUG
4706 printf("Premature EOL\n");
4707#endif
4708 }
4709 return (status == RA_MATCH);
4710 }
4711
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004712 } // End of loop until the regstack is empty.
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004713
Bram Moolenaar9490b9a2019-09-08 17:20:12 +02004714 // NOTREACHED
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004715}
4716
4717/*
4718 * regtry - try match of "prog" with at rex.line["col"].
4719 * Returns 0 for failure, number of lines contained in the match otherwise.
4720 */
4721 static long
4722regtry(
4723 bt_regprog_T *prog,
4724 colnr_T col,
4725 proftime_T *tm, // timeout limit or NULL
4726 int *timed_out) // flag set on timeout or NULL
4727{
4728 rex.input = rex.line + col;
4729 rex.need_clear_subexpr = TRUE;
4730#ifdef FEAT_SYN_HL
4731 // Clear the external match subpointers if necessary.
4732 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
4733#endif
4734
4735 if (regmatch(prog->program + 1, tm, timed_out) == 0)
4736 return 0;
4737
4738 cleanup_subexpr();
4739 if (REG_MULTI)
4740 {
4741 if (rex.reg_startpos[0].lnum < 0)
4742 {
4743 rex.reg_startpos[0].lnum = 0;
4744 rex.reg_startpos[0].col = col;
4745 }
4746 if (rex.reg_endpos[0].lnum < 0)
4747 {
4748 rex.reg_endpos[0].lnum = rex.lnum;
4749 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
4750 }
4751 else
4752 // Use line number of "\ze".
4753 rex.lnum = rex.reg_endpos[0].lnum;
4754 }
4755 else
4756 {
4757 if (rex.reg_startp[0] == NULL)
4758 rex.reg_startp[0] = rex.line + col;
4759 if (rex.reg_endp[0] == NULL)
4760 rex.reg_endp[0] = rex.input;
4761 }
4762#ifdef FEAT_SYN_HL
4763 // Package any found \z(...\) matches for export. Default is none.
4764 unref_extmatch(re_extmatch_out);
4765 re_extmatch_out = NULL;
4766
4767 if (prog->reghasz == REX_SET)
4768 {
4769 int i;
4770
4771 cleanup_zsubexpr();
4772 re_extmatch_out = make_extmatch();
Bram Moolenaar7c77b342019-12-22 19:40:40 +01004773 if (re_extmatch_out == NULL)
4774 return 0;
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004775 for (i = 0; i < NSUBEXP; i++)
4776 {
4777 if (REG_MULTI)
4778 {
4779 // Only accept single line matches.
4780 if (reg_startzpos[i].lnum >= 0
4781 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
4782 && reg_endzpos[i].col >= reg_startzpos[i].col)
4783 re_extmatch_out->matches[i] =
4784 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
4785 + reg_startzpos[i].col,
4786 reg_endzpos[i].col - reg_startzpos[i].col);
4787 }
4788 else
4789 {
4790 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
4791 re_extmatch_out->matches[i] =
4792 vim_strnsave(reg_startzp[i],
Bram Moolenaar71ccd032020-06-12 22:59:11 +02004793 reg_endzp[i] - reg_startzp[i]);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004794 }
4795 }
4796 }
4797#endif
4798 return 1 + rex.lnum;
4799}
4800
4801/*
4802 * Match a regexp against a string ("line" points to the string) or multiple
Bram Moolenaardf365142021-05-03 20:01:45 +02004803 * lines (if "line" is NULL, use reg_getline()).
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004804 * Returns 0 for failure, number of lines contained in the match otherwise.
4805 */
4806 static long
4807bt_regexec_both(
4808 char_u *line,
4809 colnr_T col, // column to start looking for match
4810 proftime_T *tm, // timeout limit or NULL
4811 int *timed_out) // flag set on timeout or NULL
4812{
4813 bt_regprog_T *prog;
4814 char_u *s;
4815 long retval = 0L;
4816
4817 // Create "regstack" and "backpos" if they are not allocated yet.
4818 // We allocate *_INITIAL amount of bytes first and then set the grow size
4819 // to much bigger value to avoid many malloc calls in case of deep regular
4820 // expressions.
4821 if (regstack.ga_data == NULL)
4822 {
4823 // Use an item size of 1 byte, since we push different things
4824 // onto the regstack.
4825 ga_init2(&regstack, 1, REGSTACK_INITIAL);
4826 (void)ga_grow(&regstack, REGSTACK_INITIAL);
4827 regstack.ga_growsize = REGSTACK_INITIAL * 8;
4828 }
4829
4830 if (backpos.ga_data == NULL)
4831 {
4832 ga_init2(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
4833 (void)ga_grow(&backpos, BACKPOS_INITIAL);
4834 backpos.ga_growsize = BACKPOS_INITIAL * 8;
4835 }
4836
4837 if (REG_MULTI)
4838 {
4839 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
4840 line = reg_getline((linenr_T)0);
4841 rex.reg_startpos = rex.reg_mmatch->startpos;
4842 rex.reg_endpos = rex.reg_mmatch->endpos;
4843 }
4844 else
4845 {
4846 prog = (bt_regprog_T *)rex.reg_match->regprog;
4847 rex.reg_startp = rex.reg_match->startp;
4848 rex.reg_endp = rex.reg_match->endp;
4849 }
4850
4851 // Be paranoid...
4852 if (prog == NULL || line == NULL)
4853 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02004854 iemsg(_(e_null_argument));
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02004855 goto theend;
4856 }
4857
4858 // Check validity of program.
4859 if (prog_magic_wrong())
4860 goto theend;
4861
4862 // If the start column is past the maximum column: no need to try.
4863 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4864 goto theend;
4865
4866 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
4867 if (prog->regflags & RF_ICASE)
4868 rex.reg_ic = TRUE;
4869 else if (prog->regflags & RF_NOICASE)
4870 rex.reg_ic = FALSE;
4871
4872 // If pattern contains "\Z" overrule value of rex.reg_icombine
4873 if (prog->regflags & RF_ICOMBINE)
4874 rex.reg_icombine = TRUE;
4875
4876 // If there is a "must appear" string, look for it.
4877 if (prog->regmust != NULL)
4878 {
4879 int c;
4880
4881 if (has_mbyte)
4882 c = (*mb_ptr2char)(prog->regmust);
4883 else
4884 c = *prog->regmust;
4885 s = line + col;
4886
4887 // This is used very often, esp. for ":global". Use three versions of
4888 // the loop to avoid overhead of conditions.
4889 if (!rex.reg_ic && !has_mbyte)
4890 while ((s = vim_strbyte(s, c)) != NULL)
4891 {
4892 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4893 break; // Found it.
4894 ++s;
4895 }
4896 else if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
4897 while ((s = vim_strchr(s, c)) != NULL)
4898 {
4899 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4900 break; // Found it.
4901 MB_PTR_ADV(s);
4902 }
4903 else
4904 while ((s = cstrchr(s, c)) != NULL)
4905 {
4906 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
4907 break; // Found it.
4908 MB_PTR_ADV(s);
4909 }
4910 if (s == NULL) // Not present.
4911 goto theend;
4912 }
4913
4914 rex.line = line;
4915 rex.lnum = 0;
4916 reg_toolong = FALSE;
4917
4918 // Simplest case: Anchored match need be tried only once.
4919 if (prog->reganch)
4920 {
4921 int c;
4922
4923 if (has_mbyte)
4924 c = (*mb_ptr2char)(rex.line + col);
4925 else
4926 c = rex.line[col];
4927 if (prog->regstart == NUL
4928 || prog->regstart == c
4929 || (rex.reg_ic
4930 && (((enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
4931 || (c < 255 && prog->regstart < 255 &&
4932 MB_TOLOWER(prog->regstart) == MB_TOLOWER(c)))))
4933 retval = regtry(prog, col, tm, timed_out);
4934 else
4935 retval = 0;
4936 }
4937 else
4938 {
4939#ifdef FEAT_RELTIME
4940 int tm_count = 0;
4941#endif
4942 // Messy cases: unanchored match.
4943 while (!got_int)
4944 {
4945 if (prog->regstart != NUL)
4946 {
4947 // Skip until the char we know it must start with.
4948 // Used often, do some work to avoid call overhead.
4949 if (!rex.reg_ic && !has_mbyte)
4950 s = vim_strbyte(rex.line + col, prog->regstart);
4951 else
4952 s = cstrchr(rex.line + col, prog->regstart);
4953 if (s == NULL)
4954 {
4955 retval = 0;
4956 break;
4957 }
4958 col = (int)(s - rex.line);
4959 }
4960
4961 // Check for maximum column to try.
4962 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol)
4963 {
4964 retval = 0;
4965 break;
4966 }
4967
4968 retval = regtry(prog, col, tm, timed_out);
4969 if (retval > 0)
4970 break;
4971
4972 // if not currently on the first line, get it again
4973 if (rex.lnum != 0)
4974 {
4975 rex.lnum = 0;
4976 rex.line = reg_getline((linenr_T)0);
4977 }
4978 if (rex.line[col] == NUL)
4979 break;
4980 if (has_mbyte)
4981 col += (*mb_ptr2len)(rex.line + col);
4982 else
4983 ++col;
4984#ifdef FEAT_RELTIME
4985 // Check for timeout once in a twenty times to avoid overhead.
4986 if (tm != NULL && ++tm_count == 20)
4987 {
4988 tm_count = 0;
4989 if (profile_passed_limit(tm))
4990 {
4991 if (timed_out != NULL)
4992 *timed_out = TRUE;
4993 break;
4994 }
4995 }
4996#endif
4997 }
4998 }
4999
5000theend:
5001 // Free "reg_tofree" when it's a bit big.
5002 // Free regstack and backpos if they are bigger than their initial size.
5003 if (reg_tofreelen > 400)
5004 VIM_CLEAR(reg_tofree);
5005 if (regstack.ga_maxlen > REGSTACK_INITIAL)
5006 ga_clear(&regstack);
5007 if (backpos.ga_maxlen > BACKPOS_INITIAL)
5008 ga_clear(&backpos);
5009
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005010 if (retval > 0)
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005011 {
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005012 // Make sure the end is never before the start. Can happen when \zs
5013 // and \ze are used.
5014 if (REG_MULTI)
5015 {
5016 lpos_T *start = &rex.reg_mmatch->startpos[0];
5017 lpos_T *end = &rex.reg_mmatch->endpos[0];
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005018
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005019 if (end->lnum < start->lnum
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005020 || (end->lnum == start->lnum && end->col < start->col))
Bram Moolenaara3d10a52020-12-21 18:24:00 +01005021 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
5022 }
5023 else
5024 {
5025 if (rex.reg_match->endp[0] < rex.reg_match->startp[0])
5026 rex.reg_match->endp[0] = rex.reg_match->startp[0];
5027 }
Bram Moolenaara7a691c2020-12-09 16:36:04 +01005028 }
5029
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005030 return retval;
5031}
5032
5033/*
5034 * Match a regexp against a string.
5035 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5036 * Uses curbuf for line count and 'iskeyword'.
5037 * if "line_lbr" is TRUE consider a "\n" in "line" to be a line break.
5038 *
5039 * Returns 0 for failure, number of lines contained in the match otherwise.
5040 */
5041 static int
5042bt_regexec_nl(
5043 regmatch_T *rmp,
5044 char_u *line, // string to match against
5045 colnr_T col, // column to start looking for match
5046 int line_lbr)
5047{
5048 rex.reg_match = rmp;
5049 rex.reg_mmatch = NULL;
5050 rex.reg_maxline = 0;
5051 rex.reg_line_lbr = line_lbr;
5052 rex.reg_buf = curbuf;
5053 rex.reg_win = NULL;
5054 rex.reg_ic = rmp->rm_ic;
5055 rex.reg_icombine = FALSE;
5056 rex.reg_maxcol = 0;
5057
5058 return bt_regexec_both(line, col, NULL, NULL);
5059}
5060
5061/*
5062 * Match a regexp against multiple lines.
5063 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
5064 * Uses curbuf for line count and 'iskeyword'.
5065 *
5066 * Return zero if there is no match. Return number of lines contained in the
5067 * match otherwise.
5068 */
5069 static long
5070bt_regexec_multi(
5071 regmmatch_T *rmp,
5072 win_T *win, // window in which to search or NULL
5073 buf_T *buf, // buffer in which to search
5074 linenr_T lnum, // nr of line to start looking for match
5075 colnr_T col, // column to start looking for match
5076 proftime_T *tm, // timeout limit or NULL
5077 int *timed_out) // flag set on timeout or NULL
5078{
Bram Moolenaarf4140482020-02-15 23:06:45 +01005079 init_regexec_multi(rmp, win, buf, lnum);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02005080 return bt_regexec_both(NULL, col, tm, timed_out);
5081}
5082
5083/*
5084 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5085 */
5086 static int
5087re_num_cmp(long_u val, char_u *scan)
5088{
5089 long_u n = OPERAND_MIN(scan);
5090
5091 if (OPERAND_CMP(scan) == '>')
5092 return val > n;
5093 if (OPERAND_CMP(scan) == '<')
5094 return val < n;
5095 return val == n;
5096}
5097
5098#ifdef BT_REGEXP_DUMP
5099
5100/*
5101 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5102 */
5103 static void
5104regdump(char_u *pattern, bt_regprog_T *r)
5105{
5106 char_u *s;
5107 int op = EXACTLY; // Arbitrary non-END op.
5108 char_u *next;
5109 char_u *end = NULL;
5110 FILE *f;
5111
5112#ifdef BT_REGEXP_LOG
5113 f = fopen("bt_regexp_log.log", "a");
5114#else
5115 f = stdout;
5116#endif
5117 if (f == NULL)
5118 return;
5119 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern);
5120
5121 s = r->program + 1;
5122 // Loop until we find the END that isn't before a referred next (an END
5123 // can also appear in a NOMATCH operand).
5124 while (op != END || s <= end)
5125 {
5126 op = OP(s);
5127 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); // Where, what.
5128 next = regnext(s);
5129 if (next == NULL) // Next ptr.
5130 fprintf(f, "(0)");
5131 else
5132 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5133 if (end < next)
5134 end = next;
5135 if (op == BRACE_LIMITS)
5136 {
5137 // Two ints
5138 fprintf(f, " minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5139 s += 8;
5140 }
5141 else if (op == BEHIND || op == NOBEHIND)
5142 {
5143 // one int
5144 fprintf(f, " count %ld", OPERAND_MIN(s));
5145 s += 4;
5146 }
5147 else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL)
5148 {
5149 // one int plus comparator
5150 fprintf(f, " count %ld", OPERAND_MIN(s));
5151 s += 5;
5152 }
5153 s += 3;
5154 if (op == ANYOF || op == ANYOF + ADD_NL
5155 || op == ANYBUT || op == ANYBUT + ADD_NL
5156 || op == EXACTLY)
5157 {
5158 // Literal string, where present.
5159 fprintf(f, "\nxxxxxxxxx\n");
5160 while (*s != NUL)
5161 fprintf(f, "%c", *s++);
5162 fprintf(f, "\nxxxxxxxxx\n");
5163 s++;
5164 }
5165 fprintf(f, "\r\n");
5166 }
5167
5168 // Header fields of interest.
5169 if (r->regstart != NUL)
5170 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5171 ? (char *)transchar(r->regstart)
5172 : "multibyte", r->regstart);
5173 if (r->reganch)
5174 fprintf(f, "anchored; ");
5175 if (r->regmust != NULL)
5176 fprintf(f, "must have \"%s\"", r->regmust);
5177 fprintf(f, "\r\n");
5178
5179#ifdef BT_REGEXP_LOG
5180 fclose(f);
5181#endif
5182}
5183#endif // BT_REGEXP_DUMP
5184
5185#ifdef DEBUG
5186/*
5187 * regprop - printable representation of opcode
5188 */
5189 static char_u *
5190regprop(char_u *op)
5191{
5192 char *p;
5193 static char buf[50];
5194
5195 STRCPY(buf, ":");
5196
5197 switch ((int) OP(op))
5198 {
5199 case BOL:
5200 p = "BOL";
5201 break;
5202 case EOL:
5203 p = "EOL";
5204 break;
5205 case RE_BOF:
5206 p = "BOF";
5207 break;
5208 case RE_EOF:
5209 p = "EOF";
5210 break;
5211 case CURSOR:
5212 p = "CURSOR";
5213 break;
5214 case RE_VISUAL:
5215 p = "RE_VISUAL";
5216 break;
5217 case RE_LNUM:
5218 p = "RE_LNUM";
5219 break;
5220 case RE_MARK:
5221 p = "RE_MARK";
5222 break;
5223 case RE_COL:
5224 p = "RE_COL";
5225 break;
5226 case RE_VCOL:
5227 p = "RE_VCOL";
5228 break;
5229 case BOW:
5230 p = "BOW";
5231 break;
5232 case EOW:
5233 p = "EOW";
5234 break;
5235 case ANY:
5236 p = "ANY";
5237 break;
5238 case ANY + ADD_NL:
5239 p = "ANY+NL";
5240 break;
5241 case ANYOF:
5242 p = "ANYOF";
5243 break;
5244 case ANYOF + ADD_NL:
5245 p = "ANYOF+NL";
5246 break;
5247 case ANYBUT:
5248 p = "ANYBUT";
5249 break;
5250 case ANYBUT + ADD_NL:
5251 p = "ANYBUT+NL";
5252 break;
5253 case IDENT:
5254 p = "IDENT";
5255 break;
5256 case IDENT + ADD_NL:
5257 p = "IDENT+NL";
5258 break;
5259 case SIDENT:
5260 p = "SIDENT";
5261 break;
5262 case SIDENT + ADD_NL:
5263 p = "SIDENT+NL";
5264 break;
5265 case KWORD:
5266 p = "KWORD";
5267 break;
5268 case KWORD + ADD_NL:
5269 p = "KWORD+NL";
5270 break;
5271 case SKWORD:
5272 p = "SKWORD";
5273 break;
5274 case SKWORD + ADD_NL:
5275 p = "SKWORD+NL";
5276 break;
5277 case FNAME:
5278 p = "FNAME";
5279 break;
5280 case FNAME + ADD_NL:
5281 p = "FNAME+NL";
5282 break;
5283 case SFNAME:
5284 p = "SFNAME";
5285 break;
5286 case SFNAME + ADD_NL:
5287 p = "SFNAME+NL";
5288 break;
5289 case PRINT:
5290 p = "PRINT";
5291 break;
5292 case PRINT + ADD_NL:
5293 p = "PRINT+NL";
5294 break;
5295 case SPRINT:
5296 p = "SPRINT";
5297 break;
5298 case SPRINT + ADD_NL:
5299 p = "SPRINT+NL";
5300 break;
5301 case WHITE:
5302 p = "WHITE";
5303 break;
5304 case WHITE + ADD_NL:
5305 p = "WHITE+NL";
5306 break;
5307 case NWHITE:
5308 p = "NWHITE";
5309 break;
5310 case NWHITE + ADD_NL:
5311 p = "NWHITE+NL";
5312 break;
5313 case DIGIT:
5314 p = "DIGIT";
5315 break;
5316 case DIGIT + ADD_NL:
5317 p = "DIGIT+NL";
5318 break;
5319 case NDIGIT:
5320 p = "NDIGIT";
5321 break;
5322 case NDIGIT + ADD_NL:
5323 p = "NDIGIT+NL";
5324 break;
5325 case HEX:
5326 p = "HEX";
5327 break;
5328 case HEX + ADD_NL:
5329 p = "HEX+NL";
5330 break;
5331 case NHEX:
5332 p = "NHEX";
5333 break;
5334 case NHEX + ADD_NL:
5335 p = "NHEX+NL";
5336 break;
5337 case OCTAL:
5338 p = "OCTAL";
5339 break;
5340 case OCTAL + ADD_NL:
5341 p = "OCTAL+NL";
5342 break;
5343 case NOCTAL:
5344 p = "NOCTAL";
5345 break;
5346 case NOCTAL + ADD_NL:
5347 p = "NOCTAL+NL";
5348 break;
5349 case WORD:
5350 p = "WORD";
5351 break;
5352 case WORD + ADD_NL:
5353 p = "WORD+NL";
5354 break;
5355 case NWORD:
5356 p = "NWORD";
5357 break;
5358 case NWORD + ADD_NL:
5359 p = "NWORD+NL";
5360 break;
5361 case HEAD:
5362 p = "HEAD";
5363 break;
5364 case HEAD + ADD_NL:
5365 p = "HEAD+NL";
5366 break;
5367 case NHEAD:
5368 p = "NHEAD";
5369 break;
5370 case NHEAD + ADD_NL:
5371 p = "NHEAD+NL";
5372 break;
5373 case ALPHA:
5374 p = "ALPHA";
5375 break;
5376 case ALPHA + ADD_NL:
5377 p = "ALPHA+NL";
5378 break;
5379 case NALPHA:
5380 p = "NALPHA";
5381 break;
5382 case NALPHA + ADD_NL:
5383 p = "NALPHA+NL";
5384 break;
5385 case LOWER:
5386 p = "LOWER";
5387 break;
5388 case LOWER + ADD_NL:
5389 p = "LOWER+NL";
5390 break;
5391 case NLOWER:
5392 p = "NLOWER";
5393 break;
5394 case NLOWER + ADD_NL:
5395 p = "NLOWER+NL";
5396 break;
5397 case UPPER:
5398 p = "UPPER";
5399 break;
5400 case UPPER + ADD_NL:
5401 p = "UPPER+NL";
5402 break;
5403 case NUPPER:
5404 p = "NUPPER";
5405 break;
5406 case NUPPER + ADD_NL:
5407 p = "NUPPER+NL";
5408 break;
5409 case BRANCH:
5410 p = "BRANCH";
5411 break;
5412 case EXACTLY:
5413 p = "EXACTLY";
5414 break;
5415 case NOTHING:
5416 p = "NOTHING";
5417 break;
5418 case BACK:
5419 p = "BACK";
5420 break;
5421 case END:
5422 p = "END";
5423 break;
5424 case MOPEN + 0:
5425 p = "MATCH START";
5426 break;
5427 case MOPEN + 1:
5428 case MOPEN + 2:
5429 case MOPEN + 3:
5430 case MOPEN + 4:
5431 case MOPEN + 5:
5432 case MOPEN + 6:
5433 case MOPEN + 7:
5434 case MOPEN + 8:
5435 case MOPEN + 9:
5436 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5437 p = NULL;
5438 break;
5439 case MCLOSE + 0:
5440 p = "MATCH END";
5441 break;
5442 case MCLOSE + 1:
5443 case MCLOSE + 2:
5444 case MCLOSE + 3:
5445 case MCLOSE + 4:
5446 case MCLOSE + 5:
5447 case MCLOSE + 6:
5448 case MCLOSE + 7:
5449 case MCLOSE + 8:
5450 case MCLOSE + 9:
5451 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5452 p = NULL;
5453 break;
5454 case BACKREF + 1:
5455 case BACKREF + 2:
5456 case BACKREF + 3:
5457 case BACKREF + 4:
5458 case BACKREF + 5:
5459 case BACKREF + 6:
5460 case BACKREF + 7:
5461 case BACKREF + 8:
5462 case BACKREF + 9:
5463 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5464 p = NULL;
5465 break;
5466 case NOPEN:
5467 p = "NOPEN";
5468 break;
5469 case NCLOSE:
5470 p = "NCLOSE";
5471 break;
5472#ifdef FEAT_SYN_HL
5473 case ZOPEN + 1:
5474 case ZOPEN + 2:
5475 case ZOPEN + 3:
5476 case ZOPEN + 4:
5477 case ZOPEN + 5:
5478 case ZOPEN + 6:
5479 case ZOPEN + 7:
5480 case ZOPEN + 8:
5481 case ZOPEN + 9:
5482 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5483 p = NULL;
5484 break;
5485 case ZCLOSE + 1:
5486 case ZCLOSE + 2:
5487 case ZCLOSE + 3:
5488 case ZCLOSE + 4:
5489 case ZCLOSE + 5:
5490 case ZCLOSE + 6:
5491 case ZCLOSE + 7:
5492 case ZCLOSE + 8:
5493 case ZCLOSE + 9:
5494 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5495 p = NULL;
5496 break;
5497 case ZREF + 1:
5498 case ZREF + 2:
5499 case ZREF + 3:
5500 case ZREF + 4:
5501 case ZREF + 5:
5502 case ZREF + 6:
5503 case ZREF + 7:
5504 case ZREF + 8:
5505 case ZREF + 9:
5506 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5507 p = NULL;
5508 break;
5509#endif
5510 case STAR:
5511 p = "STAR";
5512 break;
5513 case PLUS:
5514 p = "PLUS";
5515 break;
5516 case NOMATCH:
5517 p = "NOMATCH";
5518 break;
5519 case MATCH:
5520 p = "MATCH";
5521 break;
5522 case BEHIND:
5523 p = "BEHIND";
5524 break;
5525 case NOBEHIND:
5526 p = "NOBEHIND";
5527 break;
5528 case SUBPAT:
5529 p = "SUBPAT";
5530 break;
5531 case BRACE_LIMITS:
5532 p = "BRACE_LIMITS";
5533 break;
5534 case BRACE_SIMPLE:
5535 p = "BRACE_SIMPLE";
5536 break;
5537 case BRACE_COMPLEX + 0:
5538 case BRACE_COMPLEX + 1:
5539 case BRACE_COMPLEX + 2:
5540 case BRACE_COMPLEX + 3:
5541 case BRACE_COMPLEX + 4:
5542 case BRACE_COMPLEX + 5:
5543 case BRACE_COMPLEX + 6:
5544 case BRACE_COMPLEX + 7:
5545 case BRACE_COMPLEX + 8:
5546 case BRACE_COMPLEX + 9:
5547 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5548 p = NULL;
5549 break;
5550 case MULTIBYTECODE:
5551 p = "MULTIBYTECODE";
5552 break;
5553 case NEWL:
5554 p = "NEWL";
5555 break;
5556 default:
5557 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5558 p = NULL;
5559 break;
5560 }
5561 if (p != NULL)
5562 STRCAT(buf, p);
5563 return (char_u *)buf;
5564}
5565#endif // DEBUG