blob: 98231dac6dab15f4bfbff96a8ca9e2adb31b9353 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarac78dd42022-01-02 19:25:26 +000069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_(e_invalid_item_in_str_brackets), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaar071d4272004-06-13 20:20:40 +000074#define NOT_MULTI 0
75#define MULTI_ONE 1
76#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020077
78// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010079#define RA_FAIL 1 // something failed, abort
80#define RA_CONT 2 // continue in inner loop
81#define RA_BREAK 3 // break inner loop
82#define RA_MATCH 4 // successful match
83#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020084
Bram Moolenaar071d4272004-06-13 20:20:40 +000085/*
86 * Return NOT_MULTI if c is not a "multi" operator.
87 * Return MULTI_ONE if c is a single "multi" operator.
88 * Return MULTI_MULT if c is a multi "multi" operator.
89 */
90 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010091re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +000092{
93 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
94 return MULTI_ONE;
95 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
96 return MULTI_MULT;
97 return NOT_MULTI;
98}
99
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000100static char_u *reg_prev_sub = NULL;
101
Bram Moolenaar071d4272004-06-13 20:20:40 +0000102/*
103 * REGEXP_INRANGE contains all characters which are always special in a []
104 * range after '\'.
105 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
106 * These are:
107 * \n - New line (NL).
108 * \r - Carriage Return (CR).
109 * \t - Tab (TAB).
110 * \e - Escape (ESC).
111 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000112 * \d - Character code in decimal, eg \d123
113 * \o - Character code in octal, eg \o80
114 * \x - Character code in hex, eg \x4a
115 * \u - Multibyte character code, eg \u20ac
116 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000117 */
118static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000119static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000120
Bram Moolenaar071d4272004-06-13 20:20:40 +0000121/*
122 * Translate '\x' to its control character, except "\n", which is Magic.
123 */
124 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100125backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000126{
127 switch (c)
128 {
129 case 'r': return CAR;
130 case 't': return TAB;
131 case 'e': return ESC;
132 case 'b': return BS;
133 }
134 return c;
135}
136
137/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000138 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000139 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
140 * recognized. Otherwise "pp" is advanced to after the item.
141 */
142 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100143get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000144{
145 static const char *(class_names[]) =
146 {
147 "alnum:]",
148#define CLASS_ALNUM 0
149 "alpha:]",
150#define CLASS_ALPHA 1
151 "blank:]",
152#define CLASS_BLANK 2
153 "cntrl:]",
154#define CLASS_CNTRL 3
155 "digit:]",
156#define CLASS_DIGIT 4
157 "graph:]",
158#define CLASS_GRAPH 5
159 "lower:]",
160#define CLASS_LOWER 6
161 "print:]",
162#define CLASS_PRINT 7
163 "punct:]",
164#define CLASS_PUNCT 8
165 "space:]",
166#define CLASS_SPACE 9
167 "upper:]",
168#define CLASS_UPPER 10
169 "xdigit:]",
170#define CLASS_XDIGIT 11
171 "tab:]",
172#define CLASS_TAB 12
173 "return:]",
174#define CLASS_RETURN 13
175 "backspace:]",
176#define CLASS_BACKSPACE 14
177 "escape:]",
178#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100179 "ident:]",
180#define CLASS_IDENT 16
181 "keyword:]",
182#define CLASS_KEYWORD 17
183 "fname:]",
184#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000185 };
186#define CLASS_NONE 99
187 int i;
188
189 if ((*pp)[1] == ':')
190 {
K.Takataeeec2542021-06-02 13:28:16 +0200191 for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000192 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
193 {
194 *pp += STRLEN(class_names[i]) + 2;
195 return i;
196 }
197 }
198 return CLASS_NONE;
199}
200
201/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000202 * Specific version of character class functions.
203 * Using a table to keep this fast.
204 */
205static short class_tab[256];
206
207#define RI_DIGIT 0x01
208#define RI_HEX 0x02
209#define RI_OCTAL 0x04
210#define RI_WORD 0x08
211#define RI_HEAD 0x10
212#define RI_ALPHA 0x20
213#define RI_LOWER 0x40
214#define RI_UPPER 0x80
215#define RI_WHITE 0x100
216
217 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100218init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000219{
220 int i;
221 static int done = FALSE;
222
223 if (done)
224 return;
225
226 for (i = 0; i < 256; ++i)
227 {
228 if (i >= '0' && i <= '7')
229 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
230 else if (i >= '8' && i <= '9')
231 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
232 else if (i >= 'a' && i <= 'f')
233 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000234 else if (i >= 'g' && i <= 'z')
Bram Moolenaar071d4272004-06-13 20:20:40 +0000235 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
236 else if (i >= 'A' && i <= 'F')
237 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000238 else if (i >= 'G' && i <= 'Z')
Bram Moolenaar071d4272004-06-13 20:20:40 +0000239 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
240 else if (i == '_')
241 class_tab[i] = RI_WORD + RI_HEAD;
242 else
243 class_tab[i] = 0;
244 }
245 class_tab[' '] |= RI_WHITE;
246 class_tab['\t'] |= RI_WHITE;
247 done = TRUE;
248}
249
kylo252ae6f1d82022-02-16 19:24:07 +0000250#define ri_digit(c) ((c) < 0x100 && (class_tab[c] & RI_DIGIT))
251#define ri_hex(c) ((c) < 0x100 && (class_tab[c] & RI_HEX))
252#define ri_octal(c) ((c) < 0x100 && (class_tab[c] & RI_OCTAL))
253#define ri_word(c) ((c) < 0x100 && (class_tab[c] & RI_WORD))
254#define ri_head(c) ((c) < 0x100 && (class_tab[c] & RI_HEAD))
255#define ri_alpha(c) ((c) < 0x100 && (class_tab[c] & RI_ALPHA))
256#define ri_lower(c) ((c) < 0x100 && (class_tab[c] & RI_LOWER))
257#define ri_upper(c) ((c) < 0x100 && (class_tab[c] & RI_UPPER))
258#define ri_white(c) ((c) < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000259
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100260// flags for regflags
261#define RF_ICASE 1 // ignore case
262#define RF_NOICASE 2 // don't ignore case
263#define RF_HASNL 4 // can match a NL
264#define RF_ICOMBINE 8 // ignore combining characters
265#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000266
267/*
268 * Global work variables for vim_regcomp().
269 */
270
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100271static char_u *regparse; // Input-scan pointer.
272static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100273static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000274#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100275static int regnzpar; // \z() count.
276static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000277#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100278static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000279#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100280static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000281#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000282
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100283static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000284
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100285static int reg_string; // matching with a string instead of a buffer
286 // line
287static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000288
289/*
290 * META contains all characters that may be magic, except '^' and '$'.
291 */
292
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100293// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000294static char_u META_flags[] = {
295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100297// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000298 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100299// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000300 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100301// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000302 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100303// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000304 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100305// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000306 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100307// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000308 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
309};
Bram Moolenaar071d4272004-06-13 20:20:40 +0000310
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100311static int curchr; // currently parsed character
312// Previous character. Note: prevchr is sometimes -1 when we are not at the
313// start, eg in /[ ^I]^ the pattern was never found even if it existed,
314// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200315static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100316static int prevprevchr; // previous-previous character
317static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000318
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100319// arguments for reg()
320#define REG_NOPAREN 0 // toplevel reg()
321#define REG_PAREN 1 // \(\)
322#define REG_ZPAREN 2 // \z(\)
323#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200325typedef struct
326{
327 char_u *regparse;
328 int prevchr_len;
329 int curchr;
330 int prevchr;
331 int prevprevchr;
332 int nextchr;
333 int at_start;
334 int prev_at_start;
335 int regnpar;
336} parse_state_T;
337
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100338static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100339static int getchr(void);
340static void skipchr_keepstart(void);
341static int peekchr(void);
342static void skipchr(void);
343static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100344static long gethexchrs(int maxinputlen);
345static long getoctchrs(void);
346static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100347static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100348static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200349static int cstrncmp(char_u *s1, char_u *s2, int *n);
350static char_u *cstrchr(char_u *, int);
351static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100352static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100353#ifdef FEAT_EVAL
354static void report_re_switch(char_u *pat);
355#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000356
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200357static regengine_T bt_regengine;
358static regengine_T nfa_regengine;
359
Bram Moolenaar071d4272004-06-13 20:20:40 +0000360/*
361 * Return TRUE if compiled regular expression "prog" can match a line break.
362 */
363 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100364re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000365{
366 return (prog->regflags & RF_HASNL);
367}
368
369/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000370 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
371 * Returns a character representing the class. Zero means that no item was
372 * recognized. Otherwise "pp" is advanced to after the item.
373 */
374 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100375get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000376{
377 int c;
378 int l = 1;
379 char_u *p = *pp;
380
Bram Moolenaar985079c2019-02-16 17:07:47 +0100381 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000382 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000383 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000384 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000385 if (p[l + 2] == '=' && p[l + 3] == ']')
386 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000387 if (has_mbyte)
388 c = mb_ptr2char(p + 2);
389 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000390 c = p[2];
391 *pp += l + 4;
392 return c;
393 }
394 }
395 return 0;
396}
397
398/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000399 * Check for a collating element "[.a.]". "pp" points to the '['.
400 * Returns a character. Zero means that no item was recognized. Otherwise
401 * "pp" is advanced to after the item.
402 * Currently only single characters are recognized!
403 */
404 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100405get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000406{
407 int c;
408 int l = 1;
409 char_u *p = *pp;
410
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100411 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000412 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000413 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000414 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (p[l + 2] == '.' && p[l + 3] == ']')
416 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000417 if (has_mbyte)
418 c = mb_ptr2char(p + 2);
419 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000420 c = p[2];
421 *pp += l + 4;
422 return c;
423 }
424 }
425 return 0;
426}
427
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100428static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
429static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200430
431 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100432get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200433{
434 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
435 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
436}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000437
438/*
439 * Skip over a "[]" range.
440 * "p" must point to the character after the '['.
441 * The returned pointer is on the matching ']', or the terminating NUL.
442 */
443 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100444skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000445{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000446 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000447
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100448 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000449 ++p;
450 if (*p == ']' || *p == '-')
451 ++p;
452 while (*p != NUL && *p != ']')
453 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000454 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000455 p += l;
456 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000457 if (*p == '-')
458 {
459 ++p;
460 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100461 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000462 }
463 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200464 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000465 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200466 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000467 p += 2;
468 else if (*p == '[')
469 {
470 if (get_char_class(&p) == CLASS_NONE
471 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200472 && get_coll_element(&p) == 0
473 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100474 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000475 }
476 else
477 ++p;
478 }
479
480 return p;
481}
482
483/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000484 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200485 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000486 * Take care of characters with a backslash in front of it.
487 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000488 */
489 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100490skip_regexp(
491 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200492 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200493 int magic)
494{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100495 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200496}
497
498/*
499 * Call skip_regexp() and when the delimiter does not match give an error and
500 * return NULL.
501 */
502 char_u *
503skip_regexp_err(
504 char_u *startp,
505 int delim,
506 int magic)
507{
508 char_u *p = skip_regexp(startp, delim, magic);
509
510 if (*p != delim)
511 {
Bram Moolenaara6f79292022-01-04 21:30:47 +0000512 semsg(_(e_missing_delimiter_after_search_pattern_str), startp);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200513 return NULL;
514 }
515 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200516}
517
518/*
519 * skip_regexp() with extra arguments:
520 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
521 * expression and change "\?" to "?". If "*newp" is not NULL the expression
522 * is changed in-place.
523 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100524 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200525 */
526 char_u *
527skip_regexp_ex(
528 char_u *startp,
529 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100530 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200531 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100532 int *dropped,
533 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000534{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100535 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000536 char_u *p = startp;
537
538 if (magic)
539 mymagic = MAGIC_ON;
540 else
541 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200542 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000543
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100544 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000545 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100546 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000547 break;
548 if ((p[0] == '[' && mymagic >= MAGIC_ON)
549 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
550 {
551 p = skip_anyof(p + 1);
552 if (p[0] == NUL)
553 break;
554 }
555 else if (p[0] == '\\' && p[1] != NUL)
556 {
557 if (dirc == '?' && newp != NULL && p[1] == '?')
558 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100559 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000560 if (*newp == NULL)
561 {
562 *newp = vim_strsave(startp);
563 if (*newp != NULL)
564 p = *newp + (p - startp);
565 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200566 if (dropped != NULL)
567 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000568 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000569 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000570 else
571 ++p;
572 }
573 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100574 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000575 if (*p == 'v')
576 mymagic = MAGIC_ALL;
577 else if (*p == 'V')
578 mymagic = MAGIC_NONE;
579 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000580 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100581 if (magic_val != NULL)
582 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000583 return p;
584}
585
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200586/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200587 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200588 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100589static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200590static int at_start; // True when on the first character
591static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100592
Bram Moolenaar071d4272004-06-13 20:20:40 +0000593/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200594 * Start parsing at "str".
595 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000596 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100597initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000598{
599 regparse = str;
600 prevchr_len = 0;
601 curchr = prevprevchr = prevchr = nextchr = -1;
602 at_start = TRUE;
603 prev_at_start = FALSE;
604}
605
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200606/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200607 * Save the current parse state, so that it can be restored and parsing
608 * starts in the same state again.
609 */
610 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100611save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200612{
613 ps->regparse = regparse;
614 ps->prevchr_len = prevchr_len;
615 ps->curchr = curchr;
616 ps->prevchr = prevchr;
617 ps->prevprevchr = prevprevchr;
618 ps->nextchr = nextchr;
619 ps->at_start = at_start;
620 ps->prev_at_start = prev_at_start;
621 ps->regnpar = regnpar;
622}
623
624/*
625 * Restore a previously saved parse state.
626 */
627 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100628restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200629{
630 regparse = ps->regparse;
631 prevchr_len = ps->prevchr_len;
632 curchr = ps->curchr;
633 prevchr = ps->prevchr;
634 prevprevchr = ps->prevprevchr;
635 nextchr = ps->nextchr;
636 at_start = ps->at_start;
637 prev_at_start = ps->prev_at_start;
638 regnpar = ps->regnpar;
639}
640
641
642/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200643 * Get the next character without advancing.
644 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000645 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100646peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000647{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000648 static int after_slash = FALSE;
649
Bram Moolenaar071d4272004-06-13 20:20:40 +0000650 if (curchr == -1)
651 {
652 switch (curchr = regparse[0])
653 {
654 case '.':
655 case '[':
656 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100657 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000658 if (reg_magic >= MAGIC_ON)
659 curchr = Magic(curchr);
660 break;
661 case '(':
662 case ')':
663 case '{':
664 case '%':
665 case '+':
666 case '=':
667 case '?':
668 case '@':
669 case '!':
670 case '&':
671 case '|':
672 case '<':
673 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100674 case '#': // future ext.
675 case '"': // future ext.
676 case '\'': // future ext.
677 case ',': // future ext.
678 case '-': // future ext.
679 case ':': // future ext.
680 case ';': // future ext.
681 case '`': // future ext.
682 case '/': // Can't be used in / command
683 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000684 if (reg_magic == MAGIC_ALL)
685 curchr = Magic(curchr);
686 break;
687 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100688 // * is not magic as the very first character, eg "?*ptr", when
689 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
690 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000691 if (reg_magic >= MAGIC_ON
692 && !at_start
693 && !(prev_at_start && prevchr == Magic('^'))
694 && (after_slash
695 || (prevchr != Magic('(')
696 && prevchr != Magic('&')
697 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000698 curchr = Magic('*');
699 break;
700 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100701 // '^' is only magic as the very first character and if it's after
702 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000703 if (reg_magic >= MAGIC_OFF
704 && (at_start
705 || reg_magic == MAGIC_ALL
706 || prevchr == Magic('(')
707 || prevchr == Magic('|')
708 || prevchr == Magic('&')
709 || prevchr == Magic('n')
710 || (no_Magic(prevchr) == '('
711 && prevprevchr == Magic('%'))))
712 {
713 curchr = Magic('^');
714 at_start = TRUE;
715 prev_at_start = FALSE;
716 }
717 break;
718 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100719 // '$' is only magic as the very last char and if it's in front of
720 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000721 if (reg_magic >= MAGIC_OFF)
722 {
723 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200724 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000725
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100726 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000727 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200728 || p[1] == 'm' || p[1] == 'M'
729 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
730 {
731 if (p[1] == 'v')
732 is_magic_all = TRUE;
733 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
734 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000735 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200736 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000737 if (p[0] == NUL
738 || (p[0] == '\\'
739 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
740 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200741 || (is_magic_all
742 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000743 || reg_magic == MAGIC_ALL)
744 curchr = Magic('$');
745 }
746 break;
747 case '\\':
748 {
749 int c = regparse[1];
750
751 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100752 curchr = '\\'; // trailing '\'
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000753 else if (c <= '~' && META_flags[c])
Bram Moolenaar071d4272004-06-13 20:20:40 +0000754 {
755 /*
756 * META contains everything that may be magic sometimes,
757 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200758 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000759 * magicness. Therefore, \ is so meta-magic that it is
760 * not in META.
761 */
762 curchr = -1;
763 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100764 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000765 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000766 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000767 peekchr();
768 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000769 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000770 curchr = toggle_Magic(curchr);
771 }
772 else if (vim_strchr(REGEXP_ABBR, c))
773 {
774 /*
775 * Handle abbreviations, like "\t" for TAB -- webb
776 */
777 curchr = backslash_trans(c);
778 }
779 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
780 curchr = toggle_Magic(c);
781 else
782 {
783 /*
784 * Next character can never be (made) magic?
785 * Then backslashing it won't do anything.
786 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000787 if (has_mbyte)
788 curchr = (*mb_ptr2char)(regparse + 1);
789 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000790 curchr = c;
791 }
792 break;
793 }
794
Bram Moolenaar071d4272004-06-13 20:20:40 +0000795 default:
796 if (has_mbyte)
797 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000798 }
799 }
800
801 return curchr;
802}
803
804/*
805 * Eat one lexed character. Do this in a way that we can undo it.
806 */
807 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100808skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000809{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100810 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000811 if (*regparse == '\\')
812 prevchr_len = 1;
813 else
814 prevchr_len = 0;
815 if (regparse[prevchr_len] != NUL)
816 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000817 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100818 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000819 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000820 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000821 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000822 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000823 ++prevchr_len;
824 }
825 regparse += prevchr_len;
826 prev_at_start = at_start;
827 at_start = FALSE;
828 prevprevchr = prevchr;
829 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100830 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000831 nextchr = -1;
832}
833
834/*
835 * Skip a character while keeping the value of prev_at_start for at_start.
836 * prevchr and prevprevchr are also kept.
837 */
838 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100839skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000840{
841 int as = prev_at_start;
842 int pr = prevchr;
843 int prpr = prevprevchr;
844
845 skipchr();
846 at_start = as;
847 prevchr = pr;
848 prevprevchr = prpr;
849}
850
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200851/*
852 * Get the next character from the pattern. We know about magic and such, so
853 * therefore we need a lexical analyzer.
854 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000855 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100856getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000857{
858 int chr = peekchr();
859
860 skipchr();
861 return chr;
862}
863
864/*
865 * put character back. Works only once!
866 */
867 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100868ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000869{
870 nextchr = curchr;
871 curchr = prevchr;
872 prevchr = prevprevchr;
873 at_start = prev_at_start;
874 prev_at_start = FALSE;
875
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100876 // Backup regparse, so that it's at the same position as before the
877 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000878 regparse -= prevchr_len;
879}
880
881/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000882 * Get and return the value of the hex string at the current position.
883 * Return -1 if there is no valid hex number.
884 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000885 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000886 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000887 * The parameter controls the maximum number of input characters. This will be
888 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
889 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100890 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100891gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000892{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100893 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000894 int c;
895 int i;
896
897 for (i = 0; i < maxinputlen; ++i)
898 {
899 c = regparse[0];
900 if (!vim_isxdigit(c))
901 break;
902 nr <<= 4;
903 nr |= hex2nr(c);
904 ++regparse;
905 }
906
907 if (i == 0)
908 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100909 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000910}
911
912/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200913 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000914 * current position. Return -1 for invalid. Consumes all digits.
915 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100916 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100917getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000918{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100919 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000920 int c;
921 int i;
922
923 for (i = 0; ; ++i)
924 {
925 c = regparse[0];
926 if (c < '0' || c > '9')
927 break;
928 nr *= 10;
929 nr += c - '0';
930 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100931 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000932 }
933
934 if (i == 0)
935 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100936 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000937}
938
939/*
940 * get and return the value of the octal string immediately after the current
941 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
942 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
943 * treat 8 or 9 as recognised characters. Position is updated:
944 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000945 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000946 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100947 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100948getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000949{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100950 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000951 int c;
952 int i;
953
954 for (i = 0; i < 3 && nr < 040; ++i)
955 {
956 c = regparse[0];
957 if (c < '0' || c > '7')
958 break;
959 nr <<= 3;
960 nr |= hex2nr(c);
961 ++regparse;
962 }
963
964 if (i == 0)
965 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100966 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000967}
968
969/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000970 * read_limits - Read two integers to be taken as a minimum and maximum.
971 * If the first character is '-', then the range is reversed.
972 * Should end with 'end'. If minval is missing, zero is default, if maxval is
973 * missing, a very big number is the default.
974 */
975 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100976read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000977{
978 int reverse = FALSE;
979 char_u *first_char;
980 long tmp;
981
982 if (*regparse == '-')
983 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100984 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +0000985 regparse++;
986 reverse = TRUE;
987 }
988 first_char = regparse;
989 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100990 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +0000991 {
992 if (vim_isdigit(*++regparse))
993 *maxval = getdigits(&regparse);
994 else
995 *maxval = MAX_LIMIT;
996 }
997 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100998 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +0000999 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001000 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001001 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001002 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001003 if (*regparse != '}')
Bram Moolenaar1d423ef2022-01-02 21:26:16 +00001004 EMSG2_RET_FAIL(_(e_syntax_error_in_str_curlies),
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001005 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001006
1007 /*
1008 * Reverse the range if there was a '-', or make sure it is in the right
1009 * order otherwise.
1010 */
1011 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1012 {
1013 tmp = *minval;
1014 *minval = *maxval;
1015 *maxval = tmp;
1016 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001017 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001018 return OK;
1019}
1020
1021/*
1022 * vim_regexec and friends
1023 */
1024
1025/*
1026 * Global work variables for vim_regexec().
1027 */
1028
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001029static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001030#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001031static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001032#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001033static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001034static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001035
1036/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001037 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1038 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001039 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001040 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001041static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001042static unsigned reg_tofreelen;
1043
1044/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001045 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001046 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001047 * done:
1048 * single-line multi-line
1049 * reg_match &regmatch_T NULL
1050 * reg_mmatch NULL &regmmatch_T
1051 * reg_startp reg_match->startp <invalid>
1052 * reg_endp reg_match->endp <invalid>
1053 * reg_startpos <invalid> reg_mmatch->startpos
1054 * reg_endpos <invalid> reg_mmatch->endpos
1055 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001056 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001057 * reg_firstlnum <invalid> first line in which to search
1058 * reg_maxline 0 last line nr
1059 * reg_line_lbr FALSE or TRUE FALSE
1060 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001061typedef struct {
1062 regmatch_T *reg_match;
1063 regmmatch_T *reg_mmatch;
1064 char_u **reg_startp;
1065 char_u **reg_endp;
1066 lpos_T *reg_startpos;
1067 lpos_T *reg_endpos;
1068 win_T *reg_win;
1069 buf_T *reg_buf;
1070 linenr_T reg_firstlnum;
1071 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001072 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001073
Bram Moolenaar0270f382018-07-17 05:43:58 +02001074 // The current match-position is stord in these variables:
1075 linenr_T lnum; // line number, relative to first line
1076 char_u *line; // start of current line
Bram Moolenaar64066b92021-11-17 18:22:56 +00001077 char_u *input; // current input, points into "line"
Bram Moolenaar0270f382018-07-17 05:43:58 +02001078
1079 int need_clear_subexpr; // subexpressions still need to be cleared
1080#ifdef FEAT_SYN_HL
1081 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1082 // cleared
1083#endif
1084
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001085 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1086 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1087 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001088 int reg_ic;
1089
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001090 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1091 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001092 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001093
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001094 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1095 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001096 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001097
1098 // State for the NFA engine regexec.
1099 int nfa_has_zend; // NFA regexp \ze operator encountered.
1100 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1101 int nfa_nsubexpr; // Number of sub expressions actually being used
1102 // during execution. 1 if only the whole match
1103 // (subexpr 0) is used.
1104 // listid is global, so that it increases on recursive calls to
1105 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1106 // all the states.
1107 int nfa_listid;
1108 int nfa_alt_listid;
1109
1110#ifdef FEAT_SYN_HL
1111 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1112#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001113} regexec_T;
1114
1115static regexec_T rex;
1116static int rex_in_use = FALSE;
1117
Bram Moolenaar071d4272004-06-13 20:20:40 +00001118/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001119 * Return TRUE if character 'c' is included in 'iskeyword' option for
1120 * "reg_buf" buffer.
1121 */
1122 static int
1123reg_iswordc(int c)
1124{
1125 return vim_iswordc_buf(c, rex.reg_buf);
1126}
1127
1128/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001129 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1130 */
1131 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001132reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001133{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001134 // when looking behind for a match/no-match lnum is negative. But we
1135 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001136 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001137 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001138 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001139 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001140 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001141 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001142}
1143
Bram Moolenaar071d4272004-06-13 20:20:40 +00001144#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001145static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1146static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1147static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1148static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001149#endif
1150
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001151// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001152#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001153
Bram Moolenaar071d4272004-06-13 20:20:40 +00001154#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001155/*
1156 * Create a new extmatch and mark it as referenced once.
1157 */
1158 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001159make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001160{
1161 reg_extmatch_T *em;
1162
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001163 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001164 if (em != NULL)
1165 em->refcnt = 1;
1166 return em;
1167}
1168
1169/*
1170 * Add a reference to an extmatch.
1171 */
1172 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001173ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001174{
1175 if (em != NULL)
1176 em->refcnt++;
1177 return em;
1178}
1179
1180/*
1181 * Remove a reference to an extmatch. If there are no references left, free
1182 * the info.
1183 */
1184 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001185unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001186{
1187 int i;
1188
1189 if (em != NULL && --em->refcnt <= 0)
1190 {
1191 for (i = 0; i < NSUBEXP; ++i)
1192 vim_free(em->matches[i]);
1193 vim_free(em);
1194 }
1195}
1196#endif
1197
1198/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001199 * Get class of previous character.
1200 */
1201 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001202reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001203{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001204 if (rex.input > rex.line)
1205 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001206 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001207 return -1;
1208}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001209
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001210/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001211 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001212 */
1213 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001214reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001215{
1216 pos_T top, bot;
1217 linenr_T lnum;
1218 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001219 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001220 int mode;
1221 colnr_T start, end;
1222 colnr_T start2, end2;
1223 colnr_T cols;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001224 colnr_T curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001225
Bram Moolenaar679d66c2022-01-30 16:42:56 +00001226 // Check if the buffer is the current buffer and not using a string.
Bram Moolenaar44a4d942022-01-30 17:17:41 +00001227 if (rex.reg_buf != curbuf || VIsual.lnum == 0 || !REG_MULTI)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001228 return FALSE;
1229
1230 if (VIsual_active)
1231 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001232 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001233 {
1234 top = VIsual;
1235 bot = wp->w_cursor;
1236 }
1237 else
1238 {
1239 top = wp->w_cursor;
1240 bot = VIsual;
1241 }
1242 mode = VIsual_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001243 curswant = wp->w_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001244 }
1245 else
1246 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001247 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001248 {
1249 top = curbuf->b_visual.vi_start;
1250 bot = curbuf->b_visual.vi_end;
1251 }
1252 else
1253 {
1254 top = curbuf->b_visual.vi_end;
1255 bot = curbuf->b_visual.vi_start;
1256 }
1257 mode = curbuf->b_visual.vi_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001258 curswant = curbuf->b_visual.vi_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001259 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001260 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001261 if (lnum < top.lnum || lnum > bot.lnum)
1262 return FALSE;
1263
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001264 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001265 if (mode == 'v')
1266 {
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001267 if ((lnum == top.lnum && col < top.col)
1268 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1269 return FALSE;
1270 }
1271 else if (mode == Ctrl_V)
1272 {
1273 getvvcol(wp, &top, &start, NULL, &end);
1274 getvvcol(wp, &bot, &start2, NULL, &end2);
1275 if (start2 < start)
1276 start = start2;
1277 if (end2 > end)
1278 end = end2;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001279 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001280 end = MAXCOL;
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001281
1282 // getvvcol() flushes rex.line, need to get it again
1283 rex.line = reg_getline(rex.lnum);
1284 rex.input = rex.line + col;
1285
1286 cols = win_linetabsize(wp, rex.line, col);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001287 if (cols < start || cols > end - (*p_sel == 'e'))
1288 return FALSE;
1289 }
1290 return TRUE;
1291}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001292
Bram Moolenaar071d4272004-06-13 20:20:40 +00001293/*
1294 * Check the regexp program for its magic number.
1295 * Return TRUE if it's wrong.
1296 */
1297 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001298prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001299{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001300 regprog_T *prog;
1301
Bram Moolenaar6100d022016-10-02 16:51:57 +02001302 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001303 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001304 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001305 return FALSE;
1306
1307 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001308 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001309 emsg(_(e_corrupted_regexp_program));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001310 return TRUE;
1311 }
1312 return FALSE;
1313}
1314
1315/*
1316 * Cleanup the subexpressions, if this wasn't done yet.
1317 * This construction is used to clear the subexpressions only when they are
1318 * used (to increase speed).
1319 */
1320 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001321cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001322{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001323 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001324 {
1325 if (REG_MULTI)
1326 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001327 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001328 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1329 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001330 }
1331 else
1332 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001333 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1334 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001335 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001336 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001337 }
1338}
1339
1340#ifdef FEAT_SYN_HL
1341 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001342cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001343{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001344 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001345 {
1346 if (REG_MULTI)
1347 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001348 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001349 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1350 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1351 }
1352 else
1353 {
1354 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1355 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1356 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001357 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001358 }
1359}
1360#endif
1361
1362/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001363 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001364 */
1365 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001366reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001367{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001368 rex.line = reg_getline(++rex.lnum);
1369 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001370 fast_breakcheck();
1371}
1372
1373/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001374 * Check whether a backreference matches.
1375 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001376 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1377 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001378 */
1379 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001380match_with_backref(
1381 linenr_T start_lnum,
1382 colnr_T start_col,
1383 linenr_T end_lnum,
1384 colnr_T end_col,
1385 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001386{
1387 linenr_T clnum = start_lnum;
1388 colnr_T ccol = start_col;
1389 int len;
1390 char_u *p;
1391
1392 if (bytelen != NULL)
1393 *bytelen = 0;
1394 for (;;)
1395 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001396 // Since getting one line may invalidate the other, need to make copy.
1397 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001398 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001399 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001400 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001401 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1402 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001403 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001404 vim_free(reg_tofree);
1405 reg_tofree = alloc(len);
1406 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001407 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001408 reg_tofreelen = len;
1409 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001410 STRCPY(reg_tofree, rex.line);
1411 rex.input = reg_tofree + (rex.input - rex.line);
1412 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001413 }
1414
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001415 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001416 p = reg_getline(clnum);
1417 if (clnum == end_lnum)
1418 len = end_col - ccol;
1419 else
1420 len = (int)STRLEN(p + ccol);
1421
Bram Moolenaar0270f382018-07-17 05:43:58 +02001422 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001423 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001424 if (bytelen != NULL)
1425 *bytelen += len;
1426 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001427 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001428 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001429 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001430
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001431 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001432 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001433 if (bytelen != NULL)
1434 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001435 ++clnum;
1436 ccol = 0;
1437 if (got_int)
1438 return RA_FAIL;
1439 }
1440
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001441 // found a match! Note that rex.line may now point to a copy of the line,
1442 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001443 return RA_MATCH;
1444}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001445
Bram Moolenaarfb031402014-09-09 17:18:49 +02001446/*
1447 * Used in a place where no * or \+ can follow.
1448 */
1449 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001450re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001451{
1452 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001453 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001454 semsg(_(e_nfa_regexp_cannot_repeat_str), what);
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001455 rc_did_emsg = TRUE;
1456 return FAIL;
1457 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001458 return OK;
1459}
1460
Bram Moolenaar071d4272004-06-13 20:20:40 +00001461typedef struct
1462{
1463 int a, b, c;
1464} decomp_T;
1465
1466
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001467// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001468static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001469{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001470 {0x5e2,0,0}, // 0xfb20 alt ayin
1471 {0x5d0,0,0}, // 0xfb21 alt alef
1472 {0x5d3,0,0}, // 0xfb22 alt dalet
1473 {0x5d4,0,0}, // 0xfb23 alt he
1474 {0x5db,0,0}, // 0xfb24 alt kaf
1475 {0x5dc,0,0}, // 0xfb25 alt lamed
1476 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1477 {0x5e8,0,0}, // 0xfb27 alt resh
1478 {0x5ea,0,0}, // 0xfb28 alt tav
1479 {'+', 0, 0}, // 0xfb29 alt plus
1480 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1481 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1482 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1483 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1484 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1485 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1486 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1487 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1488 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1489 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1490 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1491 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1492 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1493 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1494 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1495 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1496 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1497 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1498 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1499 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1500 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1501 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1502 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1503 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1504 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1505 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1506 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1507 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1508 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1509 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1510 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1511 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1512 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1513 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1514 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1515 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1516 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1517 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001518};
1519
1520 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001521mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001522{
1523 decomp_T d;
1524
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001525 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001526 {
1527 d = decomp_table[c - 0xfb20];
1528 *c1 = d.a;
1529 *c2 = d.b;
1530 *c3 = d.c;
1531 }
1532 else
1533 {
1534 *c1 = c;
1535 *c2 = *c3 = 0;
1536 }
1537}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001538
1539/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001540 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001541 * Return 0 if strings match, non-zero otherwise.
1542 * Correct the length "*n" when composing characters are ignored.
1543 */
1544 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001545cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001546{
1547 int result;
1548
Bram Moolenaar6100d022016-10-02 16:51:57 +02001549 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001550 result = STRNCMP(s1, s2, *n);
1551 else
1552 result = MB_STRNICMP(s1, s2, *n);
1553
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001554 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001555 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001556 {
1557 char_u *str1, *str2;
1558 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001559 int junk;
1560
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001561 // we have to handle the strcmp ourselves, since it is necessary to
1562 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001563 str1 = s1;
1564 str2 = s2;
1565 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001566 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001567 {
1568 c1 = mb_ptr2char_adv(&str1);
1569 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001570
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001571 // Decompose the character if necessary, into 'base' characters.
1572 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001573 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001574 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001575 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001576 mb_decompose(c1, &c11, &junk, &junk);
1577 mb_decompose(c2, &c12, &junk, &junk);
1578 c1 = c11;
1579 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001580 if (c11 != c12
1581 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001582 break;
1583 }
1584 }
1585 result = c2 - c1;
1586 if (result == 0)
1587 *n = (int)(str2 - s2);
1588 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001589
1590 return result;
1591}
1592
1593/*
1594 * cstrchr: This function is used a lot for simple searches, keep it fast!
1595 */
1596 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001597cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001598{
1599 char_u *p;
1600 int cc;
1601
Bram Moolenaara12a1612019-01-24 16:39:02 +01001602 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001603 return vim_strchr(s, c);
1604
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001605 // tolower() and toupper() can be slow, comparing twice should be a lot
1606 // faster (esp. when using MS Visual C++!).
1607 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001608 if (enc_utf8 && c > 0x80)
1609 cc = utf_fold(c);
1610 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001611 if (MB_ISUPPER(c))
1612 cc = MB_TOLOWER(c);
1613 else if (MB_ISLOWER(c))
1614 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001615 else
1616 return vim_strchr(s, c);
1617
Bram Moolenaar071d4272004-06-13 20:20:40 +00001618 if (has_mbyte)
1619 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001620 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001621 {
1622 if (enc_utf8 && c > 0x80)
1623 {
1624 if (utf_fold(utf_ptr2char(p)) == cc)
1625 return p;
1626 }
1627 else if (*p == c || *p == cc)
1628 return p;
1629 }
1630 }
1631 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001632 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001633 for (p = s; *p != NUL; ++p)
1634 if (*p == c || *p == cc)
1635 return p;
1636
1637 return NULL;
1638}
1639
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001640////////////////////////////////////////////////////////////////
1641// regsub stuff //
1642////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001643
Bram Moolenaar071d4272004-06-13 20:20:40 +00001644/*
1645 * We should define ftpr as a pointer to a function returning a pointer to
1646 * a function returning a pointer to a function ...
1647 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001648 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001649 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001650typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001651
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001652static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001653
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001654 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001655do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001656{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001657 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001658
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001659 return (fptr_T)NULL;
1660}
1661
1662 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001663do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001664{
1665 *d = MB_TOUPPER(c);
1666
1667 return (fptr_T)do_Upper;
1668}
1669
1670 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001671do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001672{
1673 *d = MB_TOLOWER(c);
1674
1675 return (fptr_T)NULL;
1676}
1677
1678 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001679do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001680{
1681 *d = MB_TOLOWER(c);
1682
1683 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001684}
1685
1686/*
1687 * regtilde(): Replace tildes in the pattern by the old pattern.
1688 *
1689 * Short explanation of the tilde: It stands for the previous replacement
1690 * pattern. If that previous pattern also contains a ~ we should go back a
1691 * step further... But we insert the previous pattern into the current one
1692 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001693 * This still does not handle the case where "magic" changes. So require the
1694 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001695 *
1696 * The tildes are parsed once before the first call to vim_regsub().
1697 */
1698 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001699regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001700{
1701 char_u *newsub = source;
1702 char_u *tmpsub;
1703 char_u *p;
1704 int len;
1705 int prevlen;
1706
1707 for (p = newsub; *p; ++p)
1708 {
1709 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1710 {
1711 if (reg_prev_sub != NULL)
1712 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001713 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001714 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001715 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001716 if (tmpsub != NULL)
1717 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001718 // copy prefix
1719 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001720 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001721 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001722 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001723 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001724 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001725 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001726 STRCPY(tmpsub + len + prevlen, p + 1);
1727
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001728 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001729 vim_free(newsub);
1730 newsub = tmpsub;
1731 p = newsub + len + prevlen;
1732 }
1733 }
1734 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001735 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001736 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001737 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001738 --p;
1739 }
1740 else
1741 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001742 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001743 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001744 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001745 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001746 }
1747 }
1748
1749 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001750 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001751 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001752 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001753 reg_prev_sub = vim_strsave(newsub);
1754 return newsub;
1755}
1756
1757#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001758static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001759
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001760// These pointers are used for reg_submatch(). Needed for when the
1761// substitution string is an expression that contains a call to substitute()
1762// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001763typedef struct {
1764 regmatch_T *sm_match;
1765 regmmatch_T *sm_mmatch;
1766 linenr_T sm_firstlnum;
1767 linenr_T sm_maxline;
1768 int sm_line_lbr;
1769} regsubmatch_T;
1770
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001771static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001772#endif
1773
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001774#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001775
1776/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001777 * Put the submatches in "argv[argskip]" which is a list passed into
1778 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001779 */
1780 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001781fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001782{
1783 listitem_T *li;
1784 int i;
1785 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001786 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001787
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001788 if (argcount == argskip)
1789 // called function doesn't take a submatches argument
1790 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001791
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001792 // Relies on sl_list to be the first item in staticList10_T.
1793 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001794
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001795 // There are always 10 list items in staticList10_T.
1796 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001797 for (i = 0; i < 10; ++i)
1798 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001799 s = rsm.sm_match->startp[i];
1800 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001801 s = NULL;
1802 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001803 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001804 li->li_tv.v_type = VAR_STRING;
1805 li->li_tv.vval.v_string = s;
1806 li = li->li_next;
1807 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001808 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001809}
1810
1811 static void
1812clear_submatch_list(staticList10_T *sl)
1813{
1814 int i;
1815
1816 for (i = 0; i < 10; ++i)
1817 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1818}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001819#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001820
Bram Moolenaar071d4272004-06-13 20:20:40 +00001821/*
1822 * vim_regsub() - perform substitutions after a vim_regexec() or
1823 * vim_regexec_multi() match.
1824 *
1825 * If "copy" is TRUE really copy into "dest".
1826 * If "copy" is FALSE nothing is copied, this is just to find out the length
1827 * of the result.
1828 *
1829 * If "backslash" is TRUE, a backslash will be removed later, need to double
1830 * them to keep them, and insert a backslash before a CR to avoid it being
1831 * replaced with a line break later.
1832 *
1833 * Note: The matched text must not change between the call of
1834 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1835 * references invalid!
1836 *
1837 * Returns the size of the replacement, including terminating NUL.
1838 */
1839 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001840vim_regsub(
1841 regmatch_T *rmp,
1842 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001843 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001844 char_u *dest,
1845 int copy,
1846 int magic,
1847 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001848{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001849 int result;
1850 regexec_T rex_save;
1851 int rex_in_use_save = rex_in_use;
1852
1853 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001854 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001855 rex_save = rex;
1856 rex_in_use = TRUE;
1857
1858 rex.reg_match = rmp;
1859 rex.reg_mmatch = NULL;
1860 rex.reg_maxline = 0;
1861 rex.reg_buf = curbuf;
1862 rex.reg_line_lbr = TRUE;
1863 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1864
1865 rex_in_use = rex_in_use_save;
1866 if (rex_in_use)
1867 rex = rex_save;
1868
1869 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001870}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001871
1872 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001873vim_regsub_multi(
1874 regmmatch_T *rmp,
1875 linenr_T lnum,
1876 char_u *source,
1877 char_u *dest,
1878 int copy,
1879 int magic,
1880 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001881{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001882 int result;
1883 regexec_T rex_save;
1884 int rex_in_use_save = rex_in_use;
1885
1886 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001887 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001888 rex_save = rex;
1889 rex_in_use = TRUE;
1890
1891 rex.reg_match = NULL;
1892 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001893 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001894 rex.reg_firstlnum = lnum;
1895 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1896 rex.reg_line_lbr = FALSE;
1897 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1898
1899 rex_in_use = rex_in_use_save;
1900 if (rex_in_use)
1901 rex = rex_save;
1902
1903 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001904}
1905
1906 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001907vim_regsub_both(
1908 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001909 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001910 char_u *dest,
1911 int copy,
1912 int magic,
1913 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001914{
1915 char_u *src;
1916 char_u *dst;
1917 char_u *s;
1918 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001919 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001920 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001921 fptr_T func_all = (fptr_T)NULL;
1922 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001923 linenr_T clnum = 0; // init for GCC
1924 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001925#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001926 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001927#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001928
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001929 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001930 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001931 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001932 emsg(_(e_null_argument));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001933 return 0;
1934 }
1935 if (prog_magic_wrong())
1936 return 0;
1937 src = source;
1938 dst = dest;
1939
1940 /*
1941 * When the substitute part starts with "\=" evaluate it as an expression.
1942 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001943 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001944 {
1945#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001946 // To make sure that the length doesn't change between checking the
1947 // length and copying the string, and to speed up things, the
1948 // resulting string is saved from the call with "copy" == FALSE to the
1949 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001950 if (copy)
1951 {
1952 if (eval_result != NULL)
1953 {
1954 STRCPY(dest, eval_result);
1955 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01001956 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001957 }
1958 }
1959 else
1960 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001961 int prev_can_f_submatch = can_f_submatch;
1962 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001963
1964 vim_free(eval_result);
1965
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001966 // The expression may contain substitute(), which calls us
1967 // recursively. Make sure submatch() gets the text from the first
1968 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001969 if (can_f_submatch)
1970 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001971 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001972 rsm.sm_match = rex.reg_match;
1973 rsm.sm_mmatch = rex.reg_mmatch;
1974 rsm.sm_firstlnum = rex.reg_firstlnum;
1975 rsm.sm_maxline = rex.reg_maxline;
1976 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001978 if (expr != NULL)
1979 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001980 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001981 char_u buf[NUMBUFLEN];
1982 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001983 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02001984 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001985
1986 rettv.v_type = VAR_STRING;
1987 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001988 argv[0].v_type = VAR_LIST;
1989 argv[0].vval.v_list = &matchList.sl_list;
1990 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02001991 CLEAR_FIELD(funcexe);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00001992 funcexe.fe_argv_func = fill_submatch_list;
1993 funcexe.fe_evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001994 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001995 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001996 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02001997 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001998 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02001999 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002000 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002001 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002002
Bram Moolenaar6100d022016-10-02 16:51:57 +02002003 s = partial_name(partial);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002004 funcexe.fe_partial = partial;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002005 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002006 }
LemonBoyf3b48952022-05-05 13:53:03 +01002007 else if (expr->v_type == VAR_INSTR)
2008 {
2009 exe_typval_instr(expr, &rettv);
2010 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002011 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002012 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002013 clear_submatch_list(&matchList);
2014
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002015 if (rettv.v_type == VAR_UNKNOWN)
2016 // something failed, no need to report another error
2017 eval_result = NULL;
2018 else
2019 {
2020 eval_result = tv_get_string_buf_chk(&rettv, buf);
2021 if (eval_result != NULL)
2022 eval_result = vim_strsave(eval_result);
2023 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002024 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002025 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002026 else if (substitute_instr != NULL)
2027 // Execute instructions from ISN_SUBSTITUTE.
2028 eval_result = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002029 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002030 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031
Bram Moolenaar071d4272004-06-13 20:20:40 +00002032 if (eval_result != NULL)
2033 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002034 int had_backslash = FALSE;
2035
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002036 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002037 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002038 // Change NL to CR, so that it becomes a line break,
2039 // unless called from vim_regexec_nl().
2040 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002041 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002042 *s = CAR;
2043 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002044 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002045 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002046 /* Change NL to CR here too, so that this works:
2047 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2048 * abc\
2049 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002050 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002051 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002052 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002053 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002054 had_backslash = TRUE;
2055 }
2056 }
2057 if (had_backslash && backslash)
2058 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002059 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002060 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2061 if (s != NULL)
2062 {
2063 vim_free(eval_result);
2064 eval_result = s;
2065 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002066 }
2067
2068 dst += STRLEN(eval_result);
2069 }
2070
Bram Moolenaar6100d022016-10-02 16:51:57 +02002071 can_f_submatch = prev_can_f_submatch;
2072 if (can_f_submatch)
2073 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002074 }
2075#endif
2076 }
2077 else
2078 while ((c = *src++) != NUL)
2079 {
2080 if (c == '&' && magic)
2081 no = 0;
2082 else if (c == '\\' && *src != NUL)
2083 {
2084 if (*src == '&' && !magic)
2085 {
2086 ++src;
2087 no = 0;
2088 }
2089 else if ('0' <= *src && *src <= '9')
2090 {
2091 no = *src++ - '0';
2092 }
2093 else if (vim_strchr((char_u *)"uUlLeE", *src))
2094 {
2095 switch (*src++)
2096 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002097 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002098 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002099 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002100 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002101 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002102 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002103 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002104 continue;
2105 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002106 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002107 continue;
2108 }
2109 }
2110 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002111 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002112 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002113 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2114 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002115 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002116 if (copy)
2117 {
2118 *dst++ = c;
2119 *dst++ = *src++;
2120 *dst++ = *src++;
2121 }
2122 else
2123 {
2124 dst += 3;
2125 src += 2;
2126 }
2127 continue;
2128 }
2129
Bram Moolenaar071d4272004-06-13 20:20:40 +00002130 if (c == '\\' && *src != NUL)
2131 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002132 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002133 switch (*src)
2134 {
2135 case 'r': c = CAR; ++src; break;
2136 case 'n': c = NL; ++src; break;
2137 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002138 // Oh no! \e already has meaning in subst pat :-(
2139 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002140 case 'b': c = Ctrl_H; ++src; break;
2141
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002142 // If "backslash" is TRUE the backslash will be removed
2143 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002144 default: if (backslash)
2145 {
2146 if (copy)
2147 *dst = '\\';
2148 ++dst;
2149 }
2150 c = *src++;
2151 }
2152 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002153 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002154 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002155
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002156 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002157 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002158 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002159 func_one = (fptr_T)(func_one(&cc, c));
2160 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002161 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002162 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002163 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002164 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002165
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002166 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002167 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002168 int totlen = mb_ptr2len(src - 1);
2169
Bram Moolenaar071d4272004-06-13 20:20:40 +00002170 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002171 mb_char2bytes(cc, dst);
2172 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002173 if (enc_utf8)
2174 {
2175 int clen = utf_ptr2len(src - 1);
2176
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002177 // If the character length is shorter than "totlen", there
2178 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002179 if (clen < totlen)
2180 {
2181 if (copy)
2182 mch_memmove(dst + 1, src - 1 + clen,
2183 (size_t)(totlen - clen));
2184 dst += totlen - clen;
2185 }
2186 }
2187 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002188 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002189 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002190 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002191 dst++;
2192 }
2193 else
2194 {
2195 if (REG_MULTI)
2196 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002197 clnum = rex.reg_mmatch->startpos[no].lnum;
2198 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002199 s = NULL;
2200 else
2201 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002202 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2203 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2204 len = rex.reg_mmatch->endpos[no].col
2205 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002206 else
2207 len = (int)STRLEN(s);
2208 }
2209 }
2210 else
2211 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002212 s = rex.reg_match->startp[no];
2213 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002214 s = NULL;
2215 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002216 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002217 }
2218 if (s != NULL)
2219 {
2220 for (;;)
2221 {
2222 if (len == 0)
2223 {
2224 if (REG_MULTI)
2225 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002226 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002227 break;
2228 if (copy)
2229 *dst = CAR;
2230 ++dst;
2231 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002232 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2233 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002234 else
2235 len = (int)STRLEN(s);
2236 }
2237 else
2238 break;
2239 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002240 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002241 {
2242 if (copy)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002243 iemsg(_(e_damaged_match_string));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002244 goto exit;
2245 }
2246 else
2247 {
2248 if (backslash && (*s == CAR || *s == '\\'))
2249 {
2250 /*
2251 * Insert a backslash in front of a CR, otherwise
2252 * it will be replaced by a line break.
2253 * Number of backslashes will be halved later,
2254 * double them here.
2255 */
2256 if (copy)
2257 {
2258 dst[0] = '\\';
2259 dst[1] = *s;
2260 }
2261 dst += 2;
2262 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002263 else
2264 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002265 if (has_mbyte)
2266 c = mb_ptr2char(s);
2267 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002268 c = *s;
2269
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002270 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002271 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002272 func_one = (fptr_T)(func_one(&cc, c));
2273 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002274 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002275 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002276 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002277 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002278
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002279 if (has_mbyte)
2280 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002281 int l;
2282
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002283 // Copy composing characters separately, one
2284 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002285 if (enc_utf8)
2286 l = utf_ptr2len(s) - 1;
2287 else
2288 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002289
2290 s += l;
2291 len -= l;
2292 if (copy)
2293 mb_char2bytes(cc, dst);
2294 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002295 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002296 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002297 *dst = cc;
2298 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002299 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002300
Bram Moolenaar071d4272004-06-13 20:20:40 +00002301 ++s;
2302 --len;
2303 }
2304 }
2305 }
2306 no = -1;
2307 }
2308 }
2309 if (copy)
2310 *dst = NUL;
2311
2312exit:
2313 return (int)((dst - dest) + 1);
2314}
2315
2316#ifdef FEAT_EVAL
2317/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002318 * Call reg_getline() with the line numbers from the submatch. If a
2319 * substitute() was used the reg_maxline and other values have been
2320 * overwritten.
2321 */
2322 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002323reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002324{
2325 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002326 linenr_T save_first = rex.reg_firstlnum;
2327 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002328
Bram Moolenaar6100d022016-10-02 16:51:57 +02002329 rex.reg_firstlnum = rsm.sm_firstlnum;
2330 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002331
2332 s = reg_getline(lnum);
2333
Bram Moolenaar6100d022016-10-02 16:51:57 +02002334 rex.reg_firstlnum = save_first;
2335 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002336 return s;
2337}
2338
2339/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002340 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002341 * allocated memory.
2342 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2343 */
2344 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002345reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002346{
2347 char_u *retval = NULL;
2348 char_u *s;
2349 int len;
2350 int round;
2351 linenr_T lnum;
2352
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002353 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002354 return NULL;
2355
Bram Moolenaar6100d022016-10-02 16:51:57 +02002356 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002357 {
2358 /*
2359 * First round: compute the length and allocate memory.
2360 * Second round: copy the text.
2361 */
2362 for (round = 1; round <= 2; ++round)
2363 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002364 lnum = rsm.sm_mmatch->startpos[no].lnum;
2365 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002366 return NULL;
2367
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002368 s = reg_getline_submatch(lnum);
2369 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002370 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002371 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002372 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002373 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002374 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002375 len = rsm.sm_mmatch->endpos[no].col
2376 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002377 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002378 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002379 ++len;
2380 }
2381 else
2382 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002383 // Multiple lines: take start line from start col, middle
2384 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002385 len = (int)STRLEN(s);
2386 if (round == 2)
2387 {
2388 STRCPY(retval, s);
2389 retval[len] = '\n';
2390 }
2391 ++len;
2392 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002393 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002394 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002395 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002396 if (round == 2)
2397 STRCPY(retval + len, s);
2398 len += (int)STRLEN(s);
2399 if (round == 2)
2400 retval[len] = '\n';
2401 ++len;
2402 }
2403 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002404 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002405 rsm.sm_mmatch->endpos[no].col);
2406 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002407 if (round == 2)
2408 retval[len] = NUL;
2409 ++len;
2410 }
2411
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002412 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002413 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002414 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002415 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 return NULL;
2417 }
2418 }
2419 }
2420 else
2421 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002422 s = rsm.sm_match->startp[no];
2423 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002424 retval = NULL;
2425 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002426 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002427 }
2428
2429 return retval;
2430}
Bram Moolenaar41571762014-04-02 19:00:58 +02002431
2432/*
2433 * Used for the submatch() function with the optional non-zero argument: get
2434 * the list of strings from the n'th submatch in allocated memory with NULs
2435 * represented in NLs.
2436 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2437 * command, for a non-existing submatch and for any error.
2438 */
2439 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002440reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002441{
2442 char_u *s;
2443 linenr_T slnum;
2444 linenr_T elnum;
2445 colnr_T scol;
2446 colnr_T ecol;
2447 int i;
2448 list_T *list;
2449 int error = FALSE;
2450
2451 if (!can_f_submatch || no < 0)
2452 return NULL;
2453
Bram Moolenaar6100d022016-10-02 16:51:57 +02002454 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002455 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002456 slnum = rsm.sm_mmatch->startpos[no].lnum;
2457 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002458 if (slnum < 0 || elnum < 0)
2459 return NULL;
2460
Bram Moolenaar6100d022016-10-02 16:51:57 +02002461 scol = rsm.sm_mmatch->startpos[no].col;
2462 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002463
2464 list = list_alloc();
2465 if (list == NULL)
2466 return NULL;
2467
2468 s = reg_getline_submatch(slnum) + scol;
2469 if (slnum == elnum)
2470 {
2471 if (list_append_string(list, s, ecol - scol) == FAIL)
2472 error = TRUE;
2473 }
2474 else
2475 {
2476 if (list_append_string(list, s, -1) == FAIL)
2477 error = TRUE;
2478 for (i = 1; i < elnum - slnum; i++)
2479 {
2480 s = reg_getline_submatch(slnum + i);
2481 if (list_append_string(list, s, -1) == FAIL)
2482 error = TRUE;
2483 }
2484 s = reg_getline_submatch(elnum);
2485 if (list_append_string(list, s, ecol) == FAIL)
2486 error = TRUE;
2487 }
2488 }
2489 else
2490 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002491 s = rsm.sm_match->startp[no];
2492 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002493 return NULL;
2494 list = list_alloc();
2495 if (list == NULL)
2496 return NULL;
2497 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002498 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002499 error = TRUE;
2500 }
2501
2502 if (error)
2503 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002504 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002505 return NULL;
2506 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002507 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002508 return list;
2509}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002510#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002511
Bram Moolenaarf4140482020-02-15 23:06:45 +01002512/*
2513 * Initialize the values used for matching against multiple lines
2514 */
2515 static void
2516init_regexec_multi(
2517 regmmatch_T *rmp,
2518 win_T *win, // window in which to search or NULL
2519 buf_T *buf, // buffer in which to search
2520 linenr_T lnum) // nr of line to start looking for match
2521{
2522 rex.reg_match = NULL;
2523 rex.reg_mmatch = rmp;
2524 rex.reg_buf = buf;
2525 rex.reg_win = win;
2526 rex.reg_firstlnum = lnum;
2527 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2528 rex.reg_line_lbr = FALSE;
2529 rex.reg_ic = rmp->rmm_ic;
2530 rex.reg_icombine = FALSE;
2531 rex.reg_maxcol = rmp->rmm_maxcol;
2532}
2533
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002534#include "regexp_bt.c"
2535
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002536static regengine_T bt_regengine =
2537{
2538 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002539 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002540 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002541 bt_regexec_multi,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002542};
2543
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002544#include "regexp_nfa.c"
2545
2546static regengine_T nfa_regengine =
2547{
2548 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002549 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002550 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002551 nfa_regexec_multi,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002552};
2553
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002554// Which regexp engine to use? Needed for vim_regcomp().
2555// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002556static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002557
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002558#ifdef DEBUG
2559static char_u regname[][30] = {
2560 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002561 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002562 "NFA Regexp Engine"
2563 };
2564#endif
2565
2566/*
2567 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002568 * Returns the program in allocated memory.
2569 * Use vim_regfree() to free the memory.
2570 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002571 */
2572 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002573vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002574{
2575 regprog_T *prog = NULL;
2576 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002577 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002578
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002579 regexp_engine = p_re;
2580
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002581 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582 if (STRNCMP(expr, "\\%#=", 4) == 0)
2583 {
2584 int newengine = expr[4] - '0';
2585
2586 if (newengine == AUTOMATIC_ENGINE
2587 || newengine == BACKTRACKING_ENGINE
2588 || newengine == NFA_ENGINE)
2589 {
2590 regexp_engine = expr[4] - '0';
2591 expr += 5;
2592#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002593 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002594 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002595#endif
2596 }
2597 else
2598 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00002599 emsg(_(e_percent_hash_can_only_be_followed_by_zero_one_two_automatic_engine_will_be_used));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600 regexp_engine = AUTOMATIC_ENGINE;
2601 }
2602 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002603#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002604 bt_regengine.expr = expr;
2605 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002606#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002607 // reg_iswordc() uses rex.reg_buf
2608 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002609
2610 /*
2611 * First try the NFA engine, unless backtracking was requested.
2612 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002613 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002614 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002615 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002616 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002617 else
2618 prog = bt_regengine.regcomp(expr, re_flags);
2619
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002620 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002621 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002622 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002623#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002624 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002625 {
2626 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002627 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002628 if (f)
2629 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002630 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002631 fclose(f);
2632 }
2633 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002634 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002635 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002636 }
2637#endif
2638 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002639 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002640 * The NFA engine also fails for patterns that it can't handle well
2641 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002642 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002643 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002644 if (regexp_engine == AUTOMATIC_ENGINE
2645 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002646 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002647 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002648#ifdef FEAT_EVAL
2649 report_re_switch(expr);
2650#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002651 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002652 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002653 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002654
Bram Moolenaarfda37292014-11-05 14:27:36 +01002655 if (prog != NULL)
2656 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002657 // Store the info needed to call regcomp() again when the engine turns
2658 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002659 prog->re_engine = regexp_engine;
2660 prog->re_flags = re_flags;
2661 }
2662
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002663 return prog;
2664}
2665
2666/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002667 * Free a compiled regexp program, returned by vim_regcomp().
2668 */
2669 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002670vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002671{
2672 if (prog != NULL)
2673 prog->engine->regfree(prog);
2674}
2675
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002676#if defined(EXITFREE) || defined(PROTO)
2677 void
2678free_regexp_stuff(void)
2679{
2680 ga_clear(&regstack);
2681 ga_clear(&backpos);
2682 vim_free(reg_tofree);
2683 vim_free(reg_prev_sub);
2684}
2685#endif
2686
Bram Moolenaarfda37292014-11-05 14:27:36 +01002687#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002688 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002689report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002690{
2691 if (p_verbose > 0)
2692 {
2693 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002694 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2695 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002696 verbose_leave();
2697 }
2698}
2699#endif
2700
Bram Moolenaar651fca82021-11-29 20:39:38 +00002701#if defined(FEAT_X11) || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002702/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002703 * Return whether "prog" is currently being executed.
2704 */
2705 int
2706regprog_in_use(regprog_T *prog)
2707{
2708 return prog->re_in_use;
2709}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002710#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002711
2712/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002713 * Match a regexp against a string.
2714 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002715 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002716 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002717 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002718 *
2719 * Return TRUE if there is a match, FALSE if not.
2720 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002721 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002722vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002723 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002724 char_u *line, // string to match against
2725 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002726 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002727{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002728 int result;
2729 regexec_T rex_save;
2730 int rex_in_use_save = rex_in_use;
2731
Bram Moolenaar0270f382018-07-17 05:43:58 +02002732 // Cannot use the same prog recursively, it contains state.
2733 if (rmp->regprog->re_in_use)
2734 {
Bram Moolenaar677658a2022-01-05 16:09:06 +00002735 emsg(_(e_cannot_use_pattern_recursively));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002736 return FALSE;
2737 }
2738 rmp->regprog->re_in_use = TRUE;
2739
Bram Moolenaar6100d022016-10-02 16:51:57 +02002740 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002741 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002742 rex_save = rex;
2743 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002744
Bram Moolenaar6100d022016-10-02 16:51:57 +02002745 rex.reg_startp = NULL;
2746 rex.reg_endp = NULL;
2747 rex.reg_startpos = NULL;
2748 rex.reg_endpos = NULL;
2749
2750 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002751 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002752
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002753 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002754 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2755 && result == NFA_TOO_EXPENSIVE)
2756 {
2757 int save_p_re = p_re;
2758 int re_flags = rmp->regprog->re_flags;
2759 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2760
2761 p_re = BACKTRACKING_ENGINE;
2762 vim_regfree(rmp->regprog);
2763 if (pat != NULL)
2764 {
2765#ifdef FEAT_EVAL
2766 report_re_switch(pat);
2767#endif
2768 rmp->regprog = vim_regcomp(pat, re_flags);
2769 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002770 {
2771 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002772 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002773 rmp->regprog->re_in_use = FALSE;
2774 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002775 vim_free(pat);
2776 }
2777
2778 p_re = save_p_re;
2779 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002780
2781 rex_in_use = rex_in_use_save;
2782 if (rex_in_use)
2783 rex = rex_save;
2784
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002785 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002786}
2787
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002788/*
2789 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002790 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002791 */
2792 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002793vim_regexec_prog(
2794 regprog_T **prog,
2795 int ignore_case,
2796 char_u *line,
2797 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002798{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002799 int r;
2800 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002801
2802 regmatch.regprog = *prog;
2803 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002804 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002805 *prog = regmatch.regprog;
2806 return r;
2807}
2808
2809/*
2810 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002811 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002812 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002813 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002814vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002815{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002816 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002817}
2818
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002819/*
2820 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002821 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002822 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002823 */
2824 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002825vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002826{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002827 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002828}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002829
2830/*
2831 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002832 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2833 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002834 * Uses curbuf for line count and 'iskeyword'.
2835 *
2836 * Return zero if there is no match. Return number of lines contained in the
2837 * match otherwise.
2838 */
2839 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002840vim_regexec_multi(
2841 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002842 win_T *win, // window in which to search or NULL
2843 buf_T *buf, // buffer in which to search
2844 linenr_T lnum, // nr of line to start looking for match
2845 colnr_T col, // column to start looking for match
2846 proftime_T *tm, // timeout limit or NULL
2847 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002848{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002849 int result;
2850 regexec_T rex_save;
2851 int rex_in_use_save = rex_in_use;
2852
Bram Moolenaar0270f382018-07-17 05:43:58 +02002853 // Cannot use the same prog recursively, it contains state.
2854 if (rmp->regprog->re_in_use)
2855 {
Bram Moolenaar677658a2022-01-05 16:09:06 +00002856 emsg(_(e_cannot_use_pattern_recursively));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002857 return FALSE;
2858 }
2859 rmp->regprog->re_in_use = TRUE;
2860
Bram Moolenaar6100d022016-10-02 16:51:57 +02002861 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002862 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002863 rex_save = rex;
2864 rex_in_use = TRUE;
2865
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002866 result = rmp->regprog->engine->regexec_multi(
2867 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002868 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002869
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002870 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002871 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2872 && result == NFA_TOO_EXPENSIVE)
2873 {
2874 int save_p_re = p_re;
2875 int re_flags = rmp->regprog->re_flags;
2876 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2877
2878 p_re = BACKTRACKING_ENGINE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002879 if (pat != NULL)
2880 {
Bram Moolenaare8a4c0d2022-04-04 18:14:34 +01002881 regprog_T *prev_prog = rmp->regprog;
2882
Bram Moolenaarfda37292014-11-05 14:27:36 +01002883#ifdef FEAT_EVAL
2884 report_re_switch(pat);
2885#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002886#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002887 // checking for \z misuse was already done when compiling for NFA,
2888 // allow all here
2889 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002890#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002891 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002892#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002893 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002894#endif
Bram Moolenaare8a4c0d2022-04-04 18:14:34 +01002895 if (rmp->regprog == NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002896 {
Bram Moolenaare8a4c0d2022-04-04 18:14:34 +01002897 // Somehow compiling the pattern failed now, put back the
2898 // previous one to avoid "regprog" becoming NULL.
2899 rmp->regprog = prev_prog;
2900 }
2901 else
2902 {
2903 vim_regfree(prev_prog);
2904
Bram Moolenaar41499802018-07-18 06:02:09 +02002905 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002906 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002907 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002908 rmp->regprog->re_in_use = FALSE;
2909 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002910 vim_free(pat);
2911 }
2912 p_re = save_p_re;
2913 }
2914
Bram Moolenaar6100d022016-10-02 16:51:57 +02002915 rex_in_use = rex_in_use_save;
2916 if (rex_in_use)
2917 rex = rex_save;
2918
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002919 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002920}