blob: 4b14745791304c501d7e3addf04b0cb41f2a2a36 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020077#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020078static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020079static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020081static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020082static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020083static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
84
Bram Moolenaar071d4272004-06-13 20:20:40 +000085#define NOT_MULTI 0
86#define MULTI_ONE 1
87#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020088
89// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010090#define RA_FAIL 1 // something failed, abort
91#define RA_CONT 2 // continue in inner loop
92#define RA_BREAK 3 // break inner loop
93#define RA_MATCH 4 // successful match
94#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020095
Bram Moolenaar071d4272004-06-13 20:20:40 +000096/*
97 * Return NOT_MULTI if c is not a "multi" operator.
98 * Return MULTI_ONE if c is a single "multi" operator.
99 * Return MULTI_MULT if c is a multi "multi" operator.
100 */
101 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100102re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000103{
104 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
105 return MULTI_ONE;
106 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
107 return MULTI_MULT;
108 return NOT_MULTI;
109}
110
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000111static char_u *reg_prev_sub = NULL;
112
Bram Moolenaar071d4272004-06-13 20:20:40 +0000113/*
114 * REGEXP_INRANGE contains all characters which are always special in a []
115 * range after '\'.
116 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
117 * These are:
118 * \n - New line (NL).
119 * \r - Carriage Return (CR).
120 * \t - Tab (TAB).
121 * \e - Escape (ESC).
122 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000123 * \d - Character code in decimal, eg \d123
124 * \o - Character code in octal, eg \o80
125 * \x - Character code in hex, eg \x4a
126 * \u - Multibyte character code, eg \u20ac
127 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000128 */
129static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000130static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131
Bram Moolenaar071d4272004-06-13 20:20:40 +0000132/*
133 * Translate '\x' to its control character, except "\n", which is Magic.
134 */
135 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100136backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000137{
138 switch (c)
139 {
140 case 'r': return CAR;
141 case 't': return TAB;
142 case 'e': return ESC;
143 case 'b': return BS;
144 }
145 return c;
146}
147
148/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000149 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000150 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
151 * recognized. Otherwise "pp" is advanced to after the item.
152 */
153 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100154get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000155{
156 static const char *(class_names[]) =
157 {
158 "alnum:]",
159#define CLASS_ALNUM 0
160 "alpha:]",
161#define CLASS_ALPHA 1
162 "blank:]",
163#define CLASS_BLANK 2
164 "cntrl:]",
165#define CLASS_CNTRL 3
166 "digit:]",
167#define CLASS_DIGIT 4
168 "graph:]",
169#define CLASS_GRAPH 5
170 "lower:]",
171#define CLASS_LOWER 6
172 "print:]",
173#define CLASS_PRINT 7
174 "punct:]",
175#define CLASS_PUNCT 8
176 "space:]",
177#define CLASS_SPACE 9
178 "upper:]",
179#define CLASS_UPPER 10
180 "xdigit:]",
181#define CLASS_XDIGIT 11
182 "tab:]",
183#define CLASS_TAB 12
184 "return:]",
185#define CLASS_RETURN 13
186 "backspace:]",
187#define CLASS_BACKSPACE 14
188 "escape:]",
189#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100190 "ident:]",
191#define CLASS_IDENT 16
192 "keyword:]",
193#define CLASS_KEYWORD 17
194 "fname:]",
195#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000196 };
197#define CLASS_NONE 99
198 int i;
199
200 if ((*pp)[1] == ':')
201 {
K.Takataeeec2542021-06-02 13:28:16 +0200202 for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000203 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
204 {
205 *pp += STRLEN(class_names[i]) + 2;
206 return i;
207 }
208 }
209 return CLASS_NONE;
210}
211
212/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000213 * Specific version of character class functions.
214 * Using a table to keep this fast.
215 */
216static short class_tab[256];
217
218#define RI_DIGIT 0x01
219#define RI_HEX 0x02
220#define RI_OCTAL 0x04
221#define RI_WORD 0x08
222#define RI_HEAD 0x10
223#define RI_ALPHA 0x20
224#define RI_LOWER 0x40
225#define RI_UPPER 0x80
226#define RI_WHITE 0x100
227
228 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100229init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000230{
231 int i;
232 static int done = FALSE;
233
234 if (done)
235 return;
236
237 for (i = 0; i < 256; ++i)
238 {
239 if (i >= '0' && i <= '7')
240 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
241 else if (i >= '8' && i <= '9')
242 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
243 else if (i >= 'a' && i <= 'f')
244 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
245#ifdef EBCDIC
246 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
247 || (i >= 's' && i <= 'z'))
248#else
249 else if (i >= 'g' && i <= 'z')
250#endif
251 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
252 else if (i >= 'A' && i <= 'F')
253 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
254#ifdef EBCDIC
255 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
256 || (i >= 'S' && i <= 'Z'))
257#else
258 else if (i >= 'G' && i <= 'Z')
259#endif
260 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
261 else if (i == '_')
262 class_tab[i] = RI_WORD + RI_HEAD;
263 else
264 class_tab[i] = 0;
265 }
266 class_tab[' '] |= RI_WHITE;
267 class_tab['\t'] |= RI_WHITE;
268 done = TRUE;
269}
270
Bram Moolenaara12a1612019-01-24 16:39:02 +0100271#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
272#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
273#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
274#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
275#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
276#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
277#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
278#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
279#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000280
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100281// flags for regflags
282#define RF_ICASE 1 // ignore case
283#define RF_NOICASE 2 // don't ignore case
284#define RF_HASNL 4 // can match a NL
285#define RF_ICOMBINE 8 // ignore combining characters
286#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000287
288/*
289 * Global work variables for vim_regcomp().
290 */
291
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100292static char_u *regparse; // Input-scan pointer.
293static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100294static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000295#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100296static int regnzpar; // \z() count.
297static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000298#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100299static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000300#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100301static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000302#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000303
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100304static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100306static int reg_string; // matching with a string instead of a buffer
307 // line
308static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000309
310/*
311 * META contains all characters that may be magic, except '^' and '$'.
312 */
313
314#ifdef EBCDIC
315static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
316#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100317// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000318static char_u META_flags[] = {
319 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100321// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000322 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100323// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100325// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000326 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100327// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000328 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100329// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000330 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100331// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000332 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
333};
334#endif
335
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100336static int curchr; // currently parsed character
337// Previous character. Note: prevchr is sometimes -1 when we are not at the
338// start, eg in /[ ^I]^ the pattern was never found even if it existed,
339// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200340static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100341static int prevprevchr; // previous-previous character
342static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000343
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100344// arguments for reg()
345#define REG_NOPAREN 0 // toplevel reg()
346#define REG_PAREN 1 // \(\)
347#define REG_ZPAREN 2 // \z(\)
348#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000349
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200350typedef struct
351{
352 char_u *regparse;
353 int prevchr_len;
354 int curchr;
355 int prevchr;
356 int prevprevchr;
357 int nextchr;
358 int at_start;
359 int prev_at_start;
360 int regnpar;
361} parse_state_T;
362
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100363static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100364static int getchr(void);
365static void skipchr_keepstart(void);
366static int peekchr(void);
367static void skipchr(void);
368static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100369static long gethexchrs(int maxinputlen);
370static long getoctchrs(void);
371static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100372static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100373static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200374static int cstrncmp(char_u *s1, char_u *s2, int *n);
375static char_u *cstrchr(char_u *, int);
376static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100377static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100378#ifdef FEAT_EVAL
379static void report_re_switch(char_u *pat);
380#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000381
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200382static regengine_T bt_regengine;
383static regengine_T nfa_regengine;
384
Bram Moolenaar071d4272004-06-13 20:20:40 +0000385/*
386 * Return TRUE if compiled regular expression "prog" can match a line break.
387 */
388 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100389re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000390{
391 return (prog->regflags & RF_HASNL);
392}
393
394/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000395 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
396 * Returns a character representing the class. Zero means that no item was
397 * recognized. Otherwise "pp" is advanced to after the item.
398 */
399 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100400get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000401{
402 int c;
403 int l = 1;
404 char_u *p = *pp;
405
Bram Moolenaar985079c2019-02-16 17:07:47 +0100406 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000407 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000408 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000409 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000410 if (p[l + 2] == '=' && p[l + 3] == ']')
411 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000412 if (has_mbyte)
413 c = mb_ptr2char(p + 2);
414 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 c = p[2];
416 *pp += l + 4;
417 return c;
418 }
419 }
420 return 0;
421}
422
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200423#ifdef EBCDIC
424/*
425 * Table for equivalence class "c". (IBM-1047)
426 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200427static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200428 "A\x62\x63\x64\x65\x66\x67",
429 "C\x68",
430 "E\x71\x72\x73\x74",
431 "I\x75\x76\x77\x78",
432 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200433 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200434 "U\xFB\xFC\xFD\xFE",
435 "Y\xBA",
436 "a\x42\x43\x44\x45\x46\x47",
437 "c\x48",
438 "e\x51\x52\x53\x54",
439 "i\x55\x56\x57\x58",
440 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200441 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200442 "u\xDB\xDC\xDD\xDE",
443 "y\x8D\xDF",
444};
445#endif
446
Bram Moolenaardf177f62005-02-22 08:39:57 +0000447/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000448 * Check for a collating element "[.a.]". "pp" points to the '['.
449 * Returns a character. Zero means that no item was recognized. Otherwise
450 * "pp" is advanced to after the item.
451 * Currently only single characters are recognized!
452 */
453 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100454get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000455{
456 int c;
457 int l = 1;
458 char_u *p = *pp;
459
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100460 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000461 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000462 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000463 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000464 if (p[l + 2] == '.' && p[l + 3] == ']')
465 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000466 if (has_mbyte)
467 c = mb_ptr2char(p + 2);
468 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 c = p[2];
470 *pp += l + 4;
471 return c;
472 }
473 }
474 return 0;
475}
476
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100477static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
478static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200479
480 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100481get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200482{
483 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
484 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
485}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000486
487/*
488 * Skip over a "[]" range.
489 * "p" must point to the character after the '['.
490 * The returned pointer is on the matching ']', or the terminating NUL.
491 */
492 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100493skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000494{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000495 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000496
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100497 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 ++p;
499 if (*p == ']' || *p == '-')
500 ++p;
501 while (*p != NUL && *p != ']')
502 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000503 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000504 p += l;
505 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000506 if (*p == '-')
507 {
508 ++p;
509 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100510 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000511 }
512 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200513 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000514 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200515 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000516 p += 2;
517 else if (*p == '[')
518 {
519 if (get_char_class(&p) == CLASS_NONE
520 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200521 && get_coll_element(&p) == 0
522 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100523 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000524 }
525 else
526 ++p;
527 }
528
529 return p;
530}
531
532/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000533 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200534 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000535 * Take care of characters with a backslash in front of it.
536 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000537 */
538 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100539skip_regexp(
540 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200541 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200542 int magic)
543{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100544 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200545}
546
547/*
548 * Call skip_regexp() and when the delimiter does not match give an error and
549 * return NULL.
550 */
551 char_u *
552skip_regexp_err(
553 char_u *startp,
554 int delim,
555 int magic)
556{
557 char_u *p = skip_regexp(startp, delim, magic);
558
559 if (*p != delim)
560 {
561 semsg(_("E654: missing delimiter after search pattern: %s"), startp);
562 return NULL;
563 }
564 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200565}
566
567/*
568 * skip_regexp() with extra arguments:
569 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
570 * expression and change "\?" to "?". If "*newp" is not NULL the expression
571 * is changed in-place.
572 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100573 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200574 */
575 char_u *
576skip_regexp_ex(
577 char_u *startp,
578 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100579 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200580 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100581 int *dropped,
582 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000583{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100584 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000585 char_u *p = startp;
586
587 if (magic)
588 mymagic = MAGIC_ON;
589 else
590 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200591 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000592
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100593 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000594 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100595 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000596 break;
597 if ((p[0] == '[' && mymagic >= MAGIC_ON)
598 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
599 {
600 p = skip_anyof(p + 1);
601 if (p[0] == NUL)
602 break;
603 }
604 else if (p[0] == '\\' && p[1] != NUL)
605 {
606 if (dirc == '?' && newp != NULL && p[1] == '?')
607 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100608 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000609 if (*newp == NULL)
610 {
611 *newp = vim_strsave(startp);
612 if (*newp != NULL)
613 p = *newp + (p - startp);
614 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200615 if (dropped != NULL)
616 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000617 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000618 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000619 else
620 ++p;
621 }
622 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100623 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000624 if (*p == 'v')
625 mymagic = MAGIC_ALL;
626 else if (*p == 'V')
627 mymagic = MAGIC_NONE;
628 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000629 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100630 if (magic_val != NULL)
631 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000632 return p;
633}
634
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200635/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200636 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200637 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100638static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200639static int at_start; // True when on the first character
640static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100641
Bram Moolenaar071d4272004-06-13 20:20:40 +0000642/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200643 * Start parsing at "str".
644 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000645 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100646initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000647{
648 regparse = str;
649 prevchr_len = 0;
650 curchr = prevprevchr = prevchr = nextchr = -1;
651 at_start = TRUE;
652 prev_at_start = FALSE;
653}
654
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200655/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200656 * Save the current parse state, so that it can be restored and parsing
657 * starts in the same state again.
658 */
659 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100660save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200661{
662 ps->regparse = regparse;
663 ps->prevchr_len = prevchr_len;
664 ps->curchr = curchr;
665 ps->prevchr = prevchr;
666 ps->prevprevchr = prevprevchr;
667 ps->nextchr = nextchr;
668 ps->at_start = at_start;
669 ps->prev_at_start = prev_at_start;
670 ps->regnpar = regnpar;
671}
672
673/*
674 * Restore a previously saved parse state.
675 */
676 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100677restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200678{
679 regparse = ps->regparse;
680 prevchr_len = ps->prevchr_len;
681 curchr = ps->curchr;
682 prevchr = ps->prevchr;
683 prevprevchr = ps->prevprevchr;
684 nextchr = ps->nextchr;
685 at_start = ps->at_start;
686 prev_at_start = ps->prev_at_start;
687 regnpar = ps->regnpar;
688}
689
690
691/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200692 * Get the next character without advancing.
693 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000694 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100695peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000696{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000697 static int after_slash = FALSE;
698
Bram Moolenaar071d4272004-06-13 20:20:40 +0000699 if (curchr == -1)
700 {
701 switch (curchr = regparse[0])
702 {
703 case '.':
704 case '[':
705 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100706 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000707 if (reg_magic >= MAGIC_ON)
708 curchr = Magic(curchr);
709 break;
710 case '(':
711 case ')':
712 case '{':
713 case '%':
714 case '+':
715 case '=':
716 case '?':
717 case '@':
718 case '!':
719 case '&':
720 case '|':
721 case '<':
722 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100723 case '#': // future ext.
724 case '"': // future ext.
725 case '\'': // future ext.
726 case ',': // future ext.
727 case '-': // future ext.
728 case ':': // future ext.
729 case ';': // future ext.
730 case '`': // future ext.
731 case '/': // Can't be used in / command
732 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000733 if (reg_magic == MAGIC_ALL)
734 curchr = Magic(curchr);
735 break;
736 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100737 // * is not magic as the very first character, eg "?*ptr", when
738 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
739 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000740 if (reg_magic >= MAGIC_ON
741 && !at_start
742 && !(prev_at_start && prevchr == Magic('^'))
743 && (after_slash
744 || (prevchr != Magic('(')
745 && prevchr != Magic('&')
746 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000747 curchr = Magic('*');
748 break;
749 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100750 // '^' is only magic as the very first character and if it's after
751 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000752 if (reg_magic >= MAGIC_OFF
753 && (at_start
754 || reg_magic == MAGIC_ALL
755 || prevchr == Magic('(')
756 || prevchr == Magic('|')
757 || prevchr == Magic('&')
758 || prevchr == Magic('n')
759 || (no_Magic(prevchr) == '('
760 && prevprevchr == Magic('%'))))
761 {
762 curchr = Magic('^');
763 at_start = TRUE;
764 prev_at_start = FALSE;
765 }
766 break;
767 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100768 // '$' is only magic as the very last char and if it's in front of
769 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000770 if (reg_magic >= MAGIC_OFF)
771 {
772 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200773 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000774
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100775 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000776 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200777 || p[1] == 'm' || p[1] == 'M'
778 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
779 {
780 if (p[1] == 'v')
781 is_magic_all = TRUE;
782 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
783 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000784 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200785 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000786 if (p[0] == NUL
787 || (p[0] == '\\'
788 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
789 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200790 || (is_magic_all
791 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000792 || reg_magic == MAGIC_ALL)
793 curchr = Magic('$');
794 }
795 break;
796 case '\\':
797 {
798 int c = regparse[1];
799
800 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100801 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000802 else if (
803#ifdef EBCDIC
804 vim_strchr(META, c)
805#else
806 c <= '~' && META_flags[c]
807#endif
808 )
809 {
810 /*
811 * META contains everything that may be magic sometimes,
812 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200813 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000814 * magicness. Therefore, \ is so meta-magic that it is
815 * not in META.
816 */
817 curchr = -1;
818 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100819 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000820 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000821 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000822 peekchr();
823 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000824 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000825 curchr = toggle_Magic(curchr);
826 }
827 else if (vim_strchr(REGEXP_ABBR, c))
828 {
829 /*
830 * Handle abbreviations, like "\t" for TAB -- webb
831 */
832 curchr = backslash_trans(c);
833 }
834 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
835 curchr = toggle_Magic(c);
836 else
837 {
838 /*
839 * Next character can never be (made) magic?
840 * Then backslashing it won't do anything.
841 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000842 if (has_mbyte)
843 curchr = (*mb_ptr2char)(regparse + 1);
844 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000845 curchr = c;
846 }
847 break;
848 }
849
Bram Moolenaar071d4272004-06-13 20:20:40 +0000850 default:
851 if (has_mbyte)
852 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000853 }
854 }
855
856 return curchr;
857}
858
859/*
860 * Eat one lexed character. Do this in a way that we can undo it.
861 */
862 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100863skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000864{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100865 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000866 if (*regparse == '\\')
867 prevchr_len = 1;
868 else
869 prevchr_len = 0;
870 if (regparse[prevchr_len] != NUL)
871 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000872 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100873 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000874 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000875 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000876 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000877 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000878 ++prevchr_len;
879 }
880 regparse += prevchr_len;
881 prev_at_start = at_start;
882 at_start = FALSE;
883 prevprevchr = prevchr;
884 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100885 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000886 nextchr = -1;
887}
888
889/*
890 * Skip a character while keeping the value of prev_at_start for at_start.
891 * prevchr and prevprevchr are also kept.
892 */
893 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100894skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000895{
896 int as = prev_at_start;
897 int pr = prevchr;
898 int prpr = prevprevchr;
899
900 skipchr();
901 at_start = as;
902 prevchr = pr;
903 prevprevchr = prpr;
904}
905
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200906/*
907 * Get the next character from the pattern. We know about magic and such, so
908 * therefore we need a lexical analyzer.
909 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000910 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100911getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000912{
913 int chr = peekchr();
914
915 skipchr();
916 return chr;
917}
918
919/*
920 * put character back. Works only once!
921 */
922 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100923ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000924{
925 nextchr = curchr;
926 curchr = prevchr;
927 prevchr = prevprevchr;
928 at_start = prev_at_start;
929 prev_at_start = FALSE;
930
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100931 // Backup regparse, so that it's at the same position as before the
932 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000933 regparse -= prevchr_len;
934}
935
936/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000937 * Get and return the value of the hex string at the current position.
938 * Return -1 if there is no valid hex number.
939 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000940 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000941 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000942 * The parameter controls the maximum number of input characters. This will be
943 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
944 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100945 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100946gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000947{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100948 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000949 int c;
950 int i;
951
952 for (i = 0; i < maxinputlen; ++i)
953 {
954 c = regparse[0];
955 if (!vim_isxdigit(c))
956 break;
957 nr <<= 4;
958 nr |= hex2nr(c);
959 ++regparse;
960 }
961
962 if (i == 0)
963 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100964 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000965}
966
967/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200968 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000969 * current position. Return -1 for invalid. Consumes all digits.
970 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100971 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100972getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000973{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100974 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000975 int c;
976 int i;
977
978 for (i = 0; ; ++i)
979 {
980 c = regparse[0];
981 if (c < '0' || c > '9')
982 break;
983 nr *= 10;
984 nr += c - '0';
985 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100986 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000987 }
988
989 if (i == 0)
990 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100991 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000992}
993
994/*
995 * get and return the value of the octal string immediately after the current
996 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
997 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
998 * treat 8 or 9 as recognised characters. Position is updated:
999 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +00001000 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001001 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001002 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001003getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001004{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001005 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001006 int c;
1007 int i;
1008
1009 for (i = 0; i < 3 && nr < 040; ++i)
1010 {
1011 c = regparse[0];
1012 if (c < '0' || c > '7')
1013 break;
1014 nr <<= 3;
1015 nr |= hex2nr(c);
1016 ++regparse;
1017 }
1018
1019 if (i == 0)
1020 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001021 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001022}
1023
1024/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001025 * read_limits - Read two integers to be taken as a minimum and maximum.
1026 * If the first character is '-', then the range is reversed.
1027 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1028 * missing, a very big number is the default.
1029 */
1030 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001031read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001032{
1033 int reverse = FALSE;
1034 char_u *first_char;
1035 long tmp;
1036
1037 if (*regparse == '-')
1038 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001039 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001040 regparse++;
1041 reverse = TRUE;
1042 }
1043 first_char = regparse;
1044 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001045 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001046 {
1047 if (vim_isdigit(*++regparse))
1048 *maxval = getdigits(&regparse);
1049 else
1050 *maxval = MAX_LIMIT;
1051 }
1052 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001053 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001054 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001055 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001056 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001057 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001058 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001059 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1060 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001061
1062 /*
1063 * Reverse the range if there was a '-', or make sure it is in the right
1064 * order otherwise.
1065 */
1066 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1067 {
1068 tmp = *minval;
1069 *minval = *maxval;
1070 *maxval = tmp;
1071 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001072 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001073 return OK;
1074}
1075
1076/*
1077 * vim_regexec and friends
1078 */
1079
1080/*
1081 * Global work variables for vim_regexec().
1082 */
1083
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001084static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001085#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001086static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001087#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001088static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001089static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001090
1091/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001092 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1093 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001094 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001095 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001096static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001097static unsigned reg_tofreelen;
1098
1099/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001100 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001101 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001102 * done:
1103 * single-line multi-line
1104 * reg_match &regmatch_T NULL
1105 * reg_mmatch NULL &regmmatch_T
1106 * reg_startp reg_match->startp <invalid>
1107 * reg_endp reg_match->endp <invalid>
1108 * reg_startpos <invalid> reg_mmatch->startpos
1109 * reg_endpos <invalid> reg_mmatch->endpos
1110 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001111 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001112 * reg_firstlnum <invalid> first line in which to search
1113 * reg_maxline 0 last line nr
1114 * reg_line_lbr FALSE or TRUE FALSE
1115 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001116typedef struct {
1117 regmatch_T *reg_match;
1118 regmmatch_T *reg_mmatch;
1119 char_u **reg_startp;
1120 char_u **reg_endp;
1121 lpos_T *reg_startpos;
1122 lpos_T *reg_endpos;
1123 win_T *reg_win;
1124 buf_T *reg_buf;
1125 linenr_T reg_firstlnum;
1126 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001127 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001128
Bram Moolenaar0270f382018-07-17 05:43:58 +02001129 // The current match-position is stord in these variables:
1130 linenr_T lnum; // line number, relative to first line
1131 char_u *line; // start of current line
Bram Moolenaar64066b92021-11-17 18:22:56 +00001132 char_u *input; // current input, points into "line"
Bram Moolenaar0270f382018-07-17 05:43:58 +02001133
1134 int need_clear_subexpr; // subexpressions still need to be cleared
1135#ifdef FEAT_SYN_HL
1136 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1137 // cleared
1138#endif
1139
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001140 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1141 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1142 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001143 int reg_ic;
1144
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001145 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1146 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001147 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001148
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001149 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1150 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001151 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001152
1153 // State for the NFA engine regexec.
1154 int nfa_has_zend; // NFA regexp \ze operator encountered.
1155 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1156 int nfa_nsubexpr; // Number of sub expressions actually being used
1157 // during execution. 1 if only the whole match
1158 // (subexpr 0) is used.
1159 // listid is global, so that it increases on recursive calls to
1160 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1161 // all the states.
1162 int nfa_listid;
1163 int nfa_alt_listid;
1164
1165#ifdef FEAT_SYN_HL
1166 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1167#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001168} regexec_T;
1169
1170static regexec_T rex;
1171static int rex_in_use = FALSE;
1172
Bram Moolenaar071d4272004-06-13 20:20:40 +00001173/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001174 * Return TRUE if character 'c' is included in 'iskeyword' option for
1175 * "reg_buf" buffer.
1176 */
1177 static int
1178reg_iswordc(int c)
1179{
1180 return vim_iswordc_buf(c, rex.reg_buf);
1181}
1182
1183/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001184 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1185 */
1186 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001187reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001188{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001189 // when looking behind for a match/no-match lnum is negative. But we
1190 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001191 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001192 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001193 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001194 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001195 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001196 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001197}
1198
Bram Moolenaar071d4272004-06-13 20:20:40 +00001199#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001200static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1201static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1202static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1203static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001204#endif
1205
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001206// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001207#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001208
Bram Moolenaar071d4272004-06-13 20:20:40 +00001209#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001210/*
1211 * Create a new extmatch and mark it as referenced once.
1212 */
1213 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001214make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001215{
1216 reg_extmatch_T *em;
1217
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001218 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001219 if (em != NULL)
1220 em->refcnt = 1;
1221 return em;
1222}
1223
1224/*
1225 * Add a reference to an extmatch.
1226 */
1227 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001228ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001229{
1230 if (em != NULL)
1231 em->refcnt++;
1232 return em;
1233}
1234
1235/*
1236 * Remove a reference to an extmatch. If there are no references left, free
1237 * the info.
1238 */
1239 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001240unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001241{
1242 int i;
1243
1244 if (em != NULL && --em->refcnt <= 0)
1245 {
1246 for (i = 0; i < NSUBEXP; ++i)
1247 vim_free(em->matches[i]);
1248 vim_free(em);
1249 }
1250}
1251#endif
1252
1253/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001254 * Get class of previous character.
1255 */
1256 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001257reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001258{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001259 if (rex.input > rex.line)
1260 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001261 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001262 return -1;
1263}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001264
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001265/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001266 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001267 */
1268 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001269reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001270{
1271 pos_T top, bot;
1272 linenr_T lnum;
1273 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001274 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001275 int mode;
1276 colnr_T start, end;
1277 colnr_T start2, end2;
1278 colnr_T cols;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001279 colnr_T curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001280
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001281 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001282 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001283 return FALSE;
1284
1285 if (VIsual_active)
1286 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001287 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001288 {
1289 top = VIsual;
1290 bot = wp->w_cursor;
1291 }
1292 else
1293 {
1294 top = wp->w_cursor;
1295 bot = VIsual;
1296 }
1297 mode = VIsual_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001298 curswant = wp->w_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001299 }
1300 else
1301 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001302 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001303 {
1304 top = curbuf->b_visual.vi_start;
1305 bot = curbuf->b_visual.vi_end;
1306 }
1307 else
1308 {
1309 top = curbuf->b_visual.vi_end;
1310 bot = curbuf->b_visual.vi_start;
1311 }
1312 mode = curbuf->b_visual.vi_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001313 curswant = curbuf->b_visual.vi_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001314 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001315 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001316 if (lnum < top.lnum || lnum > bot.lnum)
1317 return FALSE;
1318
1319 if (mode == 'v')
1320 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001321 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001322 if ((lnum == top.lnum && col < top.col)
1323 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1324 return FALSE;
1325 }
1326 else if (mode == Ctrl_V)
1327 {
1328 getvvcol(wp, &top, &start, NULL, &end);
1329 getvvcol(wp, &bot, &start2, NULL, &end2);
1330 if (start2 < start)
1331 start = start2;
1332 if (end2 > end)
1333 end = end2;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001334 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001335 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001336 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001337 if (cols < start || cols > end - (*p_sel == 'e'))
1338 return FALSE;
1339 }
1340 return TRUE;
1341}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001342
Bram Moolenaar071d4272004-06-13 20:20:40 +00001343/*
1344 * Check the regexp program for its magic number.
1345 * Return TRUE if it's wrong.
1346 */
1347 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001348prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001349{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001350 regprog_T *prog;
1351
Bram Moolenaar6100d022016-10-02 16:51:57 +02001352 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001353 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001354 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001355 return FALSE;
1356
1357 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001358 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001359 emsg(_(e_corrupted_regexp_program));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001360 return TRUE;
1361 }
1362 return FALSE;
1363}
1364
1365/*
1366 * Cleanup the subexpressions, if this wasn't done yet.
1367 * This construction is used to clear the subexpressions only when they are
1368 * used (to increase speed).
1369 */
1370 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001371cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001372{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001373 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001374 {
1375 if (REG_MULTI)
1376 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001377 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001378 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1379 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001380 }
1381 else
1382 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001383 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1384 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001385 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001386 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001387 }
1388}
1389
1390#ifdef FEAT_SYN_HL
1391 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001392cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001393{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001394 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001395 {
1396 if (REG_MULTI)
1397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001398 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001399 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1400 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1401 }
1402 else
1403 {
1404 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1405 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1406 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001407 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001408 }
1409}
1410#endif
1411
1412/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001413 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001414 */
1415 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001416reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001417{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001418 rex.line = reg_getline(++rex.lnum);
1419 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001420 fast_breakcheck();
1421}
1422
1423/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001424 * Check whether a backreference matches.
1425 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001426 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1427 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001428 */
1429 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001430match_with_backref(
1431 linenr_T start_lnum,
1432 colnr_T start_col,
1433 linenr_T end_lnum,
1434 colnr_T end_col,
1435 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001436{
1437 linenr_T clnum = start_lnum;
1438 colnr_T ccol = start_col;
1439 int len;
1440 char_u *p;
1441
1442 if (bytelen != NULL)
1443 *bytelen = 0;
1444 for (;;)
1445 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001446 // Since getting one line may invalidate the other, need to make copy.
1447 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001448 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001449 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001450 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001451 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1452 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001453 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001454 vim_free(reg_tofree);
1455 reg_tofree = alloc(len);
1456 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001457 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001458 reg_tofreelen = len;
1459 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001460 STRCPY(reg_tofree, rex.line);
1461 rex.input = reg_tofree + (rex.input - rex.line);
1462 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001463 }
1464
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001465 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001466 p = reg_getline(clnum);
1467 if (clnum == end_lnum)
1468 len = end_col - ccol;
1469 else
1470 len = (int)STRLEN(p + ccol);
1471
Bram Moolenaar0270f382018-07-17 05:43:58 +02001472 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001473 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001474 if (bytelen != NULL)
1475 *bytelen += len;
1476 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001477 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001478 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001479 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001480
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001481 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001482 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001483 if (bytelen != NULL)
1484 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001485 ++clnum;
1486 ccol = 0;
1487 if (got_int)
1488 return RA_FAIL;
1489 }
1490
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001491 // found a match! Note that rex.line may now point to a copy of the line,
1492 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001493 return RA_MATCH;
1494}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001495
Bram Moolenaarfb031402014-09-09 17:18:49 +02001496/*
1497 * Used in a place where no * or \+ can follow.
1498 */
1499 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001500re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001501{
1502 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001503 {
1504 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1505 rc_did_emsg = TRUE;
1506 return FAIL;
1507 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001508 return OK;
1509}
1510
Bram Moolenaar071d4272004-06-13 20:20:40 +00001511typedef struct
1512{
1513 int a, b, c;
1514} decomp_T;
1515
1516
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001517// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001518static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001519{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001520 {0x5e2,0,0}, // 0xfb20 alt ayin
1521 {0x5d0,0,0}, // 0xfb21 alt alef
1522 {0x5d3,0,0}, // 0xfb22 alt dalet
1523 {0x5d4,0,0}, // 0xfb23 alt he
1524 {0x5db,0,0}, // 0xfb24 alt kaf
1525 {0x5dc,0,0}, // 0xfb25 alt lamed
1526 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1527 {0x5e8,0,0}, // 0xfb27 alt resh
1528 {0x5ea,0,0}, // 0xfb28 alt tav
1529 {'+', 0, 0}, // 0xfb29 alt plus
1530 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1531 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1532 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1533 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1534 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1535 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1536 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1537 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1538 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1539 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1540 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1541 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1542 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1543 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1544 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1545 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1546 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1547 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1548 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1549 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1550 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1551 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1552 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1553 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1554 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1555 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1556 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1557 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1558 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1559 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1560 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1561 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1562 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1563 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1564 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1565 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1566 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1567 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001568};
1569
1570 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001571mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001572{
1573 decomp_T d;
1574
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001575 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001576 {
1577 d = decomp_table[c - 0xfb20];
1578 *c1 = d.a;
1579 *c2 = d.b;
1580 *c3 = d.c;
1581 }
1582 else
1583 {
1584 *c1 = c;
1585 *c2 = *c3 = 0;
1586 }
1587}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001588
1589/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001590 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001591 * Return 0 if strings match, non-zero otherwise.
1592 * Correct the length "*n" when composing characters are ignored.
1593 */
1594 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001595cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001596{
1597 int result;
1598
Bram Moolenaar6100d022016-10-02 16:51:57 +02001599 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001600 result = STRNCMP(s1, s2, *n);
1601 else
1602 result = MB_STRNICMP(s1, s2, *n);
1603
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001604 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001605 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001606 {
1607 char_u *str1, *str2;
1608 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001609 int junk;
1610
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001611 // we have to handle the strcmp ourselves, since it is necessary to
1612 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001613 str1 = s1;
1614 str2 = s2;
1615 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001616 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001617 {
1618 c1 = mb_ptr2char_adv(&str1);
1619 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001620
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001621 // Decompose the character if necessary, into 'base' characters.
1622 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001623 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001624 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001625 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001626 mb_decompose(c1, &c11, &junk, &junk);
1627 mb_decompose(c2, &c12, &junk, &junk);
1628 c1 = c11;
1629 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001630 if (c11 != c12
1631 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001632 break;
1633 }
1634 }
1635 result = c2 - c1;
1636 if (result == 0)
1637 *n = (int)(str2 - s2);
1638 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001639
1640 return result;
1641}
1642
1643/*
1644 * cstrchr: This function is used a lot for simple searches, keep it fast!
1645 */
1646 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001647cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001648{
1649 char_u *p;
1650 int cc;
1651
Bram Moolenaara12a1612019-01-24 16:39:02 +01001652 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001653 return vim_strchr(s, c);
1654
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001655 // tolower() and toupper() can be slow, comparing twice should be a lot
1656 // faster (esp. when using MS Visual C++!).
1657 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001658 if (enc_utf8 && c > 0x80)
1659 cc = utf_fold(c);
1660 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001661 if (MB_ISUPPER(c))
1662 cc = MB_TOLOWER(c);
1663 else if (MB_ISLOWER(c))
1664 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001665 else
1666 return vim_strchr(s, c);
1667
Bram Moolenaar071d4272004-06-13 20:20:40 +00001668 if (has_mbyte)
1669 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001670 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001671 {
1672 if (enc_utf8 && c > 0x80)
1673 {
1674 if (utf_fold(utf_ptr2char(p)) == cc)
1675 return p;
1676 }
1677 else if (*p == c || *p == cc)
1678 return p;
1679 }
1680 }
1681 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001682 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001683 for (p = s; *p != NUL; ++p)
1684 if (*p == c || *p == cc)
1685 return p;
1686
1687 return NULL;
1688}
1689
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001690////////////////////////////////////////////////////////////////
1691// regsub stuff //
1692////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001693
Bram Moolenaar071d4272004-06-13 20:20:40 +00001694/*
1695 * We should define ftpr as a pointer to a function returning a pointer to
1696 * a function returning a pointer to a function ...
1697 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001698 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001699 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001700typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001701
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001702static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001703
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001704 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001705do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001706{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001707 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001708
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001709 return (fptr_T)NULL;
1710}
1711
1712 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001713do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001714{
1715 *d = MB_TOUPPER(c);
1716
1717 return (fptr_T)do_Upper;
1718}
1719
1720 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001721do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001722{
1723 *d = MB_TOLOWER(c);
1724
1725 return (fptr_T)NULL;
1726}
1727
1728 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001729do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001730{
1731 *d = MB_TOLOWER(c);
1732
1733 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001734}
1735
1736/*
1737 * regtilde(): Replace tildes in the pattern by the old pattern.
1738 *
1739 * Short explanation of the tilde: It stands for the previous replacement
1740 * pattern. If that previous pattern also contains a ~ we should go back a
1741 * step further... But we insert the previous pattern into the current one
1742 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001743 * This still does not handle the case where "magic" changes. So require the
1744 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001745 *
1746 * The tildes are parsed once before the first call to vim_regsub().
1747 */
1748 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001749regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001750{
1751 char_u *newsub = source;
1752 char_u *tmpsub;
1753 char_u *p;
1754 int len;
1755 int prevlen;
1756
1757 for (p = newsub; *p; ++p)
1758 {
1759 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1760 {
1761 if (reg_prev_sub != NULL)
1762 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001763 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001764 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001765 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001766 if (tmpsub != NULL)
1767 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001768 // copy prefix
1769 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001771 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001772 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001773 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001774 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001775 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001776 STRCPY(tmpsub + len + prevlen, p + 1);
1777
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001778 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001779 vim_free(newsub);
1780 newsub = tmpsub;
1781 p = newsub + len + prevlen;
1782 }
1783 }
1784 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001785 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001786 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001787 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001788 --p;
1789 }
1790 else
1791 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001792 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001793 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001794 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001795 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001796 }
1797 }
1798
1799 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001800 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001801 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001802 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001803 reg_prev_sub = vim_strsave(newsub);
1804 return newsub;
1805}
1806
1807#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001808static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001809
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001810// These pointers are used for reg_submatch(). Needed for when the
1811// substitution string is an expression that contains a call to substitute()
1812// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001813typedef struct {
1814 regmatch_T *sm_match;
1815 regmmatch_T *sm_mmatch;
1816 linenr_T sm_firstlnum;
1817 linenr_T sm_maxline;
1818 int sm_line_lbr;
1819} regsubmatch_T;
1820
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001821static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001822#endif
1823
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001824#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001825
1826/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001827 * Put the submatches in "argv[argskip]" which is a list passed into
1828 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001829 */
1830 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001831fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001832{
1833 listitem_T *li;
1834 int i;
1835 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001836 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001837
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001838 if (argcount == argskip)
1839 // called function doesn't take a submatches argument
1840 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001841
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001842 // Relies on sl_list to be the first item in staticList10_T.
1843 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001844
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001845 // There are always 10 list items in staticList10_T.
1846 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001847 for (i = 0; i < 10; ++i)
1848 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001849 s = rsm.sm_match->startp[i];
1850 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001851 s = NULL;
1852 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001853 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001854 li->li_tv.v_type = VAR_STRING;
1855 li->li_tv.vval.v_string = s;
1856 li = li->li_next;
1857 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001858 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001859}
1860
1861 static void
1862clear_submatch_list(staticList10_T *sl)
1863{
1864 int i;
1865
1866 for (i = 0; i < 10; ++i)
1867 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1868}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001869#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001870
Bram Moolenaar071d4272004-06-13 20:20:40 +00001871/*
1872 * vim_regsub() - perform substitutions after a vim_regexec() or
1873 * vim_regexec_multi() match.
1874 *
1875 * If "copy" is TRUE really copy into "dest".
1876 * If "copy" is FALSE nothing is copied, this is just to find out the length
1877 * of the result.
1878 *
1879 * If "backslash" is TRUE, a backslash will be removed later, need to double
1880 * them to keep them, and insert a backslash before a CR to avoid it being
1881 * replaced with a line break later.
1882 *
1883 * Note: The matched text must not change between the call of
1884 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1885 * references invalid!
1886 *
1887 * Returns the size of the replacement, including terminating NUL.
1888 */
1889 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001890vim_regsub(
1891 regmatch_T *rmp,
1892 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001893 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001894 char_u *dest,
1895 int copy,
1896 int magic,
1897 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001898{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001899 int result;
1900 regexec_T rex_save;
1901 int rex_in_use_save = rex_in_use;
1902
1903 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001904 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001905 rex_save = rex;
1906 rex_in_use = TRUE;
1907
1908 rex.reg_match = rmp;
1909 rex.reg_mmatch = NULL;
1910 rex.reg_maxline = 0;
1911 rex.reg_buf = curbuf;
1912 rex.reg_line_lbr = TRUE;
1913 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1914
1915 rex_in_use = rex_in_use_save;
1916 if (rex_in_use)
1917 rex = rex_save;
1918
1919 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001920}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001921
1922 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001923vim_regsub_multi(
1924 regmmatch_T *rmp,
1925 linenr_T lnum,
1926 char_u *source,
1927 char_u *dest,
1928 int copy,
1929 int magic,
1930 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001931{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001932 int result;
1933 regexec_T rex_save;
1934 int rex_in_use_save = rex_in_use;
1935
1936 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001937 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001938 rex_save = rex;
1939 rex_in_use = TRUE;
1940
1941 rex.reg_match = NULL;
1942 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001943 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001944 rex.reg_firstlnum = lnum;
1945 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1946 rex.reg_line_lbr = FALSE;
1947 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1948
1949 rex_in_use = rex_in_use_save;
1950 if (rex_in_use)
1951 rex = rex_save;
1952
1953 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001954}
1955
1956 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001957vim_regsub_both(
1958 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001959 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001960 char_u *dest,
1961 int copy,
1962 int magic,
1963 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001964{
1965 char_u *src;
1966 char_u *dst;
1967 char_u *s;
1968 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001969 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001970 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001971 fptr_T func_all = (fptr_T)NULL;
1972 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001973 linenr_T clnum = 0; // init for GCC
1974 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001975#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001976 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001978
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001979 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001980 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001981 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001982 emsg(_(e_null_argument));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001983 return 0;
1984 }
1985 if (prog_magic_wrong())
1986 return 0;
1987 src = source;
1988 dst = dest;
1989
1990 /*
1991 * When the substitute part starts with "\=" evaluate it as an expression.
1992 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001993 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001994 {
1995#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001996 // To make sure that the length doesn't change between checking the
1997 // length and copying the string, and to speed up things, the
1998 // resulting string is saved from the call with "copy" == FALSE to the
1999 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002000 if (copy)
2001 {
2002 if (eval_result != NULL)
2003 {
2004 STRCPY(dest, eval_result);
2005 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002006 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002007 }
2008 }
2009 else
2010 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002011 int prev_can_f_submatch = can_f_submatch;
2012 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002013
2014 vim_free(eval_result);
2015
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002016 // The expression may contain substitute(), which calls us
2017 // recursively. Make sure submatch() gets the text from the first
2018 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002019 if (can_f_submatch)
2020 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002021 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002022 rsm.sm_match = rex.reg_match;
2023 rsm.sm_mmatch = rex.reg_mmatch;
2024 rsm.sm_firstlnum = rex.reg_firstlnum;
2025 rsm.sm_maxline = rex.reg_maxline;
2026 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002027
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002028 if (expr != NULL)
2029 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002030 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031 char_u buf[NUMBUFLEN];
2032 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002033 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002034 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002035
2036 rettv.v_type = VAR_STRING;
2037 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002038 argv[0].v_type = VAR_LIST;
2039 argv[0].vval.v_list = &matchList.sl_list;
2040 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002041 CLEAR_FIELD(funcexe);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002042 funcexe.argv_func = fill_submatch_list;
2043 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002044 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002045 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002046 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002047 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002048 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002049 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002050 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002051 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002052
Bram Moolenaar6100d022016-10-02 16:51:57 +02002053 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002054 funcexe.partial = partial;
2055 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002056 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002057 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002058 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002059 clear_submatch_list(&matchList);
2060
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002061 if (rettv.v_type == VAR_UNKNOWN)
2062 // something failed, no need to report another error
2063 eval_result = NULL;
2064 else
2065 {
2066 eval_result = tv_get_string_buf_chk(&rettv, buf);
2067 if (eval_result != NULL)
2068 eval_result = vim_strsave(eval_result);
2069 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002070 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002071 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002072 else if (substitute_instr != NULL)
2073 // Execute instructions from ISN_SUBSTITUTE.
2074 eval_result = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002075 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002076 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002077
Bram Moolenaar071d4272004-06-13 20:20:40 +00002078 if (eval_result != NULL)
2079 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002080 int had_backslash = FALSE;
2081
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002082 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002083 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002084 // Change NL to CR, so that it becomes a line break,
2085 // unless called from vim_regexec_nl().
2086 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002087 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002088 *s = CAR;
2089 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002090 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002091 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002092 /* Change NL to CR here too, so that this works:
2093 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2094 * abc\
2095 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002096 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002097 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002098 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002099 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002100 had_backslash = TRUE;
2101 }
2102 }
2103 if (had_backslash && backslash)
2104 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002105 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002106 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2107 if (s != NULL)
2108 {
2109 vim_free(eval_result);
2110 eval_result = s;
2111 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002112 }
2113
2114 dst += STRLEN(eval_result);
2115 }
2116
Bram Moolenaar6100d022016-10-02 16:51:57 +02002117 can_f_submatch = prev_can_f_submatch;
2118 if (can_f_submatch)
2119 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002120 }
2121#endif
2122 }
2123 else
2124 while ((c = *src++) != NUL)
2125 {
2126 if (c == '&' && magic)
2127 no = 0;
2128 else if (c == '\\' && *src != NUL)
2129 {
2130 if (*src == '&' && !magic)
2131 {
2132 ++src;
2133 no = 0;
2134 }
2135 else if ('0' <= *src && *src <= '9')
2136 {
2137 no = *src++ - '0';
2138 }
2139 else if (vim_strchr((char_u *)"uUlLeE", *src))
2140 {
2141 switch (*src++)
2142 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002143 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002144 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002145 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002146 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002147 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002148 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002149 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002150 continue;
2151 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002152 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002153 continue;
2154 }
2155 }
2156 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002157 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002158 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002159 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2160 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002161 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002162 if (copy)
2163 {
2164 *dst++ = c;
2165 *dst++ = *src++;
2166 *dst++ = *src++;
2167 }
2168 else
2169 {
2170 dst += 3;
2171 src += 2;
2172 }
2173 continue;
2174 }
2175
Bram Moolenaar071d4272004-06-13 20:20:40 +00002176 if (c == '\\' && *src != NUL)
2177 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002178 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002179 switch (*src)
2180 {
2181 case 'r': c = CAR; ++src; break;
2182 case 'n': c = NL; ++src; break;
2183 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002184 // Oh no! \e already has meaning in subst pat :-(
2185 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002186 case 'b': c = Ctrl_H; ++src; break;
2187
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002188 // If "backslash" is TRUE the backslash will be removed
2189 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002190 default: if (backslash)
2191 {
2192 if (copy)
2193 *dst = '\\';
2194 ++dst;
2195 }
2196 c = *src++;
2197 }
2198 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002199 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002200 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002201
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002202 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002203 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002204 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002205 func_one = (fptr_T)(func_one(&cc, c));
2206 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002207 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002208 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002209 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002210 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002211
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002212 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002213 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002214 int totlen = mb_ptr2len(src - 1);
2215
Bram Moolenaar071d4272004-06-13 20:20:40 +00002216 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002217 mb_char2bytes(cc, dst);
2218 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002219 if (enc_utf8)
2220 {
2221 int clen = utf_ptr2len(src - 1);
2222
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002223 // If the character length is shorter than "totlen", there
2224 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002225 if (clen < totlen)
2226 {
2227 if (copy)
2228 mch_memmove(dst + 1, src - 1 + clen,
2229 (size_t)(totlen - clen));
2230 dst += totlen - clen;
2231 }
2232 }
2233 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002234 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002235 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002236 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002237 dst++;
2238 }
2239 else
2240 {
2241 if (REG_MULTI)
2242 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002243 clnum = rex.reg_mmatch->startpos[no].lnum;
2244 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002245 s = NULL;
2246 else
2247 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002248 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2249 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2250 len = rex.reg_mmatch->endpos[no].col
2251 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002252 else
2253 len = (int)STRLEN(s);
2254 }
2255 }
2256 else
2257 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002258 s = rex.reg_match->startp[no];
2259 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002260 s = NULL;
2261 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002262 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002263 }
2264 if (s != NULL)
2265 {
2266 for (;;)
2267 {
2268 if (len == 0)
2269 {
2270 if (REG_MULTI)
2271 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002272 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002273 break;
2274 if (copy)
2275 *dst = CAR;
2276 ++dst;
2277 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002278 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2279 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002280 else
2281 len = (int)STRLEN(s);
2282 }
2283 else
2284 break;
2285 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002286 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002287 {
2288 if (copy)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002289 iemsg(_(e_damaged_match_string));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002290 goto exit;
2291 }
2292 else
2293 {
2294 if (backslash && (*s == CAR || *s == '\\'))
2295 {
2296 /*
2297 * Insert a backslash in front of a CR, otherwise
2298 * it will be replaced by a line break.
2299 * Number of backslashes will be halved later,
2300 * double them here.
2301 */
2302 if (copy)
2303 {
2304 dst[0] = '\\';
2305 dst[1] = *s;
2306 }
2307 dst += 2;
2308 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002309 else
2310 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002311 if (has_mbyte)
2312 c = mb_ptr2char(s);
2313 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002314 c = *s;
2315
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002316 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002317 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002318 func_one = (fptr_T)(func_one(&cc, c));
2319 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002320 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002321 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002322 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002323 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002324
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002325 if (has_mbyte)
2326 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002327 int l;
2328
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002329 // Copy composing characters separately, one
2330 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002331 if (enc_utf8)
2332 l = utf_ptr2len(s) - 1;
2333 else
2334 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002335
2336 s += l;
2337 len -= l;
2338 if (copy)
2339 mb_char2bytes(cc, dst);
2340 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002341 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002342 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002343 *dst = cc;
2344 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002345 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002346
Bram Moolenaar071d4272004-06-13 20:20:40 +00002347 ++s;
2348 --len;
2349 }
2350 }
2351 }
2352 no = -1;
2353 }
2354 }
2355 if (copy)
2356 *dst = NUL;
2357
2358exit:
2359 return (int)((dst - dest) + 1);
2360}
2361
2362#ifdef FEAT_EVAL
2363/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002364 * Call reg_getline() with the line numbers from the submatch. If a
2365 * substitute() was used the reg_maxline and other values have been
2366 * overwritten.
2367 */
2368 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002369reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002370{
2371 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002372 linenr_T save_first = rex.reg_firstlnum;
2373 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002374
Bram Moolenaar6100d022016-10-02 16:51:57 +02002375 rex.reg_firstlnum = rsm.sm_firstlnum;
2376 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002377
2378 s = reg_getline(lnum);
2379
Bram Moolenaar6100d022016-10-02 16:51:57 +02002380 rex.reg_firstlnum = save_first;
2381 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002382 return s;
2383}
2384
2385/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002386 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002387 * allocated memory.
2388 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2389 */
2390 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002391reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002392{
2393 char_u *retval = NULL;
2394 char_u *s;
2395 int len;
2396 int round;
2397 linenr_T lnum;
2398
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002399 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002400 return NULL;
2401
Bram Moolenaar6100d022016-10-02 16:51:57 +02002402 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002403 {
2404 /*
2405 * First round: compute the length and allocate memory.
2406 * Second round: copy the text.
2407 */
2408 for (round = 1; round <= 2; ++round)
2409 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002410 lnum = rsm.sm_mmatch->startpos[no].lnum;
2411 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002412 return NULL;
2413
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002414 s = reg_getline_submatch(lnum);
2415 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002417 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002418 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002419 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002420 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002421 len = rsm.sm_mmatch->endpos[no].col
2422 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002423 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002424 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002425 ++len;
2426 }
2427 else
2428 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002429 // Multiple lines: take start line from start col, middle
2430 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002431 len = (int)STRLEN(s);
2432 if (round == 2)
2433 {
2434 STRCPY(retval, s);
2435 retval[len] = '\n';
2436 }
2437 ++len;
2438 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002439 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002440 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002441 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002442 if (round == 2)
2443 STRCPY(retval + len, s);
2444 len += (int)STRLEN(s);
2445 if (round == 2)
2446 retval[len] = '\n';
2447 ++len;
2448 }
2449 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002450 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002451 rsm.sm_mmatch->endpos[no].col);
2452 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002453 if (round == 2)
2454 retval[len] = NUL;
2455 ++len;
2456 }
2457
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002458 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002459 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002460 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002461 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002462 return NULL;
2463 }
2464 }
2465 }
2466 else
2467 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002468 s = rsm.sm_match->startp[no];
2469 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002470 retval = NULL;
2471 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002472 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002473 }
2474
2475 return retval;
2476}
Bram Moolenaar41571762014-04-02 19:00:58 +02002477
2478/*
2479 * Used for the submatch() function with the optional non-zero argument: get
2480 * the list of strings from the n'th submatch in allocated memory with NULs
2481 * represented in NLs.
2482 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2483 * command, for a non-existing submatch and for any error.
2484 */
2485 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002486reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002487{
2488 char_u *s;
2489 linenr_T slnum;
2490 linenr_T elnum;
2491 colnr_T scol;
2492 colnr_T ecol;
2493 int i;
2494 list_T *list;
2495 int error = FALSE;
2496
2497 if (!can_f_submatch || no < 0)
2498 return NULL;
2499
Bram Moolenaar6100d022016-10-02 16:51:57 +02002500 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002501 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002502 slnum = rsm.sm_mmatch->startpos[no].lnum;
2503 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002504 if (slnum < 0 || elnum < 0)
2505 return NULL;
2506
Bram Moolenaar6100d022016-10-02 16:51:57 +02002507 scol = rsm.sm_mmatch->startpos[no].col;
2508 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002509
2510 list = list_alloc();
2511 if (list == NULL)
2512 return NULL;
2513
2514 s = reg_getline_submatch(slnum) + scol;
2515 if (slnum == elnum)
2516 {
2517 if (list_append_string(list, s, ecol - scol) == FAIL)
2518 error = TRUE;
2519 }
2520 else
2521 {
2522 if (list_append_string(list, s, -1) == FAIL)
2523 error = TRUE;
2524 for (i = 1; i < elnum - slnum; i++)
2525 {
2526 s = reg_getline_submatch(slnum + i);
2527 if (list_append_string(list, s, -1) == FAIL)
2528 error = TRUE;
2529 }
2530 s = reg_getline_submatch(elnum);
2531 if (list_append_string(list, s, ecol) == FAIL)
2532 error = TRUE;
2533 }
2534 }
2535 else
2536 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002537 s = rsm.sm_match->startp[no];
2538 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002539 return NULL;
2540 list = list_alloc();
2541 if (list == NULL)
2542 return NULL;
2543 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002544 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002545 error = TRUE;
2546 }
2547
2548 if (error)
2549 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002550 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002551 return NULL;
2552 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002553 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002554 return list;
2555}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002556#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002557
Bram Moolenaarf4140482020-02-15 23:06:45 +01002558/*
2559 * Initialize the values used for matching against multiple lines
2560 */
2561 static void
2562init_regexec_multi(
2563 regmmatch_T *rmp,
2564 win_T *win, // window in which to search or NULL
2565 buf_T *buf, // buffer in which to search
2566 linenr_T lnum) // nr of line to start looking for match
2567{
2568 rex.reg_match = NULL;
2569 rex.reg_mmatch = rmp;
2570 rex.reg_buf = buf;
2571 rex.reg_win = win;
2572 rex.reg_firstlnum = lnum;
2573 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2574 rex.reg_line_lbr = FALSE;
2575 rex.reg_ic = rmp->rmm_ic;
2576 rex.reg_icombine = FALSE;
2577 rex.reg_maxcol = rmp->rmm_maxcol;
2578}
2579
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002580#include "regexp_bt.c"
2581
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582static regengine_T bt_regengine =
2583{
2584 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002585 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002586 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002587 bt_regexec_multi,
2588 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002589};
2590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002591#include "regexp_nfa.c"
2592
2593static regengine_T nfa_regengine =
2594{
2595 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002596 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002598 nfa_regexec_multi,
2599 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600};
2601
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002602// Which regexp engine to use? Needed for vim_regcomp().
2603// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002604static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002605
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002606#ifdef DEBUG
2607static char_u regname[][30] = {
2608 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002609 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002610 "NFA Regexp Engine"
2611 };
2612#endif
2613
2614/*
2615 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002616 * Returns the program in allocated memory.
2617 * Use vim_regfree() to free the memory.
2618 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002619 */
2620 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002621vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002622{
2623 regprog_T *prog = NULL;
2624 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002625 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002626
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002627 regexp_engine = p_re;
2628
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002629 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002630 if (STRNCMP(expr, "\\%#=", 4) == 0)
2631 {
2632 int newengine = expr[4] - '0';
2633
2634 if (newengine == AUTOMATIC_ENGINE
2635 || newengine == BACKTRACKING_ENGINE
2636 || newengine == NFA_ENGINE)
2637 {
2638 regexp_engine = expr[4] - '0';
2639 expr += 5;
2640#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002641 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002642 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002643#endif
2644 }
2645 else
2646 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002647 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002648 regexp_engine = AUTOMATIC_ENGINE;
2649 }
2650 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002651#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002652 bt_regengine.expr = expr;
2653 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002654#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002655 // reg_iswordc() uses rex.reg_buf
2656 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002657
2658 /*
2659 * First try the NFA engine, unless backtracking was requested.
2660 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002661 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002662 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002663 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002664 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002665 else
2666 prog = bt_regengine.regcomp(expr, re_flags);
2667
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002668 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002669 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002670 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002671#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002672 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002673 {
2674 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002675 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002676 if (f)
2677 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002678 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002679 fclose(f);
2680 }
2681 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002682 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002683 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002684 }
2685#endif
2686 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002687 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002688 * The NFA engine also fails for patterns that it can't handle well
2689 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002690 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002691 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002692 if (regexp_engine == AUTOMATIC_ENGINE
2693 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002694 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002695 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002696#ifdef FEAT_EVAL
2697 report_re_switch(expr);
2698#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002699 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002700 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002701 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002702
Bram Moolenaarfda37292014-11-05 14:27:36 +01002703 if (prog != NULL)
2704 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002705 // Store the info needed to call regcomp() again when the engine turns
2706 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002707 prog->re_engine = regexp_engine;
2708 prog->re_flags = re_flags;
2709 }
2710
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002711 return prog;
2712}
2713
2714/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002715 * Free a compiled regexp program, returned by vim_regcomp().
2716 */
2717 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002718vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002719{
2720 if (prog != NULL)
2721 prog->engine->regfree(prog);
2722}
2723
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002724#if defined(EXITFREE) || defined(PROTO)
2725 void
2726free_regexp_stuff(void)
2727{
2728 ga_clear(&regstack);
2729 ga_clear(&backpos);
2730 vim_free(reg_tofree);
2731 vim_free(reg_prev_sub);
2732}
2733#endif
2734
Bram Moolenaarfda37292014-11-05 14:27:36 +01002735#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002736 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002737report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002738{
2739 if (p_verbose > 0)
2740 {
2741 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002742 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2743 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002744 verbose_leave();
2745 }
2746}
2747#endif
2748
Bram Moolenaar113e1072019-01-20 15:30:40 +01002749#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2750 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002751/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002752 * Return whether "prog" is currently being executed.
2753 */
2754 int
2755regprog_in_use(regprog_T *prog)
2756{
2757 return prog->re_in_use;
2758}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002759#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002760
2761/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002762 * Match a regexp against a string.
2763 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002764 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002765 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002766 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002767 *
2768 * Return TRUE if there is a match, FALSE if not.
2769 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002770 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002771vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002772 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002773 char_u *line, // string to match against
2774 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002775 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002776{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002777 int result;
2778 regexec_T rex_save;
2779 int rex_in_use_save = rex_in_use;
2780
Bram Moolenaar0270f382018-07-17 05:43:58 +02002781 // Cannot use the same prog recursively, it contains state.
2782 if (rmp->regprog->re_in_use)
2783 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002784 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002785 return FALSE;
2786 }
2787 rmp->regprog->re_in_use = TRUE;
2788
Bram Moolenaar6100d022016-10-02 16:51:57 +02002789 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002790 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002791 rex_save = rex;
2792 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002793
Bram Moolenaar6100d022016-10-02 16:51:57 +02002794 rex.reg_startp = NULL;
2795 rex.reg_endp = NULL;
2796 rex.reg_startpos = NULL;
2797 rex.reg_endpos = NULL;
2798
2799 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002800 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002801
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002802 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002803 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2804 && result == NFA_TOO_EXPENSIVE)
2805 {
2806 int save_p_re = p_re;
2807 int re_flags = rmp->regprog->re_flags;
2808 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2809
2810 p_re = BACKTRACKING_ENGINE;
2811 vim_regfree(rmp->regprog);
2812 if (pat != NULL)
2813 {
2814#ifdef FEAT_EVAL
2815 report_re_switch(pat);
2816#endif
2817 rmp->regprog = vim_regcomp(pat, re_flags);
2818 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002819 {
2820 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002821 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002822 rmp->regprog->re_in_use = FALSE;
2823 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002824 vim_free(pat);
2825 }
2826
2827 p_re = save_p_re;
2828 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002829
2830 rex_in_use = rex_in_use_save;
2831 if (rex_in_use)
2832 rex = rex_save;
2833
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002834 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002835}
2836
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002837/*
2838 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002839 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002840 */
2841 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002842vim_regexec_prog(
2843 regprog_T **prog,
2844 int ignore_case,
2845 char_u *line,
2846 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002847{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002848 int r;
2849 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002850
2851 regmatch.regprog = *prog;
2852 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002853 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002854 *prog = regmatch.regprog;
2855 return r;
2856}
2857
2858/*
2859 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002860 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002861 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002862 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002863vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002864{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002865 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002866}
2867
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002868/*
2869 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002870 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002871 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002872 */
2873 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002874vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002875{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002876 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002877}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002878
2879/*
2880 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002881 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2882 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002883 * Uses curbuf for line count and 'iskeyword'.
2884 *
2885 * Return zero if there is no match. Return number of lines contained in the
2886 * match otherwise.
2887 */
2888 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002889vim_regexec_multi(
2890 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002891 win_T *win, // window in which to search or NULL
2892 buf_T *buf, // buffer in which to search
2893 linenr_T lnum, // nr of line to start looking for match
2894 colnr_T col, // column to start looking for match
2895 proftime_T *tm, // timeout limit or NULL
2896 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002897{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002898 int result;
2899 regexec_T rex_save;
2900 int rex_in_use_save = rex_in_use;
2901
Bram Moolenaar0270f382018-07-17 05:43:58 +02002902 // Cannot use the same prog recursively, it contains state.
2903 if (rmp->regprog->re_in_use)
2904 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002905 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002906 return FALSE;
2907 }
2908 rmp->regprog->re_in_use = TRUE;
2909
Bram Moolenaar6100d022016-10-02 16:51:57 +02002910 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002911 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002912 rex_save = rex;
2913 rex_in_use = TRUE;
2914
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002915 result = rmp->regprog->engine->regexec_multi(
2916 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002917 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002918
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002919 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002920 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2921 && result == NFA_TOO_EXPENSIVE)
2922 {
2923 int save_p_re = p_re;
2924 int re_flags = rmp->regprog->re_flags;
2925 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2926
2927 p_re = BACKTRACKING_ENGINE;
2928 vim_regfree(rmp->regprog);
2929 if (pat != NULL)
2930 {
2931#ifdef FEAT_EVAL
2932 report_re_switch(pat);
2933#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002934#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002935 // checking for \z misuse was already done when compiling for NFA,
2936 // allow all here
2937 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002938#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002939 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002940#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002941 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002942#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002943
Bram Moolenaarfda37292014-11-05 14:27:36 +01002944 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002945 {
2946 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002947 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002948 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002949 rmp->regprog->re_in_use = FALSE;
2950 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002951 vim_free(pat);
2952 }
2953 p_re = save_p_re;
2954 }
2955
Bram Moolenaar6100d022016-10-02 16:51:57 +02002956 rex_in_use = rex_in_use_save;
2957 if (rex_in_use)
2958 rex = rex_save;
2959
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002960 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002961}