blob: 805056e3e618070dac187ec5b8e46af77dce172e [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020077static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
78static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
79static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020081static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020082static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020083#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020084static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020085static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020086static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
87
Bram Moolenaar071d4272004-06-13 20:20:40 +000088#define NOT_MULTI 0
89#define MULTI_ONE 1
90#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020091
92// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010093#define RA_FAIL 1 // something failed, abort
94#define RA_CONT 2 // continue in inner loop
95#define RA_BREAK 3 // break inner loop
96#define RA_MATCH 4 // successful match
97#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020098
Bram Moolenaar071d4272004-06-13 20:20:40 +000099/*
100 * Return NOT_MULTI if c is not a "multi" operator.
101 * Return MULTI_ONE if c is a single "multi" operator.
102 * Return MULTI_MULT if c is a multi "multi" operator.
103 */
104 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100105re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000106{
107 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
108 return MULTI_ONE;
109 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
110 return MULTI_MULT;
111 return NOT_MULTI;
112}
113
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000114static char_u *reg_prev_sub = NULL;
115
Bram Moolenaar071d4272004-06-13 20:20:40 +0000116/*
117 * REGEXP_INRANGE contains all characters which are always special in a []
118 * range after '\'.
119 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
120 * These are:
121 * \n - New line (NL).
122 * \r - Carriage Return (CR).
123 * \t - Tab (TAB).
124 * \e - Escape (ESC).
125 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000126 * \d - Character code in decimal, eg \d123
127 * \o - Character code in octal, eg \o80
128 * \x - Character code in hex, eg \x4a
129 * \u - Multibyte character code, eg \u20ac
130 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131 */
132static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000133static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135/*
136 * Translate '\x' to its control character, except "\n", which is Magic.
137 */
138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100139backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000140{
141 switch (c)
142 {
143 case 'r': return CAR;
144 case 't': return TAB;
145 case 'e': return ESC;
146 case 'b': return BS;
147 }
148 return c;
149}
150
151/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000152 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000153 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
154 * recognized. Otherwise "pp" is advanced to after the item.
155 */
156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100157get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000158{
159 static const char *(class_names[]) =
160 {
161 "alnum:]",
162#define CLASS_ALNUM 0
163 "alpha:]",
164#define CLASS_ALPHA 1
165 "blank:]",
166#define CLASS_BLANK 2
167 "cntrl:]",
168#define CLASS_CNTRL 3
169 "digit:]",
170#define CLASS_DIGIT 4
171 "graph:]",
172#define CLASS_GRAPH 5
173 "lower:]",
174#define CLASS_LOWER 6
175 "print:]",
176#define CLASS_PRINT 7
177 "punct:]",
178#define CLASS_PUNCT 8
179 "space:]",
180#define CLASS_SPACE 9
181 "upper:]",
182#define CLASS_UPPER 10
183 "xdigit:]",
184#define CLASS_XDIGIT 11
185 "tab:]",
186#define CLASS_TAB 12
187 "return:]",
188#define CLASS_RETURN 13
189 "backspace:]",
190#define CLASS_BACKSPACE 14
191 "escape:]",
192#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100193 "ident:]",
194#define CLASS_IDENT 16
195 "keyword:]",
196#define CLASS_KEYWORD 17
197 "fname:]",
198#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 };
200#define CLASS_NONE 99
201 int i;
202
203 if ((*pp)[1] == ':')
204 {
K.Takataeeec2542021-06-02 13:28:16 +0200205 for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000206 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
207 {
208 *pp += STRLEN(class_names[i]) + 2;
209 return i;
210 }
211 }
212 return CLASS_NONE;
213}
214
215/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216 * Specific version of character class functions.
217 * Using a table to keep this fast.
218 */
219static short class_tab[256];
220
221#define RI_DIGIT 0x01
222#define RI_HEX 0x02
223#define RI_OCTAL 0x04
224#define RI_WORD 0x08
225#define RI_HEAD 0x10
226#define RI_ALPHA 0x20
227#define RI_LOWER 0x40
228#define RI_UPPER 0x80
229#define RI_WHITE 0x100
230
231 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100232init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000233{
234 int i;
235 static int done = FALSE;
236
237 if (done)
238 return;
239
240 for (i = 0; i < 256; ++i)
241 {
242 if (i >= '0' && i <= '7')
243 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
244 else if (i >= '8' && i <= '9')
245 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
246 else if (i >= 'a' && i <= 'f')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
248#ifdef EBCDIC
249 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
250 || (i >= 's' && i <= 'z'))
251#else
252 else if (i >= 'g' && i <= 'z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
255 else if (i >= 'A' && i <= 'F')
256 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
257#ifdef EBCDIC
258 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
259 || (i >= 'S' && i <= 'Z'))
260#else
261 else if (i >= 'G' && i <= 'Z')
262#endif
263 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
264 else if (i == '_')
265 class_tab[i] = RI_WORD + RI_HEAD;
266 else
267 class_tab[i] = 0;
268 }
269 class_tab[' '] |= RI_WHITE;
270 class_tab['\t'] |= RI_WHITE;
271 done = TRUE;
272}
273
Bram Moolenaara12a1612019-01-24 16:39:02 +0100274#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
275#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
276#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
277#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
278#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
279#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
280#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
281#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
282#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100284// flags for regflags
285#define RF_ICASE 1 // ignore case
286#define RF_NOICASE 2 // don't ignore case
287#define RF_HASNL 4 // can match a NL
288#define RF_ICOMBINE 8 // ignore combining characters
289#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000290
291/*
292 * Global work variables for vim_regcomp().
293 */
294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static char_u *regparse; // Input-scan pointer.
296static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100297static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000298#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100299static int regnzpar; // \z() count.
300static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000301#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000303#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100304static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000306
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100307static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000308
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100309static int reg_string; // matching with a string instead of a buffer
310 // line
311static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000312
313/*
314 * META contains all characters that may be magic, except '^' and '$'.
315 */
316
317#ifdef EBCDIC
318static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
319#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100320// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000321static char_u META_flags[] = {
322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100324// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000325 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100326// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000327 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100328// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000329 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100330// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000331 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100332// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000333 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100334// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000335 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
336};
337#endif
338
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100339static int curchr; // currently parsed character
340// Previous character. Note: prevchr is sometimes -1 when we are not at the
341// start, eg in /[ ^I]^ the pattern was never found even if it existed,
342// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200343static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100344static int prevprevchr; // previous-previous character
345static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000346
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100347// arguments for reg()
348#define REG_NOPAREN 0 // toplevel reg()
349#define REG_PAREN 1 // \(\)
350#define REG_ZPAREN 2 // \z(\)
351#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000352
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200353typedef struct
354{
355 char_u *regparse;
356 int prevchr_len;
357 int curchr;
358 int prevchr;
359 int prevprevchr;
360 int nextchr;
361 int at_start;
362 int prev_at_start;
363 int regnpar;
364} parse_state_T;
365
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100366static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100367static int getchr(void);
368static void skipchr_keepstart(void);
369static int peekchr(void);
370static void skipchr(void);
371static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100372static long gethexchrs(int maxinputlen);
373static long getoctchrs(void);
374static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100375static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100376static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200377static int cstrncmp(char_u *s1, char_u *s2, int *n);
378static char_u *cstrchr(char_u *, int);
379static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100380static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100381#ifdef FEAT_EVAL
382static void report_re_switch(char_u *pat);
383#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200385static regengine_T bt_regengine;
386static regengine_T nfa_regengine;
387
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388/*
389 * Return TRUE if compiled regular expression "prog" can match a line break.
390 */
391 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393{
394 return (prog->regflags & RF_HASNL);
395}
396
397/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
399 * Returns a character representing the class. Zero means that no item was
400 * recognized. Otherwise "pp" is advanced to after the item.
401 */
402 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100403get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404{
405 int c;
406 int l = 1;
407 char_u *p = *pp;
408
Bram Moolenaar985079c2019-02-16 17:07:47 +0100409 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000410 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000411 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000412 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000413 if (p[l + 2] == '=' && p[l + 3] == ']')
414 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (has_mbyte)
416 c = mb_ptr2char(p + 2);
417 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000418 c = p[2];
419 *pp += l + 4;
420 return c;
421 }
422 }
423 return 0;
424}
425
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200426#ifdef EBCDIC
427/*
428 * Table for equivalence class "c". (IBM-1047)
429 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200430static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200431 "A\x62\x63\x64\x65\x66\x67",
432 "C\x68",
433 "E\x71\x72\x73\x74",
434 "I\x75\x76\x77\x78",
435 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200436 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200437 "U\xFB\xFC\xFD\xFE",
438 "Y\xBA",
439 "a\x42\x43\x44\x45\x46\x47",
440 "c\x48",
441 "e\x51\x52\x53\x54",
442 "i\x55\x56\x57\x58",
443 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200444 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200445 "u\xDB\xDC\xDD\xDE",
446 "y\x8D\xDF",
447};
448#endif
449
Bram Moolenaardf177f62005-02-22 08:39:57 +0000450/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000451 * Check for a collating element "[.a.]". "pp" points to the '['.
452 * Returns a character. Zero means that no item was recognized. Otherwise
453 * "pp" is advanced to after the item.
454 * Currently only single characters are recognized!
455 */
456 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100457get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458{
459 int c;
460 int l = 1;
461 char_u *p = *pp;
462
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100463 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000464 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000465 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000466 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000467 if (p[l + 2] == '.' && p[l + 3] == ']')
468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
470 c = mb_ptr2char(p + 2);
471 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000472 c = p[2];
473 *pp += l + 4;
474 return c;
475 }
476 }
477 return 0;
478}
479
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100480static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
481static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200482
483 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100484get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200485{
486 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
487 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
488}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000489
490/*
491 * Skip over a "[]" range.
492 * "p" must point to the character after the '['.
493 * The returned pointer is on the matching ']', or the terminating NUL.
494 */
495 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100496skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000497{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000499
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100500 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501 ++p;
502 if (*p == ']' || *p == '-')
503 ++p;
504 while (*p != NUL && *p != ']')
505 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000506 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000507 p += l;
508 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000509 if (*p == '-')
510 {
511 ++p;
512 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100513 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000514 }
515 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200516 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000517 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200518 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000519 p += 2;
520 else if (*p == '[')
521 {
522 if (get_char_class(&p) == CLASS_NONE
523 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200524 && get_coll_element(&p) == 0
525 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100526 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000527 }
528 else
529 ++p;
530 }
531
532 return p;
533}
534
535/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000536 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200537 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000538 * Take care of characters with a backslash in front of it.
539 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000540 */
541 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100542skip_regexp(
543 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200544 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200545 int magic)
546{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100547 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200548}
549
550/*
551 * Call skip_regexp() and when the delimiter does not match give an error and
552 * return NULL.
553 */
554 char_u *
555skip_regexp_err(
556 char_u *startp,
557 int delim,
558 int magic)
559{
560 char_u *p = skip_regexp(startp, delim, magic);
561
562 if (*p != delim)
563 {
564 semsg(_("E654: missing delimiter after search pattern: %s"), startp);
565 return NULL;
566 }
567 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200568}
569
570/*
571 * skip_regexp() with extra arguments:
572 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
573 * expression and change "\?" to "?". If "*newp" is not NULL the expression
574 * is changed in-place.
575 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100576 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200577 */
578 char_u *
579skip_regexp_ex(
580 char_u *startp,
581 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100582 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200583 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100584 int *dropped,
585 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000586{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100587 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000588 char_u *p = startp;
589
590 if (magic)
591 mymagic = MAGIC_ON;
592 else
593 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200594 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000595
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100596 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000597 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100598 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000599 break;
600 if ((p[0] == '[' && mymagic >= MAGIC_ON)
601 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
602 {
603 p = skip_anyof(p + 1);
604 if (p[0] == NUL)
605 break;
606 }
607 else if (p[0] == '\\' && p[1] != NUL)
608 {
609 if (dirc == '?' && newp != NULL && p[1] == '?')
610 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100611 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000612 if (*newp == NULL)
613 {
614 *newp = vim_strsave(startp);
615 if (*newp != NULL)
616 p = *newp + (p - startp);
617 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200618 if (dropped != NULL)
619 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000620 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000621 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000622 else
623 ++p;
624 }
625 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100626 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000627 if (*p == 'v')
628 mymagic = MAGIC_ALL;
629 else if (*p == 'V')
630 mymagic = MAGIC_NONE;
631 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000632 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100633 if (magic_val != NULL)
634 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000635 return p;
636}
637
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200638/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200639 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200640 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100641static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200642static int at_start; // True when on the first character
643static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100644
Bram Moolenaar071d4272004-06-13 20:20:40 +0000645/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200646 * Start parsing at "str".
647 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000648 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100649initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000650{
651 regparse = str;
652 prevchr_len = 0;
653 curchr = prevprevchr = prevchr = nextchr = -1;
654 at_start = TRUE;
655 prev_at_start = FALSE;
656}
657
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200658/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200659 * Save the current parse state, so that it can be restored and parsing
660 * starts in the same state again.
661 */
662 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100663save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200664{
665 ps->regparse = regparse;
666 ps->prevchr_len = prevchr_len;
667 ps->curchr = curchr;
668 ps->prevchr = prevchr;
669 ps->prevprevchr = prevprevchr;
670 ps->nextchr = nextchr;
671 ps->at_start = at_start;
672 ps->prev_at_start = prev_at_start;
673 ps->regnpar = regnpar;
674}
675
676/*
677 * Restore a previously saved parse state.
678 */
679 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100680restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200681{
682 regparse = ps->regparse;
683 prevchr_len = ps->prevchr_len;
684 curchr = ps->curchr;
685 prevchr = ps->prevchr;
686 prevprevchr = ps->prevprevchr;
687 nextchr = ps->nextchr;
688 at_start = ps->at_start;
689 prev_at_start = ps->prev_at_start;
690 regnpar = ps->regnpar;
691}
692
693
694/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695 * Get the next character without advancing.
696 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000697 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100698peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000699{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000700 static int after_slash = FALSE;
701
Bram Moolenaar071d4272004-06-13 20:20:40 +0000702 if (curchr == -1)
703 {
704 switch (curchr = regparse[0])
705 {
706 case '.':
707 case '[':
708 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100709 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000710 if (reg_magic >= MAGIC_ON)
711 curchr = Magic(curchr);
712 break;
713 case '(':
714 case ')':
715 case '{':
716 case '%':
717 case '+':
718 case '=':
719 case '?':
720 case '@':
721 case '!':
722 case '&':
723 case '|':
724 case '<':
725 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100726 case '#': // future ext.
727 case '"': // future ext.
728 case '\'': // future ext.
729 case ',': // future ext.
730 case '-': // future ext.
731 case ':': // future ext.
732 case ';': // future ext.
733 case '`': // future ext.
734 case '/': // Can't be used in / command
735 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000736 if (reg_magic == MAGIC_ALL)
737 curchr = Magic(curchr);
738 break;
739 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100740 // * is not magic as the very first character, eg "?*ptr", when
741 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
742 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000743 if (reg_magic >= MAGIC_ON
744 && !at_start
745 && !(prev_at_start && prevchr == Magic('^'))
746 && (after_slash
747 || (prevchr != Magic('(')
748 && prevchr != Magic('&')
749 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000750 curchr = Magic('*');
751 break;
752 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100753 // '^' is only magic as the very first character and if it's after
754 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000755 if (reg_magic >= MAGIC_OFF
756 && (at_start
757 || reg_magic == MAGIC_ALL
758 || prevchr == Magic('(')
759 || prevchr == Magic('|')
760 || prevchr == Magic('&')
761 || prevchr == Magic('n')
762 || (no_Magic(prevchr) == '('
763 && prevprevchr == Magic('%'))))
764 {
765 curchr = Magic('^');
766 at_start = TRUE;
767 prev_at_start = FALSE;
768 }
769 break;
770 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100771 // '$' is only magic as the very last char and if it's in front of
772 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000773 if (reg_magic >= MAGIC_OFF)
774 {
775 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200776 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000777
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100778 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000779 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200780 || p[1] == 'm' || p[1] == 'M'
781 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
782 {
783 if (p[1] == 'v')
784 is_magic_all = TRUE;
785 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
786 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000787 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200788 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000789 if (p[0] == NUL
790 || (p[0] == '\\'
791 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
792 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200793 || (is_magic_all
794 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000795 || reg_magic == MAGIC_ALL)
796 curchr = Magic('$');
797 }
798 break;
799 case '\\':
800 {
801 int c = regparse[1];
802
803 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100804 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000805 else if (
806#ifdef EBCDIC
807 vim_strchr(META, c)
808#else
809 c <= '~' && META_flags[c]
810#endif
811 )
812 {
813 /*
814 * META contains everything that may be magic sometimes,
815 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200816 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000817 * magicness. Therefore, \ is so meta-magic that it is
818 * not in META.
819 */
820 curchr = -1;
821 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100822 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000823 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000824 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000825 peekchr();
826 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000827 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000828 curchr = toggle_Magic(curchr);
829 }
830 else if (vim_strchr(REGEXP_ABBR, c))
831 {
832 /*
833 * Handle abbreviations, like "\t" for TAB -- webb
834 */
835 curchr = backslash_trans(c);
836 }
837 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
838 curchr = toggle_Magic(c);
839 else
840 {
841 /*
842 * Next character can never be (made) magic?
843 * Then backslashing it won't do anything.
844 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000845 if (has_mbyte)
846 curchr = (*mb_ptr2char)(regparse + 1);
847 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000848 curchr = c;
849 }
850 break;
851 }
852
Bram Moolenaar071d4272004-06-13 20:20:40 +0000853 default:
854 if (has_mbyte)
855 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000856 }
857 }
858
859 return curchr;
860}
861
862/*
863 * Eat one lexed character. Do this in a way that we can undo it.
864 */
865 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100866skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000867{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100868 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000869 if (*regparse == '\\')
870 prevchr_len = 1;
871 else
872 prevchr_len = 0;
873 if (regparse[prevchr_len] != NUL)
874 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000875 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100876 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000877 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000878 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000879 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000880 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000881 ++prevchr_len;
882 }
883 regparse += prevchr_len;
884 prev_at_start = at_start;
885 at_start = FALSE;
886 prevprevchr = prevchr;
887 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100888 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000889 nextchr = -1;
890}
891
892/*
893 * Skip a character while keeping the value of prev_at_start for at_start.
894 * prevchr and prevprevchr are also kept.
895 */
896 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100897skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000898{
899 int as = prev_at_start;
900 int pr = prevchr;
901 int prpr = prevprevchr;
902
903 skipchr();
904 at_start = as;
905 prevchr = pr;
906 prevprevchr = prpr;
907}
908
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200909/*
910 * Get the next character from the pattern. We know about magic and such, so
911 * therefore we need a lexical analyzer.
912 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000913 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100914getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000915{
916 int chr = peekchr();
917
918 skipchr();
919 return chr;
920}
921
922/*
923 * put character back. Works only once!
924 */
925 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100926ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000927{
928 nextchr = curchr;
929 curchr = prevchr;
930 prevchr = prevprevchr;
931 at_start = prev_at_start;
932 prev_at_start = FALSE;
933
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100934 // Backup regparse, so that it's at the same position as before the
935 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000936 regparse -= prevchr_len;
937}
938
939/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000940 * Get and return the value of the hex string at the current position.
941 * Return -1 if there is no valid hex number.
942 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000943 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000944 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000945 * The parameter controls the maximum number of input characters. This will be
946 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
947 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100948 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100949gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000950{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100951 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000952 int c;
953 int i;
954
955 for (i = 0; i < maxinputlen; ++i)
956 {
957 c = regparse[0];
958 if (!vim_isxdigit(c))
959 break;
960 nr <<= 4;
961 nr |= hex2nr(c);
962 ++regparse;
963 }
964
965 if (i == 0)
966 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100967 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000968}
969
970/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200971 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000972 * current position. Return -1 for invalid. Consumes all digits.
973 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100974 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100975getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000976{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100977 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000978 int c;
979 int i;
980
981 for (i = 0; ; ++i)
982 {
983 c = regparse[0];
984 if (c < '0' || c > '9')
985 break;
986 nr *= 10;
987 nr += c - '0';
988 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100989 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000990 }
991
992 if (i == 0)
993 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100994 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000995}
996
997/*
998 * get and return the value of the octal string immediately after the current
999 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
1000 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
1001 * treat 8 or 9 as recognised characters. Position is updated:
1002 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +00001003 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001004 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001005 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001006getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001007{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001008 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001009 int c;
1010 int i;
1011
1012 for (i = 0; i < 3 && nr < 040; ++i)
1013 {
1014 c = regparse[0];
1015 if (c < '0' || c > '7')
1016 break;
1017 nr <<= 3;
1018 nr |= hex2nr(c);
1019 ++regparse;
1020 }
1021
1022 if (i == 0)
1023 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001024 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001025}
1026
1027/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001028 * read_limits - Read two integers to be taken as a minimum and maximum.
1029 * If the first character is '-', then the range is reversed.
1030 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1031 * missing, a very big number is the default.
1032 */
1033 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001034read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001035{
1036 int reverse = FALSE;
1037 char_u *first_char;
1038 long tmp;
1039
1040 if (*regparse == '-')
1041 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001042 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001043 regparse++;
1044 reverse = TRUE;
1045 }
1046 first_char = regparse;
1047 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001048 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001049 {
1050 if (vim_isdigit(*++regparse))
1051 *maxval = getdigits(&regparse);
1052 else
1053 *maxval = MAX_LIMIT;
1054 }
1055 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001056 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001057 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001058 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001059 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001060 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001061 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001062 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1063 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001064
1065 /*
1066 * Reverse the range if there was a '-', or make sure it is in the right
1067 * order otherwise.
1068 */
1069 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1070 {
1071 tmp = *minval;
1072 *minval = *maxval;
1073 *maxval = tmp;
1074 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001075 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001076 return OK;
1077}
1078
1079/*
1080 * vim_regexec and friends
1081 */
1082
1083/*
1084 * Global work variables for vim_regexec().
1085 */
1086
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001087static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001088#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001089static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001090#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001091static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001092static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001093
1094/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001095 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1096 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001097 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001098 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001099static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001100static unsigned reg_tofreelen;
1101
1102/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001103 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001104 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001105 * done:
1106 * single-line multi-line
1107 * reg_match &regmatch_T NULL
1108 * reg_mmatch NULL &regmmatch_T
1109 * reg_startp reg_match->startp <invalid>
1110 * reg_endp reg_match->endp <invalid>
1111 * reg_startpos <invalid> reg_mmatch->startpos
1112 * reg_endpos <invalid> reg_mmatch->endpos
1113 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001114 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001115 * reg_firstlnum <invalid> first line in which to search
1116 * reg_maxline 0 last line nr
1117 * reg_line_lbr FALSE or TRUE FALSE
1118 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001119typedef struct {
1120 regmatch_T *reg_match;
1121 regmmatch_T *reg_mmatch;
1122 char_u **reg_startp;
1123 char_u **reg_endp;
1124 lpos_T *reg_startpos;
1125 lpos_T *reg_endpos;
1126 win_T *reg_win;
1127 buf_T *reg_buf;
1128 linenr_T reg_firstlnum;
1129 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001130 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001131
Bram Moolenaar0270f382018-07-17 05:43:58 +02001132 // The current match-position is stord in these variables:
1133 linenr_T lnum; // line number, relative to first line
1134 char_u *line; // start of current line
1135 char_u *input; // current input, points into "regline"
1136
1137 int need_clear_subexpr; // subexpressions still need to be cleared
1138#ifdef FEAT_SYN_HL
1139 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1140 // cleared
1141#endif
1142
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001143 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1144 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1145 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001146 int reg_ic;
1147
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001148 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1149 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001150 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001151
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001152 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1153 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001154 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001155
1156 // State for the NFA engine regexec.
1157 int nfa_has_zend; // NFA regexp \ze operator encountered.
1158 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1159 int nfa_nsubexpr; // Number of sub expressions actually being used
1160 // during execution. 1 if only the whole match
1161 // (subexpr 0) is used.
1162 // listid is global, so that it increases on recursive calls to
1163 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1164 // all the states.
1165 int nfa_listid;
1166 int nfa_alt_listid;
1167
1168#ifdef FEAT_SYN_HL
1169 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1170#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001171} regexec_T;
1172
1173static regexec_T rex;
1174static int rex_in_use = FALSE;
1175
Bram Moolenaar071d4272004-06-13 20:20:40 +00001176/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001177 * Return TRUE if character 'c' is included in 'iskeyword' option for
1178 * "reg_buf" buffer.
1179 */
1180 static int
1181reg_iswordc(int c)
1182{
1183 return vim_iswordc_buf(c, rex.reg_buf);
1184}
1185
1186/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001187 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1188 */
1189 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001190reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001191{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001192 // when looking behind for a match/no-match lnum is negative. But we
1193 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001194 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001195 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001196 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001197 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001198 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001199 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001200}
1201
Bram Moolenaar071d4272004-06-13 20:20:40 +00001202#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001203static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1204static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1205static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1206static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001207#endif
1208
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001209// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001210#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001211
Bram Moolenaar071d4272004-06-13 20:20:40 +00001212#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001213/*
1214 * Create a new extmatch and mark it as referenced once.
1215 */
1216 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001217make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001218{
1219 reg_extmatch_T *em;
1220
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001221 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001222 if (em != NULL)
1223 em->refcnt = 1;
1224 return em;
1225}
1226
1227/*
1228 * Add a reference to an extmatch.
1229 */
1230 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001231ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001232{
1233 if (em != NULL)
1234 em->refcnt++;
1235 return em;
1236}
1237
1238/*
1239 * Remove a reference to an extmatch. If there are no references left, free
1240 * the info.
1241 */
1242 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001243unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001244{
1245 int i;
1246
1247 if (em != NULL && --em->refcnt <= 0)
1248 {
1249 for (i = 0; i < NSUBEXP; ++i)
1250 vim_free(em->matches[i]);
1251 vim_free(em);
1252 }
1253}
1254#endif
1255
1256/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001257 * Get class of previous character.
1258 */
1259 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001260reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001261{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001262 if (rex.input > rex.line)
1263 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001264 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001265 return -1;
1266}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001267
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001268/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001269 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001270 */
1271 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001272reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001273{
1274 pos_T top, bot;
1275 linenr_T lnum;
1276 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001277 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001278 int mode;
1279 colnr_T start, end;
1280 colnr_T start2, end2;
1281 colnr_T cols;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001282 colnr_T curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001283
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001284 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001285 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001286 return FALSE;
1287
1288 if (VIsual_active)
1289 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001290 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001291 {
1292 top = VIsual;
1293 bot = wp->w_cursor;
1294 }
1295 else
1296 {
1297 top = wp->w_cursor;
1298 bot = VIsual;
1299 }
1300 mode = VIsual_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001301 curswant = wp->w_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001302 }
1303 else
1304 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001305 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001306 {
1307 top = curbuf->b_visual.vi_start;
1308 bot = curbuf->b_visual.vi_end;
1309 }
1310 else
1311 {
1312 top = curbuf->b_visual.vi_end;
1313 bot = curbuf->b_visual.vi_start;
1314 }
1315 mode = curbuf->b_visual.vi_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001316 curswant = curbuf->b_visual.vi_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001317 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001318 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001319 if (lnum < top.lnum || lnum > bot.lnum)
1320 return FALSE;
1321
1322 if (mode == 'v')
1323 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001324 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001325 if ((lnum == top.lnum && col < top.col)
1326 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1327 return FALSE;
1328 }
1329 else if (mode == Ctrl_V)
1330 {
1331 getvvcol(wp, &top, &start, NULL, &end);
1332 getvvcol(wp, &bot, &start2, NULL, &end2);
1333 if (start2 < start)
1334 start = start2;
1335 if (end2 > end)
1336 end = end2;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001337 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001338 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001339 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001340 if (cols < start || cols > end - (*p_sel == 'e'))
1341 return FALSE;
1342 }
1343 return TRUE;
1344}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001345
Bram Moolenaar071d4272004-06-13 20:20:40 +00001346/*
1347 * Check the regexp program for its magic number.
1348 * Return TRUE if it's wrong.
1349 */
1350 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001351prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001352{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001353 regprog_T *prog;
1354
Bram Moolenaar6100d022016-10-02 16:51:57 +02001355 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001356 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001357 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001358 return FALSE;
1359
1360 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001361 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001362 emsg(_(e_re_corr));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001363 return TRUE;
1364 }
1365 return FALSE;
1366}
1367
1368/*
1369 * Cleanup the subexpressions, if this wasn't done yet.
1370 * This construction is used to clear the subexpressions only when they are
1371 * used (to increase speed).
1372 */
1373 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001374cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001375{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001376 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001377 {
1378 if (REG_MULTI)
1379 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001380 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001381 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1382 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001383 }
1384 else
1385 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001386 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1387 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001388 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001389 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001390 }
1391}
1392
1393#ifdef FEAT_SYN_HL
1394 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001395cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001396{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001397 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001398 {
1399 if (REG_MULTI)
1400 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001401 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001402 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1403 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1404 }
1405 else
1406 {
1407 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1408 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1409 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001410 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001411 }
1412}
1413#endif
1414
1415/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001416 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001417 */
1418 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001419reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001420{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001421 rex.line = reg_getline(++rex.lnum);
1422 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001423 fast_breakcheck();
1424}
1425
1426/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001427 * Check whether a backreference matches.
1428 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001429 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1430 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001431 */
1432 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001433match_with_backref(
1434 linenr_T start_lnum,
1435 colnr_T start_col,
1436 linenr_T end_lnum,
1437 colnr_T end_col,
1438 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001439{
1440 linenr_T clnum = start_lnum;
1441 colnr_T ccol = start_col;
1442 int len;
1443 char_u *p;
1444
1445 if (bytelen != NULL)
1446 *bytelen = 0;
1447 for (;;)
1448 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001449 // Since getting one line may invalidate the other, need to make copy.
1450 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001451 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001452 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001453 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001454 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1455 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001456 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001457 vim_free(reg_tofree);
1458 reg_tofree = alloc(len);
1459 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001460 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001461 reg_tofreelen = len;
1462 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001463 STRCPY(reg_tofree, rex.line);
1464 rex.input = reg_tofree + (rex.input - rex.line);
1465 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001466 }
1467
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001468 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001469 p = reg_getline(clnum);
1470 if (clnum == end_lnum)
1471 len = end_col - ccol;
1472 else
1473 len = (int)STRLEN(p + ccol);
1474
Bram Moolenaar0270f382018-07-17 05:43:58 +02001475 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001476 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001477 if (bytelen != NULL)
1478 *bytelen += len;
1479 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001480 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001481 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001482 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001483
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001484 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001485 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001486 if (bytelen != NULL)
1487 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001488 ++clnum;
1489 ccol = 0;
1490 if (got_int)
1491 return RA_FAIL;
1492 }
1493
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001494 // found a match! Note that rex.line may now point to a copy of the line,
1495 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001496 return RA_MATCH;
1497}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001498
Bram Moolenaarfb031402014-09-09 17:18:49 +02001499/*
1500 * Used in a place where no * or \+ can follow.
1501 */
1502 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001503re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001504{
1505 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001506 {
1507 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1508 rc_did_emsg = TRUE;
1509 return FAIL;
1510 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001511 return OK;
1512}
1513
Bram Moolenaar071d4272004-06-13 20:20:40 +00001514typedef struct
1515{
1516 int a, b, c;
1517} decomp_T;
1518
1519
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001520// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001521static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001522{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001523 {0x5e2,0,0}, // 0xfb20 alt ayin
1524 {0x5d0,0,0}, // 0xfb21 alt alef
1525 {0x5d3,0,0}, // 0xfb22 alt dalet
1526 {0x5d4,0,0}, // 0xfb23 alt he
1527 {0x5db,0,0}, // 0xfb24 alt kaf
1528 {0x5dc,0,0}, // 0xfb25 alt lamed
1529 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1530 {0x5e8,0,0}, // 0xfb27 alt resh
1531 {0x5ea,0,0}, // 0xfb28 alt tav
1532 {'+', 0, 0}, // 0xfb29 alt plus
1533 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1534 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1535 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1536 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1537 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1538 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1539 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1540 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1541 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1542 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1543 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1544 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1545 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1546 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1547 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1548 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1549 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1550 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1551 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1552 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1553 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1554 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1555 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1556 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1557 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1558 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1559 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1560 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1561 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1562 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1563 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1564 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1565 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1566 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1567 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1568 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1569 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1570 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001571};
1572
1573 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001574mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001575{
1576 decomp_T d;
1577
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001578 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001579 {
1580 d = decomp_table[c - 0xfb20];
1581 *c1 = d.a;
1582 *c2 = d.b;
1583 *c3 = d.c;
1584 }
1585 else
1586 {
1587 *c1 = c;
1588 *c2 = *c3 = 0;
1589 }
1590}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001591
1592/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001593 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001594 * Return 0 if strings match, non-zero otherwise.
1595 * Correct the length "*n" when composing characters are ignored.
1596 */
1597 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001598cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001599{
1600 int result;
1601
Bram Moolenaar6100d022016-10-02 16:51:57 +02001602 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001603 result = STRNCMP(s1, s2, *n);
1604 else
1605 result = MB_STRNICMP(s1, s2, *n);
1606
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001607 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001608 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001609 {
1610 char_u *str1, *str2;
1611 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001612 int junk;
1613
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001614 // we have to handle the strcmp ourselves, since it is necessary to
1615 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001616 str1 = s1;
1617 str2 = s2;
1618 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001619 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001620 {
1621 c1 = mb_ptr2char_adv(&str1);
1622 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001623
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001624 // Decompose the character if necessary, into 'base' characters.
1625 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001626 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001627 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001628 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001629 mb_decompose(c1, &c11, &junk, &junk);
1630 mb_decompose(c2, &c12, &junk, &junk);
1631 c1 = c11;
1632 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001633 if (c11 != c12
1634 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001635 break;
1636 }
1637 }
1638 result = c2 - c1;
1639 if (result == 0)
1640 *n = (int)(str2 - s2);
1641 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001642
1643 return result;
1644}
1645
1646/*
1647 * cstrchr: This function is used a lot for simple searches, keep it fast!
1648 */
1649 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001650cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001651{
1652 char_u *p;
1653 int cc;
1654
Bram Moolenaara12a1612019-01-24 16:39:02 +01001655 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001656 return vim_strchr(s, c);
1657
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001658 // tolower() and toupper() can be slow, comparing twice should be a lot
1659 // faster (esp. when using MS Visual C++!).
1660 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001661 if (enc_utf8 && c > 0x80)
1662 cc = utf_fold(c);
1663 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001664 if (MB_ISUPPER(c))
1665 cc = MB_TOLOWER(c);
1666 else if (MB_ISLOWER(c))
1667 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001668 else
1669 return vim_strchr(s, c);
1670
Bram Moolenaar071d4272004-06-13 20:20:40 +00001671 if (has_mbyte)
1672 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001673 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001674 {
1675 if (enc_utf8 && c > 0x80)
1676 {
1677 if (utf_fold(utf_ptr2char(p)) == cc)
1678 return p;
1679 }
1680 else if (*p == c || *p == cc)
1681 return p;
1682 }
1683 }
1684 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001685 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001686 for (p = s; *p != NUL; ++p)
1687 if (*p == c || *p == cc)
1688 return p;
1689
1690 return NULL;
1691}
1692
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001693////////////////////////////////////////////////////////////////
1694// regsub stuff //
1695////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001696
Bram Moolenaar071d4272004-06-13 20:20:40 +00001697/*
1698 * We should define ftpr as a pointer to a function returning a pointer to
1699 * a function returning a pointer to a function ...
1700 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001701 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001702 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001703typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001704
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001705static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001706
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001707 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001708do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001709{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001710 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001711
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001712 return (fptr_T)NULL;
1713}
1714
1715 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001716do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001717{
1718 *d = MB_TOUPPER(c);
1719
1720 return (fptr_T)do_Upper;
1721}
1722
1723 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001724do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001725{
1726 *d = MB_TOLOWER(c);
1727
1728 return (fptr_T)NULL;
1729}
1730
1731 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001732do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001733{
1734 *d = MB_TOLOWER(c);
1735
1736 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001737}
1738
1739/*
1740 * regtilde(): Replace tildes in the pattern by the old pattern.
1741 *
1742 * Short explanation of the tilde: It stands for the previous replacement
1743 * pattern. If that previous pattern also contains a ~ we should go back a
1744 * step further... But we insert the previous pattern into the current one
1745 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001746 * This still does not handle the case where "magic" changes. So require the
1747 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001748 *
1749 * The tildes are parsed once before the first call to vim_regsub().
1750 */
1751 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001752regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001753{
1754 char_u *newsub = source;
1755 char_u *tmpsub;
1756 char_u *p;
1757 int len;
1758 int prevlen;
1759
1760 for (p = newsub; *p; ++p)
1761 {
1762 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1763 {
1764 if (reg_prev_sub != NULL)
1765 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001766 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001767 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001768 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001769 if (tmpsub != NULL)
1770 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001771 // copy prefix
1772 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001773 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001774 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001775 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001776 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001777 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001778 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001779 STRCPY(tmpsub + len + prevlen, p + 1);
1780
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001781 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001782 vim_free(newsub);
1783 newsub = tmpsub;
1784 p = newsub + len + prevlen;
1785 }
1786 }
1787 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001788 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001789 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001790 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001791 --p;
1792 }
1793 else
1794 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001795 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001796 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001797 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001798 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001799 }
1800 }
1801
1802 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001803 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001804 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001805 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001806 reg_prev_sub = vim_strsave(newsub);
1807 return newsub;
1808}
1809
1810#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001811static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001812
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001813// These pointers are used for reg_submatch(). Needed for when the
1814// substitution string is an expression that contains a call to substitute()
1815// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001816typedef struct {
1817 regmatch_T *sm_match;
1818 regmmatch_T *sm_mmatch;
1819 linenr_T sm_firstlnum;
1820 linenr_T sm_maxline;
1821 int sm_line_lbr;
1822} regsubmatch_T;
1823
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001824static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001825#endif
1826
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001827#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001828
1829/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001830 * Put the submatches in "argv[argskip]" which is a list passed into
1831 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001832 */
1833 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001834fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001835{
1836 listitem_T *li;
1837 int i;
1838 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001839 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001840
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001841 if (argcount == argskip)
1842 // called function doesn't take a submatches argument
1843 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001844
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001845 // Relies on sl_list to be the first item in staticList10_T.
1846 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001847
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001848 // There are always 10 list items in staticList10_T.
1849 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001850 for (i = 0; i < 10; ++i)
1851 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001852 s = rsm.sm_match->startp[i];
1853 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001854 s = NULL;
1855 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001856 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001857 li->li_tv.v_type = VAR_STRING;
1858 li->li_tv.vval.v_string = s;
1859 li = li->li_next;
1860 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001861 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001862}
1863
1864 static void
1865clear_submatch_list(staticList10_T *sl)
1866{
1867 int i;
1868
1869 for (i = 0; i < 10; ++i)
1870 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1871}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001872#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001873
Bram Moolenaar071d4272004-06-13 20:20:40 +00001874/*
1875 * vim_regsub() - perform substitutions after a vim_regexec() or
1876 * vim_regexec_multi() match.
1877 *
1878 * If "copy" is TRUE really copy into "dest".
1879 * If "copy" is FALSE nothing is copied, this is just to find out the length
1880 * of the result.
1881 *
1882 * If "backslash" is TRUE, a backslash will be removed later, need to double
1883 * them to keep them, and insert a backslash before a CR to avoid it being
1884 * replaced with a line break later.
1885 *
1886 * Note: The matched text must not change between the call of
1887 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1888 * references invalid!
1889 *
1890 * Returns the size of the replacement, including terminating NUL.
1891 */
1892 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001893vim_regsub(
1894 regmatch_T *rmp,
1895 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001896 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001897 char_u *dest,
1898 int copy,
1899 int magic,
1900 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001901{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001902 int result;
1903 regexec_T rex_save;
1904 int rex_in_use_save = rex_in_use;
1905
1906 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001907 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001908 rex_save = rex;
1909 rex_in_use = TRUE;
1910
1911 rex.reg_match = rmp;
1912 rex.reg_mmatch = NULL;
1913 rex.reg_maxline = 0;
1914 rex.reg_buf = curbuf;
1915 rex.reg_line_lbr = TRUE;
1916 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1917
1918 rex_in_use = rex_in_use_save;
1919 if (rex_in_use)
1920 rex = rex_save;
1921
1922 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001923}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001924
1925 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001926vim_regsub_multi(
1927 regmmatch_T *rmp,
1928 linenr_T lnum,
1929 char_u *source,
1930 char_u *dest,
1931 int copy,
1932 int magic,
1933 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001934{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001935 int result;
1936 regexec_T rex_save;
1937 int rex_in_use_save = rex_in_use;
1938
1939 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001940 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001941 rex_save = rex;
1942 rex_in_use = TRUE;
1943
1944 rex.reg_match = NULL;
1945 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001946 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001947 rex.reg_firstlnum = lnum;
1948 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1949 rex.reg_line_lbr = FALSE;
1950 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1951
1952 rex_in_use = rex_in_use_save;
1953 if (rex_in_use)
1954 rex = rex_save;
1955
1956 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001957}
1958
1959 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001960vim_regsub_both(
1961 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001962 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001963 char_u *dest,
1964 int copy,
1965 int magic,
1966 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001967{
1968 char_u *src;
1969 char_u *dst;
1970 char_u *s;
1971 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001972 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001973 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001974 fptr_T func_all = (fptr_T)NULL;
1975 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001976 linenr_T clnum = 0; // init for GCC
1977 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001978#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001979 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001980#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001981
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001982 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001983 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001984 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001985 emsg(_(e_null));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001986 return 0;
1987 }
1988 if (prog_magic_wrong())
1989 return 0;
1990 src = source;
1991 dst = dest;
1992
1993 /*
1994 * When the substitute part starts with "\=" evaluate it as an expression.
1995 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001996 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001997 {
1998#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001999 // To make sure that the length doesn't change between checking the
2000 // length and copying the string, and to speed up things, the
2001 // resulting string is saved from the call with "copy" == FALSE to the
2002 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002003 if (copy)
2004 {
2005 if (eval_result != NULL)
2006 {
2007 STRCPY(dest, eval_result);
2008 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002009 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002010 }
2011 }
2012 else
2013 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002014 int prev_can_f_submatch = can_f_submatch;
2015 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002016
2017 vim_free(eval_result);
2018
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002019 // The expression may contain substitute(), which calls us
2020 // recursively. Make sure submatch() gets the text from the first
2021 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002022 if (can_f_submatch)
2023 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002024 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002025 rsm.sm_match = rex.reg_match;
2026 rsm.sm_mmatch = rex.reg_mmatch;
2027 rsm.sm_firstlnum = rex.reg_firstlnum;
2028 rsm.sm_maxline = rex.reg_maxline;
2029 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002030
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031 if (expr != NULL)
2032 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002033 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002034 char_u buf[NUMBUFLEN];
2035 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002036 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002037 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002038
2039 rettv.v_type = VAR_STRING;
2040 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002041 argv[0].v_type = VAR_LIST;
2042 argv[0].vval.v_list = &matchList.sl_list;
2043 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002044 CLEAR_FIELD(funcexe);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002045 funcexe.argv_func = fill_submatch_list;
2046 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002047 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002048 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002049 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002050 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002051 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002052 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002053 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002054 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002055
Bram Moolenaar6100d022016-10-02 16:51:57 +02002056 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002057 funcexe.partial = partial;
2058 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002059 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002060 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002061 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002062 clear_submatch_list(&matchList);
2063
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002064 if (rettv.v_type == VAR_UNKNOWN)
2065 // something failed, no need to report another error
2066 eval_result = NULL;
2067 else
2068 {
2069 eval_result = tv_get_string_buf_chk(&rettv, buf);
2070 if (eval_result != NULL)
2071 eval_result = vim_strsave(eval_result);
2072 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002073 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002074 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002075 else if (substitute_instr != NULL)
2076 // Execute instructions from ISN_SUBSTITUTE.
2077 eval_result = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002078 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002079 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002080
Bram Moolenaar071d4272004-06-13 20:20:40 +00002081 if (eval_result != NULL)
2082 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002083 int had_backslash = FALSE;
2084
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002085 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002086 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002087 // Change NL to CR, so that it becomes a line break,
2088 // unless called from vim_regexec_nl().
2089 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002090 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002091 *s = CAR;
2092 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002093 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002094 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002095 /* Change NL to CR here too, so that this works:
2096 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2097 * abc\
2098 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002099 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002100 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002101 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002102 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002103 had_backslash = TRUE;
2104 }
2105 }
2106 if (had_backslash && backslash)
2107 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002108 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002109 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2110 if (s != NULL)
2111 {
2112 vim_free(eval_result);
2113 eval_result = s;
2114 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002115 }
2116
2117 dst += STRLEN(eval_result);
2118 }
2119
Bram Moolenaar6100d022016-10-02 16:51:57 +02002120 can_f_submatch = prev_can_f_submatch;
2121 if (can_f_submatch)
2122 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002123 }
2124#endif
2125 }
2126 else
2127 while ((c = *src++) != NUL)
2128 {
2129 if (c == '&' && magic)
2130 no = 0;
2131 else if (c == '\\' && *src != NUL)
2132 {
2133 if (*src == '&' && !magic)
2134 {
2135 ++src;
2136 no = 0;
2137 }
2138 else if ('0' <= *src && *src <= '9')
2139 {
2140 no = *src++ - '0';
2141 }
2142 else if (vim_strchr((char_u *)"uUlLeE", *src))
2143 {
2144 switch (*src++)
2145 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002146 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002147 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002148 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002149 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002150 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002151 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002152 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002153 continue;
2154 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002155 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002156 continue;
2157 }
2158 }
2159 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002160 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002161 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002162 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2163 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002164 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002165 if (copy)
2166 {
2167 *dst++ = c;
2168 *dst++ = *src++;
2169 *dst++ = *src++;
2170 }
2171 else
2172 {
2173 dst += 3;
2174 src += 2;
2175 }
2176 continue;
2177 }
2178
Bram Moolenaar071d4272004-06-13 20:20:40 +00002179 if (c == '\\' && *src != NUL)
2180 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002181 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002182 switch (*src)
2183 {
2184 case 'r': c = CAR; ++src; break;
2185 case 'n': c = NL; ++src; break;
2186 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002187 // Oh no! \e already has meaning in subst pat :-(
2188 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002189 case 'b': c = Ctrl_H; ++src; break;
2190
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002191 // If "backslash" is TRUE the backslash will be removed
2192 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002193 default: if (backslash)
2194 {
2195 if (copy)
2196 *dst = '\\';
2197 ++dst;
2198 }
2199 c = *src++;
2200 }
2201 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002202 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002203 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002204
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002205 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002206 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002207 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002208 func_one = (fptr_T)(func_one(&cc, c));
2209 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002210 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002211 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002212 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002213 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002214
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002215 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002216 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002217 int totlen = mb_ptr2len(src - 1);
2218
Bram Moolenaar071d4272004-06-13 20:20:40 +00002219 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002220 mb_char2bytes(cc, dst);
2221 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002222 if (enc_utf8)
2223 {
2224 int clen = utf_ptr2len(src - 1);
2225
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002226 // If the character length is shorter than "totlen", there
2227 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002228 if (clen < totlen)
2229 {
2230 if (copy)
2231 mch_memmove(dst + 1, src - 1 + clen,
2232 (size_t)(totlen - clen));
2233 dst += totlen - clen;
2234 }
2235 }
2236 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002237 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002238 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002239 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002240 dst++;
2241 }
2242 else
2243 {
2244 if (REG_MULTI)
2245 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002246 clnum = rex.reg_mmatch->startpos[no].lnum;
2247 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002248 s = NULL;
2249 else
2250 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002251 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2252 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2253 len = rex.reg_mmatch->endpos[no].col
2254 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002255 else
2256 len = (int)STRLEN(s);
2257 }
2258 }
2259 else
2260 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002261 s = rex.reg_match->startp[no];
2262 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002263 s = NULL;
2264 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002265 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002266 }
2267 if (s != NULL)
2268 {
2269 for (;;)
2270 {
2271 if (len == 0)
2272 {
2273 if (REG_MULTI)
2274 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002275 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002276 break;
2277 if (copy)
2278 *dst = CAR;
2279 ++dst;
2280 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002281 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2282 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002283 else
2284 len = (int)STRLEN(s);
2285 }
2286 else
2287 break;
2288 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002289 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002290 {
2291 if (copy)
Bram Moolenaare83cca22020-09-07 18:53:21 +02002292 iemsg(_(e_re_damg));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002293 goto exit;
2294 }
2295 else
2296 {
2297 if (backslash && (*s == CAR || *s == '\\'))
2298 {
2299 /*
2300 * Insert a backslash in front of a CR, otherwise
2301 * it will be replaced by a line break.
2302 * Number of backslashes will be halved later,
2303 * double them here.
2304 */
2305 if (copy)
2306 {
2307 dst[0] = '\\';
2308 dst[1] = *s;
2309 }
2310 dst += 2;
2311 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002312 else
2313 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002314 if (has_mbyte)
2315 c = mb_ptr2char(s);
2316 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002317 c = *s;
2318
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002319 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002320 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002321 func_one = (fptr_T)(func_one(&cc, c));
2322 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002323 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002324 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002325 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002326 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002327
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002328 if (has_mbyte)
2329 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002330 int l;
2331
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002332 // Copy composing characters separately, one
2333 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002334 if (enc_utf8)
2335 l = utf_ptr2len(s) - 1;
2336 else
2337 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002338
2339 s += l;
2340 len -= l;
2341 if (copy)
2342 mb_char2bytes(cc, dst);
2343 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002344 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002345 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002346 *dst = cc;
2347 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002348 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002349
Bram Moolenaar071d4272004-06-13 20:20:40 +00002350 ++s;
2351 --len;
2352 }
2353 }
2354 }
2355 no = -1;
2356 }
2357 }
2358 if (copy)
2359 *dst = NUL;
2360
2361exit:
2362 return (int)((dst - dest) + 1);
2363}
2364
2365#ifdef FEAT_EVAL
2366/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002367 * Call reg_getline() with the line numbers from the submatch. If a
2368 * substitute() was used the reg_maxline and other values have been
2369 * overwritten.
2370 */
2371 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002372reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002373{
2374 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002375 linenr_T save_first = rex.reg_firstlnum;
2376 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002377
Bram Moolenaar6100d022016-10-02 16:51:57 +02002378 rex.reg_firstlnum = rsm.sm_firstlnum;
2379 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002380
2381 s = reg_getline(lnum);
2382
Bram Moolenaar6100d022016-10-02 16:51:57 +02002383 rex.reg_firstlnum = save_first;
2384 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002385 return s;
2386}
2387
2388/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002389 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002390 * allocated memory.
2391 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2392 */
2393 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002394reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002395{
2396 char_u *retval = NULL;
2397 char_u *s;
2398 int len;
2399 int round;
2400 linenr_T lnum;
2401
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002402 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002403 return NULL;
2404
Bram Moolenaar6100d022016-10-02 16:51:57 +02002405 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002406 {
2407 /*
2408 * First round: compute the length and allocate memory.
2409 * Second round: copy the text.
2410 */
2411 for (round = 1; round <= 2; ++round)
2412 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002413 lnum = rsm.sm_mmatch->startpos[no].lnum;
2414 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002415 return NULL;
2416
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002417 s = reg_getline_submatch(lnum);
2418 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002419 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002420 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002421 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002422 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002423 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002424 len = rsm.sm_mmatch->endpos[no].col
2425 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002426 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002427 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002428 ++len;
2429 }
2430 else
2431 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002432 // Multiple lines: take start line from start col, middle
2433 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002434 len = (int)STRLEN(s);
2435 if (round == 2)
2436 {
2437 STRCPY(retval, s);
2438 retval[len] = '\n';
2439 }
2440 ++len;
2441 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002442 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002443 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002444 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002445 if (round == 2)
2446 STRCPY(retval + len, s);
2447 len += (int)STRLEN(s);
2448 if (round == 2)
2449 retval[len] = '\n';
2450 ++len;
2451 }
2452 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002453 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002454 rsm.sm_mmatch->endpos[no].col);
2455 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002456 if (round == 2)
2457 retval[len] = NUL;
2458 ++len;
2459 }
2460
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002461 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002462 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002463 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002464 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002465 return NULL;
2466 }
2467 }
2468 }
2469 else
2470 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002471 s = rsm.sm_match->startp[no];
2472 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002473 retval = NULL;
2474 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002475 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002476 }
2477
2478 return retval;
2479}
Bram Moolenaar41571762014-04-02 19:00:58 +02002480
2481/*
2482 * Used for the submatch() function with the optional non-zero argument: get
2483 * the list of strings from the n'th submatch in allocated memory with NULs
2484 * represented in NLs.
2485 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2486 * command, for a non-existing submatch and for any error.
2487 */
2488 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002489reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002490{
2491 char_u *s;
2492 linenr_T slnum;
2493 linenr_T elnum;
2494 colnr_T scol;
2495 colnr_T ecol;
2496 int i;
2497 list_T *list;
2498 int error = FALSE;
2499
2500 if (!can_f_submatch || no < 0)
2501 return NULL;
2502
Bram Moolenaar6100d022016-10-02 16:51:57 +02002503 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002504 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002505 slnum = rsm.sm_mmatch->startpos[no].lnum;
2506 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002507 if (slnum < 0 || elnum < 0)
2508 return NULL;
2509
Bram Moolenaar6100d022016-10-02 16:51:57 +02002510 scol = rsm.sm_mmatch->startpos[no].col;
2511 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002512
2513 list = list_alloc();
2514 if (list == NULL)
2515 return NULL;
2516
2517 s = reg_getline_submatch(slnum) + scol;
2518 if (slnum == elnum)
2519 {
2520 if (list_append_string(list, s, ecol - scol) == FAIL)
2521 error = TRUE;
2522 }
2523 else
2524 {
2525 if (list_append_string(list, s, -1) == FAIL)
2526 error = TRUE;
2527 for (i = 1; i < elnum - slnum; i++)
2528 {
2529 s = reg_getline_submatch(slnum + i);
2530 if (list_append_string(list, s, -1) == FAIL)
2531 error = TRUE;
2532 }
2533 s = reg_getline_submatch(elnum);
2534 if (list_append_string(list, s, ecol) == FAIL)
2535 error = TRUE;
2536 }
2537 }
2538 else
2539 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002540 s = rsm.sm_match->startp[no];
2541 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002542 return NULL;
2543 list = list_alloc();
2544 if (list == NULL)
2545 return NULL;
2546 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002547 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002548 error = TRUE;
2549 }
2550
2551 if (error)
2552 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002553 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002554 return NULL;
2555 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002556 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002557 return list;
2558}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002559#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002560
Bram Moolenaarf4140482020-02-15 23:06:45 +01002561/*
2562 * Initialize the values used for matching against multiple lines
2563 */
2564 static void
2565init_regexec_multi(
2566 regmmatch_T *rmp,
2567 win_T *win, // window in which to search or NULL
2568 buf_T *buf, // buffer in which to search
2569 linenr_T lnum) // nr of line to start looking for match
2570{
2571 rex.reg_match = NULL;
2572 rex.reg_mmatch = rmp;
2573 rex.reg_buf = buf;
2574 rex.reg_win = win;
2575 rex.reg_firstlnum = lnum;
2576 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2577 rex.reg_line_lbr = FALSE;
2578 rex.reg_ic = rmp->rmm_ic;
2579 rex.reg_icombine = FALSE;
2580 rex.reg_maxcol = rmp->rmm_maxcol;
2581}
2582
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002583#include "regexp_bt.c"
2584
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002585static regengine_T bt_regengine =
2586{
2587 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002588 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002589 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002590 bt_regexec_multi,
2591 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002592};
2593
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594#include "regexp_nfa.c"
2595
2596static regengine_T nfa_regengine =
2597{
2598 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002599 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002601 nfa_regexec_multi,
2602 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002603};
2604
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002605// Which regexp engine to use? Needed for vim_regcomp().
2606// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002607static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002608
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002609#ifdef DEBUG
2610static char_u regname[][30] = {
2611 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002612 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002613 "NFA Regexp Engine"
2614 };
2615#endif
2616
2617/*
2618 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002619 * Returns the program in allocated memory.
2620 * Use vim_regfree() to free the memory.
2621 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002622 */
2623 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002624vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002625{
2626 regprog_T *prog = NULL;
2627 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002628 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002629
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002630 regexp_engine = p_re;
2631
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002632 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002633 if (STRNCMP(expr, "\\%#=", 4) == 0)
2634 {
2635 int newengine = expr[4] - '0';
2636
2637 if (newengine == AUTOMATIC_ENGINE
2638 || newengine == BACKTRACKING_ENGINE
2639 || newengine == NFA_ENGINE)
2640 {
2641 regexp_engine = expr[4] - '0';
2642 expr += 5;
2643#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002644 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002645 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002646#endif
2647 }
2648 else
2649 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002650 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002651 regexp_engine = AUTOMATIC_ENGINE;
2652 }
2653 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002654#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002655 bt_regengine.expr = expr;
2656 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002657#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002658 // reg_iswordc() uses rex.reg_buf
2659 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002660
2661 /*
2662 * First try the NFA engine, unless backtracking was requested.
2663 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002664 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002665 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002666 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002667 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002668 else
2669 prog = bt_regengine.regcomp(expr, re_flags);
2670
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002671 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002672 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002673 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002674#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002675 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002676 {
2677 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002678 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002679 if (f)
2680 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002681 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002682 fclose(f);
2683 }
2684 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002685 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002686 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002687 }
2688#endif
2689 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002690 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002691 * The NFA engine also fails for patterns that it can't handle well
2692 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002693 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002694 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002695 if (regexp_engine == AUTOMATIC_ENGINE
2696 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002697 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002698 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002699#ifdef FEAT_EVAL
2700 report_re_switch(expr);
2701#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002702 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002703 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002704 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002705
Bram Moolenaarfda37292014-11-05 14:27:36 +01002706 if (prog != NULL)
2707 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002708 // Store the info needed to call regcomp() again when the engine turns
2709 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002710 prog->re_engine = regexp_engine;
2711 prog->re_flags = re_flags;
2712 }
2713
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002714 return prog;
2715}
2716
2717/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002718 * Free a compiled regexp program, returned by vim_regcomp().
2719 */
2720 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002721vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002722{
2723 if (prog != NULL)
2724 prog->engine->regfree(prog);
2725}
2726
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002727#if defined(EXITFREE) || defined(PROTO)
2728 void
2729free_regexp_stuff(void)
2730{
2731 ga_clear(&regstack);
2732 ga_clear(&backpos);
2733 vim_free(reg_tofree);
2734 vim_free(reg_prev_sub);
2735}
2736#endif
2737
Bram Moolenaarfda37292014-11-05 14:27:36 +01002738#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002739 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002740report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002741{
2742 if (p_verbose > 0)
2743 {
2744 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002745 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2746 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002747 verbose_leave();
2748 }
2749}
2750#endif
2751
Bram Moolenaar113e1072019-01-20 15:30:40 +01002752#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2753 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002754/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002755 * Return whether "prog" is currently being executed.
2756 */
2757 int
2758regprog_in_use(regprog_T *prog)
2759{
2760 return prog->re_in_use;
2761}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002762#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002763
2764/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002765 * Match a regexp against a string.
2766 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002767 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002768 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002769 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002770 *
2771 * Return TRUE if there is a match, FALSE if not.
2772 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002773 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002774vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002775 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002776 char_u *line, // string to match against
2777 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002778 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002779{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002780 int result;
2781 regexec_T rex_save;
2782 int rex_in_use_save = rex_in_use;
2783
Bram Moolenaar0270f382018-07-17 05:43:58 +02002784 // Cannot use the same prog recursively, it contains state.
2785 if (rmp->regprog->re_in_use)
2786 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002787 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002788 return FALSE;
2789 }
2790 rmp->regprog->re_in_use = TRUE;
2791
Bram Moolenaar6100d022016-10-02 16:51:57 +02002792 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002793 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002794 rex_save = rex;
2795 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002796
Bram Moolenaar6100d022016-10-02 16:51:57 +02002797 rex.reg_startp = NULL;
2798 rex.reg_endp = NULL;
2799 rex.reg_startpos = NULL;
2800 rex.reg_endpos = NULL;
2801
2802 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002803 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002804
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002805 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002806 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2807 && result == NFA_TOO_EXPENSIVE)
2808 {
2809 int save_p_re = p_re;
2810 int re_flags = rmp->regprog->re_flags;
2811 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2812
2813 p_re = BACKTRACKING_ENGINE;
2814 vim_regfree(rmp->regprog);
2815 if (pat != NULL)
2816 {
2817#ifdef FEAT_EVAL
2818 report_re_switch(pat);
2819#endif
2820 rmp->regprog = vim_regcomp(pat, re_flags);
2821 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002822 {
2823 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002824 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002825 rmp->regprog->re_in_use = FALSE;
2826 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002827 vim_free(pat);
2828 }
2829
2830 p_re = save_p_re;
2831 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002832
2833 rex_in_use = rex_in_use_save;
2834 if (rex_in_use)
2835 rex = rex_save;
2836
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002837 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002838}
2839
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002840/*
2841 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002842 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002843 */
2844 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002845vim_regexec_prog(
2846 regprog_T **prog,
2847 int ignore_case,
2848 char_u *line,
2849 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002850{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002851 int r;
2852 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002853
2854 regmatch.regprog = *prog;
2855 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002856 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002857 *prog = regmatch.regprog;
2858 return r;
2859}
2860
2861/*
2862 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002863 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002864 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002865 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002866vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002867{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002868 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002869}
2870
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002871/*
2872 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002873 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002874 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002875 */
2876 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002877vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002878{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002879 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002880}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002881
2882/*
2883 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002884 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2885 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002886 * Uses curbuf for line count and 'iskeyword'.
2887 *
2888 * Return zero if there is no match. Return number of lines contained in the
2889 * match otherwise.
2890 */
2891 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002892vim_regexec_multi(
2893 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002894 win_T *win, // window in which to search or NULL
2895 buf_T *buf, // buffer in which to search
2896 linenr_T lnum, // nr of line to start looking for match
2897 colnr_T col, // column to start looking for match
2898 proftime_T *tm, // timeout limit or NULL
2899 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002900{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002901 int result;
2902 regexec_T rex_save;
2903 int rex_in_use_save = rex_in_use;
2904
Bram Moolenaar0270f382018-07-17 05:43:58 +02002905 // Cannot use the same prog recursively, it contains state.
2906 if (rmp->regprog->re_in_use)
2907 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002908 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002909 return FALSE;
2910 }
2911 rmp->regprog->re_in_use = TRUE;
2912
Bram Moolenaar6100d022016-10-02 16:51:57 +02002913 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002914 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002915 rex_save = rex;
2916 rex_in_use = TRUE;
2917
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002918 result = rmp->regprog->engine->regexec_multi(
2919 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002920 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002921
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002922 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002923 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2924 && result == NFA_TOO_EXPENSIVE)
2925 {
2926 int save_p_re = p_re;
2927 int re_flags = rmp->regprog->re_flags;
2928 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2929
2930 p_re = BACKTRACKING_ENGINE;
2931 vim_regfree(rmp->regprog);
2932 if (pat != NULL)
2933 {
2934#ifdef FEAT_EVAL
2935 report_re_switch(pat);
2936#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002937#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002938 // checking for \z misuse was already done when compiling for NFA,
2939 // allow all here
2940 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002941#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002942 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002943#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002944 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002945#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002946
Bram Moolenaarfda37292014-11-05 14:27:36 +01002947 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002948 {
2949 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002950 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002951 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002952 rmp->regprog->re_in_use = FALSE;
2953 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002954 vim_free(pat);
2955 }
2956 p_re = save_p_re;
2957 }
2958
Bram Moolenaar6100d022016-10-02 16:51:57 +02002959 rex_in_use = rex_in_use_save;
2960 if (rex_in_use)
2961 rex = rex_save;
2962
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002963 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002964}