blob: e372dd44dc339687976946348624c05cdbbe597b [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020077static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
78static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
79static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020081static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020082static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020083#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020084static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020085static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020086static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
87
Bram Moolenaar071d4272004-06-13 20:20:40 +000088#define NOT_MULTI 0
89#define MULTI_ONE 1
90#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020091
92// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010093#define RA_FAIL 1 // something failed, abort
94#define RA_CONT 2 // continue in inner loop
95#define RA_BREAK 3 // break inner loop
96#define RA_MATCH 4 // successful match
97#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020098
Bram Moolenaar071d4272004-06-13 20:20:40 +000099/*
100 * Return NOT_MULTI if c is not a "multi" operator.
101 * Return MULTI_ONE if c is a single "multi" operator.
102 * Return MULTI_MULT if c is a multi "multi" operator.
103 */
104 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100105re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000106{
107 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
108 return MULTI_ONE;
109 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
110 return MULTI_MULT;
111 return NOT_MULTI;
112}
113
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000114static char_u *reg_prev_sub = NULL;
115
Bram Moolenaar071d4272004-06-13 20:20:40 +0000116/*
117 * REGEXP_INRANGE contains all characters which are always special in a []
118 * range after '\'.
119 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
120 * These are:
121 * \n - New line (NL).
122 * \r - Carriage Return (CR).
123 * \t - Tab (TAB).
124 * \e - Escape (ESC).
125 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000126 * \d - Character code in decimal, eg \d123
127 * \o - Character code in octal, eg \o80
128 * \x - Character code in hex, eg \x4a
129 * \u - Multibyte character code, eg \u20ac
130 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131 */
132static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000133static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135/*
136 * Translate '\x' to its control character, except "\n", which is Magic.
137 */
138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100139backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000140{
141 switch (c)
142 {
143 case 'r': return CAR;
144 case 't': return TAB;
145 case 'e': return ESC;
146 case 'b': return BS;
147 }
148 return c;
149}
150
151/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000152 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000153 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
154 * recognized. Otherwise "pp" is advanced to after the item.
155 */
156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100157get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000158{
159 static const char *(class_names[]) =
160 {
161 "alnum:]",
162#define CLASS_ALNUM 0
163 "alpha:]",
164#define CLASS_ALPHA 1
165 "blank:]",
166#define CLASS_BLANK 2
167 "cntrl:]",
168#define CLASS_CNTRL 3
169 "digit:]",
170#define CLASS_DIGIT 4
171 "graph:]",
172#define CLASS_GRAPH 5
173 "lower:]",
174#define CLASS_LOWER 6
175 "print:]",
176#define CLASS_PRINT 7
177 "punct:]",
178#define CLASS_PUNCT 8
179 "space:]",
180#define CLASS_SPACE 9
181 "upper:]",
182#define CLASS_UPPER 10
183 "xdigit:]",
184#define CLASS_XDIGIT 11
185 "tab:]",
186#define CLASS_TAB 12
187 "return:]",
188#define CLASS_RETURN 13
189 "backspace:]",
190#define CLASS_BACKSPACE 14
191 "escape:]",
192#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100193 "ident:]",
194#define CLASS_IDENT 16
195 "keyword:]",
196#define CLASS_KEYWORD 17
197 "fname:]",
198#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 };
200#define CLASS_NONE 99
201 int i;
202
203 if ((*pp)[1] == ':')
204 {
Bram Moolenaar78a15312009-05-15 19:33:18 +0000205 for (i = 0; i < (int)(sizeof(class_names) / sizeof(*class_names)); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000206 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
207 {
208 *pp += STRLEN(class_names[i]) + 2;
209 return i;
210 }
211 }
212 return CLASS_NONE;
213}
214
215/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216 * Specific version of character class functions.
217 * Using a table to keep this fast.
218 */
219static short class_tab[256];
220
221#define RI_DIGIT 0x01
222#define RI_HEX 0x02
223#define RI_OCTAL 0x04
224#define RI_WORD 0x08
225#define RI_HEAD 0x10
226#define RI_ALPHA 0x20
227#define RI_LOWER 0x40
228#define RI_UPPER 0x80
229#define RI_WHITE 0x100
230
231 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100232init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000233{
234 int i;
235 static int done = FALSE;
236
237 if (done)
238 return;
239
240 for (i = 0; i < 256; ++i)
241 {
242 if (i >= '0' && i <= '7')
243 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
244 else if (i >= '8' && i <= '9')
245 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
246 else if (i >= 'a' && i <= 'f')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
248#ifdef EBCDIC
249 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
250 || (i >= 's' && i <= 'z'))
251#else
252 else if (i >= 'g' && i <= 'z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
255 else if (i >= 'A' && i <= 'F')
256 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
257#ifdef EBCDIC
258 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
259 || (i >= 'S' && i <= 'Z'))
260#else
261 else if (i >= 'G' && i <= 'Z')
262#endif
263 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
264 else if (i == '_')
265 class_tab[i] = RI_WORD + RI_HEAD;
266 else
267 class_tab[i] = 0;
268 }
269 class_tab[' '] |= RI_WHITE;
270 class_tab['\t'] |= RI_WHITE;
271 done = TRUE;
272}
273
Bram Moolenaara12a1612019-01-24 16:39:02 +0100274#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
275#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
276#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
277#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
278#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
279#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
280#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
281#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
282#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100284// flags for regflags
285#define RF_ICASE 1 // ignore case
286#define RF_NOICASE 2 // don't ignore case
287#define RF_HASNL 4 // can match a NL
288#define RF_ICOMBINE 8 // ignore combining characters
289#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000290
291/*
292 * Global work variables for vim_regcomp().
293 */
294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static char_u *regparse; // Input-scan pointer.
296static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100297static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000298#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100299static int regnzpar; // \z() count.
300static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000301#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000303#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100304static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000306
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100307static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000308
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100309static int reg_string; // matching with a string instead of a buffer
310 // line
311static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000312
313/*
314 * META contains all characters that may be magic, except '^' and '$'.
315 */
316
317#ifdef EBCDIC
318static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
319#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100320// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000321static char_u META_flags[] = {
322 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100324// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000325 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100326// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000327 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100328// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000329 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100330// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000331 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100332// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000333 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100334// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000335 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
336};
337#endif
338
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100339static int curchr; // currently parsed character
340// Previous character. Note: prevchr is sometimes -1 when we are not at the
341// start, eg in /[ ^I]^ the pattern was never found even if it existed,
342// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200343static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100344static int prevprevchr; // previous-previous character
345static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000346
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100347// arguments for reg()
348#define REG_NOPAREN 0 // toplevel reg()
349#define REG_PAREN 1 // \(\)
350#define REG_ZPAREN 2 // \z(\)
351#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000352
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200353typedef struct
354{
355 char_u *regparse;
356 int prevchr_len;
357 int curchr;
358 int prevchr;
359 int prevprevchr;
360 int nextchr;
361 int at_start;
362 int prev_at_start;
363 int regnpar;
364} parse_state_T;
365
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100366static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100367static int getchr(void);
368static void skipchr_keepstart(void);
369static int peekchr(void);
370static void skipchr(void);
371static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100372static long gethexchrs(int maxinputlen);
373static long getoctchrs(void);
374static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100375static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100376static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200377static int cstrncmp(char_u *s1, char_u *s2, int *n);
378static char_u *cstrchr(char_u *, int);
379static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100380static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100381#ifdef FEAT_EVAL
382static void report_re_switch(char_u *pat);
383#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200385static regengine_T bt_regengine;
386static regengine_T nfa_regengine;
387
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388/*
389 * Return TRUE if compiled regular expression "prog" can match a line break.
390 */
391 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393{
394 return (prog->regflags & RF_HASNL);
395}
396
397/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
399 * Returns a character representing the class. Zero means that no item was
400 * recognized. Otherwise "pp" is advanced to after the item.
401 */
402 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100403get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404{
405 int c;
406 int l = 1;
407 char_u *p = *pp;
408
Bram Moolenaar985079c2019-02-16 17:07:47 +0100409 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000410 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000411 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000412 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000413 if (p[l + 2] == '=' && p[l + 3] == ']')
414 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (has_mbyte)
416 c = mb_ptr2char(p + 2);
417 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000418 c = p[2];
419 *pp += l + 4;
420 return c;
421 }
422 }
423 return 0;
424}
425
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200426#ifdef EBCDIC
427/*
428 * Table for equivalence class "c". (IBM-1047)
429 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200430static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200431 "A\x62\x63\x64\x65\x66\x67",
432 "C\x68",
433 "E\x71\x72\x73\x74",
434 "I\x75\x76\x77\x78",
435 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200436 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200437 "U\xFB\xFC\xFD\xFE",
438 "Y\xBA",
439 "a\x42\x43\x44\x45\x46\x47",
440 "c\x48",
441 "e\x51\x52\x53\x54",
442 "i\x55\x56\x57\x58",
443 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200444 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200445 "u\xDB\xDC\xDD\xDE",
446 "y\x8D\xDF",
447};
448#endif
449
Bram Moolenaardf177f62005-02-22 08:39:57 +0000450/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000451 * Check for a collating element "[.a.]". "pp" points to the '['.
452 * Returns a character. Zero means that no item was recognized. Otherwise
453 * "pp" is advanced to after the item.
454 * Currently only single characters are recognized!
455 */
456 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100457get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458{
459 int c;
460 int l = 1;
461 char_u *p = *pp;
462
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100463 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000464 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000465 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000466 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000467 if (p[l + 2] == '.' && p[l + 3] == ']')
468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
470 c = mb_ptr2char(p + 2);
471 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000472 c = p[2];
473 *pp += l + 4;
474 return c;
475 }
476 }
477 return 0;
478}
479
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100480static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
481static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200482
483 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100484get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200485{
486 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
487 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
488}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000489
490/*
491 * Skip over a "[]" range.
492 * "p" must point to the character after the '['.
493 * The returned pointer is on the matching ']', or the terminating NUL.
494 */
495 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100496skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000497{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000499
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100500 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501 ++p;
502 if (*p == ']' || *p == '-')
503 ++p;
504 while (*p != NUL && *p != ']')
505 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000506 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000507 p += l;
508 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000509 if (*p == '-')
510 {
511 ++p;
512 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100513 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000514 }
515 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200516 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000517 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200518 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000519 p += 2;
520 else if (*p == '[')
521 {
522 if (get_char_class(&p) == CLASS_NONE
523 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200524 && get_coll_element(&p) == 0
525 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100526 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000527 }
528 else
529 ++p;
530 }
531
532 return p;
533}
534
535/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000536 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200537 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000538 * Take care of characters with a backslash in front of it.
539 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000540 */
541 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100542skip_regexp(
543 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200544 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200545 int magic)
546{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100547 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200548}
549
550/*
551 * Call skip_regexp() and when the delimiter does not match give an error and
552 * return NULL.
553 */
554 char_u *
555skip_regexp_err(
556 char_u *startp,
557 int delim,
558 int magic)
559{
560 char_u *p = skip_regexp(startp, delim, magic);
561
562 if (*p != delim)
563 {
564 semsg(_("E654: missing delimiter after search pattern: %s"), startp);
565 return NULL;
566 }
567 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200568}
569
570/*
571 * skip_regexp() with extra arguments:
572 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
573 * expression and change "\?" to "?". If "*newp" is not NULL the expression
574 * is changed in-place.
575 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100576 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200577 */
578 char_u *
579skip_regexp_ex(
580 char_u *startp,
581 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100582 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200583 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100584 int *dropped,
585 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000586{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100587 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000588 char_u *p = startp;
589
590 if (magic)
591 mymagic = MAGIC_ON;
592 else
593 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200594 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000595
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100596 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000597 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100598 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000599 break;
600 if ((p[0] == '[' && mymagic >= MAGIC_ON)
601 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
602 {
603 p = skip_anyof(p + 1);
604 if (p[0] == NUL)
605 break;
606 }
607 else if (p[0] == '\\' && p[1] != NUL)
608 {
609 if (dirc == '?' && newp != NULL && p[1] == '?')
610 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100611 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000612 if (*newp == NULL)
613 {
614 *newp = vim_strsave(startp);
615 if (*newp != NULL)
616 p = *newp + (p - startp);
617 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200618 if (dropped != NULL)
619 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000620 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000621 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000622 else
623 ++p;
624 }
625 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100626 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000627 if (*p == 'v')
628 mymagic = MAGIC_ALL;
629 else if (*p == 'V')
630 mymagic = MAGIC_NONE;
631 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000632 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100633 if (magic_val != NULL)
634 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000635 return p;
636}
637
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200638/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200639 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200640 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100641static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200642static int at_start; // True when on the first character
643static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100644
Bram Moolenaar071d4272004-06-13 20:20:40 +0000645/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200646 * Start parsing at "str".
647 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000648 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100649initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000650{
651 regparse = str;
652 prevchr_len = 0;
653 curchr = prevprevchr = prevchr = nextchr = -1;
654 at_start = TRUE;
655 prev_at_start = FALSE;
656}
657
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200658/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200659 * Save the current parse state, so that it can be restored and parsing
660 * starts in the same state again.
661 */
662 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100663save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200664{
665 ps->regparse = regparse;
666 ps->prevchr_len = prevchr_len;
667 ps->curchr = curchr;
668 ps->prevchr = prevchr;
669 ps->prevprevchr = prevprevchr;
670 ps->nextchr = nextchr;
671 ps->at_start = at_start;
672 ps->prev_at_start = prev_at_start;
673 ps->regnpar = regnpar;
674}
675
676/*
677 * Restore a previously saved parse state.
678 */
679 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100680restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200681{
682 regparse = ps->regparse;
683 prevchr_len = ps->prevchr_len;
684 curchr = ps->curchr;
685 prevchr = ps->prevchr;
686 prevprevchr = ps->prevprevchr;
687 nextchr = ps->nextchr;
688 at_start = ps->at_start;
689 prev_at_start = ps->prev_at_start;
690 regnpar = ps->regnpar;
691}
692
693
694/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695 * Get the next character without advancing.
696 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000697 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100698peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000699{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000700 static int after_slash = FALSE;
701
Bram Moolenaar071d4272004-06-13 20:20:40 +0000702 if (curchr == -1)
703 {
704 switch (curchr = regparse[0])
705 {
706 case '.':
707 case '[':
708 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100709 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000710 if (reg_magic >= MAGIC_ON)
711 curchr = Magic(curchr);
712 break;
713 case '(':
714 case ')':
715 case '{':
716 case '%':
717 case '+':
718 case '=':
719 case '?':
720 case '@':
721 case '!':
722 case '&':
723 case '|':
724 case '<':
725 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100726 case '#': // future ext.
727 case '"': // future ext.
728 case '\'': // future ext.
729 case ',': // future ext.
730 case '-': // future ext.
731 case ':': // future ext.
732 case ';': // future ext.
733 case '`': // future ext.
734 case '/': // Can't be used in / command
735 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000736 if (reg_magic == MAGIC_ALL)
737 curchr = Magic(curchr);
738 break;
739 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100740 // * is not magic as the very first character, eg "?*ptr", when
741 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
742 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000743 if (reg_magic >= MAGIC_ON
744 && !at_start
745 && !(prev_at_start && prevchr == Magic('^'))
746 && (after_slash
747 || (prevchr != Magic('(')
748 && prevchr != Magic('&')
749 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000750 curchr = Magic('*');
751 break;
752 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100753 // '^' is only magic as the very first character and if it's after
754 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000755 if (reg_magic >= MAGIC_OFF
756 && (at_start
757 || reg_magic == MAGIC_ALL
758 || prevchr == Magic('(')
759 || prevchr == Magic('|')
760 || prevchr == Magic('&')
761 || prevchr == Magic('n')
762 || (no_Magic(prevchr) == '('
763 && prevprevchr == Magic('%'))))
764 {
765 curchr = Magic('^');
766 at_start = TRUE;
767 prev_at_start = FALSE;
768 }
769 break;
770 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100771 // '$' is only magic as the very last char and if it's in front of
772 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000773 if (reg_magic >= MAGIC_OFF)
774 {
775 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200776 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000777
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100778 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000779 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200780 || p[1] == 'm' || p[1] == 'M'
781 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
782 {
783 if (p[1] == 'v')
784 is_magic_all = TRUE;
785 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
786 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000787 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200788 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000789 if (p[0] == NUL
790 || (p[0] == '\\'
791 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
792 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200793 || (is_magic_all
794 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000795 || reg_magic == MAGIC_ALL)
796 curchr = Magic('$');
797 }
798 break;
799 case '\\':
800 {
801 int c = regparse[1];
802
803 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100804 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000805 else if (
806#ifdef EBCDIC
807 vim_strchr(META, c)
808#else
809 c <= '~' && META_flags[c]
810#endif
811 )
812 {
813 /*
814 * META contains everything that may be magic sometimes,
815 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200816 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000817 * magicness. Therefore, \ is so meta-magic that it is
818 * not in META.
819 */
820 curchr = -1;
821 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100822 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000823 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000824 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000825 peekchr();
826 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000827 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000828 curchr = toggle_Magic(curchr);
829 }
830 else if (vim_strchr(REGEXP_ABBR, c))
831 {
832 /*
833 * Handle abbreviations, like "\t" for TAB -- webb
834 */
835 curchr = backslash_trans(c);
836 }
837 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
838 curchr = toggle_Magic(c);
839 else
840 {
841 /*
842 * Next character can never be (made) magic?
843 * Then backslashing it won't do anything.
844 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000845 if (has_mbyte)
846 curchr = (*mb_ptr2char)(regparse + 1);
847 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000848 curchr = c;
849 }
850 break;
851 }
852
Bram Moolenaar071d4272004-06-13 20:20:40 +0000853 default:
854 if (has_mbyte)
855 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000856 }
857 }
858
859 return curchr;
860}
861
862/*
863 * Eat one lexed character. Do this in a way that we can undo it.
864 */
865 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100866skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000867{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100868 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000869 if (*regparse == '\\')
870 prevchr_len = 1;
871 else
872 prevchr_len = 0;
873 if (regparse[prevchr_len] != NUL)
874 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000875 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100876 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000877 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000878 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000879 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000880 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000881 ++prevchr_len;
882 }
883 regparse += prevchr_len;
884 prev_at_start = at_start;
885 at_start = FALSE;
886 prevprevchr = prevchr;
887 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100888 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000889 nextchr = -1;
890}
891
892/*
893 * Skip a character while keeping the value of prev_at_start for at_start.
894 * prevchr and prevprevchr are also kept.
895 */
896 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100897skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000898{
899 int as = prev_at_start;
900 int pr = prevchr;
901 int prpr = prevprevchr;
902
903 skipchr();
904 at_start = as;
905 prevchr = pr;
906 prevprevchr = prpr;
907}
908
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200909/*
910 * Get the next character from the pattern. We know about magic and such, so
911 * therefore we need a lexical analyzer.
912 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000913 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100914getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000915{
916 int chr = peekchr();
917
918 skipchr();
919 return chr;
920}
921
922/*
923 * put character back. Works only once!
924 */
925 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100926ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000927{
928 nextchr = curchr;
929 curchr = prevchr;
930 prevchr = prevprevchr;
931 at_start = prev_at_start;
932 prev_at_start = FALSE;
933
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100934 // Backup regparse, so that it's at the same position as before the
935 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000936 regparse -= prevchr_len;
937}
938
939/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000940 * Get and return the value of the hex string at the current position.
941 * Return -1 if there is no valid hex number.
942 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000943 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000944 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000945 * The parameter controls the maximum number of input characters. This will be
946 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
947 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100948 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100949gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000950{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100951 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000952 int c;
953 int i;
954
955 for (i = 0; i < maxinputlen; ++i)
956 {
957 c = regparse[0];
958 if (!vim_isxdigit(c))
959 break;
960 nr <<= 4;
961 nr |= hex2nr(c);
962 ++regparse;
963 }
964
965 if (i == 0)
966 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100967 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000968}
969
970/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200971 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000972 * current position. Return -1 for invalid. Consumes all digits.
973 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100974 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100975getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000976{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100977 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000978 int c;
979 int i;
980
981 for (i = 0; ; ++i)
982 {
983 c = regparse[0];
984 if (c < '0' || c > '9')
985 break;
986 nr *= 10;
987 nr += c - '0';
988 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100989 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000990 }
991
992 if (i == 0)
993 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100994 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000995}
996
997/*
998 * get and return the value of the octal string immediately after the current
999 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
1000 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
1001 * treat 8 or 9 as recognised characters. Position is updated:
1002 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +00001003 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001004 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001005 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001006getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001007{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001008 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001009 int c;
1010 int i;
1011
1012 for (i = 0; i < 3 && nr < 040; ++i)
1013 {
1014 c = regparse[0];
1015 if (c < '0' || c > '7')
1016 break;
1017 nr <<= 3;
1018 nr |= hex2nr(c);
1019 ++regparse;
1020 }
1021
1022 if (i == 0)
1023 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001024 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001025}
1026
1027/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001028 * read_limits - Read two integers to be taken as a minimum and maximum.
1029 * If the first character is '-', then the range is reversed.
1030 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1031 * missing, a very big number is the default.
1032 */
1033 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001034read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001035{
1036 int reverse = FALSE;
1037 char_u *first_char;
1038 long tmp;
1039
1040 if (*regparse == '-')
1041 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001042 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001043 regparse++;
1044 reverse = TRUE;
1045 }
1046 first_char = regparse;
1047 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001048 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001049 {
1050 if (vim_isdigit(*++regparse))
1051 *maxval = getdigits(&regparse);
1052 else
1053 *maxval = MAX_LIMIT;
1054 }
1055 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001056 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001057 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001058 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001059 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001060 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001061 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001062 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1063 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001064
1065 /*
1066 * Reverse the range if there was a '-', or make sure it is in the right
1067 * order otherwise.
1068 */
1069 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1070 {
1071 tmp = *minval;
1072 *minval = *maxval;
1073 *maxval = tmp;
1074 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001075 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001076 return OK;
1077}
1078
1079/*
1080 * vim_regexec and friends
1081 */
1082
1083/*
1084 * Global work variables for vim_regexec().
1085 */
1086
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001087static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001088#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001089static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001090#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001091static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001092static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001093
1094/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001095 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1096 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001097 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001098 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001099static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001100static unsigned reg_tofreelen;
1101
1102/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001103 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001104 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001105 * done:
1106 * single-line multi-line
1107 * reg_match &regmatch_T NULL
1108 * reg_mmatch NULL &regmmatch_T
1109 * reg_startp reg_match->startp <invalid>
1110 * reg_endp reg_match->endp <invalid>
1111 * reg_startpos <invalid> reg_mmatch->startpos
1112 * reg_endpos <invalid> reg_mmatch->endpos
1113 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001114 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001115 * reg_firstlnum <invalid> first line in which to search
1116 * reg_maxline 0 last line nr
1117 * reg_line_lbr FALSE or TRUE FALSE
1118 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001119typedef struct {
1120 regmatch_T *reg_match;
1121 regmmatch_T *reg_mmatch;
1122 char_u **reg_startp;
1123 char_u **reg_endp;
1124 lpos_T *reg_startpos;
1125 lpos_T *reg_endpos;
1126 win_T *reg_win;
1127 buf_T *reg_buf;
1128 linenr_T reg_firstlnum;
1129 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001130 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001131
Bram Moolenaar0270f382018-07-17 05:43:58 +02001132 // The current match-position is stord in these variables:
1133 linenr_T lnum; // line number, relative to first line
1134 char_u *line; // start of current line
1135 char_u *input; // current input, points into "regline"
1136
1137 int need_clear_subexpr; // subexpressions still need to be cleared
1138#ifdef FEAT_SYN_HL
1139 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1140 // cleared
1141#endif
1142
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001143 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1144 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1145 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001146 int reg_ic;
1147
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001148 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1149 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001150 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001151
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001152 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1153 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001154 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001155
1156 // State for the NFA engine regexec.
1157 int nfa_has_zend; // NFA regexp \ze operator encountered.
1158 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1159 int nfa_nsubexpr; // Number of sub expressions actually being used
1160 // during execution. 1 if only the whole match
1161 // (subexpr 0) is used.
1162 // listid is global, so that it increases on recursive calls to
1163 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1164 // all the states.
1165 int nfa_listid;
1166 int nfa_alt_listid;
1167
1168#ifdef FEAT_SYN_HL
1169 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1170#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001171} regexec_T;
1172
1173static regexec_T rex;
1174static int rex_in_use = FALSE;
1175
Bram Moolenaar071d4272004-06-13 20:20:40 +00001176/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001177 * Return TRUE if character 'c' is included in 'iskeyword' option for
1178 * "reg_buf" buffer.
1179 */
1180 static int
1181reg_iswordc(int c)
1182{
1183 return vim_iswordc_buf(c, rex.reg_buf);
1184}
1185
1186/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001187 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1188 */
1189 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001190reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001191{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001192 // when looking behind for a match/no-match lnum is negative. But we
1193 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001194 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001195 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001196 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001197 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001198 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001199 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001200}
1201
Bram Moolenaar071d4272004-06-13 20:20:40 +00001202#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001203static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1204static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1205static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1206static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001207#endif
1208
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001209// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001210#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001211
Bram Moolenaar071d4272004-06-13 20:20:40 +00001212#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001213/*
1214 * Create a new extmatch and mark it as referenced once.
1215 */
1216 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001217make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001218{
1219 reg_extmatch_T *em;
1220
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001221 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001222 if (em != NULL)
1223 em->refcnt = 1;
1224 return em;
1225}
1226
1227/*
1228 * Add a reference to an extmatch.
1229 */
1230 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001231ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001232{
1233 if (em != NULL)
1234 em->refcnt++;
1235 return em;
1236}
1237
1238/*
1239 * Remove a reference to an extmatch. If there are no references left, free
1240 * the info.
1241 */
1242 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001243unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001244{
1245 int i;
1246
1247 if (em != NULL && --em->refcnt <= 0)
1248 {
1249 for (i = 0; i < NSUBEXP; ++i)
1250 vim_free(em->matches[i]);
1251 vim_free(em);
1252 }
1253}
1254#endif
1255
1256/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001257 * Get class of previous character.
1258 */
1259 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001260reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001261{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001262 if (rex.input > rex.line)
1263 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001264 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001265 return -1;
1266}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001267
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001268/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001269 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001270 */
1271 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001272reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001273{
1274 pos_T top, bot;
1275 linenr_T lnum;
1276 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001277 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001278 int mode;
1279 colnr_T start, end;
1280 colnr_T start2, end2;
1281 colnr_T cols;
1282
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001283 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001284 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001285 return FALSE;
1286
1287 if (VIsual_active)
1288 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001289 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001290 {
1291 top = VIsual;
1292 bot = wp->w_cursor;
1293 }
1294 else
1295 {
1296 top = wp->w_cursor;
1297 bot = VIsual;
1298 }
1299 mode = VIsual_mode;
1300 }
1301 else
1302 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001303 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001304 {
1305 top = curbuf->b_visual.vi_start;
1306 bot = curbuf->b_visual.vi_end;
1307 }
1308 else
1309 {
1310 top = curbuf->b_visual.vi_end;
1311 bot = curbuf->b_visual.vi_start;
1312 }
1313 mode = curbuf->b_visual.vi_mode;
1314 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001315 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001316 if (lnum < top.lnum || lnum > bot.lnum)
1317 return FALSE;
1318
1319 if (mode == 'v')
1320 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001321 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001322 if ((lnum == top.lnum && col < top.col)
1323 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1324 return FALSE;
1325 }
1326 else if (mode == Ctrl_V)
1327 {
1328 getvvcol(wp, &top, &start, NULL, &end);
1329 getvvcol(wp, &bot, &start2, NULL, &end2);
1330 if (start2 < start)
1331 start = start2;
1332 if (end2 > end)
1333 end = end2;
1334 if (top.col == MAXCOL || bot.col == MAXCOL)
1335 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001336 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001337 if (cols < start || cols > end - (*p_sel == 'e'))
1338 return FALSE;
1339 }
1340 return TRUE;
1341}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001342
Bram Moolenaar071d4272004-06-13 20:20:40 +00001343/*
1344 * Check the regexp program for its magic number.
1345 * Return TRUE if it's wrong.
1346 */
1347 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001348prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001349{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001350 regprog_T *prog;
1351
Bram Moolenaar6100d022016-10-02 16:51:57 +02001352 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001353 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001354 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001355 return FALSE;
1356
1357 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001358 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001359 emsg(_(e_re_corr));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001360 return TRUE;
1361 }
1362 return FALSE;
1363}
1364
1365/*
1366 * Cleanup the subexpressions, if this wasn't done yet.
1367 * This construction is used to clear the subexpressions only when they are
1368 * used (to increase speed).
1369 */
1370 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001371cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001372{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001373 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001374 {
1375 if (REG_MULTI)
1376 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001377 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001378 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1379 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001380 }
1381 else
1382 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001383 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1384 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001385 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001386 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001387 }
1388}
1389
1390#ifdef FEAT_SYN_HL
1391 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001392cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001393{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001394 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001395 {
1396 if (REG_MULTI)
1397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001398 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001399 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1400 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1401 }
1402 else
1403 {
1404 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1405 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1406 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001407 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001408 }
1409}
1410#endif
1411
1412/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001413 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001414 */
1415 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001416reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001417{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001418 rex.line = reg_getline(++rex.lnum);
1419 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001420 fast_breakcheck();
1421}
1422
1423/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001424 * Check whether a backreference matches.
1425 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001426 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1427 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001428 */
1429 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001430match_with_backref(
1431 linenr_T start_lnum,
1432 colnr_T start_col,
1433 linenr_T end_lnum,
1434 colnr_T end_col,
1435 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001436{
1437 linenr_T clnum = start_lnum;
1438 colnr_T ccol = start_col;
1439 int len;
1440 char_u *p;
1441
1442 if (bytelen != NULL)
1443 *bytelen = 0;
1444 for (;;)
1445 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001446 // Since getting one line may invalidate the other, need to make copy.
1447 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001448 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001449 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001450 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001451 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1452 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001453 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001454 vim_free(reg_tofree);
1455 reg_tofree = alloc(len);
1456 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001457 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001458 reg_tofreelen = len;
1459 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001460 STRCPY(reg_tofree, rex.line);
1461 rex.input = reg_tofree + (rex.input - rex.line);
1462 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001463 }
1464
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001465 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001466 p = reg_getline(clnum);
1467 if (clnum == end_lnum)
1468 len = end_col - ccol;
1469 else
1470 len = (int)STRLEN(p + ccol);
1471
Bram Moolenaar0270f382018-07-17 05:43:58 +02001472 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001473 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001474 if (bytelen != NULL)
1475 *bytelen += len;
1476 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001477 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001478 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001479 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001480
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001481 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001482 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001483 if (bytelen != NULL)
1484 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001485 ++clnum;
1486 ccol = 0;
1487 if (got_int)
1488 return RA_FAIL;
1489 }
1490
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001491 // found a match! Note that rex.line may now point to a copy of the line,
1492 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001493 return RA_MATCH;
1494}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001495
Bram Moolenaarfb031402014-09-09 17:18:49 +02001496/*
1497 * Used in a place where no * or \+ can follow.
1498 */
1499 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001500re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001501{
1502 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001503 {
1504 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1505 rc_did_emsg = TRUE;
1506 return FAIL;
1507 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001508 return OK;
1509}
1510
Bram Moolenaar071d4272004-06-13 20:20:40 +00001511typedef struct
1512{
1513 int a, b, c;
1514} decomp_T;
1515
1516
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001517// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001518static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001519{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001520 {0x5e2,0,0}, // 0xfb20 alt ayin
1521 {0x5d0,0,0}, // 0xfb21 alt alef
1522 {0x5d3,0,0}, // 0xfb22 alt dalet
1523 {0x5d4,0,0}, // 0xfb23 alt he
1524 {0x5db,0,0}, // 0xfb24 alt kaf
1525 {0x5dc,0,0}, // 0xfb25 alt lamed
1526 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1527 {0x5e8,0,0}, // 0xfb27 alt resh
1528 {0x5ea,0,0}, // 0xfb28 alt tav
1529 {'+', 0, 0}, // 0xfb29 alt plus
1530 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1531 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1532 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1533 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1534 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1535 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1536 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1537 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1538 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1539 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1540 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1541 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1542 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1543 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1544 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1545 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1546 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1547 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1548 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1549 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1550 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1551 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1552 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1553 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1554 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1555 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1556 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1557 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1558 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1559 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1560 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1561 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1562 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1563 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1564 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1565 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1566 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1567 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001568};
1569
1570 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001571mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001572{
1573 decomp_T d;
1574
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001575 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001576 {
1577 d = decomp_table[c - 0xfb20];
1578 *c1 = d.a;
1579 *c2 = d.b;
1580 *c3 = d.c;
1581 }
1582 else
1583 {
1584 *c1 = c;
1585 *c2 = *c3 = 0;
1586 }
1587}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001588
1589/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001590 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001591 * Return 0 if strings match, non-zero otherwise.
1592 * Correct the length "*n" when composing characters are ignored.
1593 */
1594 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001595cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001596{
1597 int result;
1598
Bram Moolenaar6100d022016-10-02 16:51:57 +02001599 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001600 result = STRNCMP(s1, s2, *n);
1601 else
1602 result = MB_STRNICMP(s1, s2, *n);
1603
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001604 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001605 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001606 {
1607 char_u *str1, *str2;
1608 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001609 int junk;
1610
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001611 // we have to handle the strcmp ourselves, since it is necessary to
1612 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001613 str1 = s1;
1614 str2 = s2;
1615 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001616 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001617 {
1618 c1 = mb_ptr2char_adv(&str1);
1619 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001620
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001621 // Decompose the character if necessary, into 'base' characters.
1622 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001623 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001624 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001625 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001626 mb_decompose(c1, &c11, &junk, &junk);
1627 mb_decompose(c2, &c12, &junk, &junk);
1628 c1 = c11;
1629 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001630 if (c11 != c12
1631 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001632 break;
1633 }
1634 }
1635 result = c2 - c1;
1636 if (result == 0)
1637 *n = (int)(str2 - s2);
1638 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001639
1640 return result;
1641}
1642
1643/*
1644 * cstrchr: This function is used a lot for simple searches, keep it fast!
1645 */
1646 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001647cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001648{
1649 char_u *p;
1650 int cc;
1651
Bram Moolenaara12a1612019-01-24 16:39:02 +01001652 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001653 return vim_strchr(s, c);
1654
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001655 // tolower() and toupper() can be slow, comparing twice should be a lot
1656 // faster (esp. when using MS Visual C++!).
1657 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001658 if (enc_utf8 && c > 0x80)
1659 cc = utf_fold(c);
1660 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001661 if (MB_ISUPPER(c))
1662 cc = MB_TOLOWER(c);
1663 else if (MB_ISLOWER(c))
1664 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001665 else
1666 return vim_strchr(s, c);
1667
Bram Moolenaar071d4272004-06-13 20:20:40 +00001668 if (has_mbyte)
1669 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001670 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001671 {
1672 if (enc_utf8 && c > 0x80)
1673 {
1674 if (utf_fold(utf_ptr2char(p)) == cc)
1675 return p;
1676 }
1677 else if (*p == c || *p == cc)
1678 return p;
1679 }
1680 }
1681 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001682 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001683 for (p = s; *p != NUL; ++p)
1684 if (*p == c || *p == cc)
1685 return p;
1686
1687 return NULL;
1688}
1689
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001690////////////////////////////////////////////////////////////////
1691// regsub stuff //
1692////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001693
Bram Moolenaar071d4272004-06-13 20:20:40 +00001694/*
1695 * We should define ftpr as a pointer to a function returning a pointer to
1696 * a function returning a pointer to a function ...
1697 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001698 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001699 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001700typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001701
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001702static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001703
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001704 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001705do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001706{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001707 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001708
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001709 return (fptr_T)NULL;
1710}
1711
1712 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001713do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001714{
1715 *d = MB_TOUPPER(c);
1716
1717 return (fptr_T)do_Upper;
1718}
1719
1720 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001721do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001722{
1723 *d = MB_TOLOWER(c);
1724
1725 return (fptr_T)NULL;
1726}
1727
1728 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001729do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001730{
1731 *d = MB_TOLOWER(c);
1732
1733 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001734}
1735
1736/*
1737 * regtilde(): Replace tildes in the pattern by the old pattern.
1738 *
1739 * Short explanation of the tilde: It stands for the previous replacement
1740 * pattern. If that previous pattern also contains a ~ we should go back a
1741 * step further... But we insert the previous pattern into the current one
1742 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001743 * This still does not handle the case where "magic" changes. So require the
1744 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001745 *
1746 * The tildes are parsed once before the first call to vim_regsub().
1747 */
1748 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001749regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001750{
1751 char_u *newsub = source;
1752 char_u *tmpsub;
1753 char_u *p;
1754 int len;
1755 int prevlen;
1756
1757 for (p = newsub; *p; ++p)
1758 {
1759 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1760 {
1761 if (reg_prev_sub != NULL)
1762 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001763 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001764 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001765 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001766 if (tmpsub != NULL)
1767 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001768 // copy prefix
1769 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001771 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001772 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001773 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001774 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001775 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001776 STRCPY(tmpsub + len + prevlen, p + 1);
1777
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001778 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001779 vim_free(newsub);
1780 newsub = tmpsub;
1781 p = newsub + len + prevlen;
1782 }
1783 }
1784 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001785 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001786 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001787 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001788 --p;
1789 }
1790 else
1791 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001792 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001793 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001794 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001795 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001796 }
1797 }
1798
1799 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001800 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001801 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001802 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001803 reg_prev_sub = vim_strsave(newsub);
1804 return newsub;
1805}
1806
1807#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001808static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001809
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001810// These pointers are used for reg_submatch(). Needed for when the
1811// substitution string is an expression that contains a call to substitute()
1812// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001813typedef struct {
1814 regmatch_T *sm_match;
1815 regmmatch_T *sm_mmatch;
1816 linenr_T sm_firstlnum;
1817 linenr_T sm_maxline;
1818 int sm_line_lbr;
1819} regsubmatch_T;
1820
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001821static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001822#endif
1823
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001824#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001825
1826/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001827 * Put the submatches in "argv[argskip]" which is a list passed into
1828 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001829 */
1830 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001831fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001832{
1833 listitem_T *li;
1834 int i;
1835 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001836 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001837
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001838 if (argcount == argskip)
1839 // called function doesn't take a submatches argument
1840 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001841
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001842 // Relies on sl_list to be the first item in staticList10_T.
1843 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001844
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001845 // There are always 10 list items in staticList10_T.
1846 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001847 for (i = 0; i < 10; ++i)
1848 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001849 s = rsm.sm_match->startp[i];
1850 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001851 s = NULL;
1852 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001853 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001854 li->li_tv.v_type = VAR_STRING;
1855 li->li_tv.vval.v_string = s;
1856 li = li->li_next;
1857 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001858 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001859}
1860
1861 static void
1862clear_submatch_list(staticList10_T *sl)
1863{
1864 int i;
1865
1866 for (i = 0; i < 10; ++i)
1867 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1868}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001869#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001870
Bram Moolenaar071d4272004-06-13 20:20:40 +00001871/*
1872 * vim_regsub() - perform substitutions after a vim_regexec() or
1873 * vim_regexec_multi() match.
1874 *
1875 * If "copy" is TRUE really copy into "dest".
1876 * If "copy" is FALSE nothing is copied, this is just to find out the length
1877 * of the result.
1878 *
1879 * If "backslash" is TRUE, a backslash will be removed later, need to double
1880 * them to keep them, and insert a backslash before a CR to avoid it being
1881 * replaced with a line break later.
1882 *
1883 * Note: The matched text must not change between the call of
1884 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1885 * references invalid!
1886 *
1887 * Returns the size of the replacement, including terminating NUL.
1888 */
1889 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001890vim_regsub(
1891 regmatch_T *rmp,
1892 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001893 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001894 char_u *dest,
1895 int copy,
1896 int magic,
1897 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001898{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001899 int result;
1900 regexec_T rex_save;
1901 int rex_in_use_save = rex_in_use;
1902
1903 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001904 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001905 rex_save = rex;
1906 rex_in_use = TRUE;
1907
1908 rex.reg_match = rmp;
1909 rex.reg_mmatch = NULL;
1910 rex.reg_maxline = 0;
1911 rex.reg_buf = curbuf;
1912 rex.reg_line_lbr = TRUE;
1913 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1914
1915 rex_in_use = rex_in_use_save;
1916 if (rex_in_use)
1917 rex = rex_save;
1918
1919 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001920}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001921
1922 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001923vim_regsub_multi(
1924 regmmatch_T *rmp,
1925 linenr_T lnum,
1926 char_u *source,
1927 char_u *dest,
1928 int copy,
1929 int magic,
1930 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001931{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001932 int result;
1933 regexec_T rex_save;
1934 int rex_in_use_save = rex_in_use;
1935
1936 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001937 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001938 rex_save = rex;
1939 rex_in_use = TRUE;
1940
1941 rex.reg_match = NULL;
1942 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001943 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001944 rex.reg_firstlnum = lnum;
1945 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1946 rex.reg_line_lbr = FALSE;
1947 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1948
1949 rex_in_use = rex_in_use_save;
1950 if (rex_in_use)
1951 rex = rex_save;
1952
1953 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001954}
1955
1956 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001957vim_regsub_both(
1958 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001959 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001960 char_u *dest,
1961 int copy,
1962 int magic,
1963 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001964{
1965 char_u *src;
1966 char_u *dst;
1967 char_u *s;
1968 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001969 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001970 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001971 fptr_T func_all = (fptr_T)NULL;
1972 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001973 linenr_T clnum = 0; // init for GCC
1974 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001975#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001976 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001978
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001979 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001980 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001981 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001982 emsg(_(e_null));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001983 return 0;
1984 }
1985 if (prog_magic_wrong())
1986 return 0;
1987 src = source;
1988 dst = dest;
1989
1990 /*
1991 * When the substitute part starts with "\=" evaluate it as an expression.
1992 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001993 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001994 {
1995#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001996 // To make sure that the length doesn't change between checking the
1997 // length and copying the string, and to speed up things, the
1998 // resulting string is saved from the call with "copy" == FALSE to the
1999 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002000 if (copy)
2001 {
2002 if (eval_result != NULL)
2003 {
2004 STRCPY(dest, eval_result);
2005 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002006 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002007 }
2008 }
2009 else
2010 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002011 int prev_can_f_submatch = can_f_submatch;
2012 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002013
2014 vim_free(eval_result);
2015
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002016 // The expression may contain substitute(), which calls us
2017 // recursively. Make sure submatch() gets the text from the first
2018 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002019 if (can_f_submatch)
2020 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002021 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002022 rsm.sm_match = rex.reg_match;
2023 rsm.sm_mmatch = rex.reg_mmatch;
2024 rsm.sm_firstlnum = rex.reg_firstlnum;
2025 rsm.sm_maxline = rex.reg_maxline;
2026 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002027
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002028 if (expr != NULL)
2029 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002030 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031 char_u buf[NUMBUFLEN];
2032 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002033 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002034 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002035
2036 rettv.v_type = VAR_STRING;
2037 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002038 argv[0].v_type = VAR_LIST;
2039 argv[0].vval.v_list = &matchList.sl_list;
2040 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002041 CLEAR_FIELD(funcexe);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002042 funcexe.argv_func = fill_submatch_list;
2043 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002044 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002045 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002046 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002047 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002048 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002049 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002050 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002051 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002052
Bram Moolenaar6100d022016-10-02 16:51:57 +02002053 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002054 funcexe.partial = partial;
2055 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002056 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002057 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002058 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002059 clear_submatch_list(&matchList);
2060
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002061 if (rettv.v_type == VAR_UNKNOWN)
2062 // something failed, no need to report another error
2063 eval_result = NULL;
2064 else
2065 {
2066 eval_result = tv_get_string_buf_chk(&rettv, buf);
2067 if (eval_result != NULL)
2068 eval_result = vim_strsave(eval_result);
2069 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002070 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002071 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002072 else if (substitute_instr != NULL)
2073 // Execute instructions from ISN_SUBSTITUTE.
2074 eval_result = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002075 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002076 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002077
Bram Moolenaar071d4272004-06-13 20:20:40 +00002078 if (eval_result != NULL)
2079 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002080 int had_backslash = FALSE;
2081
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002082 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002083 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002084 // Change NL to CR, so that it becomes a line break,
2085 // unless called from vim_regexec_nl().
2086 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002087 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002088 *s = CAR;
2089 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002090 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002091 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002092 /* Change NL to CR here too, so that this works:
2093 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2094 * abc\
2095 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002096 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002097 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002098 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002099 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002100 had_backslash = TRUE;
2101 }
2102 }
2103 if (had_backslash && backslash)
2104 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002105 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002106 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2107 if (s != NULL)
2108 {
2109 vim_free(eval_result);
2110 eval_result = s;
2111 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002112 }
2113
2114 dst += STRLEN(eval_result);
2115 }
2116
Bram Moolenaar6100d022016-10-02 16:51:57 +02002117 can_f_submatch = prev_can_f_submatch;
2118 if (can_f_submatch)
2119 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002120 }
2121#endif
2122 }
2123 else
2124 while ((c = *src++) != NUL)
2125 {
2126 if (c == '&' && magic)
2127 no = 0;
2128 else if (c == '\\' && *src != NUL)
2129 {
2130 if (*src == '&' && !magic)
2131 {
2132 ++src;
2133 no = 0;
2134 }
2135 else if ('0' <= *src && *src <= '9')
2136 {
2137 no = *src++ - '0';
2138 }
2139 else if (vim_strchr((char_u *)"uUlLeE", *src))
2140 {
2141 switch (*src++)
2142 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002143 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002144 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002145 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002146 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002147 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002148 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002149 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002150 continue;
2151 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002152 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002153 continue;
2154 }
2155 }
2156 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002157 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002158 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002159 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2160 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002161 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002162 if (copy)
2163 {
2164 *dst++ = c;
2165 *dst++ = *src++;
2166 *dst++ = *src++;
2167 }
2168 else
2169 {
2170 dst += 3;
2171 src += 2;
2172 }
2173 continue;
2174 }
2175
Bram Moolenaar071d4272004-06-13 20:20:40 +00002176 if (c == '\\' && *src != NUL)
2177 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002178 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002179 switch (*src)
2180 {
2181 case 'r': c = CAR; ++src; break;
2182 case 'n': c = NL; ++src; break;
2183 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002184 // Oh no! \e already has meaning in subst pat :-(
2185 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002186 case 'b': c = Ctrl_H; ++src; break;
2187
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002188 // If "backslash" is TRUE the backslash will be removed
2189 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002190 default: if (backslash)
2191 {
2192 if (copy)
2193 *dst = '\\';
2194 ++dst;
2195 }
2196 c = *src++;
2197 }
2198 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002199 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002200 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002201
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002202 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002203 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002204 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002205 func_one = (fptr_T)(func_one(&cc, c));
2206 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002207 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002208 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002209 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002210 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002211
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002212 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002213 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002214 int totlen = mb_ptr2len(src - 1);
2215
Bram Moolenaar071d4272004-06-13 20:20:40 +00002216 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002217 mb_char2bytes(cc, dst);
2218 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002219 if (enc_utf8)
2220 {
2221 int clen = utf_ptr2len(src - 1);
2222
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002223 // If the character length is shorter than "totlen", there
2224 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002225 if (clen < totlen)
2226 {
2227 if (copy)
2228 mch_memmove(dst + 1, src - 1 + clen,
2229 (size_t)(totlen - clen));
2230 dst += totlen - clen;
2231 }
2232 }
2233 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002234 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002235 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002236 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002237 dst++;
2238 }
2239 else
2240 {
2241 if (REG_MULTI)
2242 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002243 clnum = rex.reg_mmatch->startpos[no].lnum;
2244 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002245 s = NULL;
2246 else
2247 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002248 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2249 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2250 len = rex.reg_mmatch->endpos[no].col
2251 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002252 else
2253 len = (int)STRLEN(s);
2254 }
2255 }
2256 else
2257 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002258 s = rex.reg_match->startp[no];
2259 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002260 s = NULL;
2261 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002262 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002263 }
2264 if (s != NULL)
2265 {
2266 for (;;)
2267 {
2268 if (len == 0)
2269 {
2270 if (REG_MULTI)
2271 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002272 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002273 break;
2274 if (copy)
2275 *dst = CAR;
2276 ++dst;
2277 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002278 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2279 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002280 else
2281 len = (int)STRLEN(s);
2282 }
2283 else
2284 break;
2285 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002286 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002287 {
2288 if (copy)
Bram Moolenaare83cca22020-09-07 18:53:21 +02002289 iemsg(_(e_re_damg));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002290 goto exit;
2291 }
2292 else
2293 {
2294 if (backslash && (*s == CAR || *s == '\\'))
2295 {
2296 /*
2297 * Insert a backslash in front of a CR, otherwise
2298 * it will be replaced by a line break.
2299 * Number of backslashes will be halved later,
2300 * double them here.
2301 */
2302 if (copy)
2303 {
2304 dst[0] = '\\';
2305 dst[1] = *s;
2306 }
2307 dst += 2;
2308 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002309 else
2310 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002311 if (has_mbyte)
2312 c = mb_ptr2char(s);
2313 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002314 c = *s;
2315
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002316 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002317 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002318 func_one = (fptr_T)(func_one(&cc, c));
2319 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002320 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002321 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002322 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002323 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002324
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002325 if (has_mbyte)
2326 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002327 int l;
2328
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002329 // Copy composing characters separately, one
2330 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002331 if (enc_utf8)
2332 l = utf_ptr2len(s) - 1;
2333 else
2334 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002335
2336 s += l;
2337 len -= l;
2338 if (copy)
2339 mb_char2bytes(cc, dst);
2340 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002341 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002342 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002343 *dst = cc;
2344 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002345 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002346
Bram Moolenaar071d4272004-06-13 20:20:40 +00002347 ++s;
2348 --len;
2349 }
2350 }
2351 }
2352 no = -1;
2353 }
2354 }
2355 if (copy)
2356 *dst = NUL;
2357
2358exit:
2359 return (int)((dst - dest) + 1);
2360}
2361
2362#ifdef FEAT_EVAL
2363/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002364 * Call reg_getline() with the line numbers from the submatch. If a
2365 * substitute() was used the reg_maxline and other values have been
2366 * overwritten.
2367 */
2368 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002369reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002370{
2371 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002372 linenr_T save_first = rex.reg_firstlnum;
2373 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002374
Bram Moolenaar6100d022016-10-02 16:51:57 +02002375 rex.reg_firstlnum = rsm.sm_firstlnum;
2376 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002377
2378 s = reg_getline(lnum);
2379
Bram Moolenaar6100d022016-10-02 16:51:57 +02002380 rex.reg_firstlnum = save_first;
2381 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002382 return s;
2383}
2384
2385/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002386 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002387 * allocated memory.
2388 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2389 */
2390 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002391reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002392{
2393 char_u *retval = NULL;
2394 char_u *s;
2395 int len;
2396 int round;
2397 linenr_T lnum;
2398
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002399 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002400 return NULL;
2401
Bram Moolenaar6100d022016-10-02 16:51:57 +02002402 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002403 {
2404 /*
2405 * First round: compute the length and allocate memory.
2406 * Second round: copy the text.
2407 */
2408 for (round = 1; round <= 2; ++round)
2409 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002410 lnum = rsm.sm_mmatch->startpos[no].lnum;
2411 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002412 return NULL;
2413
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002414 s = reg_getline_submatch(lnum);
2415 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002417 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002418 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002419 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002420 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002421 len = rsm.sm_mmatch->endpos[no].col
2422 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002423 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002424 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002425 ++len;
2426 }
2427 else
2428 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002429 // Multiple lines: take start line from start col, middle
2430 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002431 len = (int)STRLEN(s);
2432 if (round == 2)
2433 {
2434 STRCPY(retval, s);
2435 retval[len] = '\n';
2436 }
2437 ++len;
2438 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002439 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002440 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002441 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002442 if (round == 2)
2443 STRCPY(retval + len, s);
2444 len += (int)STRLEN(s);
2445 if (round == 2)
2446 retval[len] = '\n';
2447 ++len;
2448 }
2449 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002450 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002451 rsm.sm_mmatch->endpos[no].col);
2452 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002453 if (round == 2)
2454 retval[len] = NUL;
2455 ++len;
2456 }
2457
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002458 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002459 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002460 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002461 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002462 return NULL;
2463 }
2464 }
2465 }
2466 else
2467 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002468 s = rsm.sm_match->startp[no];
2469 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002470 retval = NULL;
2471 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002472 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002473 }
2474
2475 return retval;
2476}
Bram Moolenaar41571762014-04-02 19:00:58 +02002477
2478/*
2479 * Used for the submatch() function with the optional non-zero argument: get
2480 * the list of strings from the n'th submatch in allocated memory with NULs
2481 * represented in NLs.
2482 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2483 * command, for a non-existing submatch and for any error.
2484 */
2485 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002486reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002487{
2488 char_u *s;
2489 linenr_T slnum;
2490 linenr_T elnum;
2491 colnr_T scol;
2492 colnr_T ecol;
2493 int i;
2494 list_T *list;
2495 int error = FALSE;
2496
2497 if (!can_f_submatch || no < 0)
2498 return NULL;
2499
Bram Moolenaar6100d022016-10-02 16:51:57 +02002500 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002501 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002502 slnum = rsm.sm_mmatch->startpos[no].lnum;
2503 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002504 if (slnum < 0 || elnum < 0)
2505 return NULL;
2506
Bram Moolenaar6100d022016-10-02 16:51:57 +02002507 scol = rsm.sm_mmatch->startpos[no].col;
2508 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002509
2510 list = list_alloc();
2511 if (list == NULL)
2512 return NULL;
2513
2514 s = reg_getline_submatch(slnum) + scol;
2515 if (slnum == elnum)
2516 {
2517 if (list_append_string(list, s, ecol - scol) == FAIL)
2518 error = TRUE;
2519 }
2520 else
2521 {
2522 if (list_append_string(list, s, -1) == FAIL)
2523 error = TRUE;
2524 for (i = 1; i < elnum - slnum; i++)
2525 {
2526 s = reg_getline_submatch(slnum + i);
2527 if (list_append_string(list, s, -1) == FAIL)
2528 error = TRUE;
2529 }
2530 s = reg_getline_submatch(elnum);
2531 if (list_append_string(list, s, ecol) == FAIL)
2532 error = TRUE;
2533 }
2534 }
2535 else
2536 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002537 s = rsm.sm_match->startp[no];
2538 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002539 return NULL;
2540 list = list_alloc();
2541 if (list == NULL)
2542 return NULL;
2543 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002544 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002545 error = TRUE;
2546 }
2547
2548 if (error)
2549 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002550 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002551 return NULL;
2552 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002553 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002554 return list;
2555}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002556#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002557
Bram Moolenaarf4140482020-02-15 23:06:45 +01002558/*
2559 * Initialize the values used for matching against multiple lines
2560 */
2561 static void
2562init_regexec_multi(
2563 regmmatch_T *rmp,
2564 win_T *win, // window in which to search or NULL
2565 buf_T *buf, // buffer in which to search
2566 linenr_T lnum) // nr of line to start looking for match
2567{
2568 rex.reg_match = NULL;
2569 rex.reg_mmatch = rmp;
2570 rex.reg_buf = buf;
2571 rex.reg_win = win;
2572 rex.reg_firstlnum = lnum;
2573 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2574 rex.reg_line_lbr = FALSE;
2575 rex.reg_ic = rmp->rmm_ic;
2576 rex.reg_icombine = FALSE;
2577 rex.reg_maxcol = rmp->rmm_maxcol;
2578}
2579
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002580#include "regexp_bt.c"
2581
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582static regengine_T bt_regengine =
2583{
2584 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002585 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002586 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002587 bt_regexec_multi,
2588 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002589};
2590
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002591#include "regexp_nfa.c"
2592
2593static regengine_T nfa_regengine =
2594{
2595 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002596 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002598 nfa_regexec_multi,
2599 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600};
2601
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002602// Which regexp engine to use? Needed for vim_regcomp().
2603// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002604static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002605
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002606#ifdef DEBUG
2607static char_u regname[][30] = {
2608 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002609 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002610 "NFA Regexp Engine"
2611 };
2612#endif
2613
2614/*
2615 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002616 * Returns the program in allocated memory.
2617 * Use vim_regfree() to free the memory.
2618 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002619 */
2620 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002621vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002622{
2623 regprog_T *prog = NULL;
2624 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002625 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002626
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002627 regexp_engine = p_re;
2628
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002629 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002630 if (STRNCMP(expr, "\\%#=", 4) == 0)
2631 {
2632 int newengine = expr[4] - '0';
2633
2634 if (newengine == AUTOMATIC_ENGINE
2635 || newengine == BACKTRACKING_ENGINE
2636 || newengine == NFA_ENGINE)
2637 {
2638 regexp_engine = expr[4] - '0';
2639 expr += 5;
2640#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002641 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002642 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002643#endif
2644 }
2645 else
2646 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002647 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002648 regexp_engine = AUTOMATIC_ENGINE;
2649 }
2650 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002651#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002652 bt_regengine.expr = expr;
2653 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002654#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002655 // reg_iswordc() uses rex.reg_buf
2656 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002657
2658 /*
2659 * First try the NFA engine, unless backtracking was requested.
2660 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002661 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002662 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002663 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002664 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002665 else
2666 prog = bt_regengine.regcomp(expr, re_flags);
2667
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002668 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002669 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002670 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002671#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002672 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002673 {
2674 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002675 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002676 if (f)
2677 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002678 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002679 fclose(f);
2680 }
2681 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002682 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002683 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002684 }
2685#endif
2686 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002687 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002688 * The NFA engine also fails for patterns that it can't handle well
2689 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002690 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002691 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002692 if (regexp_engine == AUTOMATIC_ENGINE
2693 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002694 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002695 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002696#ifdef FEAT_EVAL
2697 report_re_switch(expr);
2698#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002699 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002700 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002701 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002702
Bram Moolenaarfda37292014-11-05 14:27:36 +01002703 if (prog != NULL)
2704 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002705 // Store the info needed to call regcomp() again when the engine turns
2706 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002707 prog->re_engine = regexp_engine;
2708 prog->re_flags = re_flags;
2709 }
2710
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002711 return prog;
2712}
2713
2714/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002715 * Free a compiled regexp program, returned by vim_regcomp().
2716 */
2717 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002718vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002719{
2720 if (prog != NULL)
2721 prog->engine->regfree(prog);
2722}
2723
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002724#if defined(EXITFREE) || defined(PROTO)
2725 void
2726free_regexp_stuff(void)
2727{
2728 ga_clear(&regstack);
2729 ga_clear(&backpos);
2730 vim_free(reg_tofree);
2731 vim_free(reg_prev_sub);
2732}
2733#endif
2734
Bram Moolenaarfda37292014-11-05 14:27:36 +01002735#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002736 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002737report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002738{
2739 if (p_verbose > 0)
2740 {
2741 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002742 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2743 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002744 verbose_leave();
2745 }
2746}
2747#endif
2748
Bram Moolenaar113e1072019-01-20 15:30:40 +01002749#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2750 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002751/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002752 * Return whether "prog" is currently being executed.
2753 */
2754 int
2755regprog_in_use(regprog_T *prog)
2756{
2757 return prog->re_in_use;
2758}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002759#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002760
2761/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002762 * Match a regexp against a string.
2763 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002764 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002765 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002766 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002767 *
2768 * Return TRUE if there is a match, FALSE if not.
2769 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002770 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002771vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002772 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002773 char_u *line, // string to match against
2774 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002775 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002776{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002777 int result;
2778 regexec_T rex_save;
2779 int rex_in_use_save = rex_in_use;
2780
Bram Moolenaar0270f382018-07-17 05:43:58 +02002781 // Cannot use the same prog recursively, it contains state.
2782 if (rmp->regprog->re_in_use)
2783 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002784 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002785 return FALSE;
2786 }
2787 rmp->regprog->re_in_use = TRUE;
2788
Bram Moolenaar6100d022016-10-02 16:51:57 +02002789 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002790 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002791 rex_save = rex;
2792 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002793
Bram Moolenaar6100d022016-10-02 16:51:57 +02002794 rex.reg_startp = NULL;
2795 rex.reg_endp = NULL;
2796 rex.reg_startpos = NULL;
2797 rex.reg_endpos = NULL;
2798
2799 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002800 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002801
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002802 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002803 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2804 && result == NFA_TOO_EXPENSIVE)
2805 {
2806 int save_p_re = p_re;
2807 int re_flags = rmp->regprog->re_flags;
2808 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2809
2810 p_re = BACKTRACKING_ENGINE;
2811 vim_regfree(rmp->regprog);
2812 if (pat != NULL)
2813 {
2814#ifdef FEAT_EVAL
2815 report_re_switch(pat);
2816#endif
2817 rmp->regprog = vim_regcomp(pat, re_flags);
2818 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002819 {
2820 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002821 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002822 rmp->regprog->re_in_use = FALSE;
2823 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002824 vim_free(pat);
2825 }
2826
2827 p_re = save_p_re;
2828 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002829
2830 rex_in_use = rex_in_use_save;
2831 if (rex_in_use)
2832 rex = rex_save;
2833
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002834 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002835}
2836
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002837/*
2838 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002839 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002840 */
2841 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002842vim_regexec_prog(
2843 regprog_T **prog,
2844 int ignore_case,
2845 char_u *line,
2846 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002847{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002848 int r;
2849 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002850
2851 regmatch.regprog = *prog;
2852 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002853 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002854 *prog = regmatch.regprog;
2855 return r;
2856}
2857
2858/*
2859 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002860 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002861 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002862 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002863vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002864{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002865 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002866}
2867
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002868/*
2869 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002870 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002871 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002872 */
2873 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002874vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002875{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002876 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002877}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002878
2879/*
2880 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002881 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2882 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002883 * Uses curbuf for line count and 'iskeyword'.
2884 *
2885 * Return zero if there is no match. Return number of lines contained in the
2886 * match otherwise.
2887 */
2888 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002889vim_regexec_multi(
2890 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002891 win_T *win, // window in which to search or NULL
2892 buf_T *buf, // buffer in which to search
2893 linenr_T lnum, // nr of line to start looking for match
2894 colnr_T col, // column to start looking for match
2895 proftime_T *tm, // timeout limit or NULL
2896 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002897{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002898 int result;
2899 regexec_T rex_save;
2900 int rex_in_use_save = rex_in_use;
2901
Bram Moolenaar0270f382018-07-17 05:43:58 +02002902 // Cannot use the same prog recursively, it contains state.
2903 if (rmp->regprog->re_in_use)
2904 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002905 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002906 return FALSE;
2907 }
2908 rmp->regprog->re_in_use = TRUE;
2909
Bram Moolenaar6100d022016-10-02 16:51:57 +02002910 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002911 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002912 rex_save = rex;
2913 rex_in_use = TRUE;
2914
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002915 result = rmp->regprog->engine->regexec_multi(
2916 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002917 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002918
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002919 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002920 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2921 && result == NFA_TOO_EXPENSIVE)
2922 {
2923 int save_p_re = p_re;
2924 int re_flags = rmp->regprog->re_flags;
2925 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2926
2927 p_re = BACKTRACKING_ENGINE;
2928 vim_regfree(rmp->regprog);
2929 if (pat != NULL)
2930 {
2931#ifdef FEAT_EVAL
2932 report_re_switch(pat);
2933#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002934#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002935 // checking for \z misuse was already done when compiling for NFA,
2936 // allow all here
2937 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002938#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002939 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002940#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002941 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002942#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002943
Bram Moolenaarfda37292014-11-05 14:27:36 +01002944 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002945 {
2946 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002947 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002948 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002949 rmp->regprog->re_in_use = FALSE;
2950 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002951 vim_free(pat);
2952 }
2953 p_re = save_p_re;
2954 }
2955
Bram Moolenaar6100d022016-10-02 16:51:57 +02002956 rex_in_use = rex_in_use_save;
2957 if (rex_in_use)
2958 rex = rex_save;
2959
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002960 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002961}