blob: de0b0fad43700241a7588d58a6a4a345307dddf8 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020077static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
78static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
79static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020081static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020082static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020083#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020084static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020085static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020086static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
87
Bram Moolenaar071d4272004-06-13 20:20:40 +000088#define NOT_MULTI 0
89#define MULTI_ONE 1
90#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020091
92// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010093#define RA_FAIL 1 // something failed, abort
94#define RA_CONT 2 // continue in inner loop
95#define RA_BREAK 3 // break inner loop
96#define RA_MATCH 4 // successful match
97#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020098
Bram Moolenaar071d4272004-06-13 20:20:40 +000099/*
100 * Return NOT_MULTI if c is not a "multi" operator.
101 * Return MULTI_ONE if c is a single "multi" operator.
102 * Return MULTI_MULT if c is a multi "multi" operator.
103 */
104 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100105re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000106{
107 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
108 return MULTI_ONE;
109 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
110 return MULTI_MULT;
111 return NOT_MULTI;
112}
113
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000114static char_u *reg_prev_sub = NULL;
115
Bram Moolenaar071d4272004-06-13 20:20:40 +0000116/*
117 * REGEXP_INRANGE contains all characters which are always special in a []
118 * range after '\'.
119 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
120 * These are:
121 * \n - New line (NL).
122 * \r - Carriage Return (CR).
123 * \t - Tab (TAB).
124 * \e - Escape (ESC).
125 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000126 * \d - Character code in decimal, eg \d123
127 * \o - Character code in octal, eg \o80
128 * \x - Character code in hex, eg \x4a
129 * \u - Multibyte character code, eg \u20ac
130 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131 */
132static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000133static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135/*
136 * Translate '\x' to its control character, except "\n", which is Magic.
137 */
138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100139backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000140{
141 switch (c)
142 {
143 case 'r': return CAR;
144 case 't': return TAB;
145 case 'e': return ESC;
146 case 'b': return BS;
147 }
148 return c;
149}
150
151/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000152 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000153 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
154 * recognized. Otherwise "pp" is advanced to after the item.
155 */
156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100157get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000158{
159 static const char *(class_names[]) =
160 {
161 "alnum:]",
162#define CLASS_ALNUM 0
163 "alpha:]",
164#define CLASS_ALPHA 1
165 "blank:]",
166#define CLASS_BLANK 2
167 "cntrl:]",
168#define CLASS_CNTRL 3
169 "digit:]",
170#define CLASS_DIGIT 4
171 "graph:]",
172#define CLASS_GRAPH 5
173 "lower:]",
174#define CLASS_LOWER 6
175 "print:]",
176#define CLASS_PRINT 7
177 "punct:]",
178#define CLASS_PUNCT 8
179 "space:]",
180#define CLASS_SPACE 9
181 "upper:]",
182#define CLASS_UPPER 10
183 "xdigit:]",
184#define CLASS_XDIGIT 11
185 "tab:]",
186#define CLASS_TAB 12
187 "return:]",
188#define CLASS_RETURN 13
189 "backspace:]",
190#define CLASS_BACKSPACE 14
191 "escape:]",
192#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100193 "ident:]",
194#define CLASS_IDENT 16
195 "keyword:]",
196#define CLASS_KEYWORD 17
197 "fname:]",
198#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 };
200#define CLASS_NONE 99
201 int i;
202
203 if ((*pp)[1] == ':')
204 {
Bram Moolenaar78a15312009-05-15 19:33:18 +0000205 for (i = 0; i < (int)(sizeof(class_names) / sizeof(*class_names)); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000206 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
207 {
208 *pp += STRLEN(class_names[i]) + 2;
209 return i;
210 }
211 }
212 return CLASS_NONE;
213}
214
215/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216 * Specific version of character class functions.
217 * Using a table to keep this fast.
218 */
219static short class_tab[256];
220
221#define RI_DIGIT 0x01
222#define RI_HEX 0x02
223#define RI_OCTAL 0x04
224#define RI_WORD 0x08
225#define RI_HEAD 0x10
226#define RI_ALPHA 0x20
227#define RI_LOWER 0x40
228#define RI_UPPER 0x80
229#define RI_WHITE 0x100
230
231 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100232init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000233{
234 int i;
235 static int done = FALSE;
236
237 if (done)
238 return;
239
240 for (i = 0; i < 256; ++i)
241 {
242 if (i >= '0' && i <= '7')
243 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
244 else if (i >= '8' && i <= '9')
245 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
246 else if (i >= 'a' && i <= 'f')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
248#ifdef EBCDIC
249 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
250 || (i >= 's' && i <= 'z'))
251#else
252 else if (i >= 'g' && i <= 'z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
255 else if (i >= 'A' && i <= 'F')
256 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
257#ifdef EBCDIC
258 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
259 || (i >= 'S' && i <= 'Z'))
260#else
261 else if (i >= 'G' && i <= 'Z')
262#endif
263 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
264 else if (i == '_')
265 class_tab[i] = RI_WORD + RI_HEAD;
266 else
267 class_tab[i] = 0;
268 }
269 class_tab[' '] |= RI_WHITE;
270 class_tab['\t'] |= RI_WHITE;
271 done = TRUE;
272}
273
Bram Moolenaara12a1612019-01-24 16:39:02 +0100274#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
275#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
276#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
277#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
278#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
279#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
280#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
281#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
282#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100284// flags for regflags
285#define RF_ICASE 1 // ignore case
286#define RF_NOICASE 2 // don't ignore case
287#define RF_HASNL 4 // can match a NL
288#define RF_ICOMBINE 8 // ignore combining characters
289#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000290
291/*
292 * Global work variables for vim_regcomp().
293 */
294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static char_u *regparse; // Input-scan pointer.
296static int regnpar; // () count.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000297#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298static int regnzpar; // \z() count.
299static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000300#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100301static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000302#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100303static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000304#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100306static int reg_magic; // magicness of the pattern:
307#define MAGIC_NONE 1 // "\V" very unmagic
308#define MAGIC_OFF 2 // "\M" or 'magic' off
309#define MAGIC_ON 3 // "\m" or 'magic'
310#define MAGIC_ALL 4 // "\v" very magic
Bram Moolenaar071d4272004-06-13 20:20:40 +0000311
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100312static int reg_string; // matching with a string instead of a buffer
313 // line
314static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000315
316/*
317 * META contains all characters that may be magic, except '^' and '$'.
318 */
319
320#ifdef EBCDIC
321static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
322#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100323// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324static char_u META_flags[] = {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100327// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000328 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100329// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000330 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100331// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000332 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100333// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000334 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100335// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000336 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100337// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000338 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
339};
340#endif
341
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100342static int curchr; // currently parsed character
343// Previous character. Note: prevchr is sometimes -1 when we are not at the
344// start, eg in /[ ^I]^ the pattern was never found even if it existed,
345// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200346static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100347static int prevprevchr; // previous-previous character
348static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000349
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100350// arguments for reg()
351#define REG_NOPAREN 0 // toplevel reg()
352#define REG_PAREN 1 // \(\)
353#define REG_ZPAREN 2 // \z(\)
354#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000355
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200356typedef struct
357{
358 char_u *regparse;
359 int prevchr_len;
360 int curchr;
361 int prevchr;
362 int prevprevchr;
363 int nextchr;
364 int at_start;
365 int prev_at_start;
366 int regnpar;
367} parse_state_T;
368
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100369static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100370static int getchr(void);
371static void skipchr_keepstart(void);
372static int peekchr(void);
373static void skipchr(void);
374static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100375static long gethexchrs(int maxinputlen);
376static long getoctchrs(void);
377static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100378static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100379static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200380static int cstrncmp(char_u *s1, char_u *s2, int *n);
381static char_u *cstrchr(char_u *, int);
382static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100383static int reg_iswordc(int);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200385static regengine_T bt_regengine;
386static regengine_T nfa_regengine;
387
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388/*
389 * Return TRUE if compiled regular expression "prog" can match a line break.
390 */
391 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393{
394 return (prog->regflags & RF_HASNL);
395}
396
397/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
399 * Returns a character representing the class. Zero means that no item was
400 * recognized. Otherwise "pp" is advanced to after the item.
401 */
402 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100403get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404{
405 int c;
406 int l = 1;
407 char_u *p = *pp;
408
Bram Moolenaar985079c2019-02-16 17:07:47 +0100409 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000410 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000411 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000412 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000413 if (p[l + 2] == '=' && p[l + 3] == ']')
414 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (has_mbyte)
416 c = mb_ptr2char(p + 2);
417 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000418 c = p[2];
419 *pp += l + 4;
420 return c;
421 }
422 }
423 return 0;
424}
425
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200426#ifdef EBCDIC
427/*
428 * Table for equivalence class "c". (IBM-1047)
429 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200430static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200431 "A\x62\x63\x64\x65\x66\x67",
432 "C\x68",
433 "E\x71\x72\x73\x74",
434 "I\x75\x76\x77\x78",
435 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200436 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200437 "U\xFB\xFC\xFD\xFE",
438 "Y\xBA",
439 "a\x42\x43\x44\x45\x46\x47",
440 "c\x48",
441 "e\x51\x52\x53\x54",
442 "i\x55\x56\x57\x58",
443 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200444 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200445 "u\xDB\xDC\xDD\xDE",
446 "y\x8D\xDF",
447};
448#endif
449
Bram Moolenaardf177f62005-02-22 08:39:57 +0000450/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000451 * Check for a collating element "[.a.]". "pp" points to the '['.
452 * Returns a character. Zero means that no item was recognized. Otherwise
453 * "pp" is advanced to after the item.
454 * Currently only single characters are recognized!
455 */
456 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100457get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458{
459 int c;
460 int l = 1;
461 char_u *p = *pp;
462
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100463 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000464 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000465 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000466 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000467 if (p[l + 2] == '.' && p[l + 3] == ']')
468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
470 c = mb_ptr2char(p + 2);
471 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000472 c = p[2];
473 *pp += l + 4;
474 return c;
475 }
476 }
477 return 0;
478}
479
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100480static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
481static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200482
483 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100484get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200485{
486 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
487 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
488}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000489
490/*
491 * Skip over a "[]" range.
492 * "p" must point to the character after the '['.
493 * The returned pointer is on the matching ']', or the terminating NUL.
494 */
495 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100496skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000497{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000499
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100500 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501 ++p;
502 if (*p == ']' || *p == '-')
503 ++p;
504 while (*p != NUL && *p != ']')
505 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000506 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000507 p += l;
508 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000509 if (*p == '-')
510 {
511 ++p;
512 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100513 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000514 }
515 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200516 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000517 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200518 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000519 p += 2;
520 else if (*p == '[')
521 {
522 if (get_char_class(&p) == CLASS_NONE
523 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200524 && get_coll_element(&p) == 0
525 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100526 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000527 }
528 else
529 ++p;
530 }
531
532 return p;
533}
534
535/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000536 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200537 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000538 * Take care of characters with a backslash in front of it.
539 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000540 */
541 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100542skip_regexp(
543 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200544 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200545 int magic)
546{
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200547 return skip_regexp_ex(startp, delim, magic, NULL, NULL);
548}
549
550/*
551 * Call skip_regexp() and when the delimiter does not match give an error and
552 * return NULL.
553 */
554 char_u *
555skip_regexp_err(
556 char_u *startp,
557 int delim,
558 int magic)
559{
560 char_u *p = skip_regexp(startp, delim, magic);
561
562 if (*p != delim)
563 {
564 semsg(_("E654: missing delimiter after search pattern: %s"), startp);
565 return NULL;
566 }
567 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200568}
569
570/*
571 * skip_regexp() with extra arguments:
572 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
573 * expression and change "\?" to "?". If "*newp" is not NULL the expression
574 * is changed in-place.
575 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
576 */
577 char_u *
578skip_regexp_ex(
579 char_u *startp,
580 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100581 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200582 char_u **newp,
583 int *dropped)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000584{
585 int mymagic;
586 char_u *p = startp;
587
588 if (magic)
589 mymagic = MAGIC_ON;
590 else
591 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200592 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000593
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100594 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000595 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100596 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000597 break;
598 if ((p[0] == '[' && mymagic >= MAGIC_ON)
599 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
600 {
601 p = skip_anyof(p + 1);
602 if (p[0] == NUL)
603 break;
604 }
605 else if (p[0] == '\\' && p[1] != NUL)
606 {
607 if (dirc == '?' && newp != NULL && p[1] == '?')
608 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100609 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000610 if (*newp == NULL)
611 {
612 *newp = vim_strsave(startp);
613 if (*newp != NULL)
614 p = *newp + (p - startp);
615 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200616 if (dropped != NULL)
617 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000618 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000619 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000620 else
621 ++p;
622 }
623 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100624 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000625 if (*p == 'v')
626 mymagic = MAGIC_ALL;
627 else if (*p == 'V')
628 mymagic = MAGIC_NONE;
629 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000630 }
631 return p;
632}
633
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200634/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200635 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200636 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100637static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200638static int at_start; // True when on the first character
639static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100640
Bram Moolenaar071d4272004-06-13 20:20:40 +0000641/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200642 * Start parsing at "str".
643 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000644 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100645initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000646{
647 regparse = str;
648 prevchr_len = 0;
649 curchr = prevprevchr = prevchr = nextchr = -1;
650 at_start = TRUE;
651 prev_at_start = FALSE;
652}
653
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200654/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200655 * Save the current parse state, so that it can be restored and parsing
656 * starts in the same state again.
657 */
658 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100659save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200660{
661 ps->regparse = regparse;
662 ps->prevchr_len = prevchr_len;
663 ps->curchr = curchr;
664 ps->prevchr = prevchr;
665 ps->prevprevchr = prevprevchr;
666 ps->nextchr = nextchr;
667 ps->at_start = at_start;
668 ps->prev_at_start = prev_at_start;
669 ps->regnpar = regnpar;
670}
671
672/*
673 * Restore a previously saved parse state.
674 */
675 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100676restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200677{
678 regparse = ps->regparse;
679 prevchr_len = ps->prevchr_len;
680 curchr = ps->curchr;
681 prevchr = ps->prevchr;
682 prevprevchr = ps->prevprevchr;
683 nextchr = ps->nextchr;
684 at_start = ps->at_start;
685 prev_at_start = ps->prev_at_start;
686 regnpar = ps->regnpar;
687}
688
689
690/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200691 * Get the next character without advancing.
692 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000693 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100694peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000695{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000696 static int after_slash = FALSE;
697
Bram Moolenaar071d4272004-06-13 20:20:40 +0000698 if (curchr == -1)
699 {
700 switch (curchr = regparse[0])
701 {
702 case '.':
703 case '[':
704 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100705 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000706 if (reg_magic >= MAGIC_ON)
707 curchr = Magic(curchr);
708 break;
709 case '(':
710 case ')':
711 case '{':
712 case '%':
713 case '+':
714 case '=':
715 case '?':
716 case '@':
717 case '!':
718 case '&':
719 case '|':
720 case '<':
721 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100722 case '#': // future ext.
723 case '"': // future ext.
724 case '\'': // future ext.
725 case ',': // future ext.
726 case '-': // future ext.
727 case ':': // future ext.
728 case ';': // future ext.
729 case '`': // future ext.
730 case '/': // Can't be used in / command
731 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000732 if (reg_magic == MAGIC_ALL)
733 curchr = Magic(curchr);
734 break;
735 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100736 // * is not magic as the very first character, eg "?*ptr", when
737 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
738 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000739 if (reg_magic >= MAGIC_ON
740 && !at_start
741 && !(prev_at_start && prevchr == Magic('^'))
742 && (after_slash
743 || (prevchr != Magic('(')
744 && prevchr != Magic('&')
745 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000746 curchr = Magic('*');
747 break;
748 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100749 // '^' is only magic as the very first character and if it's after
750 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000751 if (reg_magic >= MAGIC_OFF
752 && (at_start
753 || reg_magic == MAGIC_ALL
754 || prevchr == Magic('(')
755 || prevchr == Magic('|')
756 || prevchr == Magic('&')
757 || prevchr == Magic('n')
758 || (no_Magic(prevchr) == '('
759 && prevprevchr == Magic('%'))))
760 {
761 curchr = Magic('^');
762 at_start = TRUE;
763 prev_at_start = FALSE;
764 }
765 break;
766 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100767 // '$' is only magic as the very last char and if it's in front of
768 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000769 if (reg_magic >= MAGIC_OFF)
770 {
771 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200772 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000773
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100774 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000775 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200776 || p[1] == 'm' || p[1] == 'M'
777 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
778 {
779 if (p[1] == 'v')
780 is_magic_all = TRUE;
781 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
782 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000783 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200784 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000785 if (p[0] == NUL
786 || (p[0] == '\\'
787 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
788 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200789 || (is_magic_all
790 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000791 || reg_magic == MAGIC_ALL)
792 curchr = Magic('$');
793 }
794 break;
795 case '\\':
796 {
797 int c = regparse[1];
798
799 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100800 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000801 else if (
802#ifdef EBCDIC
803 vim_strchr(META, c)
804#else
805 c <= '~' && META_flags[c]
806#endif
807 )
808 {
809 /*
810 * META contains everything that may be magic sometimes,
811 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200812 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000813 * magicness. Therefore, \ is so meta-magic that it is
814 * not in META.
815 */
816 curchr = -1;
817 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100818 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000819 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000820 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000821 peekchr();
822 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000823 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000824 curchr = toggle_Magic(curchr);
825 }
826 else if (vim_strchr(REGEXP_ABBR, c))
827 {
828 /*
829 * Handle abbreviations, like "\t" for TAB -- webb
830 */
831 curchr = backslash_trans(c);
832 }
833 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
834 curchr = toggle_Magic(c);
835 else
836 {
837 /*
838 * Next character can never be (made) magic?
839 * Then backslashing it won't do anything.
840 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000841 if (has_mbyte)
842 curchr = (*mb_ptr2char)(regparse + 1);
843 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000844 curchr = c;
845 }
846 break;
847 }
848
Bram Moolenaar071d4272004-06-13 20:20:40 +0000849 default:
850 if (has_mbyte)
851 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000852 }
853 }
854
855 return curchr;
856}
857
858/*
859 * Eat one lexed character. Do this in a way that we can undo it.
860 */
861 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100862skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000863{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100864 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000865 if (*regparse == '\\')
866 prevchr_len = 1;
867 else
868 prevchr_len = 0;
869 if (regparse[prevchr_len] != NUL)
870 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000871 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100872 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000873 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000874 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000875 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000876 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000877 ++prevchr_len;
878 }
879 regparse += prevchr_len;
880 prev_at_start = at_start;
881 at_start = FALSE;
882 prevprevchr = prevchr;
883 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100884 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000885 nextchr = -1;
886}
887
888/*
889 * Skip a character while keeping the value of prev_at_start for at_start.
890 * prevchr and prevprevchr are also kept.
891 */
892 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100893skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000894{
895 int as = prev_at_start;
896 int pr = prevchr;
897 int prpr = prevprevchr;
898
899 skipchr();
900 at_start = as;
901 prevchr = pr;
902 prevprevchr = prpr;
903}
904
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200905/*
906 * Get the next character from the pattern. We know about magic and such, so
907 * therefore we need a lexical analyzer.
908 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000909 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100910getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000911{
912 int chr = peekchr();
913
914 skipchr();
915 return chr;
916}
917
918/*
919 * put character back. Works only once!
920 */
921 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100922ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000923{
924 nextchr = curchr;
925 curchr = prevchr;
926 prevchr = prevprevchr;
927 at_start = prev_at_start;
928 prev_at_start = FALSE;
929
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100930 // Backup regparse, so that it's at the same position as before the
931 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000932 regparse -= prevchr_len;
933}
934
935/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000936 * Get and return the value of the hex string at the current position.
937 * Return -1 if there is no valid hex number.
938 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000939 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000940 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000941 * The parameter controls the maximum number of input characters. This will be
942 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
943 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100944 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100945gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000946{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100947 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000948 int c;
949 int i;
950
951 for (i = 0; i < maxinputlen; ++i)
952 {
953 c = regparse[0];
954 if (!vim_isxdigit(c))
955 break;
956 nr <<= 4;
957 nr |= hex2nr(c);
958 ++regparse;
959 }
960
961 if (i == 0)
962 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100963 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000964}
965
966/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200967 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000968 * current position. Return -1 for invalid. Consumes all digits.
969 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100970 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100971getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000972{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100973 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000974 int c;
975 int i;
976
977 for (i = 0; ; ++i)
978 {
979 c = regparse[0];
980 if (c < '0' || c > '9')
981 break;
982 nr *= 10;
983 nr += c - '0';
984 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100985 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000986 }
987
988 if (i == 0)
989 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100990 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000991}
992
993/*
994 * get and return the value of the octal string immediately after the current
995 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
996 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
997 * treat 8 or 9 as recognised characters. Position is updated:
998 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000999 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001000 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001001 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001002getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001003{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001004 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001005 int c;
1006 int i;
1007
1008 for (i = 0; i < 3 && nr < 040; ++i)
1009 {
1010 c = regparse[0];
1011 if (c < '0' || c > '7')
1012 break;
1013 nr <<= 3;
1014 nr |= hex2nr(c);
1015 ++regparse;
1016 }
1017
1018 if (i == 0)
1019 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001020 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001021}
1022
1023/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001024 * read_limits - Read two integers to be taken as a minimum and maximum.
1025 * If the first character is '-', then the range is reversed.
1026 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1027 * missing, a very big number is the default.
1028 */
1029 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001030read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001031{
1032 int reverse = FALSE;
1033 char_u *first_char;
1034 long tmp;
1035
1036 if (*regparse == '-')
1037 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001038 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001039 regparse++;
1040 reverse = TRUE;
1041 }
1042 first_char = regparse;
1043 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001044 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001045 {
1046 if (vim_isdigit(*++regparse))
1047 *maxval = getdigits(&regparse);
1048 else
1049 *maxval = MAX_LIMIT;
1050 }
1051 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001052 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001053 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001054 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001055 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001056 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001057 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001058 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1059 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001060
1061 /*
1062 * Reverse the range if there was a '-', or make sure it is in the right
1063 * order otherwise.
1064 */
1065 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1066 {
1067 tmp = *minval;
1068 *minval = *maxval;
1069 *maxval = tmp;
1070 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001071 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001072 return OK;
1073}
1074
1075/*
1076 * vim_regexec and friends
1077 */
1078
1079/*
1080 * Global work variables for vim_regexec().
1081 */
1082
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001083static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001084#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001085static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001086#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001087static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001088static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001089
1090/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001091 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1092 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001093 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001094 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001095static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001096static unsigned reg_tofreelen;
1097
1098/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001099 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001100 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001101 * done:
1102 * single-line multi-line
1103 * reg_match &regmatch_T NULL
1104 * reg_mmatch NULL &regmmatch_T
1105 * reg_startp reg_match->startp <invalid>
1106 * reg_endp reg_match->endp <invalid>
1107 * reg_startpos <invalid> reg_mmatch->startpos
1108 * reg_endpos <invalid> reg_mmatch->endpos
1109 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001110 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001111 * reg_firstlnum <invalid> first line in which to search
1112 * reg_maxline 0 last line nr
1113 * reg_line_lbr FALSE or TRUE FALSE
1114 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001115typedef struct {
1116 regmatch_T *reg_match;
1117 regmmatch_T *reg_mmatch;
1118 char_u **reg_startp;
1119 char_u **reg_endp;
1120 lpos_T *reg_startpos;
1121 lpos_T *reg_endpos;
1122 win_T *reg_win;
1123 buf_T *reg_buf;
1124 linenr_T reg_firstlnum;
1125 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001126 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001127
Bram Moolenaar0270f382018-07-17 05:43:58 +02001128 // The current match-position is stord in these variables:
1129 linenr_T lnum; // line number, relative to first line
1130 char_u *line; // start of current line
1131 char_u *input; // current input, points into "regline"
1132
1133 int need_clear_subexpr; // subexpressions still need to be cleared
1134#ifdef FEAT_SYN_HL
1135 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1136 // cleared
1137#endif
1138
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001139 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1140 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1141 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001142 int reg_ic;
1143
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001144 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1145 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001146 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001147
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001148 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1149 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001150 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001151
1152 // State for the NFA engine regexec.
1153 int nfa_has_zend; // NFA regexp \ze operator encountered.
1154 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1155 int nfa_nsubexpr; // Number of sub expressions actually being used
1156 // during execution. 1 if only the whole match
1157 // (subexpr 0) is used.
1158 // listid is global, so that it increases on recursive calls to
1159 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1160 // all the states.
1161 int nfa_listid;
1162 int nfa_alt_listid;
1163
1164#ifdef FEAT_SYN_HL
1165 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1166#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001167} regexec_T;
1168
1169static regexec_T rex;
1170static int rex_in_use = FALSE;
1171
Bram Moolenaar071d4272004-06-13 20:20:40 +00001172/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001173 * Return TRUE if character 'c' is included in 'iskeyword' option for
1174 * "reg_buf" buffer.
1175 */
1176 static int
1177reg_iswordc(int c)
1178{
1179 return vim_iswordc_buf(c, rex.reg_buf);
1180}
1181
1182/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001183 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1184 */
1185 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001186reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001187{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001188 // when looking behind for a match/no-match lnum is negative. But we
1189 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001190 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001191 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001192 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001193 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001194 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001195 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001196}
1197
Bram Moolenaar071d4272004-06-13 20:20:40 +00001198#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001199static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1200static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1201static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1202static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001203#endif
1204
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001205// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001206#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001207
Bram Moolenaar071d4272004-06-13 20:20:40 +00001208#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001209/*
1210 * Create a new extmatch and mark it as referenced once.
1211 */
1212 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001213make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001214{
1215 reg_extmatch_T *em;
1216
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001217 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001218 if (em != NULL)
1219 em->refcnt = 1;
1220 return em;
1221}
1222
1223/*
1224 * Add a reference to an extmatch.
1225 */
1226 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001227ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001228{
1229 if (em != NULL)
1230 em->refcnt++;
1231 return em;
1232}
1233
1234/*
1235 * Remove a reference to an extmatch. If there are no references left, free
1236 * the info.
1237 */
1238 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001239unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001240{
1241 int i;
1242
1243 if (em != NULL && --em->refcnt <= 0)
1244 {
1245 for (i = 0; i < NSUBEXP; ++i)
1246 vim_free(em->matches[i]);
1247 vim_free(em);
1248 }
1249}
1250#endif
1251
1252/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001253 * Get class of previous character.
1254 */
1255 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001256reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001257{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001258 if (rex.input > rex.line)
1259 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001260 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001261 return -1;
1262}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001263
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001264/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001265 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001266 */
1267 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001268reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001269{
1270 pos_T top, bot;
1271 linenr_T lnum;
1272 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001273 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001274 int mode;
1275 colnr_T start, end;
1276 colnr_T start2, end2;
1277 colnr_T cols;
1278
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001279 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001280 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001281 return FALSE;
1282
1283 if (VIsual_active)
1284 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001285 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001286 {
1287 top = VIsual;
1288 bot = wp->w_cursor;
1289 }
1290 else
1291 {
1292 top = wp->w_cursor;
1293 bot = VIsual;
1294 }
1295 mode = VIsual_mode;
1296 }
1297 else
1298 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001299 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001300 {
1301 top = curbuf->b_visual.vi_start;
1302 bot = curbuf->b_visual.vi_end;
1303 }
1304 else
1305 {
1306 top = curbuf->b_visual.vi_end;
1307 bot = curbuf->b_visual.vi_start;
1308 }
1309 mode = curbuf->b_visual.vi_mode;
1310 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001311 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001312 if (lnum < top.lnum || lnum > bot.lnum)
1313 return FALSE;
1314
1315 if (mode == 'v')
1316 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001317 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001318 if ((lnum == top.lnum && col < top.col)
1319 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1320 return FALSE;
1321 }
1322 else if (mode == Ctrl_V)
1323 {
1324 getvvcol(wp, &top, &start, NULL, &end);
1325 getvvcol(wp, &bot, &start2, NULL, &end2);
1326 if (start2 < start)
1327 start = start2;
1328 if (end2 > end)
1329 end = end2;
1330 if (top.col == MAXCOL || bot.col == MAXCOL)
1331 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001332 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001333 if (cols < start || cols > end - (*p_sel == 'e'))
1334 return FALSE;
1335 }
1336 return TRUE;
1337}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001338
Bram Moolenaar071d4272004-06-13 20:20:40 +00001339/*
1340 * Check the regexp program for its magic number.
1341 * Return TRUE if it's wrong.
1342 */
1343 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001344prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001345{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001346 regprog_T *prog;
1347
Bram Moolenaar6100d022016-10-02 16:51:57 +02001348 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001349 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001350 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001351 return FALSE;
1352
1353 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001354 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001355 emsg(_(e_re_corr));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001356 return TRUE;
1357 }
1358 return FALSE;
1359}
1360
1361/*
1362 * Cleanup the subexpressions, if this wasn't done yet.
1363 * This construction is used to clear the subexpressions only when they are
1364 * used (to increase speed).
1365 */
1366 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001367cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001368{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001369 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001370 {
1371 if (REG_MULTI)
1372 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001373 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001374 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1375 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001376 }
1377 else
1378 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001379 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1380 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001381 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001382 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001383 }
1384}
1385
1386#ifdef FEAT_SYN_HL
1387 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001388cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001389{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001390 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001391 {
1392 if (REG_MULTI)
1393 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001394 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001395 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1396 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1397 }
1398 else
1399 {
1400 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1401 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1402 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001403 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001404 }
1405}
1406#endif
1407
1408/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001409 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001410 */
1411 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001412reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001413{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001414 rex.line = reg_getline(++rex.lnum);
1415 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001416 fast_breakcheck();
1417}
1418
1419/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001420 * Check whether a backreference matches.
1421 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001422 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1423 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001424 */
1425 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001426match_with_backref(
1427 linenr_T start_lnum,
1428 colnr_T start_col,
1429 linenr_T end_lnum,
1430 colnr_T end_col,
1431 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001432{
1433 linenr_T clnum = start_lnum;
1434 colnr_T ccol = start_col;
1435 int len;
1436 char_u *p;
1437
1438 if (bytelen != NULL)
1439 *bytelen = 0;
1440 for (;;)
1441 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001442 // Since getting one line may invalidate the other, need to make copy.
1443 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001444 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001445 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001446 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001447 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1448 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001449 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001450 vim_free(reg_tofree);
1451 reg_tofree = alloc(len);
1452 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001453 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001454 reg_tofreelen = len;
1455 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001456 STRCPY(reg_tofree, rex.line);
1457 rex.input = reg_tofree + (rex.input - rex.line);
1458 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001459 }
1460
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001461 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001462 p = reg_getline(clnum);
1463 if (clnum == end_lnum)
1464 len = end_col - ccol;
1465 else
1466 len = (int)STRLEN(p + ccol);
1467
Bram Moolenaar0270f382018-07-17 05:43:58 +02001468 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001469 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001470 if (bytelen != NULL)
1471 *bytelen += len;
1472 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001473 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001474 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001475 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001476
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001477 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001478 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001479 if (bytelen != NULL)
1480 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001481 ++clnum;
1482 ccol = 0;
1483 if (got_int)
1484 return RA_FAIL;
1485 }
1486
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001487 // found a match! Note that rex.line may now point to a copy of the line,
1488 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001489 return RA_MATCH;
1490}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001491
Bram Moolenaarfb031402014-09-09 17:18:49 +02001492/*
1493 * Used in a place where no * or \+ can follow.
1494 */
1495 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001496re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001497{
1498 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001499 {
1500 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1501 rc_did_emsg = TRUE;
1502 return FAIL;
1503 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001504 return OK;
1505}
1506
Bram Moolenaar071d4272004-06-13 20:20:40 +00001507typedef struct
1508{
1509 int a, b, c;
1510} decomp_T;
1511
1512
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001513// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001514static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001515{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001516 {0x5e2,0,0}, // 0xfb20 alt ayin
1517 {0x5d0,0,0}, // 0xfb21 alt alef
1518 {0x5d3,0,0}, // 0xfb22 alt dalet
1519 {0x5d4,0,0}, // 0xfb23 alt he
1520 {0x5db,0,0}, // 0xfb24 alt kaf
1521 {0x5dc,0,0}, // 0xfb25 alt lamed
1522 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1523 {0x5e8,0,0}, // 0xfb27 alt resh
1524 {0x5ea,0,0}, // 0xfb28 alt tav
1525 {'+', 0, 0}, // 0xfb29 alt plus
1526 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1527 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1528 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1529 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1530 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1531 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1532 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1533 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1534 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1535 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1536 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1537 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1538 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1539 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1540 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1541 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1542 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1543 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1544 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1545 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1546 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1547 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1548 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1549 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1550 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1551 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1552 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1553 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1554 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1555 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1556 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1557 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1558 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1559 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1560 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1561 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1562 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1563 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001564};
1565
1566 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001567mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001568{
1569 decomp_T d;
1570
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001571 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001572 {
1573 d = decomp_table[c - 0xfb20];
1574 *c1 = d.a;
1575 *c2 = d.b;
1576 *c3 = d.c;
1577 }
1578 else
1579 {
1580 *c1 = c;
1581 *c2 = *c3 = 0;
1582 }
1583}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001584
1585/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001586 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001587 * Return 0 if strings match, non-zero otherwise.
1588 * Correct the length "*n" when composing characters are ignored.
1589 */
1590 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001591cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001592{
1593 int result;
1594
Bram Moolenaar6100d022016-10-02 16:51:57 +02001595 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001596 result = STRNCMP(s1, s2, *n);
1597 else
1598 result = MB_STRNICMP(s1, s2, *n);
1599
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001600 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001601 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001602 {
1603 char_u *str1, *str2;
1604 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001605 int junk;
1606
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001607 // we have to handle the strcmp ourselves, since it is necessary to
1608 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001609 str1 = s1;
1610 str2 = s2;
1611 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001612 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001613 {
1614 c1 = mb_ptr2char_adv(&str1);
1615 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001616
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001617 // Decompose the character if necessary, into 'base' characters.
1618 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001619 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001620 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001621 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001622 mb_decompose(c1, &c11, &junk, &junk);
1623 mb_decompose(c2, &c12, &junk, &junk);
1624 c1 = c11;
1625 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001626 if (c11 != c12
1627 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001628 break;
1629 }
1630 }
1631 result = c2 - c1;
1632 if (result == 0)
1633 *n = (int)(str2 - s2);
1634 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001635
1636 return result;
1637}
1638
1639/*
1640 * cstrchr: This function is used a lot for simple searches, keep it fast!
1641 */
1642 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001643cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001644{
1645 char_u *p;
1646 int cc;
1647
Bram Moolenaara12a1612019-01-24 16:39:02 +01001648 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001649 return vim_strchr(s, c);
1650
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001651 // tolower() and toupper() can be slow, comparing twice should be a lot
1652 // faster (esp. when using MS Visual C++!).
1653 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001654 if (enc_utf8 && c > 0x80)
1655 cc = utf_fold(c);
1656 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001657 if (MB_ISUPPER(c))
1658 cc = MB_TOLOWER(c);
1659 else if (MB_ISLOWER(c))
1660 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001661 else
1662 return vim_strchr(s, c);
1663
Bram Moolenaar071d4272004-06-13 20:20:40 +00001664 if (has_mbyte)
1665 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001666 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001667 {
1668 if (enc_utf8 && c > 0x80)
1669 {
1670 if (utf_fold(utf_ptr2char(p)) == cc)
1671 return p;
1672 }
1673 else if (*p == c || *p == cc)
1674 return p;
1675 }
1676 }
1677 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001678 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001679 for (p = s; *p != NUL; ++p)
1680 if (*p == c || *p == cc)
1681 return p;
1682
1683 return NULL;
1684}
1685
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001686////////////////////////////////////////////////////////////////
1687// regsub stuff //
1688////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001689
Bram Moolenaar071d4272004-06-13 20:20:40 +00001690/*
1691 * We should define ftpr as a pointer to a function returning a pointer to
1692 * a function returning a pointer to a function ...
1693 * This is impossible, so we declare a pointer to a function returning a
1694 * pointer to a function returning void. This should work for all compilers.
1695 */
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001696typedef void (*(*fptr_T)(int *, int))();
Bram Moolenaar071d4272004-06-13 20:20:40 +00001697
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001698static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001699
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001700 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001701do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001702{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001703 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001704
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001705 return (fptr_T)NULL;
1706}
1707
1708 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001709do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001710{
1711 *d = MB_TOUPPER(c);
1712
1713 return (fptr_T)do_Upper;
1714}
1715
1716 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001717do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001718{
1719 *d = MB_TOLOWER(c);
1720
1721 return (fptr_T)NULL;
1722}
1723
1724 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001725do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001726{
1727 *d = MB_TOLOWER(c);
1728
1729 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001730}
1731
1732/*
1733 * regtilde(): Replace tildes in the pattern by the old pattern.
1734 *
1735 * Short explanation of the tilde: It stands for the previous replacement
1736 * pattern. If that previous pattern also contains a ~ we should go back a
1737 * step further... But we insert the previous pattern into the current one
1738 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001739 * This still does not handle the case where "magic" changes. So require the
1740 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001741 *
1742 * The tildes are parsed once before the first call to vim_regsub().
1743 */
1744 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001745regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001746{
1747 char_u *newsub = source;
1748 char_u *tmpsub;
1749 char_u *p;
1750 int len;
1751 int prevlen;
1752
1753 for (p = newsub; *p; ++p)
1754 {
1755 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1756 {
1757 if (reg_prev_sub != NULL)
1758 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001759 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001760 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001761 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001762 if (tmpsub != NULL)
1763 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001764 // copy prefix
1765 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001766 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001767 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001768 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001769 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001771 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001772 STRCPY(tmpsub + len + prevlen, p + 1);
1773
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001774 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001775 vim_free(newsub);
1776 newsub = tmpsub;
1777 p = newsub + len + prevlen;
1778 }
1779 }
1780 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001781 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001782 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001783 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001784 --p;
1785 }
1786 else
1787 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001788 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001789 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001790 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001791 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001792 }
1793 }
1794
1795 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001796 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001797 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001798 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001799 reg_prev_sub = vim_strsave(newsub);
1800 return newsub;
1801}
1802
1803#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001804static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001805
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001806// These pointers are used for reg_submatch(). Needed for when the
1807// substitution string is an expression that contains a call to substitute()
1808// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001809typedef struct {
1810 regmatch_T *sm_match;
1811 regmmatch_T *sm_mmatch;
1812 linenr_T sm_firstlnum;
1813 linenr_T sm_maxline;
1814 int sm_line_lbr;
1815} regsubmatch_T;
1816
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001817static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001818#endif
1819
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001820#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001821
1822/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001823 * Put the submatches in "argv[argskip]" which is a list passed into
1824 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001825 */
1826 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001827fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001828{
1829 listitem_T *li;
1830 int i;
1831 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001832 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001833
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001834 if (argcount == argskip)
1835 // called function doesn't take a submatches argument
1836 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001837
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001838 // Relies on sl_list to be the first item in staticList10_T.
1839 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001840
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001841 // There are always 10 list items in staticList10_T.
1842 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001843 for (i = 0; i < 10; ++i)
1844 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001845 s = rsm.sm_match->startp[i];
1846 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001847 s = NULL;
1848 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001849 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001850 li->li_tv.v_type = VAR_STRING;
1851 li->li_tv.vval.v_string = s;
1852 li = li->li_next;
1853 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001854 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001855}
1856
1857 static void
1858clear_submatch_list(staticList10_T *sl)
1859{
1860 int i;
1861
1862 for (i = 0; i < 10; ++i)
1863 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1864}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001865#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001866
Bram Moolenaar071d4272004-06-13 20:20:40 +00001867/*
1868 * vim_regsub() - perform substitutions after a vim_regexec() or
1869 * vim_regexec_multi() match.
1870 *
1871 * If "copy" is TRUE really copy into "dest".
1872 * If "copy" is FALSE nothing is copied, this is just to find out the length
1873 * of the result.
1874 *
1875 * If "backslash" is TRUE, a backslash will be removed later, need to double
1876 * them to keep them, and insert a backslash before a CR to avoid it being
1877 * replaced with a line break later.
1878 *
1879 * Note: The matched text must not change between the call of
1880 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1881 * references invalid!
1882 *
1883 * Returns the size of the replacement, including terminating NUL.
1884 */
1885 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001886vim_regsub(
1887 regmatch_T *rmp,
1888 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001889 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001890 char_u *dest,
1891 int copy,
1892 int magic,
1893 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001894{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001895 int result;
1896 regexec_T rex_save;
1897 int rex_in_use_save = rex_in_use;
1898
1899 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001900 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001901 rex_save = rex;
1902 rex_in_use = TRUE;
1903
1904 rex.reg_match = rmp;
1905 rex.reg_mmatch = NULL;
1906 rex.reg_maxline = 0;
1907 rex.reg_buf = curbuf;
1908 rex.reg_line_lbr = TRUE;
1909 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1910
1911 rex_in_use = rex_in_use_save;
1912 if (rex_in_use)
1913 rex = rex_save;
1914
1915 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001916}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001917
1918 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001919vim_regsub_multi(
1920 regmmatch_T *rmp,
1921 linenr_T lnum,
1922 char_u *source,
1923 char_u *dest,
1924 int copy,
1925 int magic,
1926 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001927{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001928 int result;
1929 regexec_T rex_save;
1930 int rex_in_use_save = rex_in_use;
1931
1932 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001933 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001934 rex_save = rex;
1935 rex_in_use = TRUE;
1936
1937 rex.reg_match = NULL;
1938 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001939 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001940 rex.reg_firstlnum = lnum;
1941 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1942 rex.reg_line_lbr = FALSE;
1943 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1944
1945 rex_in_use = rex_in_use_save;
1946 if (rex_in_use)
1947 rex = rex_save;
1948
1949 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001950}
1951
1952 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001953vim_regsub_both(
1954 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001955 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001956 char_u *dest,
1957 int copy,
1958 int magic,
1959 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001960{
1961 char_u *src;
1962 char_u *dst;
1963 char_u *s;
1964 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001965 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001966 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001967 fptr_T func_all = (fptr_T)NULL;
1968 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001969 linenr_T clnum = 0; // init for GCC
1970 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001971#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001972 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001973#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001974
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001975 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001976 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001978 emsg(_(e_null));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001979 return 0;
1980 }
1981 if (prog_magic_wrong())
1982 return 0;
1983 src = source;
1984 dst = dest;
1985
1986 /*
1987 * When the substitute part starts with "\=" evaluate it as an expression.
1988 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001989 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001990 {
1991#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001992 // To make sure that the length doesn't change between checking the
1993 // length and copying the string, and to speed up things, the
1994 // resulting string is saved from the call with "copy" == FALSE to the
1995 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001996 if (copy)
1997 {
1998 if (eval_result != NULL)
1999 {
2000 STRCPY(dest, eval_result);
2001 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002002 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002003 }
2004 }
2005 else
2006 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002007 int prev_can_f_submatch = can_f_submatch;
2008 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002009
2010 vim_free(eval_result);
2011
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002012 // The expression may contain substitute(), which calls us
2013 // recursively. Make sure submatch() gets the text from the first
2014 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002015 if (can_f_submatch)
2016 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002017 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002018 rsm.sm_match = rex.reg_match;
2019 rsm.sm_mmatch = rex.reg_mmatch;
2020 rsm.sm_firstlnum = rex.reg_firstlnum;
2021 rsm.sm_maxline = rex.reg_maxline;
2022 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002023
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002024 if (expr != NULL)
2025 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002026 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002027 char_u buf[NUMBUFLEN];
2028 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002029 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002030 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031
2032 rettv.v_type = VAR_STRING;
2033 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002034 argv[0].v_type = VAR_LIST;
2035 argv[0].vval.v_list = &matchList.sl_list;
2036 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002037 CLEAR_FIELD(funcexe);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002038 funcexe.argv_func = fill_submatch_list;
2039 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002040 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002041 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002042 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002043 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002044 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002045 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002046 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002047 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002048
Bram Moolenaar6100d022016-10-02 16:51:57 +02002049 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002050 funcexe.partial = partial;
2051 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002052 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002053 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002054 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002055 clear_submatch_list(&matchList);
2056
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002057 if (rettv.v_type == VAR_UNKNOWN)
2058 // something failed, no need to report another error
2059 eval_result = NULL;
2060 else
2061 {
2062 eval_result = tv_get_string_buf_chk(&rettv, buf);
2063 if (eval_result != NULL)
2064 eval_result = vim_strsave(eval_result);
2065 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002066 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002067 }
2068 else
2069 eval_result = eval_to_string(source + 2, NULL, TRUE);
2070
Bram Moolenaar071d4272004-06-13 20:20:40 +00002071 if (eval_result != NULL)
2072 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002073 int had_backslash = FALSE;
2074
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002075 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002076 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002077 // Change NL to CR, so that it becomes a line break,
2078 // unless called from vim_regexec_nl().
2079 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002080 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002081 *s = CAR;
2082 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002083 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002084 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002085 /* Change NL to CR here too, so that this works:
2086 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2087 * abc\
2088 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002089 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002090 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002091 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002092 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002093 had_backslash = TRUE;
2094 }
2095 }
2096 if (had_backslash && backslash)
2097 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002098 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002099 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2100 if (s != NULL)
2101 {
2102 vim_free(eval_result);
2103 eval_result = s;
2104 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002105 }
2106
2107 dst += STRLEN(eval_result);
2108 }
2109
Bram Moolenaar6100d022016-10-02 16:51:57 +02002110 can_f_submatch = prev_can_f_submatch;
2111 if (can_f_submatch)
2112 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002113 }
2114#endif
2115 }
2116 else
2117 while ((c = *src++) != NUL)
2118 {
2119 if (c == '&' && magic)
2120 no = 0;
2121 else if (c == '\\' && *src != NUL)
2122 {
2123 if (*src == '&' && !magic)
2124 {
2125 ++src;
2126 no = 0;
2127 }
2128 else if ('0' <= *src && *src <= '9')
2129 {
2130 no = *src++ - '0';
2131 }
2132 else if (vim_strchr((char_u *)"uUlLeE", *src))
2133 {
2134 switch (*src++)
2135 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002136 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002137 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002138 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002139 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002140 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002141 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002142 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002143 continue;
2144 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002145 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002146 continue;
2147 }
2148 }
2149 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002150 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002151 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002152 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2153 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002154 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002155 if (copy)
2156 {
2157 *dst++ = c;
2158 *dst++ = *src++;
2159 *dst++ = *src++;
2160 }
2161 else
2162 {
2163 dst += 3;
2164 src += 2;
2165 }
2166 continue;
2167 }
2168
Bram Moolenaar071d4272004-06-13 20:20:40 +00002169 if (c == '\\' && *src != NUL)
2170 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002171 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002172 switch (*src)
2173 {
2174 case 'r': c = CAR; ++src; break;
2175 case 'n': c = NL; ++src; break;
2176 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002177 // Oh no! \e already has meaning in subst pat :-(
2178 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002179 case 'b': c = Ctrl_H; ++src; break;
2180
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002181 // If "backslash" is TRUE the backslash will be removed
2182 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002183 default: if (backslash)
2184 {
2185 if (copy)
2186 *dst = '\\';
2187 ++dst;
2188 }
2189 c = *src++;
2190 }
2191 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002192 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002193 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002194
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002195 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002196 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002197 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002198 func_one = (fptr_T)(func_one(&cc, c));
2199 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002200 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002201 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002202 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002203 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002204
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002205 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002206 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002207 int totlen = mb_ptr2len(src - 1);
2208
Bram Moolenaar071d4272004-06-13 20:20:40 +00002209 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002210 mb_char2bytes(cc, dst);
2211 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002212 if (enc_utf8)
2213 {
2214 int clen = utf_ptr2len(src - 1);
2215
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002216 // If the character length is shorter than "totlen", there
2217 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002218 if (clen < totlen)
2219 {
2220 if (copy)
2221 mch_memmove(dst + 1, src - 1 + clen,
2222 (size_t)(totlen - clen));
2223 dst += totlen - clen;
2224 }
2225 }
2226 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002227 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002228 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002229 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002230 dst++;
2231 }
2232 else
2233 {
2234 if (REG_MULTI)
2235 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002236 clnum = rex.reg_mmatch->startpos[no].lnum;
2237 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002238 s = NULL;
2239 else
2240 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002241 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2242 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2243 len = rex.reg_mmatch->endpos[no].col
2244 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002245 else
2246 len = (int)STRLEN(s);
2247 }
2248 }
2249 else
2250 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002251 s = rex.reg_match->startp[no];
2252 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002253 s = NULL;
2254 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002255 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002256 }
2257 if (s != NULL)
2258 {
2259 for (;;)
2260 {
2261 if (len == 0)
2262 {
2263 if (REG_MULTI)
2264 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002265 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002266 break;
2267 if (copy)
2268 *dst = CAR;
2269 ++dst;
2270 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002271 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2272 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002273 else
2274 len = (int)STRLEN(s);
2275 }
2276 else
2277 break;
2278 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002279 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002280 {
2281 if (copy)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002282 emsg(_(e_re_damg));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002283 goto exit;
2284 }
2285 else
2286 {
2287 if (backslash && (*s == CAR || *s == '\\'))
2288 {
2289 /*
2290 * Insert a backslash in front of a CR, otherwise
2291 * it will be replaced by a line break.
2292 * Number of backslashes will be halved later,
2293 * double them here.
2294 */
2295 if (copy)
2296 {
2297 dst[0] = '\\';
2298 dst[1] = *s;
2299 }
2300 dst += 2;
2301 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002302 else
2303 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002304 if (has_mbyte)
2305 c = mb_ptr2char(s);
2306 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002307 c = *s;
2308
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002309 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002310 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002311 func_one = (fptr_T)(func_one(&cc, c));
2312 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002313 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002314 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002315 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002316 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002317
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002318 if (has_mbyte)
2319 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002320 int l;
2321
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002322 // Copy composing characters separately, one
2323 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002324 if (enc_utf8)
2325 l = utf_ptr2len(s) - 1;
2326 else
2327 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002328
2329 s += l;
2330 len -= l;
2331 if (copy)
2332 mb_char2bytes(cc, dst);
2333 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002334 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002335 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002336 *dst = cc;
2337 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002338 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002339
Bram Moolenaar071d4272004-06-13 20:20:40 +00002340 ++s;
2341 --len;
2342 }
2343 }
2344 }
2345 no = -1;
2346 }
2347 }
2348 if (copy)
2349 *dst = NUL;
2350
2351exit:
2352 return (int)((dst - dest) + 1);
2353}
2354
2355#ifdef FEAT_EVAL
2356/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002357 * Call reg_getline() with the line numbers from the submatch. If a
2358 * substitute() was used the reg_maxline and other values have been
2359 * overwritten.
2360 */
2361 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002362reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002363{
2364 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002365 linenr_T save_first = rex.reg_firstlnum;
2366 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002367
Bram Moolenaar6100d022016-10-02 16:51:57 +02002368 rex.reg_firstlnum = rsm.sm_firstlnum;
2369 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002370
2371 s = reg_getline(lnum);
2372
Bram Moolenaar6100d022016-10-02 16:51:57 +02002373 rex.reg_firstlnum = save_first;
2374 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002375 return s;
2376}
2377
2378/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002379 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002380 * allocated memory.
2381 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2382 */
2383 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002384reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002385{
2386 char_u *retval = NULL;
2387 char_u *s;
2388 int len;
2389 int round;
2390 linenr_T lnum;
2391
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002392 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002393 return NULL;
2394
Bram Moolenaar6100d022016-10-02 16:51:57 +02002395 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002396 {
2397 /*
2398 * First round: compute the length and allocate memory.
2399 * Second round: copy the text.
2400 */
2401 for (round = 1; round <= 2; ++round)
2402 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002403 lnum = rsm.sm_mmatch->startpos[no].lnum;
2404 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002405 return NULL;
2406
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002407 s = reg_getline_submatch(lnum);
2408 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002409 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002410 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002411 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002412 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002413 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002414 len = rsm.sm_mmatch->endpos[no].col
2415 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002417 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002418 ++len;
2419 }
2420 else
2421 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002422 // Multiple lines: take start line from start col, middle
2423 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002424 len = (int)STRLEN(s);
2425 if (round == 2)
2426 {
2427 STRCPY(retval, s);
2428 retval[len] = '\n';
2429 }
2430 ++len;
2431 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002432 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002433 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002434 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002435 if (round == 2)
2436 STRCPY(retval + len, s);
2437 len += (int)STRLEN(s);
2438 if (round == 2)
2439 retval[len] = '\n';
2440 ++len;
2441 }
2442 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002443 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002444 rsm.sm_mmatch->endpos[no].col);
2445 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002446 if (round == 2)
2447 retval[len] = NUL;
2448 ++len;
2449 }
2450
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002451 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002452 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002453 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002454 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002455 return NULL;
2456 }
2457 }
2458 }
2459 else
2460 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002461 s = rsm.sm_match->startp[no];
2462 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002463 retval = NULL;
2464 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002465 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002466 }
2467
2468 return retval;
2469}
Bram Moolenaar41571762014-04-02 19:00:58 +02002470
2471/*
2472 * Used for the submatch() function with the optional non-zero argument: get
2473 * the list of strings from the n'th submatch in allocated memory with NULs
2474 * represented in NLs.
2475 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2476 * command, for a non-existing submatch and for any error.
2477 */
2478 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002479reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002480{
2481 char_u *s;
2482 linenr_T slnum;
2483 linenr_T elnum;
2484 colnr_T scol;
2485 colnr_T ecol;
2486 int i;
2487 list_T *list;
2488 int error = FALSE;
2489
2490 if (!can_f_submatch || no < 0)
2491 return NULL;
2492
Bram Moolenaar6100d022016-10-02 16:51:57 +02002493 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002494 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002495 slnum = rsm.sm_mmatch->startpos[no].lnum;
2496 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002497 if (slnum < 0 || elnum < 0)
2498 return NULL;
2499
Bram Moolenaar6100d022016-10-02 16:51:57 +02002500 scol = rsm.sm_mmatch->startpos[no].col;
2501 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002502
2503 list = list_alloc();
2504 if (list == NULL)
2505 return NULL;
2506
2507 s = reg_getline_submatch(slnum) + scol;
2508 if (slnum == elnum)
2509 {
2510 if (list_append_string(list, s, ecol - scol) == FAIL)
2511 error = TRUE;
2512 }
2513 else
2514 {
2515 if (list_append_string(list, s, -1) == FAIL)
2516 error = TRUE;
2517 for (i = 1; i < elnum - slnum; i++)
2518 {
2519 s = reg_getline_submatch(slnum + i);
2520 if (list_append_string(list, s, -1) == FAIL)
2521 error = TRUE;
2522 }
2523 s = reg_getline_submatch(elnum);
2524 if (list_append_string(list, s, ecol) == FAIL)
2525 error = TRUE;
2526 }
2527 }
2528 else
2529 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002530 s = rsm.sm_match->startp[no];
2531 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002532 return NULL;
2533 list = list_alloc();
2534 if (list == NULL)
2535 return NULL;
2536 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002537 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002538 error = TRUE;
2539 }
2540
2541 if (error)
2542 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002543 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002544 return NULL;
2545 }
2546 return list;
2547}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002548#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002549
Bram Moolenaarf4140482020-02-15 23:06:45 +01002550/*
2551 * Initialize the values used for matching against multiple lines
2552 */
2553 static void
2554init_regexec_multi(
2555 regmmatch_T *rmp,
2556 win_T *win, // window in which to search or NULL
2557 buf_T *buf, // buffer in which to search
2558 linenr_T lnum) // nr of line to start looking for match
2559{
2560 rex.reg_match = NULL;
2561 rex.reg_mmatch = rmp;
2562 rex.reg_buf = buf;
2563 rex.reg_win = win;
2564 rex.reg_firstlnum = lnum;
2565 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2566 rex.reg_line_lbr = FALSE;
2567 rex.reg_ic = rmp->rmm_ic;
2568 rex.reg_icombine = FALSE;
2569 rex.reg_maxcol = rmp->rmm_maxcol;
2570}
2571
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002572#include "regexp_bt.c"
2573
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002574static regengine_T bt_regengine =
2575{
2576 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002577 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002578 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002579 bt_regexec_multi,
2580 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002581};
2582
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002583#include "regexp_nfa.c"
2584
2585static regengine_T nfa_regengine =
2586{
2587 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002588 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002589 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002590 nfa_regexec_multi,
2591 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002592};
2593
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002594// Which regexp engine to use? Needed for vim_regcomp().
2595// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002596static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002597
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002598#ifdef DEBUG
2599static char_u regname[][30] = {
2600 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002601 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002602 "NFA Regexp Engine"
2603 };
2604#endif
2605
2606/*
2607 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002608 * Returns the program in allocated memory.
2609 * Use vim_regfree() to free the memory.
2610 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002611 */
2612 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002613vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002614{
2615 regprog_T *prog = NULL;
2616 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002617 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002618
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002619 regexp_engine = p_re;
2620
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002621 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002622 if (STRNCMP(expr, "\\%#=", 4) == 0)
2623 {
2624 int newengine = expr[4] - '0';
2625
2626 if (newengine == AUTOMATIC_ENGINE
2627 || newengine == BACKTRACKING_ENGINE
2628 || newengine == NFA_ENGINE)
2629 {
2630 regexp_engine = expr[4] - '0';
2631 expr += 5;
2632#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002633 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002634 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002635#endif
2636 }
2637 else
2638 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002639 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002640 regexp_engine = AUTOMATIC_ENGINE;
2641 }
2642 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002643#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002644 bt_regengine.expr = expr;
2645 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002646#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002647 // reg_iswordc() uses rex.reg_buf
2648 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002649
2650 /*
2651 * First try the NFA engine, unless backtracking was requested.
2652 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002653 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002654 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002655 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002656 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002657 else
2658 prog = bt_regengine.regcomp(expr, re_flags);
2659
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002660 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002661 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002662 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002663#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002664 if (regexp_engine != BACKTRACKING_ENGINE) // debugging log for NFA
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002665 {
2666 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002667 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002668 if (f)
2669 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002670 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002671 fclose(f);
2672 }
2673 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002674 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002675 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002676 }
2677#endif
2678 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002679 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002680 * The NFA engine also fails for patterns that it can't handle well
2681 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002682 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002683 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002684 if (regexp_engine == AUTOMATIC_ENGINE
2685 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002686 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002687 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002688 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002689 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002690 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002691
Bram Moolenaarfda37292014-11-05 14:27:36 +01002692 if (prog != NULL)
2693 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002694 // Store the info needed to call regcomp() again when the engine turns
2695 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002696 prog->re_engine = regexp_engine;
2697 prog->re_flags = re_flags;
2698 }
2699
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002700 return prog;
2701}
2702
2703/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002704 * Free a compiled regexp program, returned by vim_regcomp().
2705 */
2706 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002707vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002708{
2709 if (prog != NULL)
2710 prog->engine->regfree(prog);
2711}
2712
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002713#if defined(EXITFREE) || defined(PROTO)
2714 void
2715free_regexp_stuff(void)
2716{
2717 ga_clear(&regstack);
2718 ga_clear(&backpos);
2719 vim_free(reg_tofree);
2720 vim_free(reg_prev_sub);
2721}
2722#endif
2723
Bram Moolenaarfda37292014-11-05 14:27:36 +01002724#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002725 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002726report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002727{
2728 if (p_verbose > 0)
2729 {
2730 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002731 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2732 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002733 verbose_leave();
2734 }
2735}
2736#endif
2737
Bram Moolenaar113e1072019-01-20 15:30:40 +01002738#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2739 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002740/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002741 * Return whether "prog" is currently being executed.
2742 */
2743 int
2744regprog_in_use(regprog_T *prog)
2745{
2746 return prog->re_in_use;
2747}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002748#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002749
2750/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002751 * Match a regexp against a string.
2752 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002753 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002754 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002755 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002756 *
2757 * Return TRUE if there is a match, FALSE if not.
2758 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002759 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002760vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002761 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002762 char_u *line, // string to match against
2763 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002764 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002765{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002766 int result;
2767 regexec_T rex_save;
2768 int rex_in_use_save = rex_in_use;
2769
Bram Moolenaar0270f382018-07-17 05:43:58 +02002770 // Cannot use the same prog recursively, it contains state.
2771 if (rmp->regprog->re_in_use)
2772 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002773 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002774 return FALSE;
2775 }
2776 rmp->regprog->re_in_use = TRUE;
2777
Bram Moolenaar6100d022016-10-02 16:51:57 +02002778 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002779 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002780 rex_save = rex;
2781 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002782
Bram Moolenaar6100d022016-10-02 16:51:57 +02002783 rex.reg_startp = NULL;
2784 rex.reg_endp = NULL;
2785 rex.reg_startpos = NULL;
2786 rex.reg_endpos = NULL;
2787
2788 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002789 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002790
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002791 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002792 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2793 && result == NFA_TOO_EXPENSIVE)
2794 {
2795 int save_p_re = p_re;
2796 int re_flags = rmp->regprog->re_flags;
2797 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2798
2799 p_re = BACKTRACKING_ENGINE;
2800 vim_regfree(rmp->regprog);
2801 if (pat != NULL)
2802 {
2803#ifdef FEAT_EVAL
2804 report_re_switch(pat);
2805#endif
2806 rmp->regprog = vim_regcomp(pat, re_flags);
2807 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002808 {
2809 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002810 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002811 rmp->regprog->re_in_use = FALSE;
2812 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002813 vim_free(pat);
2814 }
2815
2816 p_re = save_p_re;
2817 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002818
2819 rex_in_use = rex_in_use_save;
2820 if (rex_in_use)
2821 rex = rex_save;
2822
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002823 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002824}
2825
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002826/*
2827 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002828 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002829 */
2830 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002831vim_regexec_prog(
2832 regprog_T **prog,
2833 int ignore_case,
2834 char_u *line,
2835 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002836{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002837 int r;
2838 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002839
2840 regmatch.regprog = *prog;
2841 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002842 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002843 *prog = regmatch.regprog;
2844 return r;
2845}
2846
2847/*
2848 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002849 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002850 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002851 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002852vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002853{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002854 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002855}
2856
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002857/*
2858 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002859 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002860 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002861 */
2862 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002863vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002864{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002865 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002866}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002867
2868/*
2869 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002870 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2871 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002872 * Uses curbuf for line count and 'iskeyword'.
2873 *
2874 * Return zero if there is no match. Return number of lines contained in the
2875 * match otherwise.
2876 */
2877 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002878vim_regexec_multi(
2879 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002880 win_T *win, // window in which to search or NULL
2881 buf_T *buf, // buffer in which to search
2882 linenr_T lnum, // nr of line to start looking for match
2883 colnr_T col, // column to start looking for match
2884 proftime_T *tm, // timeout limit or NULL
2885 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002886{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002887 int result;
2888 regexec_T rex_save;
2889 int rex_in_use_save = rex_in_use;
2890
Bram Moolenaar0270f382018-07-17 05:43:58 +02002891 // Cannot use the same prog recursively, it contains state.
2892 if (rmp->regprog->re_in_use)
2893 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002894 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002895 return FALSE;
2896 }
2897 rmp->regprog->re_in_use = TRUE;
2898
Bram Moolenaar6100d022016-10-02 16:51:57 +02002899 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002900 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002901 rex_save = rex;
2902 rex_in_use = TRUE;
2903
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002904 result = rmp->regprog->engine->regexec_multi(
2905 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002906 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002907
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002908 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002909 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2910 && result == NFA_TOO_EXPENSIVE)
2911 {
2912 int save_p_re = p_re;
2913 int re_flags = rmp->regprog->re_flags;
2914 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2915
2916 p_re = BACKTRACKING_ENGINE;
2917 vim_regfree(rmp->regprog);
2918 if (pat != NULL)
2919 {
2920#ifdef FEAT_EVAL
2921 report_re_switch(pat);
2922#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002923#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002924 // checking for \z misuse was already done when compiling for NFA,
2925 // allow all here
2926 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002927#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002928 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002929#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002930 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002931#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002932
Bram Moolenaarfda37292014-11-05 14:27:36 +01002933 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002934 {
2935 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002936 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002937 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002938 rmp->regprog->re_in_use = FALSE;
2939 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002940 vim_free(pat);
2941 }
2942 p_re = save_p_re;
2943 }
2944
Bram Moolenaar6100d022016-10-02 16:51:57 +02002945 rex_in_use = rex_in_use_save;
2946 if (rex_in_use)
2947 rex = rex_save;
2948
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002949 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002950}