blob: 3911c906165f1f1ae0139fb45bd499672285f6fc [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020077static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
78static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
79static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020081static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020082static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020083#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020084static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020085static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020086static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
87
Bram Moolenaar071d4272004-06-13 20:20:40 +000088#define NOT_MULTI 0
89#define MULTI_ONE 1
90#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020091
92// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010093#define RA_FAIL 1 // something failed, abort
94#define RA_CONT 2 // continue in inner loop
95#define RA_BREAK 3 // break inner loop
96#define RA_MATCH 4 // successful match
97#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020098
Bram Moolenaar071d4272004-06-13 20:20:40 +000099/*
100 * Return NOT_MULTI if c is not a "multi" operator.
101 * Return MULTI_ONE if c is a single "multi" operator.
102 * Return MULTI_MULT if c is a multi "multi" operator.
103 */
104 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100105re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000106{
107 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
108 return MULTI_ONE;
109 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
110 return MULTI_MULT;
111 return NOT_MULTI;
112}
113
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000114static char_u *reg_prev_sub = NULL;
115
Bram Moolenaar071d4272004-06-13 20:20:40 +0000116/*
117 * REGEXP_INRANGE contains all characters which are always special in a []
118 * range after '\'.
119 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
120 * These are:
121 * \n - New line (NL).
122 * \r - Carriage Return (CR).
123 * \t - Tab (TAB).
124 * \e - Escape (ESC).
125 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000126 * \d - Character code in decimal, eg \d123
127 * \o - Character code in octal, eg \o80
128 * \x - Character code in hex, eg \x4a
129 * \u - Multibyte character code, eg \u20ac
130 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131 */
132static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000133static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135/*
136 * Translate '\x' to its control character, except "\n", which is Magic.
137 */
138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100139backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000140{
141 switch (c)
142 {
143 case 'r': return CAR;
144 case 't': return TAB;
145 case 'e': return ESC;
146 case 'b': return BS;
147 }
148 return c;
149}
150
151/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000152 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000153 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
154 * recognized. Otherwise "pp" is advanced to after the item.
155 */
156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100157get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000158{
159 static const char *(class_names[]) =
160 {
161 "alnum:]",
162#define CLASS_ALNUM 0
163 "alpha:]",
164#define CLASS_ALPHA 1
165 "blank:]",
166#define CLASS_BLANK 2
167 "cntrl:]",
168#define CLASS_CNTRL 3
169 "digit:]",
170#define CLASS_DIGIT 4
171 "graph:]",
172#define CLASS_GRAPH 5
173 "lower:]",
174#define CLASS_LOWER 6
175 "print:]",
176#define CLASS_PRINT 7
177 "punct:]",
178#define CLASS_PUNCT 8
179 "space:]",
180#define CLASS_SPACE 9
181 "upper:]",
182#define CLASS_UPPER 10
183 "xdigit:]",
184#define CLASS_XDIGIT 11
185 "tab:]",
186#define CLASS_TAB 12
187 "return:]",
188#define CLASS_RETURN 13
189 "backspace:]",
190#define CLASS_BACKSPACE 14
191 "escape:]",
192#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100193 "ident:]",
194#define CLASS_IDENT 16
195 "keyword:]",
196#define CLASS_KEYWORD 17
197 "fname:]",
198#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 };
200#define CLASS_NONE 99
201 int i;
202
203 if ((*pp)[1] == ':')
204 {
Bram Moolenaar78a15312009-05-15 19:33:18 +0000205 for (i = 0; i < (int)(sizeof(class_names) / sizeof(*class_names)); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000206 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
207 {
208 *pp += STRLEN(class_names[i]) + 2;
209 return i;
210 }
211 }
212 return CLASS_NONE;
213}
214
215/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216 * Specific version of character class functions.
217 * Using a table to keep this fast.
218 */
219static short class_tab[256];
220
221#define RI_DIGIT 0x01
222#define RI_HEX 0x02
223#define RI_OCTAL 0x04
224#define RI_WORD 0x08
225#define RI_HEAD 0x10
226#define RI_ALPHA 0x20
227#define RI_LOWER 0x40
228#define RI_UPPER 0x80
229#define RI_WHITE 0x100
230
231 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100232init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000233{
234 int i;
235 static int done = FALSE;
236
237 if (done)
238 return;
239
240 for (i = 0; i < 256; ++i)
241 {
242 if (i >= '0' && i <= '7')
243 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
244 else if (i >= '8' && i <= '9')
245 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
246 else if (i >= 'a' && i <= 'f')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
248#ifdef EBCDIC
249 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
250 || (i >= 's' && i <= 'z'))
251#else
252 else if (i >= 'g' && i <= 'z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
255 else if (i >= 'A' && i <= 'F')
256 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
257#ifdef EBCDIC
258 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
259 || (i >= 'S' && i <= 'Z'))
260#else
261 else if (i >= 'G' && i <= 'Z')
262#endif
263 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
264 else if (i == '_')
265 class_tab[i] = RI_WORD + RI_HEAD;
266 else
267 class_tab[i] = 0;
268 }
269 class_tab[' '] |= RI_WHITE;
270 class_tab['\t'] |= RI_WHITE;
271 done = TRUE;
272}
273
Bram Moolenaara12a1612019-01-24 16:39:02 +0100274#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
275#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
276#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
277#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
278#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
279#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
280#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
281#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
282#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100284// flags for regflags
285#define RF_ICASE 1 // ignore case
286#define RF_NOICASE 2 // don't ignore case
287#define RF_HASNL 4 // can match a NL
288#define RF_ICOMBINE 8 // ignore combining characters
289#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000290
291/*
292 * Global work variables for vim_regcomp().
293 */
294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static char_u *regparse; // Input-scan pointer.
296static int regnpar; // () count.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000297#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298static int regnzpar; // \z() count.
299static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000300#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100301static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000302#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100303static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000304#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100306static int reg_magic; // magicness of the pattern:
307#define MAGIC_NONE 1 // "\V" very unmagic
308#define MAGIC_OFF 2 // "\M" or 'magic' off
309#define MAGIC_ON 3 // "\m" or 'magic'
310#define MAGIC_ALL 4 // "\v" very magic
Bram Moolenaar071d4272004-06-13 20:20:40 +0000311
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100312static int reg_string; // matching with a string instead of a buffer
313 // line
314static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000315
316/*
317 * META contains all characters that may be magic, except '^' and '$'.
318 */
319
320#ifdef EBCDIC
321static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
322#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100323// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324static char_u META_flags[] = {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100327// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000328 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100329// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000330 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100331// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000332 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100333// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000334 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100335// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000336 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100337// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000338 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
339};
340#endif
341
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100342static int curchr; // currently parsed character
343// Previous character. Note: prevchr is sometimes -1 when we are not at the
344// start, eg in /[ ^I]^ the pattern was never found even if it existed,
345// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200346static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100347static int prevprevchr; // previous-previous character
348static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000349
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100350// arguments for reg()
351#define REG_NOPAREN 0 // toplevel reg()
352#define REG_PAREN 1 // \(\)
353#define REG_ZPAREN 2 // \z(\)
354#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000355
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200356typedef struct
357{
358 char_u *regparse;
359 int prevchr_len;
360 int curchr;
361 int prevchr;
362 int prevprevchr;
363 int nextchr;
364 int at_start;
365 int prev_at_start;
366 int regnpar;
367} parse_state_T;
368
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100369static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100370static int getchr(void);
371static void skipchr_keepstart(void);
372static int peekchr(void);
373static void skipchr(void);
374static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100375static long gethexchrs(int maxinputlen);
376static long getoctchrs(void);
377static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100378static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100379static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200380static int cstrncmp(char_u *s1, char_u *s2, int *n);
381static char_u *cstrchr(char_u *, int);
382static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100383static int reg_iswordc(int);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200385static regengine_T bt_regengine;
386static regengine_T nfa_regengine;
387
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388/*
389 * Return TRUE if compiled regular expression "prog" can match a line break.
390 */
391 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393{
394 return (prog->regflags & RF_HASNL);
395}
396
397/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
399 * Returns a character representing the class. Zero means that no item was
400 * recognized. Otherwise "pp" is advanced to after the item.
401 */
402 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100403get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404{
405 int c;
406 int l = 1;
407 char_u *p = *pp;
408
Bram Moolenaar985079c2019-02-16 17:07:47 +0100409 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000410 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000411 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000412 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000413 if (p[l + 2] == '=' && p[l + 3] == ']')
414 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (has_mbyte)
416 c = mb_ptr2char(p + 2);
417 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000418 c = p[2];
419 *pp += l + 4;
420 return c;
421 }
422 }
423 return 0;
424}
425
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200426#ifdef EBCDIC
427/*
428 * Table for equivalence class "c". (IBM-1047)
429 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200430static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200431 "A\x62\x63\x64\x65\x66\x67",
432 "C\x68",
433 "E\x71\x72\x73\x74",
434 "I\x75\x76\x77\x78",
435 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200436 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200437 "U\xFB\xFC\xFD\xFE",
438 "Y\xBA",
439 "a\x42\x43\x44\x45\x46\x47",
440 "c\x48",
441 "e\x51\x52\x53\x54",
442 "i\x55\x56\x57\x58",
443 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200444 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200445 "u\xDB\xDC\xDD\xDE",
446 "y\x8D\xDF",
447};
448#endif
449
Bram Moolenaardf177f62005-02-22 08:39:57 +0000450/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000451 * Check for a collating element "[.a.]". "pp" points to the '['.
452 * Returns a character. Zero means that no item was recognized. Otherwise
453 * "pp" is advanced to after the item.
454 * Currently only single characters are recognized!
455 */
456 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100457get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458{
459 int c;
460 int l = 1;
461 char_u *p = *pp;
462
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100463 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000464 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000465 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000466 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000467 if (p[l + 2] == '.' && p[l + 3] == ']')
468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
470 c = mb_ptr2char(p + 2);
471 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000472 c = p[2];
473 *pp += l + 4;
474 return c;
475 }
476 }
477 return 0;
478}
479
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100480static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
481static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200482
483 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100484get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200485{
486 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
487 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
488}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000489
490/*
491 * Skip over a "[]" range.
492 * "p" must point to the character after the '['.
493 * The returned pointer is on the matching ']', or the terminating NUL.
494 */
495 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100496skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000497{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000499
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100500 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501 ++p;
502 if (*p == ']' || *p == '-')
503 ++p;
504 while (*p != NUL && *p != ']')
505 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000506 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000507 p += l;
508 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000509 if (*p == '-')
510 {
511 ++p;
512 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100513 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000514 }
515 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200516 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000517 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200518 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000519 p += 2;
520 else if (*p == '[')
521 {
522 if (get_char_class(&p) == CLASS_NONE
523 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200524 && get_coll_element(&p) == 0
525 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100526 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000527 }
528 else
529 ++p;
530 }
531
532 return p;
533}
534
535/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000536 * Skip past regular expression.
Bram Moolenaar748bf032005-02-02 23:04:36 +0000537 * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000538 * Take care of characters with a backslash in front of it.
539 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000540 */
541 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100542skip_regexp(
543 char_u *startp,
544 int dirc,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200545 int magic)
546{
547 return skip_regexp_ex(startp, dirc, magic, NULL, NULL);
548}
549
550/*
551 * skip_regexp() with extra arguments:
552 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
553 * expression and change "\?" to "?". If "*newp" is not NULL the expression
554 * is changed in-place.
555 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
556 */
557 char_u *
558skip_regexp_ex(
559 char_u *startp,
560 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100561 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200562 char_u **newp,
563 int *dropped)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000564{
565 int mymagic;
566 char_u *p = startp;
567
568 if (magic)
569 mymagic = MAGIC_ON;
570 else
571 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200572 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000573
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100574 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000575 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100576 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000577 break;
578 if ((p[0] == '[' && mymagic >= MAGIC_ON)
579 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
580 {
581 p = skip_anyof(p + 1);
582 if (p[0] == NUL)
583 break;
584 }
585 else if (p[0] == '\\' && p[1] != NUL)
586 {
587 if (dirc == '?' && newp != NULL && p[1] == '?')
588 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100589 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000590 if (*newp == NULL)
591 {
592 *newp = vim_strsave(startp);
593 if (*newp != NULL)
594 p = *newp + (p - startp);
595 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200596 if (dropped != NULL)
597 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000598 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000599 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000600 else
601 ++p;
602 }
603 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100604 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000605 if (*p == 'v')
606 mymagic = MAGIC_ALL;
607 else if (*p == 'V')
608 mymagic = MAGIC_NONE;
609 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000610 }
611 return p;
612}
613
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200614/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200615 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200616 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100617static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200618static int at_start; // True when on the first character
619static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100620
Bram Moolenaar071d4272004-06-13 20:20:40 +0000621/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200622 * Start parsing at "str".
623 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000624 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100625initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000626{
627 regparse = str;
628 prevchr_len = 0;
629 curchr = prevprevchr = prevchr = nextchr = -1;
630 at_start = TRUE;
631 prev_at_start = FALSE;
632}
633
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200634/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200635 * Save the current parse state, so that it can be restored and parsing
636 * starts in the same state again.
637 */
638 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100639save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200640{
641 ps->regparse = regparse;
642 ps->prevchr_len = prevchr_len;
643 ps->curchr = curchr;
644 ps->prevchr = prevchr;
645 ps->prevprevchr = prevprevchr;
646 ps->nextchr = nextchr;
647 ps->at_start = at_start;
648 ps->prev_at_start = prev_at_start;
649 ps->regnpar = regnpar;
650}
651
652/*
653 * Restore a previously saved parse state.
654 */
655 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100656restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200657{
658 regparse = ps->regparse;
659 prevchr_len = ps->prevchr_len;
660 curchr = ps->curchr;
661 prevchr = ps->prevchr;
662 prevprevchr = ps->prevprevchr;
663 nextchr = ps->nextchr;
664 at_start = ps->at_start;
665 prev_at_start = ps->prev_at_start;
666 regnpar = ps->regnpar;
667}
668
669
670/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200671 * Get the next character without advancing.
672 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000673 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100674peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000675{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000676 static int after_slash = FALSE;
677
Bram Moolenaar071d4272004-06-13 20:20:40 +0000678 if (curchr == -1)
679 {
680 switch (curchr = regparse[0])
681 {
682 case '.':
683 case '[':
684 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100685 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000686 if (reg_magic >= MAGIC_ON)
687 curchr = Magic(curchr);
688 break;
689 case '(':
690 case ')':
691 case '{':
692 case '%':
693 case '+':
694 case '=':
695 case '?':
696 case '@':
697 case '!':
698 case '&':
699 case '|':
700 case '<':
701 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100702 case '#': // future ext.
703 case '"': // future ext.
704 case '\'': // future ext.
705 case ',': // future ext.
706 case '-': // future ext.
707 case ':': // future ext.
708 case ';': // future ext.
709 case '`': // future ext.
710 case '/': // Can't be used in / command
711 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000712 if (reg_magic == MAGIC_ALL)
713 curchr = Magic(curchr);
714 break;
715 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100716 // * is not magic as the very first character, eg "?*ptr", when
717 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
718 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000719 if (reg_magic >= MAGIC_ON
720 && !at_start
721 && !(prev_at_start && prevchr == Magic('^'))
722 && (after_slash
723 || (prevchr != Magic('(')
724 && prevchr != Magic('&')
725 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000726 curchr = Magic('*');
727 break;
728 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100729 // '^' is only magic as the very first character and if it's after
730 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000731 if (reg_magic >= MAGIC_OFF
732 && (at_start
733 || reg_magic == MAGIC_ALL
734 || prevchr == Magic('(')
735 || prevchr == Magic('|')
736 || prevchr == Magic('&')
737 || prevchr == Magic('n')
738 || (no_Magic(prevchr) == '('
739 && prevprevchr == Magic('%'))))
740 {
741 curchr = Magic('^');
742 at_start = TRUE;
743 prev_at_start = FALSE;
744 }
745 break;
746 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100747 // '$' is only magic as the very last char and if it's in front of
748 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000749 if (reg_magic >= MAGIC_OFF)
750 {
751 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200752 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000753
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100754 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000755 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200756 || p[1] == 'm' || p[1] == 'M'
757 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
758 {
759 if (p[1] == 'v')
760 is_magic_all = TRUE;
761 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
762 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000763 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200764 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000765 if (p[0] == NUL
766 || (p[0] == '\\'
767 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
768 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200769 || (is_magic_all
770 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000771 || reg_magic == MAGIC_ALL)
772 curchr = Magic('$');
773 }
774 break;
775 case '\\':
776 {
777 int c = regparse[1];
778
779 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100780 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000781 else if (
782#ifdef EBCDIC
783 vim_strchr(META, c)
784#else
785 c <= '~' && META_flags[c]
786#endif
787 )
788 {
789 /*
790 * META contains everything that may be magic sometimes,
791 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200792 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000793 * magicness. Therefore, \ is so meta-magic that it is
794 * not in META.
795 */
796 curchr = -1;
797 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100798 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000799 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000800 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000801 peekchr();
802 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000803 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000804 curchr = toggle_Magic(curchr);
805 }
806 else if (vim_strchr(REGEXP_ABBR, c))
807 {
808 /*
809 * Handle abbreviations, like "\t" for TAB -- webb
810 */
811 curchr = backslash_trans(c);
812 }
813 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
814 curchr = toggle_Magic(c);
815 else
816 {
817 /*
818 * Next character can never be (made) magic?
819 * Then backslashing it won't do anything.
820 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000821 if (has_mbyte)
822 curchr = (*mb_ptr2char)(regparse + 1);
823 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000824 curchr = c;
825 }
826 break;
827 }
828
Bram Moolenaar071d4272004-06-13 20:20:40 +0000829 default:
830 if (has_mbyte)
831 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000832 }
833 }
834
835 return curchr;
836}
837
838/*
839 * Eat one lexed character. Do this in a way that we can undo it.
840 */
841 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100842skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000843{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100844 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000845 if (*regparse == '\\')
846 prevchr_len = 1;
847 else
848 prevchr_len = 0;
849 if (regparse[prevchr_len] != NUL)
850 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000851 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100852 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000853 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000854 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000855 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000856 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000857 ++prevchr_len;
858 }
859 regparse += prevchr_len;
860 prev_at_start = at_start;
861 at_start = FALSE;
862 prevprevchr = prevchr;
863 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100864 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000865 nextchr = -1;
866}
867
868/*
869 * Skip a character while keeping the value of prev_at_start for at_start.
870 * prevchr and prevprevchr are also kept.
871 */
872 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100873skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000874{
875 int as = prev_at_start;
876 int pr = prevchr;
877 int prpr = prevprevchr;
878
879 skipchr();
880 at_start = as;
881 prevchr = pr;
882 prevprevchr = prpr;
883}
884
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200885/*
886 * Get the next character from the pattern. We know about magic and such, so
887 * therefore we need a lexical analyzer.
888 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000889 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100890getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000891{
892 int chr = peekchr();
893
894 skipchr();
895 return chr;
896}
897
898/*
899 * put character back. Works only once!
900 */
901 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100902ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000903{
904 nextchr = curchr;
905 curchr = prevchr;
906 prevchr = prevprevchr;
907 at_start = prev_at_start;
908 prev_at_start = FALSE;
909
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100910 // Backup regparse, so that it's at the same position as before the
911 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000912 regparse -= prevchr_len;
913}
914
915/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000916 * Get and return the value of the hex string at the current position.
917 * Return -1 if there is no valid hex number.
918 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000919 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000920 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000921 * The parameter controls the maximum number of input characters. This will be
922 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
923 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100924 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100925gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000926{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100927 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000928 int c;
929 int i;
930
931 for (i = 0; i < maxinputlen; ++i)
932 {
933 c = regparse[0];
934 if (!vim_isxdigit(c))
935 break;
936 nr <<= 4;
937 nr |= hex2nr(c);
938 ++regparse;
939 }
940
941 if (i == 0)
942 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100943 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000944}
945
946/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200947 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000948 * current position. Return -1 for invalid. Consumes all digits.
949 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100950 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100951getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000952{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100953 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000954 int c;
955 int i;
956
957 for (i = 0; ; ++i)
958 {
959 c = regparse[0];
960 if (c < '0' || c > '9')
961 break;
962 nr *= 10;
963 nr += c - '0';
964 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100965 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000966 }
967
968 if (i == 0)
969 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100970 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000971}
972
973/*
974 * get and return the value of the octal string immediately after the current
975 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
976 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
977 * treat 8 or 9 as recognised characters. Position is updated:
978 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000979 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000980 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100981 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100982getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000983{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100984 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000985 int c;
986 int i;
987
988 for (i = 0; i < 3 && nr < 040; ++i)
989 {
990 c = regparse[0];
991 if (c < '0' || c > '7')
992 break;
993 nr <<= 3;
994 nr |= hex2nr(c);
995 ++regparse;
996 }
997
998 if (i == 0)
999 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001000 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001001}
1002
1003/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001004 * read_limits - Read two integers to be taken as a minimum and maximum.
1005 * If the first character is '-', then the range is reversed.
1006 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1007 * missing, a very big number is the default.
1008 */
1009 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001010read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001011{
1012 int reverse = FALSE;
1013 char_u *first_char;
1014 long tmp;
1015
1016 if (*regparse == '-')
1017 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001018 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001019 regparse++;
1020 reverse = TRUE;
1021 }
1022 first_char = regparse;
1023 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001024 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001025 {
1026 if (vim_isdigit(*++regparse))
1027 *maxval = getdigits(&regparse);
1028 else
1029 *maxval = MAX_LIMIT;
1030 }
1031 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001032 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001033 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001034 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001035 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001036 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001037 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001038 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1039 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001040
1041 /*
1042 * Reverse the range if there was a '-', or make sure it is in the right
1043 * order otherwise.
1044 */
1045 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1046 {
1047 tmp = *minval;
1048 *minval = *maxval;
1049 *maxval = tmp;
1050 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001051 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001052 return OK;
1053}
1054
1055/*
1056 * vim_regexec and friends
1057 */
1058
1059/*
1060 * Global work variables for vim_regexec().
1061 */
1062
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001063static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001064#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001065static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001066#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001067static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001068static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001069
1070/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001071 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1072 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001073 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001074 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001075static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001076static unsigned reg_tofreelen;
1077
1078/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001079 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001080 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001081 * done:
1082 * single-line multi-line
1083 * reg_match &regmatch_T NULL
1084 * reg_mmatch NULL &regmmatch_T
1085 * reg_startp reg_match->startp <invalid>
1086 * reg_endp reg_match->endp <invalid>
1087 * reg_startpos <invalid> reg_mmatch->startpos
1088 * reg_endpos <invalid> reg_mmatch->endpos
1089 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001090 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001091 * reg_firstlnum <invalid> first line in which to search
1092 * reg_maxline 0 last line nr
1093 * reg_line_lbr FALSE or TRUE FALSE
1094 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001095typedef struct {
1096 regmatch_T *reg_match;
1097 regmmatch_T *reg_mmatch;
1098 char_u **reg_startp;
1099 char_u **reg_endp;
1100 lpos_T *reg_startpos;
1101 lpos_T *reg_endpos;
1102 win_T *reg_win;
1103 buf_T *reg_buf;
1104 linenr_T reg_firstlnum;
1105 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001106 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001107
Bram Moolenaar0270f382018-07-17 05:43:58 +02001108 // The current match-position is stord in these variables:
1109 linenr_T lnum; // line number, relative to first line
1110 char_u *line; // start of current line
1111 char_u *input; // current input, points into "regline"
1112
1113 int need_clear_subexpr; // subexpressions still need to be cleared
1114#ifdef FEAT_SYN_HL
1115 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1116 // cleared
1117#endif
1118
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001119 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1120 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1121 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001122 int reg_ic;
1123
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001124 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1125 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001126 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001127
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001128 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1129 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001130 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001131
1132 // State for the NFA engine regexec.
1133 int nfa_has_zend; // NFA regexp \ze operator encountered.
1134 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1135 int nfa_nsubexpr; // Number of sub expressions actually being used
1136 // during execution. 1 if only the whole match
1137 // (subexpr 0) is used.
1138 // listid is global, so that it increases on recursive calls to
1139 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1140 // all the states.
1141 int nfa_listid;
1142 int nfa_alt_listid;
1143
1144#ifdef FEAT_SYN_HL
1145 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1146#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001147} regexec_T;
1148
1149static regexec_T rex;
1150static int rex_in_use = FALSE;
1151
Bram Moolenaar071d4272004-06-13 20:20:40 +00001152/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001153 * Return TRUE if character 'c' is included in 'iskeyword' option for
1154 * "reg_buf" buffer.
1155 */
1156 static int
1157reg_iswordc(int c)
1158{
1159 return vim_iswordc_buf(c, rex.reg_buf);
1160}
1161
1162/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001163 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1164 */
1165 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001166reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001167{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001168 // when looking behind for a match/no-match lnum is negative. But we
1169 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001170 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001171 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001172 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001173 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001174 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001175 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001176}
1177
Bram Moolenaar071d4272004-06-13 20:20:40 +00001178#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001179static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1180static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1181static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1182static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001183#endif
1184
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001185// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001186#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001187
Bram Moolenaar071d4272004-06-13 20:20:40 +00001188#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001189/*
1190 * Create a new extmatch and mark it as referenced once.
1191 */
1192 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001193make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001194{
1195 reg_extmatch_T *em;
1196
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001197 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001198 if (em != NULL)
1199 em->refcnt = 1;
1200 return em;
1201}
1202
1203/*
1204 * Add a reference to an extmatch.
1205 */
1206 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001207ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001208{
1209 if (em != NULL)
1210 em->refcnt++;
1211 return em;
1212}
1213
1214/*
1215 * Remove a reference to an extmatch. If there are no references left, free
1216 * the info.
1217 */
1218 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001219unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001220{
1221 int i;
1222
1223 if (em != NULL && --em->refcnt <= 0)
1224 {
1225 for (i = 0; i < NSUBEXP; ++i)
1226 vim_free(em->matches[i]);
1227 vim_free(em);
1228 }
1229}
1230#endif
1231
1232/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001233 * Get class of previous character.
1234 */
1235 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001236reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001237{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001238 if (rex.input > rex.line)
1239 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001240 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001241 return -1;
1242}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001243
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001244/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001245 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001246 */
1247 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001248reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001249{
1250 pos_T top, bot;
1251 linenr_T lnum;
1252 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001253 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001254 int mode;
1255 colnr_T start, end;
1256 colnr_T start2, end2;
1257 colnr_T cols;
1258
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001259 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001260 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001261 return FALSE;
1262
1263 if (VIsual_active)
1264 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001265 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001266 {
1267 top = VIsual;
1268 bot = wp->w_cursor;
1269 }
1270 else
1271 {
1272 top = wp->w_cursor;
1273 bot = VIsual;
1274 }
1275 mode = VIsual_mode;
1276 }
1277 else
1278 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001279 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001280 {
1281 top = curbuf->b_visual.vi_start;
1282 bot = curbuf->b_visual.vi_end;
1283 }
1284 else
1285 {
1286 top = curbuf->b_visual.vi_end;
1287 bot = curbuf->b_visual.vi_start;
1288 }
1289 mode = curbuf->b_visual.vi_mode;
1290 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001291 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001292 if (lnum < top.lnum || lnum > bot.lnum)
1293 return FALSE;
1294
1295 if (mode == 'v')
1296 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001297 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001298 if ((lnum == top.lnum && col < top.col)
1299 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1300 return FALSE;
1301 }
1302 else if (mode == Ctrl_V)
1303 {
1304 getvvcol(wp, &top, &start, NULL, &end);
1305 getvvcol(wp, &bot, &start2, NULL, &end2);
1306 if (start2 < start)
1307 start = start2;
1308 if (end2 > end)
1309 end = end2;
1310 if (top.col == MAXCOL || bot.col == MAXCOL)
1311 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001312 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001313 if (cols < start || cols > end - (*p_sel == 'e'))
1314 return FALSE;
1315 }
1316 return TRUE;
1317}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001318
Bram Moolenaar071d4272004-06-13 20:20:40 +00001319/*
1320 * Check the regexp program for its magic number.
1321 * Return TRUE if it's wrong.
1322 */
1323 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001324prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001325{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001326 regprog_T *prog;
1327
Bram Moolenaar6100d022016-10-02 16:51:57 +02001328 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001329 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001330 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001331 return FALSE;
1332
1333 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001334 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001335 emsg(_(e_re_corr));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001336 return TRUE;
1337 }
1338 return FALSE;
1339}
1340
1341/*
1342 * Cleanup the subexpressions, if this wasn't done yet.
1343 * This construction is used to clear the subexpressions only when they are
1344 * used (to increase speed).
1345 */
1346 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001347cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001348{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001349 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001350 {
1351 if (REG_MULTI)
1352 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001353 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001354 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1355 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001356 }
1357 else
1358 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001359 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1360 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001361 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001362 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001363 }
1364}
1365
1366#ifdef FEAT_SYN_HL
1367 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001368cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001369{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001370 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001371 {
1372 if (REG_MULTI)
1373 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001374 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001375 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1376 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1377 }
1378 else
1379 {
1380 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1381 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1382 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001383 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001384 }
1385}
1386#endif
1387
1388/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001389 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001390 */
1391 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001392reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001393{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001394 rex.line = reg_getline(++rex.lnum);
1395 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001396 fast_breakcheck();
1397}
1398
1399/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001400 * Check whether a backreference matches.
1401 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001402 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1403 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001404 */
1405 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001406match_with_backref(
1407 linenr_T start_lnum,
1408 colnr_T start_col,
1409 linenr_T end_lnum,
1410 colnr_T end_col,
1411 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001412{
1413 linenr_T clnum = start_lnum;
1414 colnr_T ccol = start_col;
1415 int len;
1416 char_u *p;
1417
1418 if (bytelen != NULL)
1419 *bytelen = 0;
1420 for (;;)
1421 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001422 // Since getting one line may invalidate the other, need to make copy.
1423 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001424 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001425 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001426 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001427 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1428 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001429 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001430 vim_free(reg_tofree);
1431 reg_tofree = alloc(len);
1432 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001433 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001434 reg_tofreelen = len;
1435 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001436 STRCPY(reg_tofree, rex.line);
1437 rex.input = reg_tofree + (rex.input - rex.line);
1438 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001439 }
1440
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001441 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001442 p = reg_getline(clnum);
1443 if (clnum == end_lnum)
1444 len = end_col - ccol;
1445 else
1446 len = (int)STRLEN(p + ccol);
1447
Bram Moolenaar0270f382018-07-17 05:43:58 +02001448 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001449 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001450 if (bytelen != NULL)
1451 *bytelen += len;
1452 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001453 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001454 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001455 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001456
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001457 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001458 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001459 if (bytelen != NULL)
1460 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001461 ++clnum;
1462 ccol = 0;
1463 if (got_int)
1464 return RA_FAIL;
1465 }
1466
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001467 // found a match! Note that rex.line may now point to a copy of the line,
1468 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001469 return RA_MATCH;
1470}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001471
Bram Moolenaarfb031402014-09-09 17:18:49 +02001472/*
1473 * Used in a place where no * or \+ can follow.
1474 */
1475 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001476re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001477{
1478 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001479 {
1480 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1481 rc_did_emsg = TRUE;
1482 return FAIL;
1483 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001484 return OK;
1485}
1486
Bram Moolenaar071d4272004-06-13 20:20:40 +00001487typedef struct
1488{
1489 int a, b, c;
1490} decomp_T;
1491
1492
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001493// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001494static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001495{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001496 {0x5e2,0,0}, // 0xfb20 alt ayin
1497 {0x5d0,0,0}, // 0xfb21 alt alef
1498 {0x5d3,0,0}, // 0xfb22 alt dalet
1499 {0x5d4,0,0}, // 0xfb23 alt he
1500 {0x5db,0,0}, // 0xfb24 alt kaf
1501 {0x5dc,0,0}, // 0xfb25 alt lamed
1502 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1503 {0x5e8,0,0}, // 0xfb27 alt resh
1504 {0x5ea,0,0}, // 0xfb28 alt tav
1505 {'+', 0, 0}, // 0xfb29 alt plus
1506 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1507 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1508 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1509 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1510 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1511 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1512 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1513 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1514 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1515 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1516 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1517 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1518 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1519 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1520 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1521 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1522 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1523 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1524 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1525 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1526 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1527 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1528 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1529 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1530 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1531 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1532 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1533 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1534 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1535 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1536 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1537 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1538 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1539 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1540 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1541 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1542 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1543 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001544};
1545
1546 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001547mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001548{
1549 decomp_T d;
1550
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001551 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001552 {
1553 d = decomp_table[c - 0xfb20];
1554 *c1 = d.a;
1555 *c2 = d.b;
1556 *c3 = d.c;
1557 }
1558 else
1559 {
1560 *c1 = c;
1561 *c2 = *c3 = 0;
1562 }
1563}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001564
1565/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001566 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001567 * Return 0 if strings match, non-zero otherwise.
1568 * Correct the length "*n" when composing characters are ignored.
1569 */
1570 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001571cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001572{
1573 int result;
1574
Bram Moolenaar6100d022016-10-02 16:51:57 +02001575 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001576 result = STRNCMP(s1, s2, *n);
1577 else
1578 result = MB_STRNICMP(s1, s2, *n);
1579
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001580 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001581 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001582 {
1583 char_u *str1, *str2;
1584 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001585 int junk;
1586
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001587 // we have to handle the strcmp ourselves, since it is necessary to
1588 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001589 str1 = s1;
1590 str2 = s2;
1591 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001592 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001593 {
1594 c1 = mb_ptr2char_adv(&str1);
1595 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001596
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001597 // Decompose the character if necessary, into 'base' characters.
1598 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001599 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001600 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001601 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001602 mb_decompose(c1, &c11, &junk, &junk);
1603 mb_decompose(c2, &c12, &junk, &junk);
1604 c1 = c11;
1605 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001606 if (c11 != c12
1607 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001608 break;
1609 }
1610 }
1611 result = c2 - c1;
1612 if (result == 0)
1613 *n = (int)(str2 - s2);
1614 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001615
1616 return result;
1617}
1618
1619/*
1620 * cstrchr: This function is used a lot for simple searches, keep it fast!
1621 */
1622 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001623cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001624{
1625 char_u *p;
1626 int cc;
1627
Bram Moolenaara12a1612019-01-24 16:39:02 +01001628 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001629 return vim_strchr(s, c);
1630
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001631 // tolower() and toupper() can be slow, comparing twice should be a lot
1632 // faster (esp. when using MS Visual C++!).
1633 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001634 if (enc_utf8 && c > 0x80)
1635 cc = utf_fold(c);
1636 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001637 if (MB_ISUPPER(c))
1638 cc = MB_TOLOWER(c);
1639 else if (MB_ISLOWER(c))
1640 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001641 else
1642 return vim_strchr(s, c);
1643
Bram Moolenaar071d4272004-06-13 20:20:40 +00001644 if (has_mbyte)
1645 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001646 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001647 {
1648 if (enc_utf8 && c > 0x80)
1649 {
1650 if (utf_fold(utf_ptr2char(p)) == cc)
1651 return p;
1652 }
1653 else if (*p == c || *p == cc)
1654 return p;
1655 }
1656 }
1657 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001658 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001659 for (p = s; *p != NUL; ++p)
1660 if (*p == c || *p == cc)
1661 return p;
1662
1663 return NULL;
1664}
1665
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001666////////////////////////////////////////////////////////////////
1667// regsub stuff //
1668////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001669
Bram Moolenaar071d4272004-06-13 20:20:40 +00001670/*
1671 * We should define ftpr as a pointer to a function returning a pointer to
1672 * a function returning a pointer to a function ...
1673 * This is impossible, so we declare a pointer to a function returning a
1674 * pointer to a function returning void. This should work for all compilers.
1675 */
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001676typedef void (*(*fptr_T)(int *, int))();
Bram Moolenaar071d4272004-06-13 20:20:40 +00001677
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001678static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001679
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001680 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001681do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001682{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001683 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001684
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001685 return (fptr_T)NULL;
1686}
1687
1688 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001689do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001690{
1691 *d = MB_TOUPPER(c);
1692
1693 return (fptr_T)do_Upper;
1694}
1695
1696 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001697do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001698{
1699 *d = MB_TOLOWER(c);
1700
1701 return (fptr_T)NULL;
1702}
1703
1704 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001705do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001706{
1707 *d = MB_TOLOWER(c);
1708
1709 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001710}
1711
1712/*
1713 * regtilde(): Replace tildes in the pattern by the old pattern.
1714 *
1715 * Short explanation of the tilde: It stands for the previous replacement
1716 * pattern. If that previous pattern also contains a ~ we should go back a
1717 * step further... But we insert the previous pattern into the current one
1718 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001719 * This still does not handle the case where "magic" changes. So require the
1720 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001721 *
1722 * The tildes are parsed once before the first call to vim_regsub().
1723 */
1724 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001725regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001726{
1727 char_u *newsub = source;
1728 char_u *tmpsub;
1729 char_u *p;
1730 int len;
1731 int prevlen;
1732
1733 for (p = newsub; *p; ++p)
1734 {
1735 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1736 {
1737 if (reg_prev_sub != NULL)
1738 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001739 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001740 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001741 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001742 if (tmpsub != NULL)
1743 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001744 // copy prefix
1745 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001746 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001747 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001748 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001749 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001750 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001751 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001752 STRCPY(tmpsub + len + prevlen, p + 1);
1753
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001754 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001755 vim_free(newsub);
1756 newsub = tmpsub;
1757 p = newsub + len + prevlen;
1758 }
1759 }
1760 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001761 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001762 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001763 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001764 --p;
1765 }
1766 else
1767 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001768 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001769 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001771 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001772 }
1773 }
1774
1775 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001776 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001777 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001778 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001779 reg_prev_sub = vim_strsave(newsub);
1780 return newsub;
1781}
1782
1783#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001784static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001785
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001786// These pointers are used for reg_submatch(). Needed for when the
1787// substitution string is an expression that contains a call to substitute()
1788// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001789typedef struct {
1790 regmatch_T *sm_match;
1791 regmmatch_T *sm_mmatch;
1792 linenr_T sm_firstlnum;
1793 linenr_T sm_maxline;
1794 int sm_line_lbr;
1795} regsubmatch_T;
1796
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001797static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001798#endif
1799
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001800#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001801
1802/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001803 * Put the submatches in "argv[argskip]" which is a list passed into
1804 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001805 */
1806 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001807fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001808{
1809 listitem_T *li;
1810 int i;
1811 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001812 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001813
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001814 if (argcount == argskip)
1815 // called function doesn't take a submatches argument
1816 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001817
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001818 // Relies on sl_list to be the first item in staticList10_T.
1819 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001820
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001821 // There are always 10 list items in staticList10_T.
1822 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001823 for (i = 0; i < 10; ++i)
1824 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001825 s = rsm.sm_match->startp[i];
1826 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001827 s = NULL;
1828 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02001829 s = vim_strnsave(s, (int)(rsm.sm_match->endp[i] - s));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001830 li->li_tv.v_type = VAR_STRING;
1831 li->li_tv.vval.v_string = s;
1832 li = li->li_next;
1833 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001834 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001835}
1836
1837 static void
1838clear_submatch_list(staticList10_T *sl)
1839{
1840 int i;
1841
1842 for (i = 0; i < 10; ++i)
1843 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1844}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001845#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001846
Bram Moolenaar071d4272004-06-13 20:20:40 +00001847/*
1848 * vim_regsub() - perform substitutions after a vim_regexec() or
1849 * vim_regexec_multi() match.
1850 *
1851 * If "copy" is TRUE really copy into "dest".
1852 * If "copy" is FALSE nothing is copied, this is just to find out the length
1853 * of the result.
1854 *
1855 * If "backslash" is TRUE, a backslash will be removed later, need to double
1856 * them to keep them, and insert a backslash before a CR to avoid it being
1857 * replaced with a line break later.
1858 *
1859 * Note: The matched text must not change between the call of
1860 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1861 * references invalid!
1862 *
1863 * Returns the size of the replacement, including terminating NUL.
1864 */
1865 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001866vim_regsub(
1867 regmatch_T *rmp,
1868 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001869 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001870 char_u *dest,
1871 int copy,
1872 int magic,
1873 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001874{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001875 int result;
1876 regexec_T rex_save;
1877 int rex_in_use_save = rex_in_use;
1878
1879 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001880 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001881 rex_save = rex;
1882 rex_in_use = TRUE;
1883
1884 rex.reg_match = rmp;
1885 rex.reg_mmatch = NULL;
1886 rex.reg_maxline = 0;
1887 rex.reg_buf = curbuf;
1888 rex.reg_line_lbr = TRUE;
1889 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1890
1891 rex_in_use = rex_in_use_save;
1892 if (rex_in_use)
1893 rex = rex_save;
1894
1895 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001896}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001897
1898 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001899vim_regsub_multi(
1900 regmmatch_T *rmp,
1901 linenr_T lnum,
1902 char_u *source,
1903 char_u *dest,
1904 int copy,
1905 int magic,
1906 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001907{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001908 int result;
1909 regexec_T rex_save;
1910 int rex_in_use_save = rex_in_use;
1911
1912 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001913 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001914 rex_save = rex;
1915 rex_in_use = TRUE;
1916
1917 rex.reg_match = NULL;
1918 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001919 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001920 rex.reg_firstlnum = lnum;
1921 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1922 rex.reg_line_lbr = FALSE;
1923 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1924
1925 rex_in_use = rex_in_use_save;
1926 if (rex_in_use)
1927 rex = rex_save;
1928
1929 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001930}
1931
1932 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001933vim_regsub_both(
1934 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001935 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001936 char_u *dest,
1937 int copy,
1938 int magic,
1939 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001940{
1941 char_u *src;
1942 char_u *dst;
1943 char_u *s;
1944 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001945 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001946 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001947 fptr_T func_all = (fptr_T)NULL;
1948 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001949 linenr_T clnum = 0; // init for GCC
1950 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001951#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001952 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001953#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001954
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001955 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001956 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001957 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001958 emsg(_(e_null));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001959 return 0;
1960 }
1961 if (prog_magic_wrong())
1962 return 0;
1963 src = source;
1964 dst = dest;
1965
1966 /*
1967 * When the substitute part starts with "\=" evaluate it as an expression.
1968 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001969 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001970 {
1971#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001972 // To make sure that the length doesn't change between checking the
1973 // length and copying the string, and to speed up things, the
1974 // resulting string is saved from the call with "copy" == FALSE to the
1975 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001976 if (copy)
1977 {
1978 if (eval_result != NULL)
1979 {
1980 STRCPY(dest, eval_result);
1981 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01001982 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001983 }
1984 }
1985 else
1986 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001987 int prev_can_f_submatch = can_f_submatch;
1988 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001989
1990 vim_free(eval_result);
1991
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001992 // The expression may contain substitute(), which calls us
1993 // recursively. Make sure submatch() gets the text from the first
1994 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001995 if (can_f_submatch)
1996 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001997 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001998 rsm.sm_match = rex.reg_match;
1999 rsm.sm_mmatch = rex.reg_mmatch;
2000 rsm.sm_firstlnum = rex.reg_firstlnum;
2001 rsm.sm_maxline = rex.reg_maxline;
2002 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002003
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002004 if (expr != NULL)
2005 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002006 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002007 char_u buf[NUMBUFLEN];
2008 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002009 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002010 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002011
2012 rettv.v_type = VAR_STRING;
2013 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002014 argv[0].v_type = VAR_LIST;
2015 argv[0].vval.v_list = &matchList.sl_list;
2016 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002017 CLEAR_FIELD(funcexe);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002018 funcexe.argv_func = fill_submatch_list;
2019 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002020 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002021 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002022 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002023 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002024 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002025 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002026 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002027 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002028
Bram Moolenaar6100d022016-10-02 16:51:57 +02002029 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002030 funcexe.partial = partial;
2031 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002032 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002033 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002034 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002035 clear_submatch_list(&matchList);
2036
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002037 if (rettv.v_type == VAR_UNKNOWN)
2038 // something failed, no need to report another error
2039 eval_result = NULL;
2040 else
2041 {
2042 eval_result = tv_get_string_buf_chk(&rettv, buf);
2043 if (eval_result != NULL)
2044 eval_result = vim_strsave(eval_result);
2045 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002046 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002047 }
2048 else
2049 eval_result = eval_to_string(source + 2, NULL, TRUE);
2050
Bram Moolenaar071d4272004-06-13 20:20:40 +00002051 if (eval_result != NULL)
2052 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002053 int had_backslash = FALSE;
2054
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002055 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002056 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002057 // Change NL to CR, so that it becomes a line break,
2058 // unless called from vim_regexec_nl().
2059 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002060 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002061 *s = CAR;
2062 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002063 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002064 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002065 /* Change NL to CR here too, so that this works:
2066 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2067 * abc\
2068 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002069 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002070 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002071 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002072 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002073 had_backslash = TRUE;
2074 }
2075 }
2076 if (had_backslash && backslash)
2077 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002078 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002079 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2080 if (s != NULL)
2081 {
2082 vim_free(eval_result);
2083 eval_result = s;
2084 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002085 }
2086
2087 dst += STRLEN(eval_result);
2088 }
2089
Bram Moolenaar6100d022016-10-02 16:51:57 +02002090 can_f_submatch = prev_can_f_submatch;
2091 if (can_f_submatch)
2092 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002093 }
2094#endif
2095 }
2096 else
2097 while ((c = *src++) != NUL)
2098 {
2099 if (c == '&' && magic)
2100 no = 0;
2101 else if (c == '\\' && *src != NUL)
2102 {
2103 if (*src == '&' && !magic)
2104 {
2105 ++src;
2106 no = 0;
2107 }
2108 else if ('0' <= *src && *src <= '9')
2109 {
2110 no = *src++ - '0';
2111 }
2112 else if (vim_strchr((char_u *)"uUlLeE", *src))
2113 {
2114 switch (*src++)
2115 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002116 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002117 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002118 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002119 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002120 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002121 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002122 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002123 continue;
2124 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002125 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002126 continue;
2127 }
2128 }
2129 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002130 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002131 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002132 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2133 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002134 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002135 if (copy)
2136 {
2137 *dst++ = c;
2138 *dst++ = *src++;
2139 *dst++ = *src++;
2140 }
2141 else
2142 {
2143 dst += 3;
2144 src += 2;
2145 }
2146 continue;
2147 }
2148
Bram Moolenaar071d4272004-06-13 20:20:40 +00002149 if (c == '\\' && *src != NUL)
2150 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002151 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002152 switch (*src)
2153 {
2154 case 'r': c = CAR; ++src; break;
2155 case 'n': c = NL; ++src; break;
2156 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002157 // Oh no! \e already has meaning in subst pat :-(
2158 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002159 case 'b': c = Ctrl_H; ++src; break;
2160
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002161 // If "backslash" is TRUE the backslash will be removed
2162 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002163 default: if (backslash)
2164 {
2165 if (copy)
2166 *dst = '\\';
2167 ++dst;
2168 }
2169 c = *src++;
2170 }
2171 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002172 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002173 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002174
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002175 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002176 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002177 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002178 func_one = (fptr_T)(func_one(&cc, c));
2179 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002180 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002181 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002182 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002183 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002184
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002185 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002186 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002187 int totlen = mb_ptr2len(src - 1);
2188
Bram Moolenaar071d4272004-06-13 20:20:40 +00002189 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002190 mb_char2bytes(cc, dst);
2191 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002192 if (enc_utf8)
2193 {
2194 int clen = utf_ptr2len(src - 1);
2195
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002196 // If the character length is shorter than "totlen", there
2197 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002198 if (clen < totlen)
2199 {
2200 if (copy)
2201 mch_memmove(dst + 1, src - 1 + clen,
2202 (size_t)(totlen - clen));
2203 dst += totlen - clen;
2204 }
2205 }
2206 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002207 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002208 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002209 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002210 dst++;
2211 }
2212 else
2213 {
2214 if (REG_MULTI)
2215 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002216 clnum = rex.reg_mmatch->startpos[no].lnum;
2217 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002218 s = NULL;
2219 else
2220 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002221 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2222 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2223 len = rex.reg_mmatch->endpos[no].col
2224 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002225 else
2226 len = (int)STRLEN(s);
2227 }
2228 }
2229 else
2230 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002231 s = rex.reg_match->startp[no];
2232 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002233 s = NULL;
2234 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002235 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002236 }
2237 if (s != NULL)
2238 {
2239 for (;;)
2240 {
2241 if (len == 0)
2242 {
2243 if (REG_MULTI)
2244 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002245 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002246 break;
2247 if (copy)
2248 *dst = CAR;
2249 ++dst;
2250 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002251 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2252 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002253 else
2254 len = (int)STRLEN(s);
2255 }
2256 else
2257 break;
2258 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002259 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002260 {
2261 if (copy)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002262 emsg(_(e_re_damg));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002263 goto exit;
2264 }
2265 else
2266 {
2267 if (backslash && (*s == CAR || *s == '\\'))
2268 {
2269 /*
2270 * Insert a backslash in front of a CR, otherwise
2271 * it will be replaced by a line break.
2272 * Number of backslashes will be halved later,
2273 * double them here.
2274 */
2275 if (copy)
2276 {
2277 dst[0] = '\\';
2278 dst[1] = *s;
2279 }
2280 dst += 2;
2281 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002282 else
2283 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002284 if (has_mbyte)
2285 c = mb_ptr2char(s);
2286 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002287 c = *s;
2288
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002289 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002290 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002291 func_one = (fptr_T)(func_one(&cc, c));
2292 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002293 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002294 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002295 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002296 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002297
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002298 if (has_mbyte)
2299 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002300 int l;
2301
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002302 // Copy composing characters separately, one
2303 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002304 if (enc_utf8)
2305 l = utf_ptr2len(s) - 1;
2306 else
2307 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002308
2309 s += l;
2310 len -= l;
2311 if (copy)
2312 mb_char2bytes(cc, dst);
2313 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002314 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002315 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002316 *dst = cc;
2317 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002318 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002319
Bram Moolenaar071d4272004-06-13 20:20:40 +00002320 ++s;
2321 --len;
2322 }
2323 }
2324 }
2325 no = -1;
2326 }
2327 }
2328 if (copy)
2329 *dst = NUL;
2330
2331exit:
2332 return (int)((dst - dest) + 1);
2333}
2334
2335#ifdef FEAT_EVAL
2336/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002337 * Call reg_getline() with the line numbers from the submatch. If a
2338 * substitute() was used the reg_maxline and other values have been
2339 * overwritten.
2340 */
2341 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002342reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002343{
2344 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002345 linenr_T save_first = rex.reg_firstlnum;
2346 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002347
Bram Moolenaar6100d022016-10-02 16:51:57 +02002348 rex.reg_firstlnum = rsm.sm_firstlnum;
2349 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002350
2351 s = reg_getline(lnum);
2352
Bram Moolenaar6100d022016-10-02 16:51:57 +02002353 rex.reg_firstlnum = save_first;
2354 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002355 return s;
2356}
2357
2358/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002359 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002360 * allocated memory.
2361 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2362 */
2363 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002364reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002365{
2366 char_u *retval = NULL;
2367 char_u *s;
2368 int len;
2369 int round;
2370 linenr_T lnum;
2371
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002372 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002373 return NULL;
2374
Bram Moolenaar6100d022016-10-02 16:51:57 +02002375 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002376 {
2377 /*
2378 * First round: compute the length and allocate memory.
2379 * Second round: copy the text.
2380 */
2381 for (round = 1; round <= 2; ++round)
2382 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002383 lnum = rsm.sm_mmatch->startpos[no].lnum;
2384 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002385 return NULL;
2386
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002387 s = reg_getline_submatch(lnum);
2388 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002389 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002390 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002391 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002392 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002393 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002394 len = rsm.sm_mmatch->endpos[no].col
2395 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002396 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002397 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002398 ++len;
2399 }
2400 else
2401 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002402 // Multiple lines: take start line from start col, middle
2403 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002404 len = (int)STRLEN(s);
2405 if (round == 2)
2406 {
2407 STRCPY(retval, s);
2408 retval[len] = '\n';
2409 }
2410 ++len;
2411 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002412 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002413 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002414 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002415 if (round == 2)
2416 STRCPY(retval + len, s);
2417 len += (int)STRLEN(s);
2418 if (round == 2)
2419 retval[len] = '\n';
2420 ++len;
2421 }
2422 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002423 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002424 rsm.sm_mmatch->endpos[no].col);
2425 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002426 if (round == 2)
2427 retval[len] = NUL;
2428 ++len;
2429 }
2430
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002431 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002432 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002433 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002434 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002435 return NULL;
2436 }
2437 }
2438 }
2439 else
2440 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002441 s = rsm.sm_match->startp[no];
2442 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002443 retval = NULL;
2444 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002445 retval = vim_strnsave(s, (int)(rsm.sm_match->endp[no] - s));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002446 }
2447
2448 return retval;
2449}
Bram Moolenaar41571762014-04-02 19:00:58 +02002450
2451/*
2452 * Used for the submatch() function with the optional non-zero argument: get
2453 * the list of strings from the n'th submatch in allocated memory with NULs
2454 * represented in NLs.
2455 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2456 * command, for a non-existing submatch and for any error.
2457 */
2458 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002459reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002460{
2461 char_u *s;
2462 linenr_T slnum;
2463 linenr_T elnum;
2464 colnr_T scol;
2465 colnr_T ecol;
2466 int i;
2467 list_T *list;
2468 int error = FALSE;
2469
2470 if (!can_f_submatch || no < 0)
2471 return NULL;
2472
Bram Moolenaar6100d022016-10-02 16:51:57 +02002473 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002474 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002475 slnum = rsm.sm_mmatch->startpos[no].lnum;
2476 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002477 if (slnum < 0 || elnum < 0)
2478 return NULL;
2479
Bram Moolenaar6100d022016-10-02 16:51:57 +02002480 scol = rsm.sm_mmatch->startpos[no].col;
2481 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002482
2483 list = list_alloc();
2484 if (list == NULL)
2485 return NULL;
2486
2487 s = reg_getline_submatch(slnum) + scol;
2488 if (slnum == elnum)
2489 {
2490 if (list_append_string(list, s, ecol - scol) == FAIL)
2491 error = TRUE;
2492 }
2493 else
2494 {
2495 if (list_append_string(list, s, -1) == FAIL)
2496 error = TRUE;
2497 for (i = 1; i < elnum - slnum; i++)
2498 {
2499 s = reg_getline_submatch(slnum + i);
2500 if (list_append_string(list, s, -1) == FAIL)
2501 error = TRUE;
2502 }
2503 s = reg_getline_submatch(elnum);
2504 if (list_append_string(list, s, ecol) == FAIL)
2505 error = TRUE;
2506 }
2507 }
2508 else
2509 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002510 s = rsm.sm_match->startp[no];
2511 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002512 return NULL;
2513 list = list_alloc();
2514 if (list == NULL)
2515 return NULL;
2516 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002517 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002518 error = TRUE;
2519 }
2520
2521 if (error)
2522 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002523 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002524 return NULL;
2525 }
2526 return list;
2527}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002528#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002529
Bram Moolenaarf4140482020-02-15 23:06:45 +01002530/*
2531 * Initialize the values used for matching against multiple lines
2532 */
2533 static void
2534init_regexec_multi(
2535 regmmatch_T *rmp,
2536 win_T *win, // window in which to search or NULL
2537 buf_T *buf, // buffer in which to search
2538 linenr_T lnum) // nr of line to start looking for match
2539{
2540 rex.reg_match = NULL;
2541 rex.reg_mmatch = rmp;
2542 rex.reg_buf = buf;
2543 rex.reg_win = win;
2544 rex.reg_firstlnum = lnum;
2545 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2546 rex.reg_line_lbr = FALSE;
2547 rex.reg_ic = rmp->rmm_ic;
2548 rex.reg_icombine = FALSE;
2549 rex.reg_maxcol = rmp->rmm_maxcol;
2550}
2551
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002552#include "regexp_bt.c"
2553
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002554static regengine_T bt_regengine =
2555{
2556 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002557 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002558 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002559 bt_regexec_multi,
2560 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002561};
2562
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002563#include "regexp_nfa.c"
2564
2565static regengine_T nfa_regengine =
2566{
2567 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002568 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002569 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002570 nfa_regexec_multi,
2571 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002572};
2573
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002574// Which regexp engine to use? Needed for vim_regcomp().
2575// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002576static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002577
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002578#ifdef DEBUG
2579static char_u regname[][30] = {
2580 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002581 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582 "NFA Regexp Engine"
2583 };
2584#endif
2585
2586/*
2587 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002588 * Returns the program in allocated memory.
2589 * Use vim_regfree() to free the memory.
2590 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002591 */
2592 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002593vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594{
2595 regprog_T *prog = NULL;
2596 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002597 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002598
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002599 regexp_engine = p_re;
2600
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002601 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002602 if (STRNCMP(expr, "\\%#=", 4) == 0)
2603 {
2604 int newengine = expr[4] - '0';
2605
2606 if (newengine == AUTOMATIC_ENGINE
2607 || newengine == BACKTRACKING_ENGINE
2608 || newengine == NFA_ENGINE)
2609 {
2610 regexp_engine = expr[4] - '0';
2611 expr += 5;
2612#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002613 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002614 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002615#endif
2616 }
2617 else
2618 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002619 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002620 regexp_engine = AUTOMATIC_ENGINE;
2621 }
2622 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002623#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002624 bt_regengine.expr = expr;
2625 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002626#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002627 // reg_iswordc() uses rex.reg_buf
2628 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002629
2630 /*
2631 * First try the NFA engine, unless backtracking was requested.
2632 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002633 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002634 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002635 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002636 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002637 else
2638 prog = bt_regengine.regcomp(expr, re_flags);
2639
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002640 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002641 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002642 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002643#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002644 if (regexp_engine != BACKTRACKING_ENGINE) // debugging log for NFA
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002645 {
2646 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002647 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002648 if (f)
2649 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002650 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002651 fclose(f);
2652 }
2653 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002654 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002655 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002656 }
2657#endif
2658 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002659 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002660 * The NFA engine also fails for patterns that it can't handle well
2661 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002662 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002663 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002664 if (regexp_engine == AUTOMATIC_ENGINE
2665 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002666 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002667 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002668 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002669 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002670 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002671
Bram Moolenaarfda37292014-11-05 14:27:36 +01002672 if (prog != NULL)
2673 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002674 // Store the info needed to call regcomp() again when the engine turns
2675 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002676 prog->re_engine = regexp_engine;
2677 prog->re_flags = re_flags;
2678 }
2679
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002680 return prog;
2681}
2682
2683/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002684 * Free a compiled regexp program, returned by vim_regcomp().
2685 */
2686 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002687vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002688{
2689 if (prog != NULL)
2690 prog->engine->regfree(prog);
2691}
2692
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002693#if defined(EXITFREE) || defined(PROTO)
2694 void
2695free_regexp_stuff(void)
2696{
2697 ga_clear(&regstack);
2698 ga_clear(&backpos);
2699 vim_free(reg_tofree);
2700 vim_free(reg_prev_sub);
2701}
2702#endif
2703
Bram Moolenaarfda37292014-11-05 14:27:36 +01002704#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002705 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002706report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002707{
2708 if (p_verbose > 0)
2709 {
2710 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002711 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2712 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002713 verbose_leave();
2714 }
2715}
2716#endif
2717
Bram Moolenaar113e1072019-01-20 15:30:40 +01002718#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2719 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002720/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002721 * Return whether "prog" is currently being executed.
2722 */
2723 int
2724regprog_in_use(regprog_T *prog)
2725{
2726 return prog->re_in_use;
2727}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002728#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002729
2730/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002731 * Match a regexp against a string.
2732 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002733 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002734 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002735 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002736 *
2737 * Return TRUE if there is a match, FALSE if not.
2738 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002739 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002740vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002741 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002742 char_u *line, // string to match against
2743 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002744 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002745{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002746 int result;
2747 regexec_T rex_save;
2748 int rex_in_use_save = rex_in_use;
2749
Bram Moolenaar0270f382018-07-17 05:43:58 +02002750 // Cannot use the same prog recursively, it contains state.
2751 if (rmp->regprog->re_in_use)
2752 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002753 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002754 return FALSE;
2755 }
2756 rmp->regprog->re_in_use = TRUE;
2757
Bram Moolenaar6100d022016-10-02 16:51:57 +02002758 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002759 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002760 rex_save = rex;
2761 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002762
Bram Moolenaar6100d022016-10-02 16:51:57 +02002763 rex.reg_startp = NULL;
2764 rex.reg_endp = NULL;
2765 rex.reg_startpos = NULL;
2766 rex.reg_endpos = NULL;
2767
2768 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002769 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002770
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002771 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002772 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2773 && result == NFA_TOO_EXPENSIVE)
2774 {
2775 int save_p_re = p_re;
2776 int re_flags = rmp->regprog->re_flags;
2777 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2778
2779 p_re = BACKTRACKING_ENGINE;
2780 vim_regfree(rmp->regprog);
2781 if (pat != NULL)
2782 {
2783#ifdef FEAT_EVAL
2784 report_re_switch(pat);
2785#endif
2786 rmp->regprog = vim_regcomp(pat, re_flags);
2787 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002788 {
2789 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002790 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002791 rmp->regprog->re_in_use = FALSE;
2792 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002793 vim_free(pat);
2794 }
2795
2796 p_re = save_p_re;
2797 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002798
2799 rex_in_use = rex_in_use_save;
2800 if (rex_in_use)
2801 rex = rex_save;
2802
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002803 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002804}
2805
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002806/*
2807 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002808 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002809 */
2810 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002811vim_regexec_prog(
2812 regprog_T **prog,
2813 int ignore_case,
2814 char_u *line,
2815 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002816{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002817 int r;
2818 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002819
2820 regmatch.regprog = *prog;
2821 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002822 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002823 *prog = regmatch.regprog;
2824 return r;
2825}
2826
2827/*
2828 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002829 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002830 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002831 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002832vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002833{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002834 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002835}
2836
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002837/*
2838 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002839 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002840 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002841 */
2842 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002843vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002844{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002845 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002846}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002847
2848/*
2849 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002850 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2851 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002852 * Uses curbuf for line count and 'iskeyword'.
2853 *
2854 * Return zero if there is no match. Return number of lines contained in the
2855 * match otherwise.
2856 */
2857 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002858vim_regexec_multi(
2859 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002860 win_T *win, // window in which to search or NULL
2861 buf_T *buf, // buffer in which to search
2862 linenr_T lnum, // nr of line to start looking for match
2863 colnr_T col, // column to start looking for match
2864 proftime_T *tm, // timeout limit or NULL
2865 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002866{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002867 int result;
2868 regexec_T rex_save;
2869 int rex_in_use_save = rex_in_use;
2870
Bram Moolenaar0270f382018-07-17 05:43:58 +02002871 // Cannot use the same prog recursively, it contains state.
2872 if (rmp->regprog->re_in_use)
2873 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002874 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002875 return FALSE;
2876 }
2877 rmp->regprog->re_in_use = TRUE;
2878
Bram Moolenaar6100d022016-10-02 16:51:57 +02002879 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002880 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002881 rex_save = rex;
2882 rex_in_use = TRUE;
2883
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002884 result = rmp->regprog->engine->regexec_multi(
2885 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002886 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002887
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002888 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002889 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2890 && result == NFA_TOO_EXPENSIVE)
2891 {
2892 int save_p_re = p_re;
2893 int re_flags = rmp->regprog->re_flags;
2894 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2895
2896 p_re = BACKTRACKING_ENGINE;
2897 vim_regfree(rmp->regprog);
2898 if (pat != NULL)
2899 {
2900#ifdef FEAT_EVAL
2901 report_re_switch(pat);
2902#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002903#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002904 // checking for \z misuse was already done when compiling for NFA,
2905 // allow all here
2906 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002907#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002908 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002909#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002910 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002911#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002912
Bram Moolenaarfda37292014-11-05 14:27:36 +01002913 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002914 {
2915 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002916 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002917 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002918 rmp->regprog->re_in_use = FALSE;
2919 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002920 vim_free(pat);
2921 }
2922 p_re = save_p_re;
2923 }
2924
Bram Moolenaar6100d022016-10-02 16:51:57 +02002925 rex_in_use = rex_in_use_save;
2926 if (rex_in_use)
2927 rex = rex_save;
2928
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002929 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002930}