blob: 0fd6de61ec4d1a85a2ddd4889401d6f7501e70e5 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020077static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
78static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
79static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020081static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020082static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020083#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020084static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020085static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020086static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
87
Bram Moolenaar071d4272004-06-13 20:20:40 +000088#define NOT_MULTI 0
89#define MULTI_ONE 1
90#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020091
92// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010093#define RA_FAIL 1 // something failed, abort
94#define RA_CONT 2 // continue in inner loop
95#define RA_BREAK 3 // break inner loop
96#define RA_MATCH 4 // successful match
97#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020098
Bram Moolenaar071d4272004-06-13 20:20:40 +000099/*
100 * Return NOT_MULTI if c is not a "multi" operator.
101 * Return MULTI_ONE if c is a single "multi" operator.
102 * Return MULTI_MULT if c is a multi "multi" operator.
103 */
104 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100105re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000106{
107 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
108 return MULTI_ONE;
109 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
110 return MULTI_MULT;
111 return NOT_MULTI;
112}
113
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000114static char_u *reg_prev_sub = NULL;
115
Bram Moolenaar071d4272004-06-13 20:20:40 +0000116/*
117 * REGEXP_INRANGE contains all characters which are always special in a []
118 * range after '\'.
119 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
120 * These are:
121 * \n - New line (NL).
122 * \r - Carriage Return (CR).
123 * \t - Tab (TAB).
124 * \e - Escape (ESC).
125 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000126 * \d - Character code in decimal, eg \d123
127 * \o - Character code in octal, eg \o80
128 * \x - Character code in hex, eg \x4a
129 * \u - Multibyte character code, eg \u20ac
130 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131 */
132static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000133static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135/*
136 * Translate '\x' to its control character, except "\n", which is Magic.
137 */
138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100139backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000140{
141 switch (c)
142 {
143 case 'r': return CAR;
144 case 't': return TAB;
145 case 'e': return ESC;
146 case 'b': return BS;
147 }
148 return c;
149}
150
151/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000152 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000153 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
154 * recognized. Otherwise "pp" is advanced to after the item.
155 */
156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100157get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000158{
159 static const char *(class_names[]) =
160 {
161 "alnum:]",
162#define CLASS_ALNUM 0
163 "alpha:]",
164#define CLASS_ALPHA 1
165 "blank:]",
166#define CLASS_BLANK 2
167 "cntrl:]",
168#define CLASS_CNTRL 3
169 "digit:]",
170#define CLASS_DIGIT 4
171 "graph:]",
172#define CLASS_GRAPH 5
173 "lower:]",
174#define CLASS_LOWER 6
175 "print:]",
176#define CLASS_PRINT 7
177 "punct:]",
178#define CLASS_PUNCT 8
179 "space:]",
180#define CLASS_SPACE 9
181 "upper:]",
182#define CLASS_UPPER 10
183 "xdigit:]",
184#define CLASS_XDIGIT 11
185 "tab:]",
186#define CLASS_TAB 12
187 "return:]",
188#define CLASS_RETURN 13
189 "backspace:]",
190#define CLASS_BACKSPACE 14
191 "escape:]",
192#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100193 "ident:]",
194#define CLASS_IDENT 16
195 "keyword:]",
196#define CLASS_KEYWORD 17
197 "fname:]",
198#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 };
200#define CLASS_NONE 99
201 int i;
202
203 if ((*pp)[1] == ':')
204 {
Bram Moolenaar78a15312009-05-15 19:33:18 +0000205 for (i = 0; i < (int)(sizeof(class_names) / sizeof(*class_names)); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000206 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
207 {
208 *pp += STRLEN(class_names[i]) + 2;
209 return i;
210 }
211 }
212 return CLASS_NONE;
213}
214
215/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216 * Specific version of character class functions.
217 * Using a table to keep this fast.
218 */
219static short class_tab[256];
220
221#define RI_DIGIT 0x01
222#define RI_HEX 0x02
223#define RI_OCTAL 0x04
224#define RI_WORD 0x08
225#define RI_HEAD 0x10
226#define RI_ALPHA 0x20
227#define RI_LOWER 0x40
228#define RI_UPPER 0x80
229#define RI_WHITE 0x100
230
231 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100232init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000233{
234 int i;
235 static int done = FALSE;
236
237 if (done)
238 return;
239
240 for (i = 0; i < 256; ++i)
241 {
242 if (i >= '0' && i <= '7')
243 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
244 else if (i >= '8' && i <= '9')
245 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
246 else if (i >= 'a' && i <= 'f')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
248#ifdef EBCDIC
249 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
250 || (i >= 's' && i <= 'z'))
251#else
252 else if (i >= 'g' && i <= 'z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
255 else if (i >= 'A' && i <= 'F')
256 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
257#ifdef EBCDIC
258 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
259 || (i >= 'S' && i <= 'Z'))
260#else
261 else if (i >= 'G' && i <= 'Z')
262#endif
263 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
264 else if (i == '_')
265 class_tab[i] = RI_WORD + RI_HEAD;
266 else
267 class_tab[i] = 0;
268 }
269 class_tab[' '] |= RI_WHITE;
270 class_tab['\t'] |= RI_WHITE;
271 done = TRUE;
272}
273
Bram Moolenaara12a1612019-01-24 16:39:02 +0100274#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
275#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
276#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
277#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
278#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
279#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
280#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
281#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
282#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100284// flags for regflags
285#define RF_ICASE 1 // ignore case
286#define RF_NOICASE 2 // don't ignore case
287#define RF_HASNL 4 // can match a NL
288#define RF_ICOMBINE 8 // ignore combining characters
289#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000290
291/*
292 * Global work variables for vim_regcomp().
293 */
294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static char_u *regparse; // Input-scan pointer.
296static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100297static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000298#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100299static int regnzpar; // \z() count.
300static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000301#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100302static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000303#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100304static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000306
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100307static int reg_magic; // magicness of the pattern:
308#define MAGIC_NONE 1 // "\V" very unmagic
309#define MAGIC_OFF 2 // "\M" or 'magic' off
310#define MAGIC_ON 3 // "\m" or 'magic'
311#define MAGIC_ALL 4 // "\v" very magic
Bram Moolenaar071d4272004-06-13 20:20:40 +0000312
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100313static int reg_string; // matching with a string instead of a buffer
314 // line
315static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000316
317/*
318 * META contains all characters that may be magic, except '^' and '$'.
319 */
320
321#ifdef EBCDIC
322static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
323#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100324// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000325static char_u META_flags[] = {
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100328// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000329 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100330// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000331 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100332// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000333 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100334// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000335 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100336// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000337 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000339 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
340};
341#endif
342
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100343static int curchr; // currently parsed character
344// Previous character. Note: prevchr is sometimes -1 when we are not at the
345// start, eg in /[ ^I]^ the pattern was never found even if it existed,
346// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200347static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100348static int prevprevchr; // previous-previous character
349static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000350
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100351// arguments for reg()
352#define REG_NOPAREN 0 // toplevel reg()
353#define REG_PAREN 1 // \(\)
354#define REG_ZPAREN 2 // \z(\)
355#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000356
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200357typedef struct
358{
359 char_u *regparse;
360 int prevchr_len;
361 int curchr;
362 int prevchr;
363 int prevprevchr;
364 int nextchr;
365 int at_start;
366 int prev_at_start;
367 int regnpar;
368} parse_state_T;
369
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100370static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100371static int getchr(void);
372static void skipchr_keepstart(void);
373static int peekchr(void);
374static void skipchr(void);
375static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100376static long gethexchrs(int maxinputlen);
377static long getoctchrs(void);
378static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100379static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100380static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200381static int cstrncmp(char_u *s1, char_u *s2, int *n);
382static char_u *cstrchr(char_u *, int);
383static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100384static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100385#ifdef FEAT_EVAL
386static void report_re_switch(char_u *pat);
387#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200389static regengine_T bt_regengine;
390static regengine_T nfa_regengine;
391
Bram Moolenaar071d4272004-06-13 20:20:40 +0000392/*
393 * Return TRUE if compiled regular expression "prog" can match a line break.
394 */
395 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100396re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000397{
398 return (prog->regflags & RF_HASNL);
399}
400
401/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000402 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
403 * Returns a character representing the class. Zero means that no item was
404 * recognized. Otherwise "pp" is advanced to after the item.
405 */
406 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100407get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000408{
409 int c;
410 int l = 1;
411 char_u *p = *pp;
412
Bram Moolenaar985079c2019-02-16 17:07:47 +0100413 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000414 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000416 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000417 if (p[l + 2] == '=' && p[l + 3] == ']')
418 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000419 if (has_mbyte)
420 c = mb_ptr2char(p + 2);
421 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000422 c = p[2];
423 *pp += l + 4;
424 return c;
425 }
426 }
427 return 0;
428}
429
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200430#ifdef EBCDIC
431/*
432 * Table for equivalence class "c". (IBM-1047)
433 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200434static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200435 "A\x62\x63\x64\x65\x66\x67",
436 "C\x68",
437 "E\x71\x72\x73\x74",
438 "I\x75\x76\x77\x78",
439 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200440 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200441 "U\xFB\xFC\xFD\xFE",
442 "Y\xBA",
443 "a\x42\x43\x44\x45\x46\x47",
444 "c\x48",
445 "e\x51\x52\x53\x54",
446 "i\x55\x56\x57\x58",
447 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200448 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200449 "u\xDB\xDC\xDD\xDE",
450 "y\x8D\xDF",
451};
452#endif
453
Bram Moolenaardf177f62005-02-22 08:39:57 +0000454/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000455 * Check for a collating element "[.a.]". "pp" points to the '['.
456 * Returns a character. Zero means that no item was recognized. Otherwise
457 * "pp" is advanced to after the item.
458 * Currently only single characters are recognized!
459 */
460 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100461get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000462{
463 int c;
464 int l = 1;
465 char_u *p = *pp;
466
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100467 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000470 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000471 if (p[l + 2] == '.' && p[l + 3] == ']')
472 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000473 if (has_mbyte)
474 c = mb_ptr2char(p + 2);
475 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000476 c = p[2];
477 *pp += l + 4;
478 return c;
479 }
480 }
481 return 0;
482}
483
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100484static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
485static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200486
487 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100488get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200489{
490 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
491 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
492}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000493
494/*
495 * Skip over a "[]" range.
496 * "p" must point to the character after the '['.
497 * The returned pointer is on the matching ']', or the terminating NUL.
498 */
499 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100500skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000502 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000503
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100504 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000505 ++p;
506 if (*p == ']' || *p == '-')
507 ++p;
508 while (*p != NUL && *p != ']')
509 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000510 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000511 p += l;
512 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000513 if (*p == '-')
514 {
515 ++p;
516 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100517 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000518 }
519 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200520 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000521 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200522 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000523 p += 2;
524 else if (*p == '[')
525 {
526 if (get_char_class(&p) == CLASS_NONE
527 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200528 && get_coll_element(&p) == 0
529 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100530 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000531 }
532 else
533 ++p;
534 }
535
536 return p;
537}
538
539/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000540 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200541 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000542 * Take care of characters with a backslash in front of it.
543 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000544 */
545 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100546skip_regexp(
547 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200548 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200549 int magic)
550{
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200551 return skip_regexp_ex(startp, delim, magic, NULL, NULL);
552}
553
554/*
555 * Call skip_regexp() and when the delimiter does not match give an error and
556 * return NULL.
557 */
558 char_u *
559skip_regexp_err(
560 char_u *startp,
561 int delim,
562 int magic)
563{
564 char_u *p = skip_regexp(startp, delim, magic);
565
566 if (*p != delim)
567 {
568 semsg(_("E654: missing delimiter after search pattern: %s"), startp);
569 return NULL;
570 }
571 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200572}
573
574/*
575 * skip_regexp() with extra arguments:
576 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
577 * expression and change "\?" to "?". If "*newp" is not NULL the expression
578 * is changed in-place.
579 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
580 */
581 char_u *
582skip_regexp_ex(
583 char_u *startp,
584 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100585 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200586 char_u **newp,
587 int *dropped)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000588{
589 int mymagic;
590 char_u *p = startp;
591
592 if (magic)
593 mymagic = MAGIC_ON;
594 else
595 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200596 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000597
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100598 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000599 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100600 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000601 break;
602 if ((p[0] == '[' && mymagic >= MAGIC_ON)
603 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
604 {
605 p = skip_anyof(p + 1);
606 if (p[0] == NUL)
607 break;
608 }
609 else if (p[0] == '\\' && p[1] != NUL)
610 {
611 if (dirc == '?' && newp != NULL && p[1] == '?')
612 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100613 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000614 if (*newp == NULL)
615 {
616 *newp = vim_strsave(startp);
617 if (*newp != NULL)
618 p = *newp + (p - startp);
619 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200620 if (dropped != NULL)
621 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000622 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000623 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000624 else
625 ++p;
626 }
627 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100628 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000629 if (*p == 'v')
630 mymagic = MAGIC_ALL;
631 else if (*p == 'V')
632 mymagic = MAGIC_NONE;
633 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000634 }
635 return p;
636}
637
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200638/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200639 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200640 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100641static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200642static int at_start; // True when on the first character
643static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100644
Bram Moolenaar071d4272004-06-13 20:20:40 +0000645/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200646 * Start parsing at "str".
647 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000648 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100649initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000650{
651 regparse = str;
652 prevchr_len = 0;
653 curchr = prevprevchr = prevchr = nextchr = -1;
654 at_start = TRUE;
655 prev_at_start = FALSE;
656}
657
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200658/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200659 * Save the current parse state, so that it can be restored and parsing
660 * starts in the same state again.
661 */
662 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100663save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200664{
665 ps->regparse = regparse;
666 ps->prevchr_len = prevchr_len;
667 ps->curchr = curchr;
668 ps->prevchr = prevchr;
669 ps->prevprevchr = prevprevchr;
670 ps->nextchr = nextchr;
671 ps->at_start = at_start;
672 ps->prev_at_start = prev_at_start;
673 ps->regnpar = regnpar;
674}
675
676/*
677 * Restore a previously saved parse state.
678 */
679 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100680restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200681{
682 regparse = ps->regparse;
683 prevchr_len = ps->prevchr_len;
684 curchr = ps->curchr;
685 prevchr = ps->prevchr;
686 prevprevchr = ps->prevprevchr;
687 nextchr = ps->nextchr;
688 at_start = ps->at_start;
689 prev_at_start = ps->prev_at_start;
690 regnpar = ps->regnpar;
691}
692
693
694/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200695 * Get the next character without advancing.
696 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000697 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100698peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000699{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000700 static int after_slash = FALSE;
701
Bram Moolenaar071d4272004-06-13 20:20:40 +0000702 if (curchr == -1)
703 {
704 switch (curchr = regparse[0])
705 {
706 case '.':
707 case '[':
708 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100709 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000710 if (reg_magic >= MAGIC_ON)
711 curchr = Magic(curchr);
712 break;
713 case '(':
714 case ')':
715 case '{':
716 case '%':
717 case '+':
718 case '=':
719 case '?':
720 case '@':
721 case '!':
722 case '&':
723 case '|':
724 case '<':
725 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100726 case '#': // future ext.
727 case '"': // future ext.
728 case '\'': // future ext.
729 case ',': // future ext.
730 case '-': // future ext.
731 case ':': // future ext.
732 case ';': // future ext.
733 case '`': // future ext.
734 case '/': // Can't be used in / command
735 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000736 if (reg_magic == MAGIC_ALL)
737 curchr = Magic(curchr);
738 break;
739 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100740 // * is not magic as the very first character, eg "?*ptr", when
741 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
742 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000743 if (reg_magic >= MAGIC_ON
744 && !at_start
745 && !(prev_at_start && prevchr == Magic('^'))
746 && (after_slash
747 || (prevchr != Magic('(')
748 && prevchr != Magic('&')
749 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000750 curchr = Magic('*');
751 break;
752 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100753 // '^' is only magic as the very first character and if it's after
754 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000755 if (reg_magic >= MAGIC_OFF
756 && (at_start
757 || reg_magic == MAGIC_ALL
758 || prevchr == Magic('(')
759 || prevchr == Magic('|')
760 || prevchr == Magic('&')
761 || prevchr == Magic('n')
762 || (no_Magic(prevchr) == '('
763 && prevprevchr == Magic('%'))))
764 {
765 curchr = Magic('^');
766 at_start = TRUE;
767 prev_at_start = FALSE;
768 }
769 break;
770 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100771 // '$' is only magic as the very last char and if it's in front of
772 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000773 if (reg_magic >= MAGIC_OFF)
774 {
775 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200776 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000777
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100778 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000779 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200780 || p[1] == 'm' || p[1] == 'M'
781 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
782 {
783 if (p[1] == 'v')
784 is_magic_all = TRUE;
785 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
786 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000787 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200788 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000789 if (p[0] == NUL
790 || (p[0] == '\\'
791 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
792 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200793 || (is_magic_all
794 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000795 || reg_magic == MAGIC_ALL)
796 curchr = Magic('$');
797 }
798 break;
799 case '\\':
800 {
801 int c = regparse[1];
802
803 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100804 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000805 else if (
806#ifdef EBCDIC
807 vim_strchr(META, c)
808#else
809 c <= '~' && META_flags[c]
810#endif
811 )
812 {
813 /*
814 * META contains everything that may be magic sometimes,
815 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200816 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000817 * magicness. Therefore, \ is so meta-magic that it is
818 * not in META.
819 */
820 curchr = -1;
821 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100822 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000823 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000824 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000825 peekchr();
826 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000827 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000828 curchr = toggle_Magic(curchr);
829 }
830 else if (vim_strchr(REGEXP_ABBR, c))
831 {
832 /*
833 * Handle abbreviations, like "\t" for TAB -- webb
834 */
835 curchr = backslash_trans(c);
836 }
837 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
838 curchr = toggle_Magic(c);
839 else
840 {
841 /*
842 * Next character can never be (made) magic?
843 * Then backslashing it won't do anything.
844 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000845 if (has_mbyte)
846 curchr = (*mb_ptr2char)(regparse + 1);
847 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000848 curchr = c;
849 }
850 break;
851 }
852
Bram Moolenaar071d4272004-06-13 20:20:40 +0000853 default:
854 if (has_mbyte)
855 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000856 }
857 }
858
859 return curchr;
860}
861
862/*
863 * Eat one lexed character. Do this in a way that we can undo it.
864 */
865 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100866skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000867{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100868 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000869 if (*regparse == '\\')
870 prevchr_len = 1;
871 else
872 prevchr_len = 0;
873 if (regparse[prevchr_len] != NUL)
874 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000875 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100876 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000877 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000878 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000879 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000880 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000881 ++prevchr_len;
882 }
883 regparse += prevchr_len;
884 prev_at_start = at_start;
885 at_start = FALSE;
886 prevprevchr = prevchr;
887 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100888 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000889 nextchr = -1;
890}
891
892/*
893 * Skip a character while keeping the value of prev_at_start for at_start.
894 * prevchr and prevprevchr are also kept.
895 */
896 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100897skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000898{
899 int as = prev_at_start;
900 int pr = prevchr;
901 int prpr = prevprevchr;
902
903 skipchr();
904 at_start = as;
905 prevchr = pr;
906 prevprevchr = prpr;
907}
908
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200909/*
910 * Get the next character from the pattern. We know about magic and such, so
911 * therefore we need a lexical analyzer.
912 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000913 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100914getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000915{
916 int chr = peekchr();
917
918 skipchr();
919 return chr;
920}
921
922/*
923 * put character back. Works only once!
924 */
925 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100926ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000927{
928 nextchr = curchr;
929 curchr = prevchr;
930 prevchr = prevprevchr;
931 at_start = prev_at_start;
932 prev_at_start = FALSE;
933
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100934 // Backup regparse, so that it's at the same position as before the
935 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000936 regparse -= prevchr_len;
937}
938
939/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000940 * Get and return the value of the hex string at the current position.
941 * Return -1 if there is no valid hex number.
942 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000943 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000944 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000945 * The parameter controls the maximum number of input characters. This will be
946 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
947 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100948 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100949gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000950{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100951 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000952 int c;
953 int i;
954
955 for (i = 0; i < maxinputlen; ++i)
956 {
957 c = regparse[0];
958 if (!vim_isxdigit(c))
959 break;
960 nr <<= 4;
961 nr |= hex2nr(c);
962 ++regparse;
963 }
964
965 if (i == 0)
966 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100967 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000968}
969
970/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200971 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000972 * current position. Return -1 for invalid. Consumes all digits.
973 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100974 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100975getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000976{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100977 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000978 int c;
979 int i;
980
981 for (i = 0; ; ++i)
982 {
983 c = regparse[0];
984 if (c < '0' || c > '9')
985 break;
986 nr *= 10;
987 nr += c - '0';
988 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100989 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000990 }
991
992 if (i == 0)
993 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100994 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000995}
996
997/*
998 * get and return the value of the octal string immediately after the current
999 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
1000 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
1001 * treat 8 or 9 as recognised characters. Position is updated:
1002 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +00001003 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001004 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001005 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001006getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001007{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001008 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001009 int c;
1010 int i;
1011
1012 for (i = 0; i < 3 && nr < 040; ++i)
1013 {
1014 c = regparse[0];
1015 if (c < '0' || c > '7')
1016 break;
1017 nr <<= 3;
1018 nr |= hex2nr(c);
1019 ++regparse;
1020 }
1021
1022 if (i == 0)
1023 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001024 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001025}
1026
1027/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001028 * read_limits - Read two integers to be taken as a minimum and maximum.
1029 * If the first character is '-', then the range is reversed.
1030 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1031 * missing, a very big number is the default.
1032 */
1033 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001034read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001035{
1036 int reverse = FALSE;
1037 char_u *first_char;
1038 long tmp;
1039
1040 if (*regparse == '-')
1041 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001042 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001043 regparse++;
1044 reverse = TRUE;
1045 }
1046 first_char = regparse;
1047 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001048 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001049 {
1050 if (vim_isdigit(*++regparse))
1051 *maxval = getdigits(&regparse);
1052 else
1053 *maxval = MAX_LIMIT;
1054 }
1055 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001056 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001057 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001058 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001059 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001060 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001061 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001062 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1063 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001064
1065 /*
1066 * Reverse the range if there was a '-', or make sure it is in the right
1067 * order otherwise.
1068 */
1069 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1070 {
1071 tmp = *minval;
1072 *minval = *maxval;
1073 *maxval = tmp;
1074 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001075 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001076 return OK;
1077}
1078
1079/*
1080 * vim_regexec and friends
1081 */
1082
1083/*
1084 * Global work variables for vim_regexec().
1085 */
1086
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001087static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001088#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001089static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001090#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001091static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001092static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001093
1094/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001095 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1096 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001097 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001098 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001099static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001100static unsigned reg_tofreelen;
1101
1102/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001103 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001104 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001105 * done:
1106 * single-line multi-line
1107 * reg_match &regmatch_T NULL
1108 * reg_mmatch NULL &regmmatch_T
1109 * reg_startp reg_match->startp <invalid>
1110 * reg_endp reg_match->endp <invalid>
1111 * reg_startpos <invalid> reg_mmatch->startpos
1112 * reg_endpos <invalid> reg_mmatch->endpos
1113 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001114 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001115 * reg_firstlnum <invalid> first line in which to search
1116 * reg_maxline 0 last line nr
1117 * reg_line_lbr FALSE or TRUE FALSE
1118 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001119typedef struct {
1120 regmatch_T *reg_match;
1121 regmmatch_T *reg_mmatch;
1122 char_u **reg_startp;
1123 char_u **reg_endp;
1124 lpos_T *reg_startpos;
1125 lpos_T *reg_endpos;
1126 win_T *reg_win;
1127 buf_T *reg_buf;
1128 linenr_T reg_firstlnum;
1129 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001130 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001131
Bram Moolenaar0270f382018-07-17 05:43:58 +02001132 // The current match-position is stord in these variables:
1133 linenr_T lnum; // line number, relative to first line
1134 char_u *line; // start of current line
1135 char_u *input; // current input, points into "regline"
1136
1137 int need_clear_subexpr; // subexpressions still need to be cleared
1138#ifdef FEAT_SYN_HL
1139 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1140 // cleared
1141#endif
1142
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001143 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1144 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1145 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001146 int reg_ic;
1147
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001148 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1149 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001150 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001151
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001152 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1153 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001154 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001155
1156 // State for the NFA engine regexec.
1157 int nfa_has_zend; // NFA regexp \ze operator encountered.
1158 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1159 int nfa_nsubexpr; // Number of sub expressions actually being used
1160 // during execution. 1 if only the whole match
1161 // (subexpr 0) is used.
1162 // listid is global, so that it increases on recursive calls to
1163 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1164 // all the states.
1165 int nfa_listid;
1166 int nfa_alt_listid;
1167
1168#ifdef FEAT_SYN_HL
1169 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1170#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001171} regexec_T;
1172
1173static regexec_T rex;
1174static int rex_in_use = FALSE;
1175
Bram Moolenaar071d4272004-06-13 20:20:40 +00001176/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001177 * Return TRUE if character 'c' is included in 'iskeyword' option for
1178 * "reg_buf" buffer.
1179 */
1180 static int
1181reg_iswordc(int c)
1182{
1183 return vim_iswordc_buf(c, rex.reg_buf);
1184}
1185
1186/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001187 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1188 */
1189 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001190reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001191{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001192 // when looking behind for a match/no-match lnum is negative. But we
1193 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001194 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001195 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001196 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001197 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001198 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001199 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001200}
1201
Bram Moolenaar071d4272004-06-13 20:20:40 +00001202#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001203static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1204static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1205static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1206static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001207#endif
1208
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001209// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001210#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001211
Bram Moolenaar071d4272004-06-13 20:20:40 +00001212#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001213/*
1214 * Create a new extmatch and mark it as referenced once.
1215 */
1216 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001217make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001218{
1219 reg_extmatch_T *em;
1220
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001221 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001222 if (em != NULL)
1223 em->refcnt = 1;
1224 return em;
1225}
1226
1227/*
1228 * Add a reference to an extmatch.
1229 */
1230 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001231ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001232{
1233 if (em != NULL)
1234 em->refcnt++;
1235 return em;
1236}
1237
1238/*
1239 * Remove a reference to an extmatch. If there are no references left, free
1240 * the info.
1241 */
1242 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001243unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001244{
1245 int i;
1246
1247 if (em != NULL && --em->refcnt <= 0)
1248 {
1249 for (i = 0; i < NSUBEXP; ++i)
1250 vim_free(em->matches[i]);
1251 vim_free(em);
1252 }
1253}
1254#endif
1255
1256/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001257 * Get class of previous character.
1258 */
1259 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001260reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001261{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001262 if (rex.input > rex.line)
1263 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001264 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001265 return -1;
1266}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001267
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001268/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001269 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001270 */
1271 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001272reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001273{
1274 pos_T top, bot;
1275 linenr_T lnum;
1276 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001277 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001278 int mode;
1279 colnr_T start, end;
1280 colnr_T start2, end2;
1281 colnr_T cols;
1282
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001283 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001284 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001285 return FALSE;
1286
1287 if (VIsual_active)
1288 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001289 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001290 {
1291 top = VIsual;
1292 bot = wp->w_cursor;
1293 }
1294 else
1295 {
1296 top = wp->w_cursor;
1297 bot = VIsual;
1298 }
1299 mode = VIsual_mode;
1300 }
1301 else
1302 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001303 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001304 {
1305 top = curbuf->b_visual.vi_start;
1306 bot = curbuf->b_visual.vi_end;
1307 }
1308 else
1309 {
1310 top = curbuf->b_visual.vi_end;
1311 bot = curbuf->b_visual.vi_start;
1312 }
1313 mode = curbuf->b_visual.vi_mode;
1314 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001315 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001316 if (lnum < top.lnum || lnum > bot.lnum)
1317 return FALSE;
1318
1319 if (mode == 'v')
1320 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001321 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001322 if ((lnum == top.lnum && col < top.col)
1323 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1324 return FALSE;
1325 }
1326 else if (mode == Ctrl_V)
1327 {
1328 getvvcol(wp, &top, &start, NULL, &end);
1329 getvvcol(wp, &bot, &start2, NULL, &end2);
1330 if (start2 < start)
1331 start = start2;
1332 if (end2 > end)
1333 end = end2;
1334 if (top.col == MAXCOL || bot.col == MAXCOL)
1335 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001336 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001337 if (cols < start || cols > end - (*p_sel == 'e'))
1338 return FALSE;
1339 }
1340 return TRUE;
1341}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001342
Bram Moolenaar071d4272004-06-13 20:20:40 +00001343/*
1344 * Check the regexp program for its magic number.
1345 * Return TRUE if it's wrong.
1346 */
1347 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001348prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001349{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001350 regprog_T *prog;
1351
Bram Moolenaar6100d022016-10-02 16:51:57 +02001352 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001353 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001354 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001355 return FALSE;
1356
1357 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001358 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001359 emsg(_(e_re_corr));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001360 return TRUE;
1361 }
1362 return FALSE;
1363}
1364
1365/*
1366 * Cleanup the subexpressions, if this wasn't done yet.
1367 * This construction is used to clear the subexpressions only when they are
1368 * used (to increase speed).
1369 */
1370 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001371cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001372{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001373 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001374 {
1375 if (REG_MULTI)
1376 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001377 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001378 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1379 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001380 }
1381 else
1382 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001383 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1384 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001385 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001386 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001387 }
1388}
1389
1390#ifdef FEAT_SYN_HL
1391 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001392cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001393{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001394 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001395 {
1396 if (REG_MULTI)
1397 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001398 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001399 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1400 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1401 }
1402 else
1403 {
1404 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1405 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1406 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001407 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001408 }
1409}
1410#endif
1411
1412/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001413 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001414 */
1415 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001416reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001417{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001418 rex.line = reg_getline(++rex.lnum);
1419 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001420 fast_breakcheck();
1421}
1422
1423/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001424 * Check whether a backreference matches.
1425 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001426 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1427 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001428 */
1429 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001430match_with_backref(
1431 linenr_T start_lnum,
1432 colnr_T start_col,
1433 linenr_T end_lnum,
1434 colnr_T end_col,
1435 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001436{
1437 linenr_T clnum = start_lnum;
1438 colnr_T ccol = start_col;
1439 int len;
1440 char_u *p;
1441
1442 if (bytelen != NULL)
1443 *bytelen = 0;
1444 for (;;)
1445 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001446 // Since getting one line may invalidate the other, need to make copy.
1447 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001448 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001449 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001450 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001451 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1452 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001453 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001454 vim_free(reg_tofree);
1455 reg_tofree = alloc(len);
1456 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001457 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001458 reg_tofreelen = len;
1459 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001460 STRCPY(reg_tofree, rex.line);
1461 rex.input = reg_tofree + (rex.input - rex.line);
1462 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001463 }
1464
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001465 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001466 p = reg_getline(clnum);
1467 if (clnum == end_lnum)
1468 len = end_col - ccol;
1469 else
1470 len = (int)STRLEN(p + ccol);
1471
Bram Moolenaar0270f382018-07-17 05:43:58 +02001472 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001473 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001474 if (bytelen != NULL)
1475 *bytelen += len;
1476 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001477 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001478 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001479 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001480
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001481 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001482 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001483 if (bytelen != NULL)
1484 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001485 ++clnum;
1486 ccol = 0;
1487 if (got_int)
1488 return RA_FAIL;
1489 }
1490
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001491 // found a match! Note that rex.line may now point to a copy of the line,
1492 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001493 return RA_MATCH;
1494}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001495
Bram Moolenaarfb031402014-09-09 17:18:49 +02001496/*
1497 * Used in a place where no * or \+ can follow.
1498 */
1499 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001500re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001501{
1502 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001503 {
1504 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1505 rc_did_emsg = TRUE;
1506 return FAIL;
1507 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001508 return OK;
1509}
1510
Bram Moolenaar071d4272004-06-13 20:20:40 +00001511typedef struct
1512{
1513 int a, b, c;
1514} decomp_T;
1515
1516
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001517// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001518static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001519{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001520 {0x5e2,0,0}, // 0xfb20 alt ayin
1521 {0x5d0,0,0}, // 0xfb21 alt alef
1522 {0x5d3,0,0}, // 0xfb22 alt dalet
1523 {0x5d4,0,0}, // 0xfb23 alt he
1524 {0x5db,0,0}, // 0xfb24 alt kaf
1525 {0x5dc,0,0}, // 0xfb25 alt lamed
1526 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1527 {0x5e8,0,0}, // 0xfb27 alt resh
1528 {0x5ea,0,0}, // 0xfb28 alt tav
1529 {'+', 0, 0}, // 0xfb29 alt plus
1530 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1531 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1532 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1533 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1534 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1535 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1536 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1537 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1538 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1539 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1540 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1541 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1542 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1543 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1544 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1545 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1546 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1547 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1548 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1549 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1550 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1551 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1552 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1553 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1554 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1555 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1556 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1557 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1558 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1559 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1560 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1561 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1562 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1563 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1564 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1565 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1566 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1567 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001568};
1569
1570 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001571mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001572{
1573 decomp_T d;
1574
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001575 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001576 {
1577 d = decomp_table[c - 0xfb20];
1578 *c1 = d.a;
1579 *c2 = d.b;
1580 *c3 = d.c;
1581 }
1582 else
1583 {
1584 *c1 = c;
1585 *c2 = *c3 = 0;
1586 }
1587}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001588
1589/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001590 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001591 * Return 0 if strings match, non-zero otherwise.
1592 * Correct the length "*n" when composing characters are ignored.
1593 */
1594 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001595cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001596{
1597 int result;
1598
Bram Moolenaar6100d022016-10-02 16:51:57 +02001599 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001600 result = STRNCMP(s1, s2, *n);
1601 else
1602 result = MB_STRNICMP(s1, s2, *n);
1603
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001604 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001605 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001606 {
1607 char_u *str1, *str2;
1608 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001609 int junk;
1610
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001611 // we have to handle the strcmp ourselves, since it is necessary to
1612 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001613 str1 = s1;
1614 str2 = s2;
1615 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001616 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001617 {
1618 c1 = mb_ptr2char_adv(&str1);
1619 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001620
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001621 // Decompose the character if necessary, into 'base' characters.
1622 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001623 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001624 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001625 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001626 mb_decompose(c1, &c11, &junk, &junk);
1627 mb_decompose(c2, &c12, &junk, &junk);
1628 c1 = c11;
1629 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001630 if (c11 != c12
1631 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001632 break;
1633 }
1634 }
1635 result = c2 - c1;
1636 if (result == 0)
1637 *n = (int)(str2 - s2);
1638 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001639
1640 return result;
1641}
1642
1643/*
1644 * cstrchr: This function is used a lot for simple searches, keep it fast!
1645 */
1646 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001647cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001648{
1649 char_u *p;
1650 int cc;
1651
Bram Moolenaara12a1612019-01-24 16:39:02 +01001652 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001653 return vim_strchr(s, c);
1654
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001655 // tolower() and toupper() can be slow, comparing twice should be a lot
1656 // faster (esp. when using MS Visual C++!).
1657 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001658 if (enc_utf8 && c > 0x80)
1659 cc = utf_fold(c);
1660 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001661 if (MB_ISUPPER(c))
1662 cc = MB_TOLOWER(c);
1663 else if (MB_ISLOWER(c))
1664 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001665 else
1666 return vim_strchr(s, c);
1667
Bram Moolenaar071d4272004-06-13 20:20:40 +00001668 if (has_mbyte)
1669 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001670 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001671 {
1672 if (enc_utf8 && c > 0x80)
1673 {
1674 if (utf_fold(utf_ptr2char(p)) == cc)
1675 return p;
1676 }
1677 else if (*p == c || *p == cc)
1678 return p;
1679 }
1680 }
1681 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001682 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001683 for (p = s; *p != NUL; ++p)
1684 if (*p == c || *p == cc)
1685 return p;
1686
1687 return NULL;
1688}
1689
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001690////////////////////////////////////////////////////////////////
1691// regsub stuff //
1692////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001693
Bram Moolenaar071d4272004-06-13 20:20:40 +00001694/*
1695 * We should define ftpr as a pointer to a function returning a pointer to
1696 * a function returning a pointer to a function ...
1697 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001698 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001699 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001700typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001701
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001702static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001703
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001704 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001705do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001706{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001707 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001708
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001709 return (fptr_T)NULL;
1710}
1711
1712 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001713do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001714{
1715 *d = MB_TOUPPER(c);
1716
1717 return (fptr_T)do_Upper;
1718}
1719
1720 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001721do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001722{
1723 *d = MB_TOLOWER(c);
1724
1725 return (fptr_T)NULL;
1726}
1727
1728 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001729do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001730{
1731 *d = MB_TOLOWER(c);
1732
1733 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001734}
1735
1736/*
1737 * regtilde(): Replace tildes in the pattern by the old pattern.
1738 *
1739 * Short explanation of the tilde: It stands for the previous replacement
1740 * pattern. If that previous pattern also contains a ~ we should go back a
1741 * step further... But we insert the previous pattern into the current one
1742 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001743 * This still does not handle the case where "magic" changes. So require the
1744 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001745 *
1746 * The tildes are parsed once before the first call to vim_regsub().
1747 */
1748 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001749regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001750{
1751 char_u *newsub = source;
1752 char_u *tmpsub;
1753 char_u *p;
1754 int len;
1755 int prevlen;
1756
1757 for (p = newsub; *p; ++p)
1758 {
1759 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1760 {
1761 if (reg_prev_sub != NULL)
1762 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001763 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001764 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001765 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001766 if (tmpsub != NULL)
1767 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001768 // copy prefix
1769 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001771 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001772 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001773 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001774 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001775 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001776 STRCPY(tmpsub + len + prevlen, p + 1);
1777
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001778 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001779 vim_free(newsub);
1780 newsub = tmpsub;
1781 p = newsub + len + prevlen;
1782 }
1783 }
1784 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001785 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001786 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001787 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001788 --p;
1789 }
1790 else
1791 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001792 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001793 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001794 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001795 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001796 }
1797 }
1798
1799 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001800 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001801 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001802 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001803 reg_prev_sub = vim_strsave(newsub);
1804 return newsub;
1805}
1806
1807#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001808static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001809
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001810// These pointers are used for reg_submatch(). Needed for when the
1811// substitution string is an expression that contains a call to substitute()
1812// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001813typedef struct {
1814 regmatch_T *sm_match;
1815 regmmatch_T *sm_mmatch;
1816 linenr_T sm_firstlnum;
1817 linenr_T sm_maxline;
1818 int sm_line_lbr;
1819} regsubmatch_T;
1820
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001821static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001822#endif
1823
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001824#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001825
1826/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001827 * Put the submatches in "argv[argskip]" which is a list passed into
1828 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001829 */
1830 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001831fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001832{
1833 listitem_T *li;
1834 int i;
1835 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001836 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001837
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001838 if (argcount == argskip)
1839 // called function doesn't take a submatches argument
1840 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001841
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001842 // Relies on sl_list to be the first item in staticList10_T.
1843 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001844
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001845 // There are always 10 list items in staticList10_T.
1846 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001847 for (i = 0; i < 10; ++i)
1848 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001849 s = rsm.sm_match->startp[i];
1850 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001851 s = NULL;
1852 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001853 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001854 li->li_tv.v_type = VAR_STRING;
1855 li->li_tv.vval.v_string = s;
1856 li = li->li_next;
1857 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001858 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001859}
1860
1861 static void
1862clear_submatch_list(staticList10_T *sl)
1863{
1864 int i;
1865
1866 for (i = 0; i < 10; ++i)
1867 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1868}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001869#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001870
Bram Moolenaar071d4272004-06-13 20:20:40 +00001871/*
1872 * vim_regsub() - perform substitutions after a vim_regexec() or
1873 * vim_regexec_multi() match.
1874 *
1875 * If "copy" is TRUE really copy into "dest".
1876 * If "copy" is FALSE nothing is copied, this is just to find out the length
1877 * of the result.
1878 *
1879 * If "backslash" is TRUE, a backslash will be removed later, need to double
1880 * them to keep them, and insert a backslash before a CR to avoid it being
1881 * replaced with a line break later.
1882 *
1883 * Note: The matched text must not change between the call of
1884 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1885 * references invalid!
1886 *
1887 * Returns the size of the replacement, including terminating NUL.
1888 */
1889 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001890vim_regsub(
1891 regmatch_T *rmp,
1892 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001893 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001894 char_u *dest,
1895 int copy,
1896 int magic,
1897 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001898{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001899 int result;
1900 regexec_T rex_save;
1901 int rex_in_use_save = rex_in_use;
1902
1903 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001904 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001905 rex_save = rex;
1906 rex_in_use = TRUE;
1907
1908 rex.reg_match = rmp;
1909 rex.reg_mmatch = NULL;
1910 rex.reg_maxline = 0;
1911 rex.reg_buf = curbuf;
1912 rex.reg_line_lbr = TRUE;
1913 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1914
1915 rex_in_use = rex_in_use_save;
1916 if (rex_in_use)
1917 rex = rex_save;
1918
1919 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001920}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001921
1922 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001923vim_regsub_multi(
1924 regmmatch_T *rmp,
1925 linenr_T lnum,
1926 char_u *source,
1927 char_u *dest,
1928 int copy,
1929 int magic,
1930 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001931{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001932 int result;
1933 regexec_T rex_save;
1934 int rex_in_use_save = rex_in_use;
1935
1936 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001937 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001938 rex_save = rex;
1939 rex_in_use = TRUE;
1940
1941 rex.reg_match = NULL;
1942 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001943 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001944 rex.reg_firstlnum = lnum;
1945 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1946 rex.reg_line_lbr = FALSE;
1947 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1948
1949 rex_in_use = rex_in_use_save;
1950 if (rex_in_use)
1951 rex = rex_save;
1952
1953 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001954}
1955
1956 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001957vim_regsub_both(
1958 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001959 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001960 char_u *dest,
1961 int copy,
1962 int magic,
1963 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001964{
1965 char_u *src;
1966 char_u *dst;
1967 char_u *s;
1968 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001969 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001970 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001971 fptr_T func_all = (fptr_T)NULL;
1972 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001973 linenr_T clnum = 0; // init for GCC
1974 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001975#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001976 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001978
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001979 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001980 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001981 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001982 emsg(_(e_null));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001983 return 0;
1984 }
1985 if (prog_magic_wrong())
1986 return 0;
1987 src = source;
1988 dst = dest;
1989
1990 /*
1991 * When the substitute part starts with "\=" evaluate it as an expression.
1992 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001993 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001994 {
1995#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001996 // To make sure that the length doesn't change between checking the
1997 // length and copying the string, and to speed up things, the
1998 // resulting string is saved from the call with "copy" == FALSE to the
1999 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002000 if (copy)
2001 {
2002 if (eval_result != NULL)
2003 {
2004 STRCPY(dest, eval_result);
2005 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002006 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002007 }
2008 }
2009 else
2010 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002011 int prev_can_f_submatch = can_f_submatch;
2012 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002013
2014 vim_free(eval_result);
2015
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002016 // The expression may contain substitute(), which calls us
2017 // recursively. Make sure submatch() gets the text from the first
2018 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002019 if (can_f_submatch)
2020 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002021 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002022 rsm.sm_match = rex.reg_match;
2023 rsm.sm_mmatch = rex.reg_mmatch;
2024 rsm.sm_firstlnum = rex.reg_firstlnum;
2025 rsm.sm_maxline = rex.reg_maxline;
2026 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002027
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002028 if (expr != NULL)
2029 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002030 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031 char_u buf[NUMBUFLEN];
2032 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002033 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002034 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002035
2036 rettv.v_type = VAR_STRING;
2037 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002038 argv[0].v_type = VAR_LIST;
2039 argv[0].vval.v_list = &matchList.sl_list;
2040 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002041 CLEAR_FIELD(funcexe);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002042 funcexe.argv_func = fill_submatch_list;
2043 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002044 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002045 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002046 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002047 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002048 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002049 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002050 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002051 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002052
Bram Moolenaar6100d022016-10-02 16:51:57 +02002053 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002054 funcexe.partial = partial;
2055 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002056 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002057 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002058 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002059 clear_submatch_list(&matchList);
2060
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002061 if (rettv.v_type == VAR_UNKNOWN)
2062 // something failed, no need to report another error
2063 eval_result = NULL;
2064 else
2065 {
2066 eval_result = tv_get_string_buf_chk(&rettv, buf);
2067 if (eval_result != NULL)
2068 eval_result = vim_strsave(eval_result);
2069 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002070 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002071 }
2072 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002073 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002074
Bram Moolenaar071d4272004-06-13 20:20:40 +00002075 if (eval_result != NULL)
2076 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002077 int had_backslash = FALSE;
2078
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002079 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002080 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002081 // Change NL to CR, so that it becomes a line break,
2082 // unless called from vim_regexec_nl().
2083 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002084 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002085 *s = CAR;
2086 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002087 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002088 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002089 /* Change NL to CR here too, so that this works:
2090 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2091 * abc\
2092 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002093 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002094 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002095 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002096 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002097 had_backslash = TRUE;
2098 }
2099 }
2100 if (had_backslash && backslash)
2101 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002102 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002103 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2104 if (s != NULL)
2105 {
2106 vim_free(eval_result);
2107 eval_result = s;
2108 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002109 }
2110
2111 dst += STRLEN(eval_result);
2112 }
2113
Bram Moolenaar6100d022016-10-02 16:51:57 +02002114 can_f_submatch = prev_can_f_submatch;
2115 if (can_f_submatch)
2116 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002117 }
2118#endif
2119 }
2120 else
2121 while ((c = *src++) != NUL)
2122 {
2123 if (c == '&' && magic)
2124 no = 0;
2125 else if (c == '\\' && *src != NUL)
2126 {
2127 if (*src == '&' && !magic)
2128 {
2129 ++src;
2130 no = 0;
2131 }
2132 else if ('0' <= *src && *src <= '9')
2133 {
2134 no = *src++ - '0';
2135 }
2136 else if (vim_strchr((char_u *)"uUlLeE", *src))
2137 {
2138 switch (*src++)
2139 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002140 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002141 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002142 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002143 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002144 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002145 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002146 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002147 continue;
2148 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002149 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002150 continue;
2151 }
2152 }
2153 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002154 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002155 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002156 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2157 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002158 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002159 if (copy)
2160 {
2161 *dst++ = c;
2162 *dst++ = *src++;
2163 *dst++ = *src++;
2164 }
2165 else
2166 {
2167 dst += 3;
2168 src += 2;
2169 }
2170 continue;
2171 }
2172
Bram Moolenaar071d4272004-06-13 20:20:40 +00002173 if (c == '\\' && *src != NUL)
2174 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002175 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002176 switch (*src)
2177 {
2178 case 'r': c = CAR; ++src; break;
2179 case 'n': c = NL; ++src; break;
2180 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002181 // Oh no! \e already has meaning in subst pat :-(
2182 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002183 case 'b': c = Ctrl_H; ++src; break;
2184
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002185 // If "backslash" is TRUE the backslash will be removed
2186 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002187 default: if (backslash)
2188 {
2189 if (copy)
2190 *dst = '\\';
2191 ++dst;
2192 }
2193 c = *src++;
2194 }
2195 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002196 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002197 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002198
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002199 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002200 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002201 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002202 func_one = (fptr_T)(func_one(&cc, c));
2203 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002204 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002205 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002206 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002207 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002208
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002209 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002210 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002211 int totlen = mb_ptr2len(src - 1);
2212
Bram Moolenaar071d4272004-06-13 20:20:40 +00002213 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002214 mb_char2bytes(cc, dst);
2215 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002216 if (enc_utf8)
2217 {
2218 int clen = utf_ptr2len(src - 1);
2219
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002220 // If the character length is shorter than "totlen", there
2221 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002222 if (clen < totlen)
2223 {
2224 if (copy)
2225 mch_memmove(dst + 1, src - 1 + clen,
2226 (size_t)(totlen - clen));
2227 dst += totlen - clen;
2228 }
2229 }
2230 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002231 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002232 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002233 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002234 dst++;
2235 }
2236 else
2237 {
2238 if (REG_MULTI)
2239 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002240 clnum = rex.reg_mmatch->startpos[no].lnum;
2241 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002242 s = NULL;
2243 else
2244 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002245 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2246 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2247 len = rex.reg_mmatch->endpos[no].col
2248 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002249 else
2250 len = (int)STRLEN(s);
2251 }
2252 }
2253 else
2254 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002255 s = rex.reg_match->startp[no];
2256 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002257 s = NULL;
2258 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002259 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002260 }
2261 if (s != NULL)
2262 {
2263 for (;;)
2264 {
2265 if (len == 0)
2266 {
2267 if (REG_MULTI)
2268 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002269 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002270 break;
2271 if (copy)
2272 *dst = CAR;
2273 ++dst;
2274 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002275 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2276 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002277 else
2278 len = (int)STRLEN(s);
2279 }
2280 else
2281 break;
2282 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002283 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002284 {
2285 if (copy)
Bram Moolenaare83cca22020-09-07 18:53:21 +02002286 iemsg(_(e_re_damg));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002287 goto exit;
2288 }
2289 else
2290 {
2291 if (backslash && (*s == CAR || *s == '\\'))
2292 {
2293 /*
2294 * Insert a backslash in front of a CR, otherwise
2295 * it will be replaced by a line break.
2296 * Number of backslashes will be halved later,
2297 * double them here.
2298 */
2299 if (copy)
2300 {
2301 dst[0] = '\\';
2302 dst[1] = *s;
2303 }
2304 dst += 2;
2305 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002306 else
2307 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002308 if (has_mbyte)
2309 c = mb_ptr2char(s);
2310 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002311 c = *s;
2312
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002313 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002314 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002315 func_one = (fptr_T)(func_one(&cc, c));
2316 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002317 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002318 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002319 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002320 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002321
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002322 if (has_mbyte)
2323 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002324 int l;
2325
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002326 // Copy composing characters separately, one
2327 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002328 if (enc_utf8)
2329 l = utf_ptr2len(s) - 1;
2330 else
2331 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002332
2333 s += l;
2334 len -= l;
2335 if (copy)
2336 mb_char2bytes(cc, dst);
2337 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002338 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002339 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002340 *dst = cc;
2341 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002342 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002343
Bram Moolenaar071d4272004-06-13 20:20:40 +00002344 ++s;
2345 --len;
2346 }
2347 }
2348 }
2349 no = -1;
2350 }
2351 }
2352 if (copy)
2353 *dst = NUL;
2354
2355exit:
2356 return (int)((dst - dest) + 1);
2357}
2358
2359#ifdef FEAT_EVAL
2360/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002361 * Call reg_getline() with the line numbers from the submatch. If a
2362 * substitute() was used the reg_maxline and other values have been
2363 * overwritten.
2364 */
2365 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002366reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002367{
2368 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002369 linenr_T save_first = rex.reg_firstlnum;
2370 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002371
Bram Moolenaar6100d022016-10-02 16:51:57 +02002372 rex.reg_firstlnum = rsm.sm_firstlnum;
2373 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002374
2375 s = reg_getline(lnum);
2376
Bram Moolenaar6100d022016-10-02 16:51:57 +02002377 rex.reg_firstlnum = save_first;
2378 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002379 return s;
2380}
2381
2382/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002383 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002384 * allocated memory.
2385 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2386 */
2387 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002388reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002389{
2390 char_u *retval = NULL;
2391 char_u *s;
2392 int len;
2393 int round;
2394 linenr_T lnum;
2395
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002396 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002397 return NULL;
2398
Bram Moolenaar6100d022016-10-02 16:51:57 +02002399 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002400 {
2401 /*
2402 * First round: compute the length and allocate memory.
2403 * Second round: copy the text.
2404 */
2405 for (round = 1; round <= 2; ++round)
2406 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002407 lnum = rsm.sm_mmatch->startpos[no].lnum;
2408 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002409 return NULL;
2410
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002411 s = reg_getline_submatch(lnum);
2412 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002413 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002414 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002415 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002417 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002418 len = rsm.sm_mmatch->endpos[no].col
2419 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002420 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002421 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002422 ++len;
2423 }
2424 else
2425 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002426 // Multiple lines: take start line from start col, middle
2427 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002428 len = (int)STRLEN(s);
2429 if (round == 2)
2430 {
2431 STRCPY(retval, s);
2432 retval[len] = '\n';
2433 }
2434 ++len;
2435 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002436 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002437 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002438 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002439 if (round == 2)
2440 STRCPY(retval + len, s);
2441 len += (int)STRLEN(s);
2442 if (round == 2)
2443 retval[len] = '\n';
2444 ++len;
2445 }
2446 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002447 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002448 rsm.sm_mmatch->endpos[no].col);
2449 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002450 if (round == 2)
2451 retval[len] = NUL;
2452 ++len;
2453 }
2454
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002455 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002456 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002457 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002458 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002459 return NULL;
2460 }
2461 }
2462 }
2463 else
2464 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002465 s = rsm.sm_match->startp[no];
2466 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002467 retval = NULL;
2468 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002469 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002470 }
2471
2472 return retval;
2473}
Bram Moolenaar41571762014-04-02 19:00:58 +02002474
2475/*
2476 * Used for the submatch() function with the optional non-zero argument: get
2477 * the list of strings from the n'th submatch in allocated memory with NULs
2478 * represented in NLs.
2479 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2480 * command, for a non-existing submatch and for any error.
2481 */
2482 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002483reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002484{
2485 char_u *s;
2486 linenr_T slnum;
2487 linenr_T elnum;
2488 colnr_T scol;
2489 colnr_T ecol;
2490 int i;
2491 list_T *list;
2492 int error = FALSE;
2493
2494 if (!can_f_submatch || no < 0)
2495 return NULL;
2496
Bram Moolenaar6100d022016-10-02 16:51:57 +02002497 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002498 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002499 slnum = rsm.sm_mmatch->startpos[no].lnum;
2500 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002501 if (slnum < 0 || elnum < 0)
2502 return NULL;
2503
Bram Moolenaar6100d022016-10-02 16:51:57 +02002504 scol = rsm.sm_mmatch->startpos[no].col;
2505 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002506
2507 list = list_alloc();
2508 if (list == NULL)
2509 return NULL;
2510
2511 s = reg_getline_submatch(slnum) + scol;
2512 if (slnum == elnum)
2513 {
2514 if (list_append_string(list, s, ecol - scol) == FAIL)
2515 error = TRUE;
2516 }
2517 else
2518 {
2519 if (list_append_string(list, s, -1) == FAIL)
2520 error = TRUE;
2521 for (i = 1; i < elnum - slnum; i++)
2522 {
2523 s = reg_getline_submatch(slnum + i);
2524 if (list_append_string(list, s, -1) == FAIL)
2525 error = TRUE;
2526 }
2527 s = reg_getline_submatch(elnum);
2528 if (list_append_string(list, s, ecol) == FAIL)
2529 error = TRUE;
2530 }
2531 }
2532 else
2533 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002534 s = rsm.sm_match->startp[no];
2535 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002536 return NULL;
2537 list = list_alloc();
2538 if (list == NULL)
2539 return NULL;
2540 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002541 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002542 error = TRUE;
2543 }
2544
2545 if (error)
2546 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002547 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002548 return NULL;
2549 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002550 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002551 return list;
2552}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002553#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002554
Bram Moolenaarf4140482020-02-15 23:06:45 +01002555/*
2556 * Initialize the values used for matching against multiple lines
2557 */
2558 static void
2559init_regexec_multi(
2560 regmmatch_T *rmp,
2561 win_T *win, // window in which to search or NULL
2562 buf_T *buf, // buffer in which to search
2563 linenr_T lnum) // nr of line to start looking for match
2564{
2565 rex.reg_match = NULL;
2566 rex.reg_mmatch = rmp;
2567 rex.reg_buf = buf;
2568 rex.reg_win = win;
2569 rex.reg_firstlnum = lnum;
2570 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2571 rex.reg_line_lbr = FALSE;
2572 rex.reg_ic = rmp->rmm_ic;
2573 rex.reg_icombine = FALSE;
2574 rex.reg_maxcol = rmp->rmm_maxcol;
2575}
2576
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002577#include "regexp_bt.c"
2578
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002579static regengine_T bt_regengine =
2580{
2581 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002582 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002583 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002584 bt_regexec_multi,
2585 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002586};
2587
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002588#include "regexp_nfa.c"
2589
2590static regengine_T nfa_regengine =
2591{
2592 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002593 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002595 nfa_regexec_multi,
2596 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597};
2598
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002599// Which regexp engine to use? Needed for vim_regcomp().
2600// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002601static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002602
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002603#ifdef DEBUG
2604static char_u regname[][30] = {
2605 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002606 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002607 "NFA Regexp Engine"
2608 };
2609#endif
2610
2611/*
2612 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002613 * Returns the program in allocated memory.
2614 * Use vim_regfree() to free the memory.
2615 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002616 */
2617 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002618vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002619{
2620 regprog_T *prog = NULL;
2621 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002622 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002623
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002624 regexp_engine = p_re;
2625
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002626 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002627 if (STRNCMP(expr, "\\%#=", 4) == 0)
2628 {
2629 int newengine = expr[4] - '0';
2630
2631 if (newengine == AUTOMATIC_ENGINE
2632 || newengine == BACKTRACKING_ENGINE
2633 || newengine == NFA_ENGINE)
2634 {
2635 regexp_engine = expr[4] - '0';
2636 expr += 5;
2637#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002638 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002639 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002640#endif
2641 }
2642 else
2643 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002644 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002645 regexp_engine = AUTOMATIC_ENGINE;
2646 }
2647 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002648#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002649 bt_regengine.expr = expr;
2650 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002651#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002652 // reg_iswordc() uses rex.reg_buf
2653 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002654
2655 /*
2656 * First try the NFA engine, unless backtracking was requested.
2657 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002658 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002659 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002660 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002661 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002662 else
2663 prog = bt_regengine.regcomp(expr, re_flags);
2664
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002665 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002666 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002667 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002668#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002669 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002670 {
2671 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002672 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002673 if (f)
2674 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002675 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002676 fclose(f);
2677 }
2678 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002679 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002680 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002681 }
2682#endif
2683 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002684 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002685 * The NFA engine also fails for patterns that it can't handle well
2686 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002687 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002688 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002689 if (regexp_engine == AUTOMATIC_ENGINE
2690 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002691 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002692 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002693#ifdef FEAT_EVAL
2694 report_re_switch(expr);
2695#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002696 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002697 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002698 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002699
Bram Moolenaarfda37292014-11-05 14:27:36 +01002700 if (prog != NULL)
2701 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002702 // Store the info needed to call regcomp() again when the engine turns
2703 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002704 prog->re_engine = regexp_engine;
2705 prog->re_flags = re_flags;
2706 }
2707
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002708 return prog;
2709}
2710
2711/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002712 * Free a compiled regexp program, returned by vim_regcomp().
2713 */
2714 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002715vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002716{
2717 if (prog != NULL)
2718 prog->engine->regfree(prog);
2719}
2720
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002721#if defined(EXITFREE) || defined(PROTO)
2722 void
2723free_regexp_stuff(void)
2724{
2725 ga_clear(&regstack);
2726 ga_clear(&backpos);
2727 vim_free(reg_tofree);
2728 vim_free(reg_prev_sub);
2729}
2730#endif
2731
Bram Moolenaarfda37292014-11-05 14:27:36 +01002732#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002733 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002734report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002735{
2736 if (p_verbose > 0)
2737 {
2738 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002739 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2740 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002741 verbose_leave();
2742 }
2743}
2744#endif
2745
Bram Moolenaar113e1072019-01-20 15:30:40 +01002746#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2747 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002748/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002749 * Return whether "prog" is currently being executed.
2750 */
2751 int
2752regprog_in_use(regprog_T *prog)
2753{
2754 return prog->re_in_use;
2755}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002756#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002757
2758/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002759 * Match a regexp against a string.
2760 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002761 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002762 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002763 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002764 *
2765 * Return TRUE if there is a match, FALSE if not.
2766 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002767 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002768vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002769 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002770 char_u *line, // string to match against
2771 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002772 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002773{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002774 int result;
2775 regexec_T rex_save;
2776 int rex_in_use_save = rex_in_use;
2777
Bram Moolenaar0270f382018-07-17 05:43:58 +02002778 // Cannot use the same prog recursively, it contains state.
2779 if (rmp->regprog->re_in_use)
2780 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002781 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002782 return FALSE;
2783 }
2784 rmp->regprog->re_in_use = TRUE;
2785
Bram Moolenaar6100d022016-10-02 16:51:57 +02002786 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002787 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002788 rex_save = rex;
2789 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002790
Bram Moolenaar6100d022016-10-02 16:51:57 +02002791 rex.reg_startp = NULL;
2792 rex.reg_endp = NULL;
2793 rex.reg_startpos = NULL;
2794 rex.reg_endpos = NULL;
2795
2796 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002797 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002798
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002799 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002800 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2801 && result == NFA_TOO_EXPENSIVE)
2802 {
2803 int save_p_re = p_re;
2804 int re_flags = rmp->regprog->re_flags;
2805 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2806
2807 p_re = BACKTRACKING_ENGINE;
2808 vim_regfree(rmp->regprog);
2809 if (pat != NULL)
2810 {
2811#ifdef FEAT_EVAL
2812 report_re_switch(pat);
2813#endif
2814 rmp->regprog = vim_regcomp(pat, re_flags);
2815 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002816 {
2817 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002818 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002819 rmp->regprog->re_in_use = FALSE;
2820 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002821 vim_free(pat);
2822 }
2823
2824 p_re = save_p_re;
2825 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002826
2827 rex_in_use = rex_in_use_save;
2828 if (rex_in_use)
2829 rex = rex_save;
2830
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002831 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002832}
2833
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002834/*
2835 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002836 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002837 */
2838 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002839vim_regexec_prog(
2840 regprog_T **prog,
2841 int ignore_case,
2842 char_u *line,
2843 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002844{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002845 int r;
2846 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002847
2848 regmatch.regprog = *prog;
2849 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002850 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002851 *prog = regmatch.regprog;
2852 return r;
2853}
2854
2855/*
2856 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002857 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002858 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002859 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002860vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002861{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002862 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002863}
2864
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002865/*
2866 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002867 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002868 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002869 */
2870 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002871vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002872{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002873 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002874}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002875
2876/*
2877 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002878 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2879 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002880 * Uses curbuf for line count and 'iskeyword'.
2881 *
2882 * Return zero if there is no match. Return number of lines contained in the
2883 * match otherwise.
2884 */
2885 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002886vim_regexec_multi(
2887 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002888 win_T *win, // window in which to search or NULL
2889 buf_T *buf, // buffer in which to search
2890 linenr_T lnum, // nr of line to start looking for match
2891 colnr_T col, // column to start looking for match
2892 proftime_T *tm, // timeout limit or NULL
2893 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002894{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002895 int result;
2896 regexec_T rex_save;
2897 int rex_in_use_save = rex_in_use;
2898
Bram Moolenaar0270f382018-07-17 05:43:58 +02002899 // Cannot use the same prog recursively, it contains state.
2900 if (rmp->regprog->re_in_use)
2901 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002902 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002903 return FALSE;
2904 }
2905 rmp->regprog->re_in_use = TRUE;
2906
Bram Moolenaar6100d022016-10-02 16:51:57 +02002907 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002908 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002909 rex_save = rex;
2910 rex_in_use = TRUE;
2911
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002912 result = rmp->regprog->engine->regexec_multi(
2913 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002914 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002915
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002916 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002917 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2918 && result == NFA_TOO_EXPENSIVE)
2919 {
2920 int save_p_re = p_re;
2921 int re_flags = rmp->regprog->re_flags;
2922 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2923
2924 p_re = BACKTRACKING_ENGINE;
2925 vim_regfree(rmp->regprog);
2926 if (pat != NULL)
2927 {
2928#ifdef FEAT_EVAL
2929 report_re_switch(pat);
2930#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002931#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002932 // checking for \z misuse was already done when compiling for NFA,
2933 // allow all here
2934 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002935#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002936 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002937#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002938 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002939#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002940
Bram Moolenaarfda37292014-11-05 14:27:36 +01002941 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002942 {
2943 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002944 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002945 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002946 rmp->regprog->re_in_use = FALSE;
2947 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002948 vim_free(pat);
2949 }
2950 p_re = save_p_re;
2951 }
2952
Bram Moolenaar6100d022016-10-02 16:51:57 +02002953 rex_in_use = rex_in_use_save;
2954 if (rex_in_use)
2955 rex = rex_save;
2956
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002957 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002958}