blob: 112f75369edef2573d90b7906d0fd5b162e27ad9 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020077static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
78static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
79static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020080#ifdef FEAT_SYN_HL
Bram Moolenaar5de820b2013-06-02 15:01:57 +020081static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
Bram Moolenaarbcf94422018-06-23 14:21:42 +020082static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
Bram Moolenaar01d89dd2013-06-03 19:41:06 +020083#endif
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +020084static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
Bram Moolenaar2976c022013-06-05 21:30:37 +020085static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
Bram Moolenaar0270f382018-07-17 05:43:58 +020086static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
87
Bram Moolenaar071d4272004-06-13 20:20:40 +000088#define NOT_MULTI 0
89#define MULTI_ONE 1
90#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020091
92// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010093#define RA_FAIL 1 // something failed, abort
94#define RA_CONT 2 // continue in inner loop
95#define RA_BREAK 3 // break inner loop
96#define RA_MATCH 4 // successful match
97#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020098
Bram Moolenaar071d4272004-06-13 20:20:40 +000099/*
100 * Return NOT_MULTI if c is not a "multi" operator.
101 * Return MULTI_ONE if c is a single "multi" operator.
102 * Return MULTI_MULT if c is a multi "multi" operator.
103 */
104 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100105re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000106{
107 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
108 return MULTI_ONE;
109 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
110 return MULTI_MULT;
111 return NOT_MULTI;
112}
113
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000114static char_u *reg_prev_sub = NULL;
115
Bram Moolenaar071d4272004-06-13 20:20:40 +0000116/*
117 * REGEXP_INRANGE contains all characters which are always special in a []
118 * range after '\'.
119 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
120 * These are:
121 * \n - New line (NL).
122 * \r - Carriage Return (CR).
123 * \t - Tab (TAB).
124 * \e - Escape (ESC).
125 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000126 * \d - Character code in decimal, eg \d123
127 * \o - Character code in octal, eg \o80
128 * \x - Character code in hex, eg \x4a
129 * \u - Multibyte character code, eg \u20ac
130 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131 */
132static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000133static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135/*
136 * Translate '\x' to its control character, except "\n", which is Magic.
137 */
138 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100139backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000140{
141 switch (c)
142 {
143 case 'r': return CAR;
144 case 't': return TAB;
145 case 'e': return ESC;
146 case 'b': return BS;
147 }
148 return c;
149}
150
151/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000152 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000153 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
154 * recognized. Otherwise "pp" is advanced to after the item.
155 */
156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100157get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000158{
159 static const char *(class_names[]) =
160 {
161 "alnum:]",
162#define CLASS_ALNUM 0
163 "alpha:]",
164#define CLASS_ALPHA 1
165 "blank:]",
166#define CLASS_BLANK 2
167 "cntrl:]",
168#define CLASS_CNTRL 3
169 "digit:]",
170#define CLASS_DIGIT 4
171 "graph:]",
172#define CLASS_GRAPH 5
173 "lower:]",
174#define CLASS_LOWER 6
175 "print:]",
176#define CLASS_PRINT 7
177 "punct:]",
178#define CLASS_PUNCT 8
179 "space:]",
180#define CLASS_SPACE 9
181 "upper:]",
182#define CLASS_UPPER 10
183 "xdigit:]",
184#define CLASS_XDIGIT 11
185 "tab:]",
186#define CLASS_TAB 12
187 "return:]",
188#define CLASS_RETURN 13
189 "backspace:]",
190#define CLASS_BACKSPACE 14
191 "escape:]",
192#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100193 "ident:]",
194#define CLASS_IDENT 16
195 "keyword:]",
196#define CLASS_KEYWORD 17
197 "fname:]",
198#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000199 };
200#define CLASS_NONE 99
201 int i;
202
203 if ((*pp)[1] == ':')
204 {
Bram Moolenaar78a15312009-05-15 19:33:18 +0000205 for (i = 0; i < (int)(sizeof(class_names) / sizeof(*class_names)); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000206 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
207 {
208 *pp += STRLEN(class_names[i]) + 2;
209 return i;
210 }
211 }
212 return CLASS_NONE;
213}
214
215/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000216 * Specific version of character class functions.
217 * Using a table to keep this fast.
218 */
219static short class_tab[256];
220
221#define RI_DIGIT 0x01
222#define RI_HEX 0x02
223#define RI_OCTAL 0x04
224#define RI_WORD 0x08
225#define RI_HEAD 0x10
226#define RI_ALPHA 0x20
227#define RI_LOWER 0x40
228#define RI_UPPER 0x80
229#define RI_WHITE 0x100
230
231 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100232init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000233{
234 int i;
235 static int done = FALSE;
236
237 if (done)
238 return;
239
240 for (i = 0; i < 256; ++i)
241 {
242 if (i >= '0' && i <= '7')
243 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
244 else if (i >= '8' && i <= '9')
245 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
246 else if (i >= 'a' && i <= 'f')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
248#ifdef EBCDIC
249 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
250 || (i >= 's' && i <= 'z'))
251#else
252 else if (i >= 'g' && i <= 'z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
255 else if (i >= 'A' && i <= 'F')
256 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
257#ifdef EBCDIC
258 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
259 || (i >= 'S' && i <= 'Z'))
260#else
261 else if (i >= 'G' && i <= 'Z')
262#endif
263 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
264 else if (i == '_')
265 class_tab[i] = RI_WORD + RI_HEAD;
266 else
267 class_tab[i] = 0;
268 }
269 class_tab[' '] |= RI_WHITE;
270 class_tab['\t'] |= RI_WHITE;
271 done = TRUE;
272}
273
Bram Moolenaara12a1612019-01-24 16:39:02 +0100274#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
275#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
276#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
277#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
278#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
279#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
280#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
281#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
282#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100284// flags for regflags
285#define RF_ICASE 1 // ignore case
286#define RF_NOICASE 2 // don't ignore case
287#define RF_HASNL 4 // can match a NL
288#define RF_ICOMBINE 8 // ignore combining characters
289#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000290
291/*
292 * Global work variables for vim_regcomp().
293 */
294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static char_u *regparse; // Input-scan pointer.
296static int regnpar; // () count.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000297#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100298static int regnzpar; // \z() count.
299static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000300#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100301static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000302#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100303static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000304#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100306static int reg_magic; // magicness of the pattern:
307#define MAGIC_NONE 1 // "\V" very unmagic
308#define MAGIC_OFF 2 // "\M" or 'magic' off
309#define MAGIC_ON 3 // "\m" or 'magic'
310#define MAGIC_ALL 4 // "\v" very magic
Bram Moolenaar071d4272004-06-13 20:20:40 +0000311
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100312static int reg_string; // matching with a string instead of a buffer
313 // line
314static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000315
316/*
317 * META contains all characters that may be magic, except '^' and '$'.
318 */
319
320#ifdef EBCDIC
321static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
322#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100323// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324static char_u META_flags[] = {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100327// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000328 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100329// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000330 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100331// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000332 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100333// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000334 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100335// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000336 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100337// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000338 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
339};
340#endif
341
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100342static int curchr; // currently parsed character
343// Previous character. Note: prevchr is sometimes -1 when we are not at the
344// start, eg in /[ ^I]^ the pattern was never found even if it existed,
345// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200346static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100347static int prevprevchr; // previous-previous character
348static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000349
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100350// arguments for reg()
351#define REG_NOPAREN 0 // toplevel reg()
352#define REG_PAREN 1 // \(\)
353#define REG_ZPAREN 2 // \z(\)
354#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000355
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200356typedef struct
357{
358 char_u *regparse;
359 int prevchr_len;
360 int curchr;
361 int prevchr;
362 int prevprevchr;
363 int nextchr;
364 int at_start;
365 int prev_at_start;
366 int regnpar;
367} parse_state_T;
368
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100369static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100370static int getchr(void);
371static void skipchr_keepstart(void);
372static int peekchr(void);
373static void skipchr(void);
374static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100375static long gethexchrs(int maxinputlen);
376static long getoctchrs(void);
377static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100378static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100379static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200380static int cstrncmp(char_u *s1, char_u *s2, int *n);
381static char_u *cstrchr(char_u *, int);
382static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100383static int reg_iswordc(int);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200385static regengine_T bt_regengine;
386static regengine_T nfa_regengine;
387
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388/*
389 * Return TRUE if compiled regular expression "prog" can match a line break.
390 */
391 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100392re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000393{
394 return (prog->regflags & RF_HASNL);
395}
396
397/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000398 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
399 * Returns a character representing the class. Zero means that no item was
400 * recognized. Otherwise "pp" is advanced to after the item.
401 */
402 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100403get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404{
405 int c;
406 int l = 1;
407 char_u *p = *pp;
408
Bram Moolenaar985079c2019-02-16 17:07:47 +0100409 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000410 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000411 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000412 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000413 if (p[l + 2] == '=' && p[l + 3] == ']')
414 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000415 if (has_mbyte)
416 c = mb_ptr2char(p + 2);
417 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000418 c = p[2];
419 *pp += l + 4;
420 return c;
421 }
422 }
423 return 0;
424}
425
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200426#ifdef EBCDIC
427/*
428 * Table for equivalence class "c". (IBM-1047)
429 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200430static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200431 "A\x62\x63\x64\x65\x66\x67",
432 "C\x68",
433 "E\x71\x72\x73\x74",
434 "I\x75\x76\x77\x78",
435 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200436 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200437 "U\xFB\xFC\xFD\xFE",
438 "Y\xBA",
439 "a\x42\x43\x44\x45\x46\x47",
440 "c\x48",
441 "e\x51\x52\x53\x54",
442 "i\x55\x56\x57\x58",
443 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200444 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200445 "u\xDB\xDC\xDD\xDE",
446 "y\x8D\xDF",
447};
448#endif
449
Bram Moolenaardf177f62005-02-22 08:39:57 +0000450/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000451 * Check for a collating element "[.a.]". "pp" points to the '['.
452 * Returns a character. Zero means that no item was recognized. Otherwise
453 * "pp" is advanced to after the item.
454 * Currently only single characters are recognized!
455 */
456 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100457get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458{
459 int c;
460 int l = 1;
461 char_u *p = *pp;
462
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100463 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000464 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000465 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000466 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000467 if (p[l + 2] == '.' && p[l + 3] == ']')
468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
470 c = mb_ptr2char(p + 2);
471 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000472 c = p[2];
473 *pp += l + 4;
474 return c;
475 }
476 }
477 return 0;
478}
479
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100480static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
481static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200482
483 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100484get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200485{
486 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
487 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
488}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000489
490/*
491 * Skip over a "[]" range.
492 * "p" must point to the character after the '['.
493 * The returned pointer is on the matching ']', or the terminating NUL.
494 */
495 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100496skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000497{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000499
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100500 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501 ++p;
502 if (*p == ']' || *p == '-')
503 ++p;
504 while (*p != NUL && *p != ']')
505 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000506 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000507 p += l;
508 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000509 if (*p == '-')
510 {
511 ++p;
512 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100513 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000514 }
515 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200516 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000517 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200518 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000519 p += 2;
520 else if (*p == '[')
521 {
522 if (get_char_class(&p) == CLASS_NONE
523 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200524 && get_coll_element(&p) == 0
525 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100526 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000527 }
528 else
529 ++p;
530 }
531
532 return p;
533}
534
535/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000536 * Skip past regular expression.
Bram Moolenaar748bf032005-02-02 23:04:36 +0000537 * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000538 * Take care of characters with a backslash in front of it.
539 * Skip strings inside [ and ].
540 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
541 * expression and change "\?" to "?". If "*newp" is not NULL the expression
542 * is changed in-place.
543 */
544 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100545skip_regexp(
546 char_u *startp,
547 int dirc,
548 int magic,
549 char_u **newp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000550{
551 int mymagic;
552 char_u *p = startp;
553
554 if (magic)
555 mymagic = MAGIC_ON;
556 else
557 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200558 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000559
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100560 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000561 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100562 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000563 break;
564 if ((p[0] == '[' && mymagic >= MAGIC_ON)
565 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
566 {
567 p = skip_anyof(p + 1);
568 if (p[0] == NUL)
569 break;
570 }
571 else if (p[0] == '\\' && p[1] != NUL)
572 {
573 if (dirc == '?' && newp != NULL && p[1] == '?')
574 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100575 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000576 if (*newp == NULL)
577 {
578 *newp = vim_strsave(startp);
579 if (*newp != NULL)
580 p = *newp + (p - startp);
581 }
582 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000583 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000584 else
585 ++p;
586 }
587 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100588 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000589 if (*p == 'v')
590 mymagic = MAGIC_ALL;
591 else if (*p == 'V')
592 mymagic = MAGIC_NONE;
593 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000594 }
595 return p;
596}
597
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200598/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200599 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200600 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100601static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200602static int at_start; // True when on the first character
603static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100604
Bram Moolenaar071d4272004-06-13 20:20:40 +0000605/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200606 * Start parsing at "str".
607 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000608 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100609initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000610{
611 regparse = str;
612 prevchr_len = 0;
613 curchr = prevprevchr = prevchr = nextchr = -1;
614 at_start = TRUE;
615 prev_at_start = FALSE;
616}
617
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200618/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200619 * Save the current parse state, so that it can be restored and parsing
620 * starts in the same state again.
621 */
622 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100623save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200624{
625 ps->regparse = regparse;
626 ps->prevchr_len = prevchr_len;
627 ps->curchr = curchr;
628 ps->prevchr = prevchr;
629 ps->prevprevchr = prevprevchr;
630 ps->nextchr = nextchr;
631 ps->at_start = at_start;
632 ps->prev_at_start = prev_at_start;
633 ps->regnpar = regnpar;
634}
635
636/*
637 * Restore a previously saved parse state.
638 */
639 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100640restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200641{
642 regparse = ps->regparse;
643 prevchr_len = ps->prevchr_len;
644 curchr = ps->curchr;
645 prevchr = ps->prevchr;
646 prevprevchr = ps->prevprevchr;
647 nextchr = ps->nextchr;
648 at_start = ps->at_start;
649 prev_at_start = ps->prev_at_start;
650 regnpar = ps->regnpar;
651}
652
653
654/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200655 * Get the next character without advancing.
656 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000657 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100658peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000659{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000660 static int after_slash = FALSE;
661
Bram Moolenaar071d4272004-06-13 20:20:40 +0000662 if (curchr == -1)
663 {
664 switch (curchr = regparse[0])
665 {
666 case '.':
667 case '[':
668 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100669 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000670 if (reg_magic >= MAGIC_ON)
671 curchr = Magic(curchr);
672 break;
673 case '(':
674 case ')':
675 case '{':
676 case '%':
677 case '+':
678 case '=':
679 case '?':
680 case '@':
681 case '!':
682 case '&':
683 case '|':
684 case '<':
685 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100686 case '#': // future ext.
687 case '"': // future ext.
688 case '\'': // future ext.
689 case ',': // future ext.
690 case '-': // future ext.
691 case ':': // future ext.
692 case ';': // future ext.
693 case '`': // future ext.
694 case '/': // Can't be used in / command
695 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000696 if (reg_magic == MAGIC_ALL)
697 curchr = Magic(curchr);
698 break;
699 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100700 // * is not magic as the very first character, eg "?*ptr", when
701 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
702 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000703 if (reg_magic >= MAGIC_ON
704 && !at_start
705 && !(prev_at_start && prevchr == Magic('^'))
706 && (after_slash
707 || (prevchr != Magic('(')
708 && prevchr != Magic('&')
709 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000710 curchr = Magic('*');
711 break;
712 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100713 // '^' is only magic as the very first character and if it's after
714 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000715 if (reg_magic >= MAGIC_OFF
716 && (at_start
717 || reg_magic == MAGIC_ALL
718 || prevchr == Magic('(')
719 || prevchr == Magic('|')
720 || prevchr == Magic('&')
721 || prevchr == Magic('n')
722 || (no_Magic(prevchr) == '('
723 && prevprevchr == Magic('%'))))
724 {
725 curchr = Magic('^');
726 at_start = TRUE;
727 prev_at_start = FALSE;
728 }
729 break;
730 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100731 // '$' is only magic as the very last char and if it's in front of
732 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000733 if (reg_magic >= MAGIC_OFF)
734 {
735 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200736 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000737
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100738 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000739 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200740 || p[1] == 'm' || p[1] == 'M'
741 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
742 {
743 if (p[1] == 'v')
744 is_magic_all = TRUE;
745 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
746 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000747 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200748 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000749 if (p[0] == NUL
750 || (p[0] == '\\'
751 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
752 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200753 || (is_magic_all
754 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000755 || reg_magic == MAGIC_ALL)
756 curchr = Magic('$');
757 }
758 break;
759 case '\\':
760 {
761 int c = regparse[1];
762
763 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100764 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000765 else if (
766#ifdef EBCDIC
767 vim_strchr(META, c)
768#else
769 c <= '~' && META_flags[c]
770#endif
771 )
772 {
773 /*
774 * META contains everything that may be magic sometimes,
775 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200776 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000777 * magicness. Therefore, \ is so meta-magic that it is
778 * not in META.
779 */
780 curchr = -1;
781 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100782 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000783 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000784 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000785 peekchr();
786 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000787 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000788 curchr = toggle_Magic(curchr);
789 }
790 else if (vim_strchr(REGEXP_ABBR, c))
791 {
792 /*
793 * Handle abbreviations, like "\t" for TAB -- webb
794 */
795 curchr = backslash_trans(c);
796 }
797 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
798 curchr = toggle_Magic(c);
799 else
800 {
801 /*
802 * Next character can never be (made) magic?
803 * Then backslashing it won't do anything.
804 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000805 if (has_mbyte)
806 curchr = (*mb_ptr2char)(regparse + 1);
807 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000808 curchr = c;
809 }
810 break;
811 }
812
Bram Moolenaar071d4272004-06-13 20:20:40 +0000813 default:
814 if (has_mbyte)
815 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000816 }
817 }
818
819 return curchr;
820}
821
822/*
823 * Eat one lexed character. Do this in a way that we can undo it.
824 */
825 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100826skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000827{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100828 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000829 if (*regparse == '\\')
830 prevchr_len = 1;
831 else
832 prevchr_len = 0;
833 if (regparse[prevchr_len] != NUL)
834 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000835 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100836 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000837 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000838 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000839 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000840 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000841 ++prevchr_len;
842 }
843 regparse += prevchr_len;
844 prev_at_start = at_start;
845 at_start = FALSE;
846 prevprevchr = prevchr;
847 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100848 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000849 nextchr = -1;
850}
851
852/*
853 * Skip a character while keeping the value of prev_at_start for at_start.
854 * prevchr and prevprevchr are also kept.
855 */
856 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100857skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000858{
859 int as = prev_at_start;
860 int pr = prevchr;
861 int prpr = prevprevchr;
862
863 skipchr();
864 at_start = as;
865 prevchr = pr;
866 prevprevchr = prpr;
867}
868
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200869/*
870 * Get the next character from the pattern. We know about magic and such, so
871 * therefore we need a lexical analyzer.
872 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000873 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100874getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000875{
876 int chr = peekchr();
877
878 skipchr();
879 return chr;
880}
881
882/*
883 * put character back. Works only once!
884 */
885 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100886ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000887{
888 nextchr = curchr;
889 curchr = prevchr;
890 prevchr = prevprevchr;
891 at_start = prev_at_start;
892 prev_at_start = FALSE;
893
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100894 // Backup regparse, so that it's at the same position as before the
895 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000896 regparse -= prevchr_len;
897}
898
899/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000900 * Get and return the value of the hex string at the current position.
901 * Return -1 if there is no valid hex number.
902 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000903 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000904 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000905 * The parameter controls the maximum number of input characters. This will be
906 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
907 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100908 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100909gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000910{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100911 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000912 int c;
913 int i;
914
915 for (i = 0; i < maxinputlen; ++i)
916 {
917 c = regparse[0];
918 if (!vim_isxdigit(c))
919 break;
920 nr <<= 4;
921 nr |= hex2nr(c);
922 ++regparse;
923 }
924
925 if (i == 0)
926 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100927 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000928}
929
930/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200931 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000932 * current position. Return -1 for invalid. Consumes all digits.
933 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100934 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100935getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000936{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100937 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000938 int c;
939 int i;
940
941 for (i = 0; ; ++i)
942 {
943 c = regparse[0];
944 if (c < '0' || c > '9')
945 break;
946 nr *= 10;
947 nr += c - '0';
948 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100949 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000950 }
951
952 if (i == 0)
953 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100954 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000955}
956
957/*
958 * get and return the value of the octal string immediately after the current
959 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
960 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
961 * treat 8 or 9 as recognised characters. Position is updated:
962 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000963 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000964 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100965 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100966getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000967{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100968 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000969 int c;
970 int i;
971
972 for (i = 0; i < 3 && nr < 040; ++i)
973 {
974 c = regparse[0];
975 if (c < '0' || c > '7')
976 break;
977 nr <<= 3;
978 nr |= hex2nr(c);
979 ++regparse;
980 }
981
982 if (i == 0)
983 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100984 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000985}
986
987/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000988 * read_limits - Read two integers to be taken as a minimum and maximum.
989 * If the first character is '-', then the range is reversed.
990 * Should end with 'end'. If minval is missing, zero is default, if maxval is
991 * missing, a very big number is the default.
992 */
993 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100994read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000995{
996 int reverse = FALSE;
997 char_u *first_char;
998 long tmp;
999
1000 if (*regparse == '-')
1001 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001002 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001003 regparse++;
1004 reverse = TRUE;
1005 }
1006 first_char = regparse;
1007 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001008 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001009 {
1010 if (vim_isdigit(*++regparse))
1011 *maxval = getdigits(&regparse);
1012 else
1013 *maxval = MAX_LIMIT;
1014 }
1015 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001016 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001017 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001018 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001019 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001020 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001021 if (*regparse != '}')
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001022 EMSG2_RET_FAIL(_("E554: Syntax error in %s{...}"),
1023 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001024
1025 /*
1026 * Reverse the range if there was a '-', or make sure it is in the right
1027 * order otherwise.
1028 */
1029 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1030 {
1031 tmp = *minval;
1032 *minval = *maxval;
1033 *maxval = tmp;
1034 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001035 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001036 return OK;
1037}
1038
1039/*
1040 * vim_regexec and friends
1041 */
1042
1043/*
1044 * Global work variables for vim_regexec().
1045 */
1046
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001047static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001048#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001049static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001050#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001051static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001052static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001053
1054/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001055 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1056 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001057 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001058 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001059static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001060static unsigned reg_tofreelen;
1061
1062/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001063 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001064 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001065 * done:
1066 * single-line multi-line
1067 * reg_match &regmatch_T NULL
1068 * reg_mmatch NULL &regmmatch_T
1069 * reg_startp reg_match->startp <invalid>
1070 * reg_endp reg_match->endp <invalid>
1071 * reg_startpos <invalid> reg_mmatch->startpos
1072 * reg_endpos <invalid> reg_mmatch->endpos
1073 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001074 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001075 * reg_firstlnum <invalid> first line in which to search
1076 * reg_maxline 0 last line nr
1077 * reg_line_lbr FALSE or TRUE FALSE
1078 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001079typedef struct {
1080 regmatch_T *reg_match;
1081 regmmatch_T *reg_mmatch;
1082 char_u **reg_startp;
1083 char_u **reg_endp;
1084 lpos_T *reg_startpos;
1085 lpos_T *reg_endpos;
1086 win_T *reg_win;
1087 buf_T *reg_buf;
1088 linenr_T reg_firstlnum;
1089 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001090 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001091
Bram Moolenaar0270f382018-07-17 05:43:58 +02001092 // The current match-position is stord in these variables:
1093 linenr_T lnum; // line number, relative to first line
1094 char_u *line; // start of current line
1095 char_u *input; // current input, points into "regline"
1096
1097 int need_clear_subexpr; // subexpressions still need to be cleared
1098#ifdef FEAT_SYN_HL
1099 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1100 // cleared
1101#endif
1102
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001103 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1104 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1105 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001106 int reg_ic;
1107
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001108 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1109 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001110 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001111
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001112 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1113 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001114 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001115
1116 // State for the NFA engine regexec.
1117 int nfa_has_zend; // NFA regexp \ze operator encountered.
1118 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1119 int nfa_nsubexpr; // Number of sub expressions actually being used
1120 // during execution. 1 if only the whole match
1121 // (subexpr 0) is used.
1122 // listid is global, so that it increases on recursive calls to
1123 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1124 // all the states.
1125 int nfa_listid;
1126 int nfa_alt_listid;
1127
1128#ifdef FEAT_SYN_HL
1129 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1130#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001131} regexec_T;
1132
1133static regexec_T rex;
1134static int rex_in_use = FALSE;
1135
Bram Moolenaar071d4272004-06-13 20:20:40 +00001136/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001137 * Return TRUE if character 'c' is included in 'iskeyword' option for
1138 * "reg_buf" buffer.
1139 */
1140 static int
1141reg_iswordc(int c)
1142{
1143 return vim_iswordc_buf(c, rex.reg_buf);
1144}
1145
1146/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001147 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1148 */
1149 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001150reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001151{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001152 // when looking behind for a match/no-match lnum is negative. But we
1153 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001154 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001155 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001156 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001157 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001158 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001159 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001160}
1161
Bram Moolenaar071d4272004-06-13 20:20:40 +00001162#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001163static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1164static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1165static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1166static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001167#endif
1168
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001169// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001170#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001171
Bram Moolenaar071d4272004-06-13 20:20:40 +00001172#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001173/*
1174 * Create a new extmatch and mark it as referenced once.
1175 */
1176 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001177make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001178{
1179 reg_extmatch_T *em;
1180
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001181 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001182 if (em != NULL)
1183 em->refcnt = 1;
1184 return em;
1185}
1186
1187/*
1188 * Add a reference to an extmatch.
1189 */
1190 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001191ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001192{
1193 if (em != NULL)
1194 em->refcnt++;
1195 return em;
1196}
1197
1198/*
1199 * Remove a reference to an extmatch. If there are no references left, free
1200 * the info.
1201 */
1202 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001203unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001204{
1205 int i;
1206
1207 if (em != NULL && --em->refcnt <= 0)
1208 {
1209 for (i = 0; i < NSUBEXP; ++i)
1210 vim_free(em->matches[i]);
1211 vim_free(em);
1212 }
1213}
1214#endif
1215
1216/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001217 * Get class of previous character.
1218 */
1219 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001220reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001221{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001222 if (rex.input > rex.line)
1223 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001224 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001225 return -1;
1226}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001227
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001228/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001229 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001230 */
1231 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001232reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001233{
1234 pos_T top, bot;
1235 linenr_T lnum;
1236 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001237 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001238 int mode;
1239 colnr_T start, end;
1240 colnr_T start2, end2;
1241 colnr_T cols;
1242
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001243 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001244 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001245 return FALSE;
1246
1247 if (VIsual_active)
1248 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001249 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001250 {
1251 top = VIsual;
1252 bot = wp->w_cursor;
1253 }
1254 else
1255 {
1256 top = wp->w_cursor;
1257 bot = VIsual;
1258 }
1259 mode = VIsual_mode;
1260 }
1261 else
1262 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001263 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001264 {
1265 top = curbuf->b_visual.vi_start;
1266 bot = curbuf->b_visual.vi_end;
1267 }
1268 else
1269 {
1270 top = curbuf->b_visual.vi_end;
1271 bot = curbuf->b_visual.vi_start;
1272 }
1273 mode = curbuf->b_visual.vi_mode;
1274 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001275 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001276 if (lnum < top.lnum || lnum > bot.lnum)
1277 return FALSE;
1278
1279 if (mode == 'v')
1280 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001281 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001282 if ((lnum == top.lnum && col < top.col)
1283 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1284 return FALSE;
1285 }
1286 else if (mode == Ctrl_V)
1287 {
1288 getvvcol(wp, &top, &start, NULL, &end);
1289 getvvcol(wp, &bot, &start2, NULL, &end2);
1290 if (start2 < start)
1291 start = start2;
1292 if (end2 > end)
1293 end = end2;
1294 if (top.col == MAXCOL || bot.col == MAXCOL)
1295 end = MAXCOL;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001296 cols = win_linetabsize(wp, rex.line, (colnr_T)(rex.input - rex.line));
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001297 if (cols < start || cols > end - (*p_sel == 'e'))
1298 return FALSE;
1299 }
1300 return TRUE;
1301}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001302
Bram Moolenaar071d4272004-06-13 20:20:40 +00001303/*
1304 * Check the regexp program for its magic number.
1305 * Return TRUE if it's wrong.
1306 */
1307 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001308prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001309{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001310 regprog_T *prog;
1311
Bram Moolenaar6100d022016-10-02 16:51:57 +02001312 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001313 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001314 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001315 return FALSE;
1316
1317 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001318 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001319 emsg(_(e_re_corr));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001320 return TRUE;
1321 }
1322 return FALSE;
1323}
1324
1325/*
1326 * Cleanup the subexpressions, if this wasn't done yet.
1327 * This construction is used to clear the subexpressions only when they are
1328 * used (to increase speed).
1329 */
1330 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001331cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001332{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001333 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001334 {
1335 if (REG_MULTI)
1336 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001337 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001338 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1339 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001340 }
1341 else
1342 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001343 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1344 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001345 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001346 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001347 }
1348}
1349
1350#ifdef FEAT_SYN_HL
1351 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001352cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001353{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001354 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001355 {
1356 if (REG_MULTI)
1357 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001358 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001359 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1360 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1361 }
1362 else
1363 {
1364 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1365 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1366 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001367 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001368 }
1369}
1370#endif
1371
1372/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001373 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001374 */
1375 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001376reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001377{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001378 rex.line = reg_getline(++rex.lnum);
1379 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001380 fast_breakcheck();
1381}
1382
1383/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001384 * Check whether a backreference matches.
1385 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001386 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1387 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001388 */
1389 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001390match_with_backref(
1391 linenr_T start_lnum,
1392 colnr_T start_col,
1393 linenr_T end_lnum,
1394 colnr_T end_col,
1395 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001396{
1397 linenr_T clnum = start_lnum;
1398 colnr_T ccol = start_col;
1399 int len;
1400 char_u *p;
1401
1402 if (bytelen != NULL)
1403 *bytelen = 0;
1404 for (;;)
1405 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001406 // Since getting one line may invalidate the other, need to make copy.
1407 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001408 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001409 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001410 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001411 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1412 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001413 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001414 vim_free(reg_tofree);
1415 reg_tofree = alloc(len);
1416 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001417 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001418 reg_tofreelen = len;
1419 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001420 STRCPY(reg_tofree, rex.line);
1421 rex.input = reg_tofree + (rex.input - rex.line);
1422 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001423 }
1424
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001425 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001426 p = reg_getline(clnum);
1427 if (clnum == end_lnum)
1428 len = end_col - ccol;
1429 else
1430 len = (int)STRLEN(p + ccol);
1431
Bram Moolenaar0270f382018-07-17 05:43:58 +02001432 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001433 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001434 if (bytelen != NULL)
1435 *bytelen += len;
1436 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001437 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001438 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001439 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001440
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001441 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001442 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001443 if (bytelen != NULL)
1444 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001445 ++clnum;
1446 ccol = 0;
1447 if (got_int)
1448 return RA_FAIL;
1449 }
1450
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001451 // found a match! Note that rex.line may now point to a copy of the line,
1452 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001453 return RA_MATCH;
1454}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001455
Bram Moolenaarfb031402014-09-09 17:18:49 +02001456/*
1457 * Used in a place where no * or \+ can follow.
1458 */
1459 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001460re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001461{
1462 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001463 {
1464 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1465 rc_did_emsg = TRUE;
1466 return FAIL;
1467 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001468 return OK;
1469}
1470
Bram Moolenaar071d4272004-06-13 20:20:40 +00001471typedef struct
1472{
1473 int a, b, c;
1474} decomp_T;
1475
1476
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001477// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001478static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001479{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001480 {0x5e2,0,0}, // 0xfb20 alt ayin
1481 {0x5d0,0,0}, // 0xfb21 alt alef
1482 {0x5d3,0,0}, // 0xfb22 alt dalet
1483 {0x5d4,0,0}, // 0xfb23 alt he
1484 {0x5db,0,0}, // 0xfb24 alt kaf
1485 {0x5dc,0,0}, // 0xfb25 alt lamed
1486 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1487 {0x5e8,0,0}, // 0xfb27 alt resh
1488 {0x5ea,0,0}, // 0xfb28 alt tav
1489 {'+', 0, 0}, // 0xfb29 alt plus
1490 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1491 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1492 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1493 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1494 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1495 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1496 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1497 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1498 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1499 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1500 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1501 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1502 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1503 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1504 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1505 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1506 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1507 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1508 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1509 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1510 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1511 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1512 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1513 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1514 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1515 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1516 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1517 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1518 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1519 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1520 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1521 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1522 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1523 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1524 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1525 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1526 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1527 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001528};
1529
1530 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001531mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001532{
1533 decomp_T d;
1534
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001535 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001536 {
1537 d = decomp_table[c - 0xfb20];
1538 *c1 = d.a;
1539 *c2 = d.b;
1540 *c3 = d.c;
1541 }
1542 else
1543 {
1544 *c1 = c;
1545 *c2 = *c3 = 0;
1546 }
1547}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001548
1549/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001550 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001551 * Return 0 if strings match, non-zero otherwise.
1552 * Correct the length "*n" when composing characters are ignored.
1553 */
1554 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001555cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001556{
1557 int result;
1558
Bram Moolenaar6100d022016-10-02 16:51:57 +02001559 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001560 result = STRNCMP(s1, s2, *n);
1561 else
1562 result = MB_STRNICMP(s1, s2, *n);
1563
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001564 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001565 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001566 {
1567 char_u *str1, *str2;
1568 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001569 int junk;
1570
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001571 // we have to handle the strcmp ourselves, since it is necessary to
1572 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001573 str1 = s1;
1574 str2 = s2;
1575 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001576 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001577 {
1578 c1 = mb_ptr2char_adv(&str1);
1579 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001580
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001581 // Decompose the character if necessary, into 'base' characters.
1582 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001583 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001584 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001585 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001586 mb_decompose(c1, &c11, &junk, &junk);
1587 mb_decompose(c2, &c12, &junk, &junk);
1588 c1 = c11;
1589 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001590 if (c11 != c12
1591 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001592 break;
1593 }
1594 }
1595 result = c2 - c1;
1596 if (result == 0)
1597 *n = (int)(str2 - s2);
1598 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001599
1600 return result;
1601}
1602
1603/*
1604 * cstrchr: This function is used a lot for simple searches, keep it fast!
1605 */
1606 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001607cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001608{
1609 char_u *p;
1610 int cc;
1611
Bram Moolenaara12a1612019-01-24 16:39:02 +01001612 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001613 return vim_strchr(s, c);
1614
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001615 // tolower() and toupper() can be slow, comparing twice should be a lot
1616 // faster (esp. when using MS Visual C++!).
1617 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001618 if (enc_utf8 && c > 0x80)
1619 cc = utf_fold(c);
1620 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001621 if (MB_ISUPPER(c))
1622 cc = MB_TOLOWER(c);
1623 else if (MB_ISLOWER(c))
1624 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001625 else
1626 return vim_strchr(s, c);
1627
Bram Moolenaar071d4272004-06-13 20:20:40 +00001628 if (has_mbyte)
1629 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001630 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001631 {
1632 if (enc_utf8 && c > 0x80)
1633 {
1634 if (utf_fold(utf_ptr2char(p)) == cc)
1635 return p;
1636 }
1637 else if (*p == c || *p == cc)
1638 return p;
1639 }
1640 }
1641 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001642 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001643 for (p = s; *p != NUL; ++p)
1644 if (*p == c || *p == cc)
1645 return p;
1646
1647 return NULL;
1648}
1649
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001650////////////////////////////////////////////////////////////////
1651// regsub stuff //
1652////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001653
Bram Moolenaar071d4272004-06-13 20:20:40 +00001654/*
1655 * We should define ftpr as a pointer to a function returning a pointer to
1656 * a function returning a pointer to a function ...
1657 * This is impossible, so we declare a pointer to a function returning a
1658 * pointer to a function returning void. This should work for all compilers.
1659 */
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001660typedef void (*(*fptr_T)(int *, int))();
Bram Moolenaar071d4272004-06-13 20:20:40 +00001661
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001662static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001663
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001664 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001665do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001666{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001667 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001668
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001669 return (fptr_T)NULL;
1670}
1671
1672 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001673do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001674{
1675 *d = MB_TOUPPER(c);
1676
1677 return (fptr_T)do_Upper;
1678}
1679
1680 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001681do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001682{
1683 *d = MB_TOLOWER(c);
1684
1685 return (fptr_T)NULL;
1686}
1687
1688 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001689do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001690{
1691 *d = MB_TOLOWER(c);
1692
1693 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001694}
1695
1696/*
1697 * regtilde(): Replace tildes in the pattern by the old pattern.
1698 *
1699 * Short explanation of the tilde: It stands for the previous replacement
1700 * pattern. If that previous pattern also contains a ~ we should go back a
1701 * step further... But we insert the previous pattern into the current one
1702 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001703 * This still does not handle the case where "magic" changes. So require the
1704 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001705 *
1706 * The tildes are parsed once before the first call to vim_regsub().
1707 */
1708 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001709regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001710{
1711 char_u *newsub = source;
1712 char_u *tmpsub;
1713 char_u *p;
1714 int len;
1715 int prevlen;
1716
1717 for (p = newsub; *p; ++p)
1718 {
1719 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1720 {
1721 if (reg_prev_sub != NULL)
1722 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001723 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001724 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001725 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001726 if (tmpsub != NULL)
1727 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001728 // copy prefix
1729 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001730 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001731 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001732 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001733 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001734 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001735 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001736 STRCPY(tmpsub + len + prevlen, p + 1);
1737
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001738 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001739 vim_free(newsub);
1740 newsub = tmpsub;
1741 p = newsub + len + prevlen;
1742 }
1743 }
1744 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001745 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001746 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001747 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001748 --p;
1749 }
1750 else
1751 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001752 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001753 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001754 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001755 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001756 }
1757 }
1758
1759 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001760 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001761 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001762 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001763 reg_prev_sub = vim_strsave(newsub);
1764 return newsub;
1765}
1766
1767#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001768static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001769
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001770// These pointers are used for reg_submatch(). Needed for when the
1771// substitution string is an expression that contains a call to substitute()
1772// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001773typedef struct {
1774 regmatch_T *sm_match;
1775 regmmatch_T *sm_mmatch;
1776 linenr_T sm_firstlnum;
1777 linenr_T sm_maxline;
1778 int sm_line_lbr;
1779} regsubmatch_T;
1780
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001781static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001782#endif
1783
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001784#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001785
1786/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001787 * Put the submatches in "argv[argskip]" which is a list passed into
1788 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001789 */
1790 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001791fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001792{
1793 listitem_T *li;
1794 int i;
1795 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001796 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001797
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001798 if (argcount == argskip)
1799 // called function doesn't take a submatches argument
1800 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001801
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001802 // Relies on sl_list to be the first item in staticList10_T.
1803 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001804
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001805 // There are always 10 list items in staticList10_T.
1806 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001807 for (i = 0; i < 10; ++i)
1808 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001809 s = rsm.sm_match->startp[i];
1810 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001811 s = NULL;
1812 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02001813 s = vim_strnsave(s, (int)(rsm.sm_match->endp[i] - s));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001814 li->li_tv.v_type = VAR_STRING;
1815 li->li_tv.vval.v_string = s;
1816 li = li->li_next;
1817 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001818 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001819}
1820
1821 static void
1822clear_submatch_list(staticList10_T *sl)
1823{
1824 int i;
1825
1826 for (i = 0; i < 10; ++i)
1827 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1828}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001829#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001830
Bram Moolenaar071d4272004-06-13 20:20:40 +00001831/*
1832 * vim_regsub() - perform substitutions after a vim_regexec() or
1833 * vim_regexec_multi() match.
1834 *
1835 * If "copy" is TRUE really copy into "dest".
1836 * If "copy" is FALSE nothing is copied, this is just to find out the length
1837 * of the result.
1838 *
1839 * If "backslash" is TRUE, a backslash will be removed later, need to double
1840 * them to keep them, and insert a backslash before a CR to avoid it being
1841 * replaced with a line break later.
1842 *
1843 * Note: The matched text must not change between the call of
1844 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1845 * references invalid!
1846 *
1847 * Returns the size of the replacement, including terminating NUL.
1848 */
1849 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001850vim_regsub(
1851 regmatch_T *rmp,
1852 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001853 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001854 char_u *dest,
1855 int copy,
1856 int magic,
1857 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001858{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001859 int result;
1860 regexec_T rex_save;
1861 int rex_in_use_save = rex_in_use;
1862
1863 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001864 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001865 rex_save = rex;
1866 rex_in_use = TRUE;
1867
1868 rex.reg_match = rmp;
1869 rex.reg_mmatch = NULL;
1870 rex.reg_maxline = 0;
1871 rex.reg_buf = curbuf;
1872 rex.reg_line_lbr = TRUE;
1873 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1874
1875 rex_in_use = rex_in_use_save;
1876 if (rex_in_use)
1877 rex = rex_save;
1878
1879 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001880}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001881
1882 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001883vim_regsub_multi(
1884 regmmatch_T *rmp,
1885 linenr_T lnum,
1886 char_u *source,
1887 char_u *dest,
1888 int copy,
1889 int magic,
1890 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001891{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001892 int result;
1893 regexec_T rex_save;
1894 int rex_in_use_save = rex_in_use;
1895
1896 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001897 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001898 rex_save = rex;
1899 rex_in_use = TRUE;
1900
1901 rex.reg_match = NULL;
1902 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001903 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001904 rex.reg_firstlnum = lnum;
1905 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1906 rex.reg_line_lbr = FALSE;
1907 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1908
1909 rex_in_use = rex_in_use_save;
1910 if (rex_in_use)
1911 rex = rex_save;
1912
1913 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001914}
1915
1916 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001917vim_regsub_both(
1918 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001919 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001920 char_u *dest,
1921 int copy,
1922 int magic,
1923 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001924{
1925 char_u *src;
1926 char_u *dst;
1927 char_u *s;
1928 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001929 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001930 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001931 fptr_T func_all = (fptr_T)NULL;
1932 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001933 linenr_T clnum = 0; // init for GCC
1934 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001935#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001936 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001937#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001938
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001939 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001940 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001941 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01001942 emsg(_(e_null));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001943 return 0;
1944 }
1945 if (prog_magic_wrong())
1946 return 0;
1947 src = source;
1948 dst = dest;
1949
1950 /*
1951 * When the substitute part starts with "\=" evaluate it as an expression.
1952 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001953 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001954 {
1955#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001956 // To make sure that the length doesn't change between checking the
1957 // length and copying the string, and to speed up things, the
1958 // resulting string is saved from the call with "copy" == FALSE to the
1959 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001960 if (copy)
1961 {
1962 if (eval_result != NULL)
1963 {
1964 STRCPY(dest, eval_result);
1965 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01001966 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001967 }
1968 }
1969 else
1970 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001971 int prev_can_f_submatch = can_f_submatch;
1972 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001973
1974 vim_free(eval_result);
1975
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001976 // The expression may contain substitute(), which calls us
1977 // recursively. Make sure submatch() gets the text from the first
1978 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001979 if (can_f_submatch)
1980 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001981 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001982 rsm.sm_match = rex.reg_match;
1983 rsm.sm_mmatch = rex.reg_mmatch;
1984 rsm.sm_firstlnum = rex.reg_firstlnum;
1985 rsm.sm_maxline = rex.reg_maxline;
1986 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001987
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001988 if (expr != NULL)
1989 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001990 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001991 char_u buf[NUMBUFLEN];
1992 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001993 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02001994 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001995
1996 rettv.v_type = VAR_STRING;
1997 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001998 argv[0].v_type = VAR_LIST;
1999 argv[0].vval.v_list = &matchList.sl_list;
2000 matchList.sl_list.lv_len = 0;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002001 vim_memset(&funcexe, 0, sizeof(funcexe));
2002 funcexe.argv_func = fill_submatch_list;
2003 funcexe.evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002004 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002005 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002006 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002007 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002008 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002009 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002010 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002011 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002012
Bram Moolenaar6100d022016-10-02 16:51:57 +02002013 s = partial_name(partial);
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002014 funcexe.partial = partial;
2015 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002016 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002017 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002018 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002019 clear_submatch_list(&matchList);
2020
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002021 if (rettv.v_type == VAR_UNKNOWN)
2022 // something failed, no need to report another error
2023 eval_result = NULL;
2024 else
2025 {
2026 eval_result = tv_get_string_buf_chk(&rettv, buf);
2027 if (eval_result != NULL)
2028 eval_result = vim_strsave(eval_result);
2029 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002030 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002031 }
2032 else
2033 eval_result = eval_to_string(source + 2, NULL, TRUE);
2034
Bram Moolenaar071d4272004-06-13 20:20:40 +00002035 if (eval_result != NULL)
2036 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002037 int had_backslash = FALSE;
2038
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002039 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002040 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002041 // Change NL to CR, so that it becomes a line break,
2042 // unless called from vim_regexec_nl().
2043 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002044 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002045 *s = CAR;
2046 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002047 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002048 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002049 /* Change NL to CR here too, so that this works:
2050 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2051 * abc\
2052 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002053 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002054 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002055 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002056 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002057 had_backslash = TRUE;
2058 }
2059 }
2060 if (had_backslash && backslash)
2061 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002062 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002063 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2064 if (s != NULL)
2065 {
2066 vim_free(eval_result);
2067 eval_result = s;
2068 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002069 }
2070
2071 dst += STRLEN(eval_result);
2072 }
2073
Bram Moolenaar6100d022016-10-02 16:51:57 +02002074 can_f_submatch = prev_can_f_submatch;
2075 if (can_f_submatch)
2076 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002077 }
2078#endif
2079 }
2080 else
2081 while ((c = *src++) != NUL)
2082 {
2083 if (c == '&' && magic)
2084 no = 0;
2085 else if (c == '\\' && *src != NUL)
2086 {
2087 if (*src == '&' && !magic)
2088 {
2089 ++src;
2090 no = 0;
2091 }
2092 else if ('0' <= *src && *src <= '9')
2093 {
2094 no = *src++ - '0';
2095 }
2096 else if (vim_strchr((char_u *)"uUlLeE", *src))
2097 {
2098 switch (*src++)
2099 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002100 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002101 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002102 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002103 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002104 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002105 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002106 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002107 continue;
2108 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002109 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002110 continue;
2111 }
2112 }
2113 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002114 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002115 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002116 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2117 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002118 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002119 if (copy)
2120 {
2121 *dst++ = c;
2122 *dst++ = *src++;
2123 *dst++ = *src++;
2124 }
2125 else
2126 {
2127 dst += 3;
2128 src += 2;
2129 }
2130 continue;
2131 }
2132
Bram Moolenaar071d4272004-06-13 20:20:40 +00002133 if (c == '\\' && *src != NUL)
2134 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002135 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002136 switch (*src)
2137 {
2138 case 'r': c = CAR; ++src; break;
2139 case 'n': c = NL; ++src; break;
2140 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002141 // Oh no! \e already has meaning in subst pat :-(
2142 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002143 case 'b': c = Ctrl_H; ++src; break;
2144
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002145 // If "backslash" is TRUE the backslash will be removed
2146 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002147 default: if (backslash)
2148 {
2149 if (copy)
2150 *dst = '\\';
2151 ++dst;
2152 }
2153 c = *src++;
2154 }
2155 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002156 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002157 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002158
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002159 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002160 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002161 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002162 func_one = (fptr_T)(func_one(&cc, c));
2163 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002164 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002165 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002166 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002167 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002168
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002169 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002170 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002171 int totlen = mb_ptr2len(src - 1);
2172
Bram Moolenaar071d4272004-06-13 20:20:40 +00002173 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002174 mb_char2bytes(cc, dst);
2175 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002176 if (enc_utf8)
2177 {
2178 int clen = utf_ptr2len(src - 1);
2179
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002180 // If the character length is shorter than "totlen", there
2181 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002182 if (clen < totlen)
2183 {
2184 if (copy)
2185 mch_memmove(dst + 1, src - 1 + clen,
2186 (size_t)(totlen - clen));
2187 dst += totlen - clen;
2188 }
2189 }
2190 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002191 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002192 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002193 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002194 dst++;
2195 }
2196 else
2197 {
2198 if (REG_MULTI)
2199 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002200 clnum = rex.reg_mmatch->startpos[no].lnum;
2201 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002202 s = NULL;
2203 else
2204 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002205 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2206 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2207 len = rex.reg_mmatch->endpos[no].col
2208 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002209 else
2210 len = (int)STRLEN(s);
2211 }
2212 }
2213 else
2214 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002215 s = rex.reg_match->startp[no];
2216 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002217 s = NULL;
2218 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002219 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002220 }
2221 if (s != NULL)
2222 {
2223 for (;;)
2224 {
2225 if (len == 0)
2226 {
2227 if (REG_MULTI)
2228 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002229 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002230 break;
2231 if (copy)
2232 *dst = CAR;
2233 ++dst;
2234 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002235 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2236 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002237 else
2238 len = (int)STRLEN(s);
2239 }
2240 else
2241 break;
2242 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002243 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002244 {
2245 if (copy)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002246 emsg(_(e_re_damg));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002247 goto exit;
2248 }
2249 else
2250 {
2251 if (backslash && (*s == CAR || *s == '\\'))
2252 {
2253 /*
2254 * Insert a backslash in front of a CR, otherwise
2255 * it will be replaced by a line break.
2256 * Number of backslashes will be halved later,
2257 * double them here.
2258 */
2259 if (copy)
2260 {
2261 dst[0] = '\\';
2262 dst[1] = *s;
2263 }
2264 dst += 2;
2265 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002266 else
2267 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002268 if (has_mbyte)
2269 c = mb_ptr2char(s);
2270 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002271 c = *s;
2272
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002273 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002274 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002275 func_one = (fptr_T)(func_one(&cc, c));
2276 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002277 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002278 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002279 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002280 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002281
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002282 if (has_mbyte)
2283 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002284 int l;
2285
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002286 // Copy composing characters separately, one
2287 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002288 if (enc_utf8)
2289 l = utf_ptr2len(s) - 1;
2290 else
2291 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002292
2293 s += l;
2294 len -= l;
2295 if (copy)
2296 mb_char2bytes(cc, dst);
2297 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002298 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002299 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002300 *dst = cc;
2301 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002302 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002303
Bram Moolenaar071d4272004-06-13 20:20:40 +00002304 ++s;
2305 --len;
2306 }
2307 }
2308 }
2309 no = -1;
2310 }
2311 }
2312 if (copy)
2313 *dst = NUL;
2314
2315exit:
2316 return (int)((dst - dest) + 1);
2317}
2318
2319#ifdef FEAT_EVAL
2320/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002321 * Call reg_getline() with the line numbers from the submatch. If a
2322 * substitute() was used the reg_maxline and other values have been
2323 * overwritten.
2324 */
2325 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002326reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002327{
2328 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002329 linenr_T save_first = rex.reg_firstlnum;
2330 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002331
Bram Moolenaar6100d022016-10-02 16:51:57 +02002332 rex.reg_firstlnum = rsm.sm_firstlnum;
2333 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002334
2335 s = reg_getline(lnum);
2336
Bram Moolenaar6100d022016-10-02 16:51:57 +02002337 rex.reg_firstlnum = save_first;
2338 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002339 return s;
2340}
2341
2342/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002343 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002344 * allocated memory.
2345 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2346 */
2347 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002348reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002349{
2350 char_u *retval = NULL;
2351 char_u *s;
2352 int len;
2353 int round;
2354 linenr_T lnum;
2355
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002356 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002357 return NULL;
2358
Bram Moolenaar6100d022016-10-02 16:51:57 +02002359 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002360 {
2361 /*
2362 * First round: compute the length and allocate memory.
2363 * Second round: copy the text.
2364 */
2365 for (round = 1; round <= 2; ++round)
2366 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002367 lnum = rsm.sm_mmatch->startpos[no].lnum;
2368 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002369 return NULL;
2370
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002371 s = reg_getline_submatch(lnum);
2372 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002373 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002374 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002375 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002376 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002377 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002378 len = rsm.sm_mmatch->endpos[no].col
2379 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002380 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002381 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002382 ++len;
2383 }
2384 else
2385 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002386 // Multiple lines: take start line from start col, middle
2387 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002388 len = (int)STRLEN(s);
2389 if (round == 2)
2390 {
2391 STRCPY(retval, s);
2392 retval[len] = '\n';
2393 }
2394 ++len;
2395 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002396 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002397 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002398 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002399 if (round == 2)
2400 STRCPY(retval + len, s);
2401 len += (int)STRLEN(s);
2402 if (round == 2)
2403 retval[len] = '\n';
2404 ++len;
2405 }
2406 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002407 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002408 rsm.sm_mmatch->endpos[no].col);
2409 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002410 if (round == 2)
2411 retval[len] = NUL;
2412 ++len;
2413 }
2414
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002415 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002417 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002418 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002419 return NULL;
2420 }
2421 }
2422 }
2423 else
2424 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002425 s = rsm.sm_match->startp[no];
2426 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002427 retval = NULL;
2428 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002429 retval = vim_strnsave(s, (int)(rsm.sm_match->endp[no] - s));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002430 }
2431
2432 return retval;
2433}
Bram Moolenaar41571762014-04-02 19:00:58 +02002434
2435/*
2436 * Used for the submatch() function with the optional non-zero argument: get
2437 * the list of strings from the n'th submatch in allocated memory with NULs
2438 * represented in NLs.
2439 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2440 * command, for a non-existing submatch and for any error.
2441 */
2442 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002443reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002444{
2445 char_u *s;
2446 linenr_T slnum;
2447 linenr_T elnum;
2448 colnr_T scol;
2449 colnr_T ecol;
2450 int i;
2451 list_T *list;
2452 int error = FALSE;
2453
2454 if (!can_f_submatch || no < 0)
2455 return NULL;
2456
Bram Moolenaar6100d022016-10-02 16:51:57 +02002457 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002458 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002459 slnum = rsm.sm_mmatch->startpos[no].lnum;
2460 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002461 if (slnum < 0 || elnum < 0)
2462 return NULL;
2463
Bram Moolenaar6100d022016-10-02 16:51:57 +02002464 scol = rsm.sm_mmatch->startpos[no].col;
2465 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002466
2467 list = list_alloc();
2468 if (list == NULL)
2469 return NULL;
2470
2471 s = reg_getline_submatch(slnum) + scol;
2472 if (slnum == elnum)
2473 {
2474 if (list_append_string(list, s, ecol - scol) == FAIL)
2475 error = TRUE;
2476 }
2477 else
2478 {
2479 if (list_append_string(list, s, -1) == FAIL)
2480 error = TRUE;
2481 for (i = 1; i < elnum - slnum; i++)
2482 {
2483 s = reg_getline_submatch(slnum + i);
2484 if (list_append_string(list, s, -1) == FAIL)
2485 error = TRUE;
2486 }
2487 s = reg_getline_submatch(elnum);
2488 if (list_append_string(list, s, ecol) == FAIL)
2489 error = TRUE;
2490 }
2491 }
2492 else
2493 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002494 s = rsm.sm_match->startp[no];
2495 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002496 return NULL;
2497 list = list_alloc();
2498 if (list == NULL)
2499 return NULL;
2500 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002501 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002502 error = TRUE;
2503 }
2504
2505 if (error)
2506 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002507 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002508 return NULL;
2509 }
2510 return list;
2511}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002512#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002513
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002514#include "regexp_bt.c"
2515
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002516static regengine_T bt_regengine =
2517{
2518 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002519 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002520 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002521 bt_regexec_multi,
2522 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002523};
2524
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002525#include "regexp_nfa.c"
2526
2527static regengine_T nfa_regengine =
2528{
2529 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002530 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002531 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002532 nfa_regexec_multi,
2533 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002534};
2535
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002536// Which regexp engine to use? Needed for vim_regcomp().
2537// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002538static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002539
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002540#ifdef DEBUG
2541static char_u regname[][30] = {
2542 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002543 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002544 "NFA Regexp Engine"
2545 };
2546#endif
2547
2548/*
2549 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002550 * Returns the program in allocated memory.
2551 * Use vim_regfree() to free the memory.
2552 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002553 */
2554 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002555vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002556{
2557 regprog_T *prog = NULL;
2558 char_u *expr = expr_arg;
Bram Moolenaarcd625122019-02-22 17:29:43 +01002559 int save_called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002560
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002561 regexp_engine = p_re;
2562
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002563 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002564 if (STRNCMP(expr, "\\%#=", 4) == 0)
2565 {
2566 int newengine = expr[4] - '0';
2567
2568 if (newengine == AUTOMATIC_ENGINE
2569 || newengine == BACKTRACKING_ENGINE
2570 || newengine == NFA_ENGINE)
2571 {
2572 regexp_engine = expr[4] - '0';
2573 expr += 5;
2574#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002575 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002576 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002577#endif
2578 }
2579 else
2580 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002581 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002582 regexp_engine = AUTOMATIC_ENGINE;
2583 }
2584 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002585#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002586 bt_regengine.expr = expr;
2587 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002588#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002589 // reg_iswordc() uses rex.reg_buf
2590 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002591
2592 /*
2593 * First try the NFA engine, unless backtracking was requested.
2594 */
Bram Moolenaarcd625122019-02-22 17:29:43 +01002595 save_called_emsg = called_emsg;
2596 called_emsg = FALSE;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002597 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002598 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002599 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600 else
2601 prog = bt_regengine.regcomp(expr, re_flags);
2602
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002603 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002604 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002605 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002606#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002607 if (regexp_engine != BACKTRACKING_ENGINE) // debugging log for NFA
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002608 {
2609 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002610 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002611 if (f)
2612 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002613 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002614 fclose(f);
2615 }
2616 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002617 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002618 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002619 }
2620#endif
2621 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002622 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002623 * The NFA engine also fails for patterns that it can't handle well
2624 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002625 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002626 */
Bram Moolenaarcd625122019-02-22 17:29:43 +01002627 if (regexp_engine == AUTOMATIC_ENGINE && !called_emsg)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002628 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002629 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002630 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002631 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002632 }
Bram Moolenaarcd625122019-02-22 17:29:43 +01002633 called_emsg |= save_called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002634
Bram Moolenaarfda37292014-11-05 14:27:36 +01002635 if (prog != NULL)
2636 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002637 // Store the info needed to call regcomp() again when the engine turns
2638 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002639 prog->re_engine = regexp_engine;
2640 prog->re_flags = re_flags;
2641 }
2642
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002643 return prog;
2644}
2645
2646/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002647 * Free a compiled regexp program, returned by vim_regcomp().
2648 */
2649 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002650vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002651{
2652 if (prog != NULL)
2653 prog->engine->regfree(prog);
2654}
2655
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002656#if defined(EXITFREE) || defined(PROTO)
2657 void
2658free_regexp_stuff(void)
2659{
2660 ga_clear(&regstack);
2661 ga_clear(&backpos);
2662 vim_free(reg_tofree);
2663 vim_free(reg_prev_sub);
2664}
2665#endif
2666
Bram Moolenaarfda37292014-11-05 14:27:36 +01002667#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002668 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002669report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002670{
2671 if (p_verbose > 0)
2672 {
2673 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002674 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2675 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002676 verbose_leave();
2677 }
2678}
2679#endif
2680
Bram Moolenaar113e1072019-01-20 15:30:40 +01002681#if (defined(FEAT_X11) && (defined(FEAT_TITLE) || defined(FEAT_XCLIPBOARD))) \
2682 || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002683/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002684 * Return whether "prog" is currently being executed.
2685 */
2686 int
2687regprog_in_use(regprog_T *prog)
2688{
2689 return prog->re_in_use;
2690}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002691#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002692
2693/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002694 * Match a regexp against a string.
2695 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002696 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002697 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002698 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002699 *
2700 * Return TRUE if there is a match, FALSE if not.
2701 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002702 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002703vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002704 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002705 char_u *line, // string to match against
2706 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002707 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002708{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002709 int result;
2710 regexec_T rex_save;
2711 int rex_in_use_save = rex_in_use;
2712
Bram Moolenaar0270f382018-07-17 05:43:58 +02002713 // Cannot use the same prog recursively, it contains state.
2714 if (rmp->regprog->re_in_use)
2715 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002716 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002717 return FALSE;
2718 }
2719 rmp->regprog->re_in_use = TRUE;
2720
Bram Moolenaar6100d022016-10-02 16:51:57 +02002721 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002722 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002723 rex_save = rex;
2724 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002725
Bram Moolenaar6100d022016-10-02 16:51:57 +02002726 rex.reg_startp = NULL;
2727 rex.reg_endp = NULL;
2728 rex.reg_startpos = NULL;
2729 rex.reg_endpos = NULL;
2730
2731 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002732 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002733
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002734 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002735 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2736 && result == NFA_TOO_EXPENSIVE)
2737 {
2738 int save_p_re = p_re;
2739 int re_flags = rmp->regprog->re_flags;
2740 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2741
2742 p_re = BACKTRACKING_ENGINE;
2743 vim_regfree(rmp->regprog);
2744 if (pat != NULL)
2745 {
2746#ifdef FEAT_EVAL
2747 report_re_switch(pat);
2748#endif
2749 rmp->regprog = vim_regcomp(pat, re_flags);
2750 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002751 {
2752 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002753 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002754 rmp->regprog->re_in_use = FALSE;
2755 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002756 vim_free(pat);
2757 }
2758
2759 p_re = save_p_re;
2760 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002761
2762 rex_in_use = rex_in_use_save;
2763 if (rex_in_use)
2764 rex = rex_save;
2765
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002766 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002767}
2768
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002769/*
2770 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002771 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002772 */
2773 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002774vim_regexec_prog(
2775 regprog_T **prog,
2776 int ignore_case,
2777 char_u *line,
2778 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002779{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002780 int r;
2781 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002782
2783 regmatch.regprog = *prog;
2784 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002785 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002786 *prog = regmatch.regprog;
2787 return r;
2788}
2789
2790/*
2791 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002792 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002793 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002794 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002795vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002796{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002797 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002798}
2799
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002800/*
2801 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002802 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002803 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002804 */
2805 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002806vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002807{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002808 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002809}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002810
2811/*
2812 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002813 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2814 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002815 * Uses curbuf for line count and 'iskeyword'.
2816 *
2817 * Return zero if there is no match. Return number of lines contained in the
2818 * match otherwise.
2819 */
2820 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002821vim_regexec_multi(
2822 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002823 win_T *win, // window in which to search or NULL
2824 buf_T *buf, // buffer in which to search
2825 linenr_T lnum, // nr of line to start looking for match
2826 colnr_T col, // column to start looking for match
2827 proftime_T *tm, // timeout limit or NULL
2828 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002829{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002830 int result;
2831 regexec_T rex_save;
2832 int rex_in_use_save = rex_in_use;
2833
Bram Moolenaar0270f382018-07-17 05:43:58 +02002834 // Cannot use the same prog recursively, it contains state.
2835 if (rmp->regprog->re_in_use)
2836 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002837 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002838 return FALSE;
2839 }
2840 rmp->regprog->re_in_use = TRUE;
2841
Bram Moolenaar6100d022016-10-02 16:51:57 +02002842 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002843 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002844 rex_save = rex;
2845 rex_in_use = TRUE;
2846
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002847 result = rmp->regprog->engine->regexec_multi(
2848 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002849 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002850
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002851 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002852 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2853 && result == NFA_TOO_EXPENSIVE)
2854 {
2855 int save_p_re = p_re;
2856 int re_flags = rmp->regprog->re_flags;
2857 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2858
2859 p_re = BACKTRACKING_ENGINE;
2860 vim_regfree(rmp->regprog);
2861 if (pat != NULL)
2862 {
2863#ifdef FEAT_EVAL
2864 report_re_switch(pat);
2865#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002866#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002867 // checking for \z misuse was already done when compiling for NFA,
2868 // allow all here
2869 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002870#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002871 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002872#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002873 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002874#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002875
Bram Moolenaarfda37292014-11-05 14:27:36 +01002876 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002877 {
2878 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002879 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002880 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002881 rmp->regprog->re_in_use = FALSE;
2882 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002883 vim_free(pat);
2884 }
2885 p_re = save_p_re;
2886 }
2887
Bram Moolenaar6100d022016-10-02 16:51:57 +02002888 rex_in_use = rex_in_use_save;
2889 if (rex_in_use)
2890 rex = rex_save;
2891
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002892 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002893}