blob: 40b21d3f41807ca9bd97538818849b11f702b7c9 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarac78dd42022-01-02 19:25:26 +000069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_(e_invalid_item_in_str_brackets), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020074static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
Bram Moolenaar966e58e2017-06-05 16:54:08 +020075static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
76static char_u e_large_class[] = N_("E945: Range too large in character class");
Bram Moolenaar0270f382018-07-17 05:43:58 +020077static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
78
Bram Moolenaar071d4272004-06-13 20:20:40 +000079#define NOT_MULTI 0
80#define MULTI_ONE 1
81#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020082
83// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010084#define RA_FAIL 1 // something failed, abort
85#define RA_CONT 2 // continue in inner loop
86#define RA_BREAK 3 // break inner loop
87#define RA_MATCH 4 // successful match
88#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020089
Bram Moolenaar071d4272004-06-13 20:20:40 +000090/*
91 * Return NOT_MULTI if c is not a "multi" operator.
92 * Return MULTI_ONE if c is a single "multi" operator.
93 * Return MULTI_MULT if c is a multi "multi" operator.
94 */
95 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010096re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +000097{
98 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
99 return MULTI_ONE;
100 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
101 return MULTI_MULT;
102 return NOT_MULTI;
103}
104
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000105static char_u *reg_prev_sub = NULL;
106
Bram Moolenaar071d4272004-06-13 20:20:40 +0000107/*
108 * REGEXP_INRANGE contains all characters which are always special in a []
109 * range after '\'.
110 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
111 * These are:
112 * \n - New line (NL).
113 * \r - Carriage Return (CR).
114 * \t - Tab (TAB).
115 * \e - Escape (ESC).
116 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000117 * \d - Character code in decimal, eg \d123
118 * \o - Character code in octal, eg \o80
119 * \x - Character code in hex, eg \x4a
120 * \u - Multibyte character code, eg \u20ac
121 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000122 */
123static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000124static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000125
Bram Moolenaar071d4272004-06-13 20:20:40 +0000126/*
127 * Translate '\x' to its control character, except "\n", which is Magic.
128 */
129 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100130backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000131{
132 switch (c)
133 {
134 case 'r': return CAR;
135 case 't': return TAB;
136 case 'e': return ESC;
137 case 'b': return BS;
138 }
139 return c;
140}
141
142/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000143 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000144 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
145 * recognized. Otherwise "pp" is advanced to after the item.
146 */
147 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100148get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000149{
150 static const char *(class_names[]) =
151 {
152 "alnum:]",
153#define CLASS_ALNUM 0
154 "alpha:]",
155#define CLASS_ALPHA 1
156 "blank:]",
157#define CLASS_BLANK 2
158 "cntrl:]",
159#define CLASS_CNTRL 3
160 "digit:]",
161#define CLASS_DIGIT 4
162 "graph:]",
163#define CLASS_GRAPH 5
164 "lower:]",
165#define CLASS_LOWER 6
166 "print:]",
167#define CLASS_PRINT 7
168 "punct:]",
169#define CLASS_PUNCT 8
170 "space:]",
171#define CLASS_SPACE 9
172 "upper:]",
173#define CLASS_UPPER 10
174 "xdigit:]",
175#define CLASS_XDIGIT 11
176 "tab:]",
177#define CLASS_TAB 12
178 "return:]",
179#define CLASS_RETURN 13
180 "backspace:]",
181#define CLASS_BACKSPACE 14
182 "escape:]",
183#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100184 "ident:]",
185#define CLASS_IDENT 16
186 "keyword:]",
187#define CLASS_KEYWORD 17
188 "fname:]",
189#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000190 };
191#define CLASS_NONE 99
192 int i;
193
194 if ((*pp)[1] == ':')
195 {
K.Takataeeec2542021-06-02 13:28:16 +0200196 for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000197 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
198 {
199 *pp += STRLEN(class_names[i]) + 2;
200 return i;
201 }
202 }
203 return CLASS_NONE;
204}
205
206/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000207 * Specific version of character class functions.
208 * Using a table to keep this fast.
209 */
210static short class_tab[256];
211
212#define RI_DIGIT 0x01
213#define RI_HEX 0x02
214#define RI_OCTAL 0x04
215#define RI_WORD 0x08
216#define RI_HEAD 0x10
217#define RI_ALPHA 0x20
218#define RI_LOWER 0x40
219#define RI_UPPER 0x80
220#define RI_WHITE 0x100
221
222 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100223init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000224{
225 int i;
226 static int done = FALSE;
227
228 if (done)
229 return;
230
231 for (i = 0; i < 256; ++i)
232 {
233 if (i >= '0' && i <= '7')
234 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
235 else if (i >= '8' && i <= '9')
236 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
237 else if (i >= 'a' && i <= 'f')
238 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
239#ifdef EBCDIC
240 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
241 || (i >= 's' && i <= 'z'))
242#else
243 else if (i >= 'g' && i <= 'z')
244#endif
245 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
246 else if (i >= 'A' && i <= 'F')
247 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
248#ifdef EBCDIC
249 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
250 || (i >= 'S' && i <= 'Z'))
251#else
252 else if (i >= 'G' && i <= 'Z')
253#endif
254 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
255 else if (i == '_')
256 class_tab[i] = RI_WORD + RI_HEAD;
257 else
258 class_tab[i] = 0;
259 }
260 class_tab[' '] |= RI_WHITE;
261 class_tab['\t'] |= RI_WHITE;
262 done = TRUE;
263}
264
Bram Moolenaara12a1612019-01-24 16:39:02 +0100265#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
266#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
267#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
268#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
269#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
270#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
271#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
272#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
273#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000274
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100275// flags for regflags
276#define RF_ICASE 1 // ignore case
277#define RF_NOICASE 2 // don't ignore case
278#define RF_HASNL 4 // can match a NL
279#define RF_ICOMBINE 8 // ignore combining characters
280#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000281
282/*
283 * Global work variables for vim_regcomp().
284 */
285
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100286static char_u *regparse; // Input-scan pointer.
287static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100288static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000289#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100290static int regnzpar; // \z() count.
291static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000292#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100293static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000294#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000296#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000297
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100298static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000299
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100300static int reg_string; // matching with a string instead of a buffer
301 // line
302static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000303
304/*
305 * META contains all characters that may be magic, except '^' and '$'.
306 */
307
308#ifdef EBCDIC
309static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
310#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100311// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000312static char_u META_flags[] = {
313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100315// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000316 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100317// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000318 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100319// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000320 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100321// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000322 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100323// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100325// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000326 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
327};
328#endif
329
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100330static int curchr; // currently parsed character
331// Previous character. Note: prevchr is sometimes -1 when we are not at the
332// start, eg in /[ ^I]^ the pattern was never found even if it existed,
333// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200334static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100335static int prevprevchr; // previous-previous character
336static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000337
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100338// arguments for reg()
339#define REG_NOPAREN 0 // toplevel reg()
340#define REG_PAREN 1 // \(\)
341#define REG_ZPAREN 2 // \z(\)
342#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000343
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200344typedef struct
345{
346 char_u *regparse;
347 int prevchr_len;
348 int curchr;
349 int prevchr;
350 int prevprevchr;
351 int nextchr;
352 int at_start;
353 int prev_at_start;
354 int regnpar;
355} parse_state_T;
356
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100357static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100358static int getchr(void);
359static void skipchr_keepstart(void);
360static int peekchr(void);
361static void skipchr(void);
362static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100363static long gethexchrs(int maxinputlen);
364static long getoctchrs(void);
365static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100366static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100367static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200368static int cstrncmp(char_u *s1, char_u *s2, int *n);
369static char_u *cstrchr(char_u *, int);
370static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100371static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100372#ifdef FEAT_EVAL
373static void report_re_switch(char_u *pat);
374#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000375
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200376static regengine_T bt_regengine;
377static regengine_T nfa_regengine;
378
Bram Moolenaar071d4272004-06-13 20:20:40 +0000379/*
380 * Return TRUE if compiled regular expression "prog" can match a line break.
381 */
382 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100383re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384{
385 return (prog->regflags & RF_HASNL);
386}
387
388/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000389 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
390 * Returns a character representing the class. Zero means that no item was
391 * recognized. Otherwise "pp" is advanced to after the item.
392 */
393 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100394get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000395{
396 int c;
397 int l = 1;
398 char_u *p = *pp;
399
Bram Moolenaar985079c2019-02-16 17:07:47 +0100400 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000401 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000402 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000403 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404 if (p[l + 2] == '=' && p[l + 3] == ']')
405 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000406 if (has_mbyte)
407 c = mb_ptr2char(p + 2);
408 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000409 c = p[2];
410 *pp += l + 4;
411 return c;
412 }
413 }
414 return 0;
415}
416
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200417#ifdef EBCDIC
418/*
419 * Table for equivalence class "c". (IBM-1047)
420 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200421static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200422 "A\x62\x63\x64\x65\x66\x67",
423 "C\x68",
424 "E\x71\x72\x73\x74",
425 "I\x75\x76\x77\x78",
426 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200427 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200428 "U\xFB\xFC\xFD\xFE",
429 "Y\xBA",
430 "a\x42\x43\x44\x45\x46\x47",
431 "c\x48",
432 "e\x51\x52\x53\x54",
433 "i\x55\x56\x57\x58",
434 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200435 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200436 "u\xDB\xDC\xDD\xDE",
437 "y\x8D\xDF",
438};
439#endif
440
Bram Moolenaardf177f62005-02-22 08:39:57 +0000441/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000442 * Check for a collating element "[.a.]". "pp" points to the '['.
443 * Returns a character. Zero means that no item was recognized. Otherwise
444 * "pp" is advanced to after the item.
445 * Currently only single characters are recognized!
446 */
447 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100448get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000449{
450 int c;
451 int l = 1;
452 char_u *p = *pp;
453
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100454 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000455 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000456 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000457 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458 if (p[l + 2] == '.' && p[l + 3] == ']')
459 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000460 if (has_mbyte)
461 c = mb_ptr2char(p + 2);
462 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000463 c = p[2];
464 *pp += l + 4;
465 return c;
466 }
467 }
468 return 0;
469}
470
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100471static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
472static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200473
474 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100475get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200476{
477 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
478 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
479}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000480
481/*
482 * Skip over a "[]" range.
483 * "p" must point to the character after the '['.
484 * The returned pointer is on the matching ']', or the terminating NUL.
485 */
486 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100487skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000488{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000489 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000490
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100491 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000492 ++p;
493 if (*p == ']' || *p == '-')
494 ++p;
495 while (*p != NUL && *p != ']')
496 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000497 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 p += l;
499 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000500 if (*p == '-')
501 {
502 ++p;
503 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100504 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000505 }
506 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200507 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000508 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200509 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000510 p += 2;
511 else if (*p == '[')
512 {
513 if (get_char_class(&p) == CLASS_NONE
514 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200515 && get_coll_element(&p) == 0
516 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100517 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000518 }
519 else
520 ++p;
521 }
522
523 return p;
524}
525
526/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000527 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200528 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000529 * Take care of characters with a backslash in front of it.
530 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000531 */
532 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100533skip_regexp(
534 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200535 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200536 int magic)
537{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100538 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200539}
540
541/*
542 * Call skip_regexp() and when the delimiter does not match give an error and
543 * return NULL.
544 */
545 char_u *
546skip_regexp_err(
547 char_u *startp,
548 int delim,
549 int magic)
550{
551 char_u *p = skip_regexp(startp, delim, magic);
552
553 if (*p != delim)
554 {
Bram Moolenaara6f79292022-01-04 21:30:47 +0000555 semsg(_(e_missing_delimiter_after_search_pattern_str), startp);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200556 return NULL;
557 }
558 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200559}
560
561/*
562 * skip_regexp() with extra arguments:
563 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
564 * expression and change "\?" to "?". If "*newp" is not NULL the expression
565 * is changed in-place.
566 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100567 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200568 */
569 char_u *
570skip_regexp_ex(
571 char_u *startp,
572 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100573 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200574 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100575 int *dropped,
576 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000577{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100578 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000579 char_u *p = startp;
580
581 if (magic)
582 mymagic = MAGIC_ON;
583 else
584 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200585 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000586
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100587 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000588 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100589 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000590 break;
591 if ((p[0] == '[' && mymagic >= MAGIC_ON)
592 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
593 {
594 p = skip_anyof(p + 1);
595 if (p[0] == NUL)
596 break;
597 }
598 else if (p[0] == '\\' && p[1] != NUL)
599 {
600 if (dirc == '?' && newp != NULL && p[1] == '?')
601 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100602 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000603 if (*newp == NULL)
604 {
605 *newp = vim_strsave(startp);
606 if (*newp != NULL)
607 p = *newp + (p - startp);
608 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200609 if (dropped != NULL)
610 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000611 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000612 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000613 else
614 ++p;
615 }
616 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100617 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000618 if (*p == 'v')
619 mymagic = MAGIC_ALL;
620 else if (*p == 'V')
621 mymagic = MAGIC_NONE;
622 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000623 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100624 if (magic_val != NULL)
625 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000626 return p;
627}
628
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200629/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200630 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200631 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100632static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200633static int at_start; // True when on the first character
634static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100635
Bram Moolenaar071d4272004-06-13 20:20:40 +0000636/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200637 * Start parsing at "str".
638 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000639 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100640initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000641{
642 regparse = str;
643 prevchr_len = 0;
644 curchr = prevprevchr = prevchr = nextchr = -1;
645 at_start = TRUE;
646 prev_at_start = FALSE;
647}
648
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200649/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200650 * Save the current parse state, so that it can be restored and parsing
651 * starts in the same state again.
652 */
653 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100654save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200655{
656 ps->regparse = regparse;
657 ps->prevchr_len = prevchr_len;
658 ps->curchr = curchr;
659 ps->prevchr = prevchr;
660 ps->prevprevchr = prevprevchr;
661 ps->nextchr = nextchr;
662 ps->at_start = at_start;
663 ps->prev_at_start = prev_at_start;
664 ps->regnpar = regnpar;
665}
666
667/*
668 * Restore a previously saved parse state.
669 */
670 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100671restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200672{
673 regparse = ps->regparse;
674 prevchr_len = ps->prevchr_len;
675 curchr = ps->curchr;
676 prevchr = ps->prevchr;
677 prevprevchr = ps->prevprevchr;
678 nextchr = ps->nextchr;
679 at_start = ps->at_start;
680 prev_at_start = ps->prev_at_start;
681 regnpar = ps->regnpar;
682}
683
684
685/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200686 * Get the next character without advancing.
687 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000688 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100689peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000690{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000691 static int after_slash = FALSE;
692
Bram Moolenaar071d4272004-06-13 20:20:40 +0000693 if (curchr == -1)
694 {
695 switch (curchr = regparse[0])
696 {
697 case '.':
698 case '[':
699 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100700 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000701 if (reg_magic >= MAGIC_ON)
702 curchr = Magic(curchr);
703 break;
704 case '(':
705 case ')':
706 case '{':
707 case '%':
708 case '+':
709 case '=':
710 case '?':
711 case '@':
712 case '!':
713 case '&':
714 case '|':
715 case '<':
716 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100717 case '#': // future ext.
718 case '"': // future ext.
719 case '\'': // future ext.
720 case ',': // future ext.
721 case '-': // future ext.
722 case ':': // future ext.
723 case ';': // future ext.
724 case '`': // future ext.
725 case '/': // Can't be used in / command
726 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000727 if (reg_magic == MAGIC_ALL)
728 curchr = Magic(curchr);
729 break;
730 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100731 // * is not magic as the very first character, eg "?*ptr", when
732 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
733 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000734 if (reg_magic >= MAGIC_ON
735 && !at_start
736 && !(prev_at_start && prevchr == Magic('^'))
737 && (after_slash
738 || (prevchr != Magic('(')
739 && prevchr != Magic('&')
740 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000741 curchr = Magic('*');
742 break;
743 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100744 // '^' is only magic as the very first character and if it's after
745 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000746 if (reg_magic >= MAGIC_OFF
747 && (at_start
748 || reg_magic == MAGIC_ALL
749 || prevchr == Magic('(')
750 || prevchr == Magic('|')
751 || prevchr == Magic('&')
752 || prevchr == Magic('n')
753 || (no_Magic(prevchr) == '('
754 && prevprevchr == Magic('%'))))
755 {
756 curchr = Magic('^');
757 at_start = TRUE;
758 prev_at_start = FALSE;
759 }
760 break;
761 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100762 // '$' is only magic as the very last char and if it's in front of
763 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000764 if (reg_magic >= MAGIC_OFF)
765 {
766 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200767 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000768
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100769 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000770 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200771 || p[1] == 'm' || p[1] == 'M'
772 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
773 {
774 if (p[1] == 'v')
775 is_magic_all = TRUE;
776 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
777 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000778 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200779 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000780 if (p[0] == NUL
781 || (p[0] == '\\'
782 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
783 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200784 || (is_magic_all
785 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000786 || reg_magic == MAGIC_ALL)
787 curchr = Magic('$');
788 }
789 break;
790 case '\\':
791 {
792 int c = regparse[1];
793
794 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100795 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000796 else if (
797#ifdef EBCDIC
798 vim_strchr(META, c)
799#else
800 c <= '~' && META_flags[c]
801#endif
802 )
803 {
804 /*
805 * META contains everything that may be magic sometimes,
806 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200807 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000808 * magicness. Therefore, \ is so meta-magic that it is
809 * not in META.
810 */
811 curchr = -1;
812 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100813 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000814 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000815 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000816 peekchr();
817 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000818 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000819 curchr = toggle_Magic(curchr);
820 }
821 else if (vim_strchr(REGEXP_ABBR, c))
822 {
823 /*
824 * Handle abbreviations, like "\t" for TAB -- webb
825 */
826 curchr = backslash_trans(c);
827 }
828 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
829 curchr = toggle_Magic(c);
830 else
831 {
832 /*
833 * Next character can never be (made) magic?
834 * Then backslashing it won't do anything.
835 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000836 if (has_mbyte)
837 curchr = (*mb_ptr2char)(regparse + 1);
838 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000839 curchr = c;
840 }
841 break;
842 }
843
Bram Moolenaar071d4272004-06-13 20:20:40 +0000844 default:
845 if (has_mbyte)
846 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000847 }
848 }
849
850 return curchr;
851}
852
853/*
854 * Eat one lexed character. Do this in a way that we can undo it.
855 */
856 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100857skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000858{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100859 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000860 if (*regparse == '\\')
861 prevchr_len = 1;
862 else
863 prevchr_len = 0;
864 if (regparse[prevchr_len] != NUL)
865 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000866 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100867 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000868 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000869 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000870 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000871 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000872 ++prevchr_len;
873 }
874 regparse += prevchr_len;
875 prev_at_start = at_start;
876 at_start = FALSE;
877 prevprevchr = prevchr;
878 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100879 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000880 nextchr = -1;
881}
882
883/*
884 * Skip a character while keeping the value of prev_at_start for at_start.
885 * prevchr and prevprevchr are also kept.
886 */
887 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100888skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000889{
890 int as = prev_at_start;
891 int pr = prevchr;
892 int prpr = prevprevchr;
893
894 skipchr();
895 at_start = as;
896 prevchr = pr;
897 prevprevchr = prpr;
898}
899
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200900/*
901 * Get the next character from the pattern. We know about magic and such, so
902 * therefore we need a lexical analyzer.
903 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000904 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100905getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000906{
907 int chr = peekchr();
908
909 skipchr();
910 return chr;
911}
912
913/*
914 * put character back. Works only once!
915 */
916 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100917ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000918{
919 nextchr = curchr;
920 curchr = prevchr;
921 prevchr = prevprevchr;
922 at_start = prev_at_start;
923 prev_at_start = FALSE;
924
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100925 // Backup regparse, so that it's at the same position as before the
926 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000927 regparse -= prevchr_len;
928}
929
930/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000931 * Get and return the value of the hex string at the current position.
932 * Return -1 if there is no valid hex number.
933 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000934 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000935 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000936 * The parameter controls the maximum number of input characters. This will be
937 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
938 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100939 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100940gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000941{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100942 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000943 int c;
944 int i;
945
946 for (i = 0; i < maxinputlen; ++i)
947 {
948 c = regparse[0];
949 if (!vim_isxdigit(c))
950 break;
951 nr <<= 4;
952 nr |= hex2nr(c);
953 ++regparse;
954 }
955
956 if (i == 0)
957 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100958 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000959}
960
961/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200962 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000963 * current position. Return -1 for invalid. Consumes all digits.
964 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100965 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100966getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000967{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100968 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000969 int c;
970 int i;
971
972 for (i = 0; ; ++i)
973 {
974 c = regparse[0];
975 if (c < '0' || c > '9')
976 break;
977 nr *= 10;
978 nr += c - '0';
979 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100980 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000981 }
982
983 if (i == 0)
984 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100985 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000986}
987
988/*
989 * get and return the value of the octal string immediately after the current
990 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
991 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
992 * treat 8 or 9 as recognised characters. Position is updated:
993 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000994 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000995 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100996 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100997getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000998{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100999 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001000 int c;
1001 int i;
1002
1003 for (i = 0; i < 3 && nr < 040; ++i)
1004 {
1005 c = regparse[0];
1006 if (c < '0' || c > '7')
1007 break;
1008 nr <<= 3;
1009 nr |= hex2nr(c);
1010 ++regparse;
1011 }
1012
1013 if (i == 0)
1014 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001015 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001016}
1017
1018/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001019 * read_limits - Read two integers to be taken as a minimum and maximum.
1020 * If the first character is '-', then the range is reversed.
1021 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1022 * missing, a very big number is the default.
1023 */
1024 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001025read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001026{
1027 int reverse = FALSE;
1028 char_u *first_char;
1029 long tmp;
1030
1031 if (*regparse == '-')
1032 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001033 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001034 regparse++;
1035 reverse = TRUE;
1036 }
1037 first_char = regparse;
1038 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001039 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001040 {
1041 if (vim_isdigit(*++regparse))
1042 *maxval = getdigits(&regparse);
1043 else
1044 *maxval = MAX_LIMIT;
1045 }
1046 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001047 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001048 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001049 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001050 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001051 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001052 if (*regparse != '}')
Bram Moolenaar1d423ef2022-01-02 21:26:16 +00001053 EMSG2_RET_FAIL(_(e_syntax_error_in_str_curlies),
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001054 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001055
1056 /*
1057 * Reverse the range if there was a '-', or make sure it is in the right
1058 * order otherwise.
1059 */
1060 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1061 {
1062 tmp = *minval;
1063 *minval = *maxval;
1064 *maxval = tmp;
1065 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001066 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001067 return OK;
1068}
1069
1070/*
1071 * vim_regexec and friends
1072 */
1073
1074/*
1075 * Global work variables for vim_regexec().
1076 */
1077
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001078static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001079#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001080static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001081#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001082static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001083static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001084
1085/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001086 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1087 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001088 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001089 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001090static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001091static unsigned reg_tofreelen;
1092
1093/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001094 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001095 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001096 * done:
1097 * single-line multi-line
1098 * reg_match &regmatch_T NULL
1099 * reg_mmatch NULL &regmmatch_T
1100 * reg_startp reg_match->startp <invalid>
1101 * reg_endp reg_match->endp <invalid>
1102 * reg_startpos <invalid> reg_mmatch->startpos
1103 * reg_endpos <invalid> reg_mmatch->endpos
1104 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001105 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001106 * reg_firstlnum <invalid> first line in which to search
1107 * reg_maxline 0 last line nr
1108 * reg_line_lbr FALSE or TRUE FALSE
1109 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001110typedef struct {
1111 regmatch_T *reg_match;
1112 regmmatch_T *reg_mmatch;
1113 char_u **reg_startp;
1114 char_u **reg_endp;
1115 lpos_T *reg_startpos;
1116 lpos_T *reg_endpos;
1117 win_T *reg_win;
1118 buf_T *reg_buf;
1119 linenr_T reg_firstlnum;
1120 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001121 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001122
Bram Moolenaar0270f382018-07-17 05:43:58 +02001123 // The current match-position is stord in these variables:
1124 linenr_T lnum; // line number, relative to first line
1125 char_u *line; // start of current line
Bram Moolenaar64066b92021-11-17 18:22:56 +00001126 char_u *input; // current input, points into "line"
Bram Moolenaar0270f382018-07-17 05:43:58 +02001127
1128 int need_clear_subexpr; // subexpressions still need to be cleared
1129#ifdef FEAT_SYN_HL
1130 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1131 // cleared
1132#endif
1133
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001134 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1135 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1136 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001137 int reg_ic;
1138
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001139 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1140 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001141 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001142
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001143 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1144 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001145 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001146
1147 // State for the NFA engine regexec.
1148 int nfa_has_zend; // NFA regexp \ze operator encountered.
1149 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1150 int nfa_nsubexpr; // Number of sub expressions actually being used
1151 // during execution. 1 if only the whole match
1152 // (subexpr 0) is used.
1153 // listid is global, so that it increases on recursive calls to
1154 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1155 // all the states.
1156 int nfa_listid;
1157 int nfa_alt_listid;
1158
1159#ifdef FEAT_SYN_HL
1160 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1161#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001162} regexec_T;
1163
1164static regexec_T rex;
1165static int rex_in_use = FALSE;
1166
Bram Moolenaar071d4272004-06-13 20:20:40 +00001167/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001168 * Return TRUE if character 'c' is included in 'iskeyword' option for
1169 * "reg_buf" buffer.
1170 */
1171 static int
1172reg_iswordc(int c)
1173{
1174 return vim_iswordc_buf(c, rex.reg_buf);
1175}
1176
1177/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001178 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1179 */
1180 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001181reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001182{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001183 // when looking behind for a match/no-match lnum is negative. But we
1184 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001185 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001186 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001187 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001188 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001189 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001190 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001191}
1192
Bram Moolenaar071d4272004-06-13 20:20:40 +00001193#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001194static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1195static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1196static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1197static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001198#endif
1199
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001200// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001201#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001202
Bram Moolenaar071d4272004-06-13 20:20:40 +00001203#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001204/*
1205 * Create a new extmatch and mark it as referenced once.
1206 */
1207 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001208make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001209{
1210 reg_extmatch_T *em;
1211
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001212 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001213 if (em != NULL)
1214 em->refcnt = 1;
1215 return em;
1216}
1217
1218/*
1219 * Add a reference to an extmatch.
1220 */
1221 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001222ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001223{
1224 if (em != NULL)
1225 em->refcnt++;
1226 return em;
1227}
1228
1229/*
1230 * Remove a reference to an extmatch. If there are no references left, free
1231 * the info.
1232 */
1233 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001234unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001235{
1236 int i;
1237
1238 if (em != NULL && --em->refcnt <= 0)
1239 {
1240 for (i = 0; i < NSUBEXP; ++i)
1241 vim_free(em->matches[i]);
1242 vim_free(em);
1243 }
1244}
1245#endif
1246
1247/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001248 * Get class of previous character.
1249 */
1250 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001251reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001252{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001253 if (rex.input > rex.line)
1254 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001255 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001256 return -1;
1257}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001258
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001259/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001260 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001261 */
1262 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001263reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001264{
1265 pos_T top, bot;
1266 linenr_T lnum;
1267 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001268 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001269 int mode;
1270 colnr_T start, end;
1271 colnr_T start2, end2;
1272 colnr_T cols;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001273 colnr_T curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001274
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001275 // Check if the buffer is the current buffer.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001276 if (rex.reg_buf != curbuf || VIsual.lnum == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001277 return FALSE;
1278
1279 if (VIsual_active)
1280 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001281 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001282 {
1283 top = VIsual;
1284 bot = wp->w_cursor;
1285 }
1286 else
1287 {
1288 top = wp->w_cursor;
1289 bot = VIsual;
1290 }
1291 mode = VIsual_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001292 curswant = wp->w_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001293 }
1294 else
1295 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001296 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001297 {
1298 top = curbuf->b_visual.vi_start;
1299 bot = curbuf->b_visual.vi_end;
1300 }
1301 else
1302 {
1303 top = curbuf->b_visual.vi_end;
1304 bot = curbuf->b_visual.vi_start;
1305 }
1306 mode = curbuf->b_visual.vi_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001307 curswant = curbuf->b_visual.vi_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001308 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001309 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001310 if (lnum < top.lnum || lnum > bot.lnum)
1311 return FALSE;
1312
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001313 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001314 if (mode == 'v')
1315 {
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001316 if ((lnum == top.lnum && col < top.col)
1317 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1318 return FALSE;
1319 }
1320 else if (mode == Ctrl_V)
1321 {
1322 getvvcol(wp, &top, &start, NULL, &end);
1323 getvvcol(wp, &bot, &start2, NULL, &end2);
1324 if (start2 < start)
1325 start = start2;
1326 if (end2 > end)
1327 end = end2;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001328 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001329 end = MAXCOL;
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001330
1331 // getvvcol() flushes rex.line, need to get it again
1332 rex.line = reg_getline(rex.lnum);
1333 rex.input = rex.line + col;
1334
1335 cols = win_linetabsize(wp, rex.line, col);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001336 if (cols < start || cols > end - (*p_sel == 'e'))
1337 return FALSE;
1338 }
1339 return TRUE;
1340}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001341
Bram Moolenaar071d4272004-06-13 20:20:40 +00001342/*
1343 * Check the regexp program for its magic number.
1344 * Return TRUE if it's wrong.
1345 */
1346 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001347prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001348{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001349 regprog_T *prog;
1350
Bram Moolenaar6100d022016-10-02 16:51:57 +02001351 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001352 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001353 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001354 return FALSE;
1355
1356 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001357 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001358 emsg(_(e_corrupted_regexp_program));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001359 return TRUE;
1360 }
1361 return FALSE;
1362}
1363
1364/*
1365 * Cleanup the subexpressions, if this wasn't done yet.
1366 * This construction is used to clear the subexpressions only when they are
1367 * used (to increase speed).
1368 */
1369 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001370cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001371{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001372 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001373 {
1374 if (REG_MULTI)
1375 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001376 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001377 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1378 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001379 }
1380 else
1381 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001382 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1383 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001384 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001385 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001386 }
1387}
1388
1389#ifdef FEAT_SYN_HL
1390 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001391cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001392{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001393 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001394 {
1395 if (REG_MULTI)
1396 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001397 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001398 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1399 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1400 }
1401 else
1402 {
1403 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1404 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1405 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001406 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001407 }
1408}
1409#endif
1410
1411/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001412 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001413 */
1414 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001415reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001416{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001417 rex.line = reg_getline(++rex.lnum);
1418 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001419 fast_breakcheck();
1420}
1421
1422/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001423 * Check whether a backreference matches.
1424 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001425 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1426 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001427 */
1428 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001429match_with_backref(
1430 linenr_T start_lnum,
1431 colnr_T start_col,
1432 linenr_T end_lnum,
1433 colnr_T end_col,
1434 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001435{
1436 linenr_T clnum = start_lnum;
1437 colnr_T ccol = start_col;
1438 int len;
1439 char_u *p;
1440
1441 if (bytelen != NULL)
1442 *bytelen = 0;
1443 for (;;)
1444 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001445 // Since getting one line may invalidate the other, need to make copy.
1446 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001447 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001448 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001449 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001450 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1451 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001452 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001453 vim_free(reg_tofree);
1454 reg_tofree = alloc(len);
1455 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001456 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001457 reg_tofreelen = len;
1458 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001459 STRCPY(reg_tofree, rex.line);
1460 rex.input = reg_tofree + (rex.input - rex.line);
1461 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001462 }
1463
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001464 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001465 p = reg_getline(clnum);
1466 if (clnum == end_lnum)
1467 len = end_col - ccol;
1468 else
1469 len = (int)STRLEN(p + ccol);
1470
Bram Moolenaar0270f382018-07-17 05:43:58 +02001471 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001472 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001473 if (bytelen != NULL)
1474 *bytelen += len;
1475 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001476 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001477 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001478 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001479
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001480 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001481 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001482 if (bytelen != NULL)
1483 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001484 ++clnum;
1485 ccol = 0;
1486 if (got_int)
1487 return RA_FAIL;
1488 }
1489
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001490 // found a match! Note that rex.line may now point to a copy of the line,
1491 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001492 return RA_MATCH;
1493}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001494
Bram Moolenaarfb031402014-09-09 17:18:49 +02001495/*
1496 * Used in a place where no * or \+ can follow.
1497 */
1498 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001499re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001500{
1501 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001502 {
1503 semsg(_("E888: (NFA regexp) cannot repeat %s"), what);
1504 rc_did_emsg = TRUE;
1505 return FAIL;
1506 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001507 return OK;
1508}
1509
Bram Moolenaar071d4272004-06-13 20:20:40 +00001510typedef struct
1511{
1512 int a, b, c;
1513} decomp_T;
1514
1515
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001516// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001517static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001518{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001519 {0x5e2,0,0}, // 0xfb20 alt ayin
1520 {0x5d0,0,0}, // 0xfb21 alt alef
1521 {0x5d3,0,0}, // 0xfb22 alt dalet
1522 {0x5d4,0,0}, // 0xfb23 alt he
1523 {0x5db,0,0}, // 0xfb24 alt kaf
1524 {0x5dc,0,0}, // 0xfb25 alt lamed
1525 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1526 {0x5e8,0,0}, // 0xfb27 alt resh
1527 {0x5ea,0,0}, // 0xfb28 alt tav
1528 {'+', 0, 0}, // 0xfb29 alt plus
1529 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1530 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1531 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1532 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1533 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1534 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1535 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1536 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1537 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1538 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1539 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1540 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1541 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1542 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1543 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1544 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1545 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1546 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1547 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1548 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1549 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1550 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1551 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1552 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1553 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1554 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1555 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1556 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1557 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1558 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1559 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1560 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1561 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1562 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1563 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1564 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1565 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1566 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001567};
1568
1569 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001570mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001571{
1572 decomp_T d;
1573
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001574 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001575 {
1576 d = decomp_table[c - 0xfb20];
1577 *c1 = d.a;
1578 *c2 = d.b;
1579 *c3 = d.c;
1580 }
1581 else
1582 {
1583 *c1 = c;
1584 *c2 = *c3 = 0;
1585 }
1586}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001587
1588/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001589 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001590 * Return 0 if strings match, non-zero otherwise.
1591 * Correct the length "*n" when composing characters are ignored.
1592 */
1593 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001594cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001595{
1596 int result;
1597
Bram Moolenaar6100d022016-10-02 16:51:57 +02001598 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001599 result = STRNCMP(s1, s2, *n);
1600 else
1601 result = MB_STRNICMP(s1, s2, *n);
1602
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001603 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001604 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001605 {
1606 char_u *str1, *str2;
1607 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001608 int junk;
1609
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001610 // we have to handle the strcmp ourselves, since it is necessary to
1611 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001612 str1 = s1;
1613 str2 = s2;
1614 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001615 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001616 {
1617 c1 = mb_ptr2char_adv(&str1);
1618 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001619
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001620 // Decompose the character if necessary, into 'base' characters.
1621 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001622 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001623 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001624 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001625 mb_decompose(c1, &c11, &junk, &junk);
1626 mb_decompose(c2, &c12, &junk, &junk);
1627 c1 = c11;
1628 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001629 if (c11 != c12
1630 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001631 break;
1632 }
1633 }
1634 result = c2 - c1;
1635 if (result == 0)
1636 *n = (int)(str2 - s2);
1637 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001638
1639 return result;
1640}
1641
1642/*
1643 * cstrchr: This function is used a lot for simple searches, keep it fast!
1644 */
1645 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001646cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001647{
1648 char_u *p;
1649 int cc;
1650
Bram Moolenaara12a1612019-01-24 16:39:02 +01001651 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001652 return vim_strchr(s, c);
1653
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001654 // tolower() and toupper() can be slow, comparing twice should be a lot
1655 // faster (esp. when using MS Visual C++!).
1656 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001657 if (enc_utf8 && c > 0x80)
1658 cc = utf_fold(c);
1659 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001660 if (MB_ISUPPER(c))
1661 cc = MB_TOLOWER(c);
1662 else if (MB_ISLOWER(c))
1663 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001664 else
1665 return vim_strchr(s, c);
1666
Bram Moolenaar071d4272004-06-13 20:20:40 +00001667 if (has_mbyte)
1668 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001669 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001670 {
1671 if (enc_utf8 && c > 0x80)
1672 {
1673 if (utf_fold(utf_ptr2char(p)) == cc)
1674 return p;
1675 }
1676 else if (*p == c || *p == cc)
1677 return p;
1678 }
1679 }
1680 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001681 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001682 for (p = s; *p != NUL; ++p)
1683 if (*p == c || *p == cc)
1684 return p;
1685
1686 return NULL;
1687}
1688
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001689////////////////////////////////////////////////////////////////
1690// regsub stuff //
1691////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001692
Bram Moolenaar071d4272004-06-13 20:20:40 +00001693/*
1694 * We should define ftpr as a pointer to a function returning a pointer to
1695 * a function returning a pointer to a function ...
1696 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001697 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001698 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001699typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001700
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001701static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001702
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001703 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001704do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001705{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001706 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001707
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001708 return (fptr_T)NULL;
1709}
1710
1711 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001712do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001713{
1714 *d = MB_TOUPPER(c);
1715
1716 return (fptr_T)do_Upper;
1717}
1718
1719 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001720do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001721{
1722 *d = MB_TOLOWER(c);
1723
1724 return (fptr_T)NULL;
1725}
1726
1727 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001728do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001729{
1730 *d = MB_TOLOWER(c);
1731
1732 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001733}
1734
1735/*
1736 * regtilde(): Replace tildes in the pattern by the old pattern.
1737 *
1738 * Short explanation of the tilde: It stands for the previous replacement
1739 * pattern. If that previous pattern also contains a ~ we should go back a
1740 * step further... But we insert the previous pattern into the current one
1741 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001742 * This still does not handle the case where "magic" changes. So require the
1743 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001744 *
1745 * The tildes are parsed once before the first call to vim_regsub().
1746 */
1747 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001748regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001749{
1750 char_u *newsub = source;
1751 char_u *tmpsub;
1752 char_u *p;
1753 int len;
1754 int prevlen;
1755
1756 for (p = newsub; *p; ++p)
1757 {
1758 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1759 {
1760 if (reg_prev_sub != NULL)
1761 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001762 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001763 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001764 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001765 if (tmpsub != NULL)
1766 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001767 // copy prefix
1768 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001769 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001770 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001771 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001772 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001773 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001774 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001775 STRCPY(tmpsub + len + prevlen, p + 1);
1776
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001777 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001778 vim_free(newsub);
1779 newsub = tmpsub;
1780 p = newsub + len + prevlen;
1781 }
1782 }
1783 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001784 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001785 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001786 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001787 --p;
1788 }
1789 else
1790 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001791 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001792 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001793 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001794 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001795 }
1796 }
1797
1798 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001799 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001800 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001801 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001802 reg_prev_sub = vim_strsave(newsub);
1803 return newsub;
1804}
1805
1806#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001807static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001808
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001809// These pointers are used for reg_submatch(). Needed for when the
1810// substitution string is an expression that contains a call to substitute()
1811// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001812typedef struct {
1813 regmatch_T *sm_match;
1814 regmmatch_T *sm_mmatch;
1815 linenr_T sm_firstlnum;
1816 linenr_T sm_maxline;
1817 int sm_line_lbr;
1818} regsubmatch_T;
1819
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001820static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001821#endif
1822
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001823#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001824
1825/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001826 * Put the submatches in "argv[argskip]" which is a list passed into
1827 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001828 */
1829 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001830fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001831{
1832 listitem_T *li;
1833 int i;
1834 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001835 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001836
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001837 if (argcount == argskip)
1838 // called function doesn't take a submatches argument
1839 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001840
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001841 // Relies on sl_list to be the first item in staticList10_T.
1842 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001843
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001844 // There are always 10 list items in staticList10_T.
1845 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001846 for (i = 0; i < 10; ++i)
1847 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001848 s = rsm.sm_match->startp[i];
1849 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001850 s = NULL;
1851 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001852 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001853 li->li_tv.v_type = VAR_STRING;
1854 li->li_tv.vval.v_string = s;
1855 li = li->li_next;
1856 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001857 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001858}
1859
1860 static void
1861clear_submatch_list(staticList10_T *sl)
1862{
1863 int i;
1864
1865 for (i = 0; i < 10; ++i)
1866 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1867}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001868#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001869
Bram Moolenaar071d4272004-06-13 20:20:40 +00001870/*
1871 * vim_regsub() - perform substitutions after a vim_regexec() or
1872 * vim_regexec_multi() match.
1873 *
1874 * If "copy" is TRUE really copy into "dest".
1875 * If "copy" is FALSE nothing is copied, this is just to find out the length
1876 * of the result.
1877 *
1878 * If "backslash" is TRUE, a backslash will be removed later, need to double
1879 * them to keep them, and insert a backslash before a CR to avoid it being
1880 * replaced with a line break later.
1881 *
1882 * Note: The matched text must not change between the call of
1883 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1884 * references invalid!
1885 *
1886 * Returns the size of the replacement, including terminating NUL.
1887 */
1888 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001889vim_regsub(
1890 regmatch_T *rmp,
1891 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001892 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001893 char_u *dest,
1894 int copy,
1895 int magic,
1896 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001897{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001898 int result;
1899 regexec_T rex_save;
1900 int rex_in_use_save = rex_in_use;
1901
1902 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001903 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001904 rex_save = rex;
1905 rex_in_use = TRUE;
1906
1907 rex.reg_match = rmp;
1908 rex.reg_mmatch = NULL;
1909 rex.reg_maxline = 0;
1910 rex.reg_buf = curbuf;
1911 rex.reg_line_lbr = TRUE;
1912 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1913
1914 rex_in_use = rex_in_use_save;
1915 if (rex_in_use)
1916 rex = rex_save;
1917
1918 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001919}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001920
1921 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001922vim_regsub_multi(
1923 regmmatch_T *rmp,
1924 linenr_T lnum,
1925 char_u *source,
1926 char_u *dest,
1927 int copy,
1928 int magic,
1929 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001930{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001931 int result;
1932 regexec_T rex_save;
1933 int rex_in_use_save = rex_in_use;
1934
1935 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001936 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001937 rex_save = rex;
1938 rex_in_use = TRUE;
1939
1940 rex.reg_match = NULL;
1941 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001942 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001943 rex.reg_firstlnum = lnum;
1944 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1945 rex.reg_line_lbr = FALSE;
1946 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1947
1948 rex_in_use = rex_in_use_save;
1949 if (rex_in_use)
1950 rex = rex_save;
1951
1952 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001953}
1954
1955 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001956vim_regsub_both(
1957 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001958 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001959 char_u *dest,
1960 int copy,
1961 int magic,
1962 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001963{
1964 char_u *src;
1965 char_u *dst;
1966 char_u *s;
1967 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001968 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001969 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001970 fptr_T func_all = (fptr_T)NULL;
1971 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001972 linenr_T clnum = 0; // init for GCC
1973 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001974#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001975 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001976#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001978 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001979 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001980 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001981 emsg(_(e_null_argument));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001982 return 0;
1983 }
1984 if (prog_magic_wrong())
1985 return 0;
1986 src = source;
1987 dst = dest;
1988
1989 /*
1990 * When the substitute part starts with "\=" evaluate it as an expression.
1991 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001992 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001993 {
1994#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001995 // To make sure that the length doesn't change between checking the
1996 // length and copying the string, and to speed up things, the
1997 // resulting string is saved from the call with "copy" == FALSE to the
1998 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001999 if (copy)
2000 {
2001 if (eval_result != NULL)
2002 {
2003 STRCPY(dest, eval_result);
2004 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002005 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002006 }
2007 }
2008 else
2009 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002010 int prev_can_f_submatch = can_f_submatch;
2011 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002012
2013 vim_free(eval_result);
2014
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002015 // The expression may contain substitute(), which calls us
2016 // recursively. Make sure submatch() gets the text from the first
2017 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002018 if (can_f_submatch)
2019 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002020 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002021 rsm.sm_match = rex.reg_match;
2022 rsm.sm_mmatch = rex.reg_mmatch;
2023 rsm.sm_firstlnum = rex.reg_firstlnum;
2024 rsm.sm_maxline = rex.reg_maxline;
2025 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002026
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002027 if (expr != NULL)
2028 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002029 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002030 char_u buf[NUMBUFLEN];
2031 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002032 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002033 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002034
2035 rettv.v_type = VAR_STRING;
2036 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002037 argv[0].v_type = VAR_LIST;
2038 argv[0].vval.v_list = &matchList.sl_list;
2039 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002040 CLEAR_FIELD(funcexe);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002041 funcexe.fe_argv_func = fill_submatch_list;
2042 funcexe.fe_evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002043 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002044 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002045 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002046 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002047 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002048 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002049 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002050 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002051
Bram Moolenaar6100d022016-10-02 16:51:57 +02002052 s = partial_name(partial);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002053 funcexe.fe_partial = partial;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002054 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002055 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002056 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002057 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002058 clear_submatch_list(&matchList);
2059
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002060 if (rettv.v_type == VAR_UNKNOWN)
2061 // something failed, no need to report another error
2062 eval_result = NULL;
2063 else
2064 {
2065 eval_result = tv_get_string_buf_chk(&rettv, buf);
2066 if (eval_result != NULL)
2067 eval_result = vim_strsave(eval_result);
2068 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002069 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002070 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002071 else if (substitute_instr != NULL)
2072 // Execute instructions from ISN_SUBSTITUTE.
2073 eval_result = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002074 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002075 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002076
Bram Moolenaar071d4272004-06-13 20:20:40 +00002077 if (eval_result != NULL)
2078 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002079 int had_backslash = FALSE;
2080
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002081 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002082 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002083 // Change NL to CR, so that it becomes a line break,
2084 // unless called from vim_regexec_nl().
2085 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002086 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002087 *s = CAR;
2088 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002089 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002090 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002091 /* Change NL to CR here too, so that this works:
2092 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2093 * abc\
2094 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002095 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002096 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002097 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002098 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002099 had_backslash = TRUE;
2100 }
2101 }
2102 if (had_backslash && backslash)
2103 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002104 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002105 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2106 if (s != NULL)
2107 {
2108 vim_free(eval_result);
2109 eval_result = s;
2110 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002111 }
2112
2113 dst += STRLEN(eval_result);
2114 }
2115
Bram Moolenaar6100d022016-10-02 16:51:57 +02002116 can_f_submatch = prev_can_f_submatch;
2117 if (can_f_submatch)
2118 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002119 }
2120#endif
2121 }
2122 else
2123 while ((c = *src++) != NUL)
2124 {
2125 if (c == '&' && magic)
2126 no = 0;
2127 else if (c == '\\' && *src != NUL)
2128 {
2129 if (*src == '&' && !magic)
2130 {
2131 ++src;
2132 no = 0;
2133 }
2134 else if ('0' <= *src && *src <= '9')
2135 {
2136 no = *src++ - '0';
2137 }
2138 else if (vim_strchr((char_u *)"uUlLeE", *src))
2139 {
2140 switch (*src++)
2141 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002142 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002143 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002144 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002145 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002146 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002147 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002148 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002149 continue;
2150 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002151 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002152 continue;
2153 }
2154 }
2155 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002156 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002157 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002158 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2159 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002160 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002161 if (copy)
2162 {
2163 *dst++ = c;
2164 *dst++ = *src++;
2165 *dst++ = *src++;
2166 }
2167 else
2168 {
2169 dst += 3;
2170 src += 2;
2171 }
2172 continue;
2173 }
2174
Bram Moolenaar071d4272004-06-13 20:20:40 +00002175 if (c == '\\' && *src != NUL)
2176 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002177 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002178 switch (*src)
2179 {
2180 case 'r': c = CAR; ++src; break;
2181 case 'n': c = NL; ++src; break;
2182 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002183 // Oh no! \e already has meaning in subst pat :-(
2184 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002185 case 'b': c = Ctrl_H; ++src; break;
2186
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002187 // If "backslash" is TRUE the backslash will be removed
2188 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002189 default: if (backslash)
2190 {
2191 if (copy)
2192 *dst = '\\';
2193 ++dst;
2194 }
2195 c = *src++;
2196 }
2197 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002198 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002199 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002200
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002201 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002202 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002203 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002204 func_one = (fptr_T)(func_one(&cc, c));
2205 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002206 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002207 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002208 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002209 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002210
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002211 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002212 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002213 int totlen = mb_ptr2len(src - 1);
2214
Bram Moolenaar071d4272004-06-13 20:20:40 +00002215 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002216 mb_char2bytes(cc, dst);
2217 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002218 if (enc_utf8)
2219 {
2220 int clen = utf_ptr2len(src - 1);
2221
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002222 // If the character length is shorter than "totlen", there
2223 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002224 if (clen < totlen)
2225 {
2226 if (copy)
2227 mch_memmove(dst + 1, src - 1 + clen,
2228 (size_t)(totlen - clen));
2229 dst += totlen - clen;
2230 }
2231 }
2232 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002233 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002234 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002235 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002236 dst++;
2237 }
2238 else
2239 {
2240 if (REG_MULTI)
2241 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002242 clnum = rex.reg_mmatch->startpos[no].lnum;
2243 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002244 s = NULL;
2245 else
2246 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002247 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2248 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2249 len = rex.reg_mmatch->endpos[no].col
2250 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002251 else
2252 len = (int)STRLEN(s);
2253 }
2254 }
2255 else
2256 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002257 s = rex.reg_match->startp[no];
2258 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002259 s = NULL;
2260 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002261 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002262 }
2263 if (s != NULL)
2264 {
2265 for (;;)
2266 {
2267 if (len == 0)
2268 {
2269 if (REG_MULTI)
2270 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002271 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002272 break;
2273 if (copy)
2274 *dst = CAR;
2275 ++dst;
2276 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002277 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2278 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002279 else
2280 len = (int)STRLEN(s);
2281 }
2282 else
2283 break;
2284 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002285 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002286 {
2287 if (copy)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002288 iemsg(_(e_damaged_match_string));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002289 goto exit;
2290 }
2291 else
2292 {
2293 if (backslash && (*s == CAR || *s == '\\'))
2294 {
2295 /*
2296 * Insert a backslash in front of a CR, otherwise
2297 * it will be replaced by a line break.
2298 * Number of backslashes will be halved later,
2299 * double them here.
2300 */
2301 if (copy)
2302 {
2303 dst[0] = '\\';
2304 dst[1] = *s;
2305 }
2306 dst += 2;
2307 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002308 else
2309 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002310 if (has_mbyte)
2311 c = mb_ptr2char(s);
2312 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002313 c = *s;
2314
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002315 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002316 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002317 func_one = (fptr_T)(func_one(&cc, c));
2318 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002319 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002320 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002321 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002322 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002323
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002324 if (has_mbyte)
2325 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002326 int l;
2327
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002328 // Copy composing characters separately, one
2329 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002330 if (enc_utf8)
2331 l = utf_ptr2len(s) - 1;
2332 else
2333 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002334
2335 s += l;
2336 len -= l;
2337 if (copy)
2338 mb_char2bytes(cc, dst);
2339 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002340 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002341 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002342 *dst = cc;
2343 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002344 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002345
Bram Moolenaar071d4272004-06-13 20:20:40 +00002346 ++s;
2347 --len;
2348 }
2349 }
2350 }
2351 no = -1;
2352 }
2353 }
2354 if (copy)
2355 *dst = NUL;
2356
2357exit:
2358 return (int)((dst - dest) + 1);
2359}
2360
2361#ifdef FEAT_EVAL
2362/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002363 * Call reg_getline() with the line numbers from the submatch. If a
2364 * substitute() was used the reg_maxline and other values have been
2365 * overwritten.
2366 */
2367 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002368reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002369{
2370 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002371 linenr_T save_first = rex.reg_firstlnum;
2372 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002373
Bram Moolenaar6100d022016-10-02 16:51:57 +02002374 rex.reg_firstlnum = rsm.sm_firstlnum;
2375 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002376
2377 s = reg_getline(lnum);
2378
Bram Moolenaar6100d022016-10-02 16:51:57 +02002379 rex.reg_firstlnum = save_first;
2380 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002381 return s;
2382}
2383
2384/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002385 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002386 * allocated memory.
2387 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2388 */
2389 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002390reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002391{
2392 char_u *retval = NULL;
2393 char_u *s;
2394 int len;
2395 int round;
2396 linenr_T lnum;
2397
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002398 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002399 return NULL;
2400
Bram Moolenaar6100d022016-10-02 16:51:57 +02002401 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002402 {
2403 /*
2404 * First round: compute the length and allocate memory.
2405 * Second round: copy the text.
2406 */
2407 for (round = 1; round <= 2; ++round)
2408 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002409 lnum = rsm.sm_mmatch->startpos[no].lnum;
2410 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002411 return NULL;
2412
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002413 s = reg_getline_submatch(lnum);
2414 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002415 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002416 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002417 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002418 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002419 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002420 len = rsm.sm_mmatch->endpos[no].col
2421 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002422 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002423 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002424 ++len;
2425 }
2426 else
2427 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002428 // Multiple lines: take start line from start col, middle
2429 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002430 len = (int)STRLEN(s);
2431 if (round == 2)
2432 {
2433 STRCPY(retval, s);
2434 retval[len] = '\n';
2435 }
2436 ++len;
2437 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002438 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002439 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002440 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002441 if (round == 2)
2442 STRCPY(retval + len, s);
2443 len += (int)STRLEN(s);
2444 if (round == 2)
2445 retval[len] = '\n';
2446 ++len;
2447 }
2448 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002449 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002450 rsm.sm_mmatch->endpos[no].col);
2451 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002452 if (round == 2)
2453 retval[len] = NUL;
2454 ++len;
2455 }
2456
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002457 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002458 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002459 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002460 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002461 return NULL;
2462 }
2463 }
2464 }
2465 else
2466 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002467 s = rsm.sm_match->startp[no];
2468 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002469 retval = NULL;
2470 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002471 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002472 }
2473
2474 return retval;
2475}
Bram Moolenaar41571762014-04-02 19:00:58 +02002476
2477/*
2478 * Used for the submatch() function with the optional non-zero argument: get
2479 * the list of strings from the n'th submatch in allocated memory with NULs
2480 * represented in NLs.
2481 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2482 * command, for a non-existing submatch and for any error.
2483 */
2484 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002485reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002486{
2487 char_u *s;
2488 linenr_T slnum;
2489 linenr_T elnum;
2490 colnr_T scol;
2491 colnr_T ecol;
2492 int i;
2493 list_T *list;
2494 int error = FALSE;
2495
2496 if (!can_f_submatch || no < 0)
2497 return NULL;
2498
Bram Moolenaar6100d022016-10-02 16:51:57 +02002499 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002500 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002501 slnum = rsm.sm_mmatch->startpos[no].lnum;
2502 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002503 if (slnum < 0 || elnum < 0)
2504 return NULL;
2505
Bram Moolenaar6100d022016-10-02 16:51:57 +02002506 scol = rsm.sm_mmatch->startpos[no].col;
2507 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002508
2509 list = list_alloc();
2510 if (list == NULL)
2511 return NULL;
2512
2513 s = reg_getline_submatch(slnum) + scol;
2514 if (slnum == elnum)
2515 {
2516 if (list_append_string(list, s, ecol - scol) == FAIL)
2517 error = TRUE;
2518 }
2519 else
2520 {
2521 if (list_append_string(list, s, -1) == FAIL)
2522 error = TRUE;
2523 for (i = 1; i < elnum - slnum; i++)
2524 {
2525 s = reg_getline_submatch(slnum + i);
2526 if (list_append_string(list, s, -1) == FAIL)
2527 error = TRUE;
2528 }
2529 s = reg_getline_submatch(elnum);
2530 if (list_append_string(list, s, ecol) == FAIL)
2531 error = TRUE;
2532 }
2533 }
2534 else
2535 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002536 s = rsm.sm_match->startp[no];
2537 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002538 return NULL;
2539 list = list_alloc();
2540 if (list == NULL)
2541 return NULL;
2542 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002543 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002544 error = TRUE;
2545 }
2546
2547 if (error)
2548 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002549 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002550 return NULL;
2551 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002552 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002553 return list;
2554}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002555#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002556
Bram Moolenaarf4140482020-02-15 23:06:45 +01002557/*
2558 * Initialize the values used for matching against multiple lines
2559 */
2560 static void
2561init_regexec_multi(
2562 regmmatch_T *rmp,
2563 win_T *win, // window in which to search or NULL
2564 buf_T *buf, // buffer in which to search
2565 linenr_T lnum) // nr of line to start looking for match
2566{
2567 rex.reg_match = NULL;
2568 rex.reg_mmatch = rmp;
2569 rex.reg_buf = buf;
2570 rex.reg_win = win;
2571 rex.reg_firstlnum = lnum;
2572 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2573 rex.reg_line_lbr = FALSE;
2574 rex.reg_ic = rmp->rmm_ic;
2575 rex.reg_icombine = FALSE;
2576 rex.reg_maxcol = rmp->rmm_maxcol;
2577}
2578
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002579#include "regexp_bt.c"
2580
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002581static regengine_T bt_regengine =
2582{
2583 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002584 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002585 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002586 bt_regexec_multi,
2587 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002588};
2589
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002590#include "regexp_nfa.c"
2591
2592static regengine_T nfa_regengine =
2593{
2594 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002595 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002596 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002597 nfa_regexec_multi,
2598 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002599};
2600
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002601// Which regexp engine to use? Needed for vim_regcomp().
2602// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002603static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002604
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002605#ifdef DEBUG
2606static char_u regname[][30] = {
2607 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002608 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002609 "NFA Regexp Engine"
2610 };
2611#endif
2612
2613/*
2614 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002615 * Returns the program in allocated memory.
2616 * Use vim_regfree() to free the memory.
2617 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002618 */
2619 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002620vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002621{
2622 regprog_T *prog = NULL;
2623 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002624 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002625
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002626 regexp_engine = p_re;
2627
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002628 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002629 if (STRNCMP(expr, "\\%#=", 4) == 0)
2630 {
2631 int newengine = expr[4] - '0';
2632
2633 if (newengine == AUTOMATIC_ENGINE
2634 || newengine == BACKTRACKING_ENGINE
2635 || newengine == NFA_ENGINE)
2636 {
2637 regexp_engine = expr[4] - '0';
2638 expr += 5;
2639#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002640 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002641 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002642#endif
2643 }
2644 else
2645 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002646 emsg(_("E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002647 regexp_engine = AUTOMATIC_ENGINE;
2648 }
2649 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002650#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002651 bt_regengine.expr = expr;
2652 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002653#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002654 // reg_iswordc() uses rex.reg_buf
2655 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002656
2657 /*
2658 * First try the NFA engine, unless backtracking was requested.
2659 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002660 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002661 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002662 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002663 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002664 else
2665 prog = bt_regengine.regcomp(expr, re_flags);
2666
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002667 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002668 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002669 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002670#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002671 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002672 {
2673 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002674 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002675 if (f)
2676 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002677 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002678 fclose(f);
2679 }
2680 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002681 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002682 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002683 }
2684#endif
2685 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002686 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002687 * The NFA engine also fails for patterns that it can't handle well
2688 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002689 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002690 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002691 if (regexp_engine == AUTOMATIC_ENGINE
2692 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002693 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002694 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002695#ifdef FEAT_EVAL
2696 report_re_switch(expr);
2697#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002698 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002699 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002700 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002701
Bram Moolenaarfda37292014-11-05 14:27:36 +01002702 if (prog != NULL)
2703 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002704 // Store the info needed to call regcomp() again when the engine turns
2705 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002706 prog->re_engine = regexp_engine;
2707 prog->re_flags = re_flags;
2708 }
2709
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002710 return prog;
2711}
2712
2713/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002714 * Free a compiled regexp program, returned by vim_regcomp().
2715 */
2716 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002717vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002718{
2719 if (prog != NULL)
2720 prog->engine->regfree(prog);
2721}
2722
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002723#if defined(EXITFREE) || defined(PROTO)
2724 void
2725free_regexp_stuff(void)
2726{
2727 ga_clear(&regstack);
2728 ga_clear(&backpos);
2729 vim_free(reg_tofree);
2730 vim_free(reg_prev_sub);
2731}
2732#endif
2733
Bram Moolenaarfda37292014-11-05 14:27:36 +01002734#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002735 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002736report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002737{
2738 if (p_verbose > 0)
2739 {
2740 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002741 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2742 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002743 verbose_leave();
2744 }
2745}
2746#endif
2747
Bram Moolenaar651fca82021-11-29 20:39:38 +00002748#if defined(FEAT_X11) || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002749/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002750 * Return whether "prog" is currently being executed.
2751 */
2752 int
2753regprog_in_use(regprog_T *prog)
2754{
2755 return prog->re_in_use;
2756}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002757#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002758
2759/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002760 * Match a regexp against a string.
2761 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002762 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002763 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002764 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002765 *
2766 * Return TRUE if there is a match, FALSE if not.
2767 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002768 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002769vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002770 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002771 char_u *line, // string to match against
2772 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002773 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002774{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002775 int result;
2776 regexec_T rex_save;
2777 int rex_in_use_save = rex_in_use;
2778
Bram Moolenaar0270f382018-07-17 05:43:58 +02002779 // Cannot use the same prog recursively, it contains state.
2780 if (rmp->regprog->re_in_use)
2781 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002782 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002783 return FALSE;
2784 }
2785 rmp->regprog->re_in_use = TRUE;
2786
Bram Moolenaar6100d022016-10-02 16:51:57 +02002787 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002788 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002789 rex_save = rex;
2790 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002791
Bram Moolenaar6100d022016-10-02 16:51:57 +02002792 rex.reg_startp = NULL;
2793 rex.reg_endp = NULL;
2794 rex.reg_startpos = NULL;
2795 rex.reg_endpos = NULL;
2796
2797 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002798 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002799
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002800 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002801 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2802 && result == NFA_TOO_EXPENSIVE)
2803 {
2804 int save_p_re = p_re;
2805 int re_flags = rmp->regprog->re_flags;
2806 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2807
2808 p_re = BACKTRACKING_ENGINE;
2809 vim_regfree(rmp->regprog);
2810 if (pat != NULL)
2811 {
2812#ifdef FEAT_EVAL
2813 report_re_switch(pat);
2814#endif
2815 rmp->regprog = vim_regcomp(pat, re_flags);
2816 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002817 {
2818 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002819 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002820 rmp->regprog->re_in_use = FALSE;
2821 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002822 vim_free(pat);
2823 }
2824
2825 p_re = save_p_re;
2826 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002827
2828 rex_in_use = rex_in_use_save;
2829 if (rex_in_use)
2830 rex = rex_save;
2831
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002832 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002833}
2834
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002835/*
2836 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002837 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002838 */
2839 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002840vim_regexec_prog(
2841 regprog_T **prog,
2842 int ignore_case,
2843 char_u *line,
2844 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002845{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002846 int r;
2847 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002848
2849 regmatch.regprog = *prog;
2850 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002851 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002852 *prog = regmatch.regprog;
2853 return r;
2854}
2855
2856/*
2857 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002858 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002859 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002860 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002861vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002862{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002863 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002864}
2865
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002866/*
2867 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002868 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002869 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002870 */
2871 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002872vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002873{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002874 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002875}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002876
2877/*
2878 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002879 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2880 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002881 * Uses curbuf for line count and 'iskeyword'.
2882 *
2883 * Return zero if there is no match. Return number of lines contained in the
2884 * match otherwise.
2885 */
2886 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002887vim_regexec_multi(
2888 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002889 win_T *win, // window in which to search or NULL
2890 buf_T *buf, // buffer in which to search
2891 linenr_T lnum, // nr of line to start looking for match
2892 colnr_T col, // column to start looking for match
2893 proftime_T *tm, // timeout limit or NULL
2894 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002895{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002896 int result;
2897 regexec_T rex_save;
2898 int rex_in_use_save = rex_in_use;
2899
Bram Moolenaar0270f382018-07-17 05:43:58 +02002900 // Cannot use the same prog recursively, it contains state.
2901 if (rmp->regprog->re_in_use)
2902 {
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002903 emsg(_(e_recursive));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002904 return FALSE;
2905 }
2906 rmp->regprog->re_in_use = TRUE;
2907
Bram Moolenaar6100d022016-10-02 16:51:57 +02002908 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002909 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002910 rex_save = rex;
2911 rex_in_use = TRUE;
2912
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002913 result = rmp->regprog->engine->regexec_multi(
2914 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002915 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002916
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002917 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002918 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2919 && result == NFA_TOO_EXPENSIVE)
2920 {
2921 int save_p_re = p_re;
2922 int re_flags = rmp->regprog->re_flags;
2923 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2924
2925 p_re = BACKTRACKING_ENGINE;
2926 vim_regfree(rmp->regprog);
2927 if (pat != NULL)
2928 {
2929#ifdef FEAT_EVAL
2930 report_re_switch(pat);
2931#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002932#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002933 // checking for \z misuse was already done when compiling for NFA,
2934 // allow all here
2935 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002936#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002937 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002938#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002939 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002940#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002941
Bram Moolenaarfda37292014-11-05 14:27:36 +01002942 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002943 {
2944 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002945 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002946 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002947 rmp->regprog->re_in_use = FALSE;
2948 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002949 vim_free(pat);
2950 }
2951 p_re = save_p_re;
2952 }
2953
Bram Moolenaar6100d022016-10-02 16:51:57 +02002954 rex_in_use = rex_in_use_save;
2955 if (rex_in_use)
2956 rex = rex_save;
2957
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002958 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002959}