blob: 95057655533e04dc12262a364e5263083603e46a [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
23/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000024 * Magic characters have a special meaning, they don't match literally.
25 * Magic characters are negative. This separates them from literal characters
26 * (possibly multi-byte). Only ASCII characters can be Magic.
27 */
28#define Magic(x) ((int)(x) - 256)
29#define un_Magic(x) ((x) + 256)
30#define is_Magic(x) ((x) < 0)
31
Bram Moolenaar071d4272004-06-13 20:20:40 +000032 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010033no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000034{
35 if (is_Magic(x))
36 return un_Magic(x);
37 return x;
38}
39
40 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010041toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000042{
43 if (is_Magic(x))
44 return un_Magic(x);
45 return Magic(x);
46}
47
48/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020049 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +000050 * number; the start node begins in the second byte. It's used to catch the
51 * most severe mutilation of the program by the caller.
52 */
53
54#define REGMAGIC 0234
55
56/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000057 * Utility definitions.
58 */
59#define UCHARAT(p) ((int)*(char_u *)(p))
60
Bram Moolenaar63d9e732019-12-05 21:10:38 +010061// Used for an error (down from) vim_regcomp(): give the error message, set
62// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010063#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
64#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
65#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
66#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +010067#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +010068#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarac78dd42022-01-02 19:25:26 +000069#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_(e_invalid_item_in_str_brackets), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +000070
Bram Moolenaar95f09602016-11-10 20:01:45 +010071
Bram Moolenaar071d4272004-06-13 20:20:40 +000072#define MAX_LIMIT (32767L << 16L)
73
Bram Moolenaar071d4272004-06-13 20:20:40 +000074#define NOT_MULTI 0
75#define MULTI_ONE 1
76#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020077
78// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +010079#define RA_FAIL 1 // something failed, abort
80#define RA_CONT 2 // continue in inner loop
81#define RA_BREAK 3 // break inner loop
82#define RA_MATCH 4 // successful match
83#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +020084
Bram Moolenaar071d4272004-06-13 20:20:40 +000085/*
86 * Return NOT_MULTI if c is not a "multi" operator.
87 * Return MULTI_ONE if c is a single "multi" operator.
88 * Return MULTI_MULT if c is a multi "multi" operator.
89 */
90 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010091re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +000092{
93 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
94 return MULTI_ONE;
95 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
96 return MULTI_MULT;
97 return NOT_MULTI;
98}
99
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000100static char_u *reg_prev_sub = NULL;
101
Bram Moolenaar071d4272004-06-13 20:20:40 +0000102/*
103 * REGEXP_INRANGE contains all characters which are always special in a []
104 * range after '\'.
105 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
106 * These are:
107 * \n - New line (NL).
108 * \r - Carriage Return (CR).
109 * \t - Tab (TAB).
110 * \e - Escape (ESC).
111 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000112 * \d - Character code in decimal, eg \d123
113 * \o - Character code in octal, eg \o80
114 * \x - Character code in hex, eg \x4a
115 * \u - Multibyte character code, eg \u20ac
116 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000117 */
118static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000119static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000120
Bram Moolenaar071d4272004-06-13 20:20:40 +0000121/*
122 * Translate '\x' to its control character, except "\n", which is Magic.
123 */
124 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100125backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000126{
127 switch (c)
128 {
129 case 'r': return CAR;
130 case 't': return TAB;
131 case 'e': return ESC;
132 case 'b': return BS;
133 }
134 return c;
135}
136
137/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000138 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000139 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
140 * recognized. Otherwise "pp" is advanced to after the item.
141 */
142 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100143get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000144{
145 static const char *(class_names[]) =
146 {
147 "alnum:]",
148#define CLASS_ALNUM 0
149 "alpha:]",
150#define CLASS_ALPHA 1
151 "blank:]",
152#define CLASS_BLANK 2
153 "cntrl:]",
154#define CLASS_CNTRL 3
155 "digit:]",
156#define CLASS_DIGIT 4
157 "graph:]",
158#define CLASS_GRAPH 5
159 "lower:]",
160#define CLASS_LOWER 6
161 "print:]",
162#define CLASS_PRINT 7
163 "punct:]",
164#define CLASS_PUNCT 8
165 "space:]",
166#define CLASS_SPACE 9
167 "upper:]",
168#define CLASS_UPPER 10
169 "xdigit:]",
170#define CLASS_XDIGIT 11
171 "tab:]",
172#define CLASS_TAB 12
173 "return:]",
174#define CLASS_RETURN 13
175 "backspace:]",
176#define CLASS_BACKSPACE 14
177 "escape:]",
178#define CLASS_ESCAPE 15
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100179 "ident:]",
180#define CLASS_IDENT 16
181 "keyword:]",
182#define CLASS_KEYWORD 17
183 "fname:]",
184#define CLASS_FNAME 18
Bram Moolenaar071d4272004-06-13 20:20:40 +0000185 };
186#define CLASS_NONE 99
187 int i;
188
189 if ((*pp)[1] == ':')
190 {
K.Takataeeec2542021-06-02 13:28:16 +0200191 for (i = 0; i < (int)ARRAY_LENGTH(class_names); ++i)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000192 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
193 {
194 *pp += STRLEN(class_names[i]) + 2;
195 return i;
196 }
197 }
198 return CLASS_NONE;
199}
200
201/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000202 * Specific version of character class functions.
203 * Using a table to keep this fast.
204 */
205static short class_tab[256];
206
207#define RI_DIGIT 0x01
208#define RI_HEX 0x02
209#define RI_OCTAL 0x04
210#define RI_WORD 0x08
211#define RI_HEAD 0x10
212#define RI_ALPHA 0x20
213#define RI_LOWER 0x40
214#define RI_UPPER 0x80
215#define RI_WHITE 0x100
216
217 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100218init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000219{
220 int i;
221 static int done = FALSE;
222
223 if (done)
224 return;
225
226 for (i = 0; i < 256; ++i)
227 {
228 if (i >= '0' && i <= '7')
229 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
230 else if (i >= '8' && i <= '9')
231 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
232 else if (i >= 'a' && i <= 'f')
233 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
234#ifdef EBCDIC
235 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
236 || (i >= 's' && i <= 'z'))
237#else
238 else if (i >= 'g' && i <= 'z')
239#endif
240 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
241 else if (i >= 'A' && i <= 'F')
242 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
243#ifdef EBCDIC
244 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
245 || (i >= 'S' && i <= 'Z'))
246#else
247 else if (i >= 'G' && i <= 'Z')
248#endif
249 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
250 else if (i == '_')
251 class_tab[i] = RI_WORD + RI_HEAD;
252 else
253 class_tab[i] = 0;
254 }
255 class_tab[' '] |= RI_WHITE;
256 class_tab['\t'] |= RI_WHITE;
257 done = TRUE;
258}
259
Bram Moolenaara12a1612019-01-24 16:39:02 +0100260#define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
261#define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
262#define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
263#define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
264#define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
265#define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
266#define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
267#define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
268#define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000269
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100270// flags for regflags
271#define RF_ICASE 1 // ignore case
272#define RF_NOICASE 2 // don't ignore case
273#define RF_HASNL 4 // can match a NL
274#define RF_ICOMBINE 8 // ignore combining characters
275#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000276
277/*
278 * Global work variables for vim_regcomp().
279 */
280
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100281static char_u *regparse; // Input-scan pointer.
282static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100283static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000284#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100285static int regnzpar; // \z() count.
286static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000287#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100288static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000289#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100290static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000291#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000292
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100293static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000294
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100295static int reg_string; // matching with a string instead of a buffer
296 // line
297static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000298
299/*
300 * META contains all characters that may be magic, except '^' and '$'.
301 */
302
303#ifdef EBCDIC
304static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
305#else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100306// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000307static char_u META_flags[] = {
308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100310// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000311 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100312// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000313 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100314// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000315 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100316// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000317 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100318// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000319 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100320// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000321 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
322};
323#endif
324
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100325static int curchr; // currently parsed character
326// Previous character. Note: prevchr is sometimes -1 when we are not at the
327// start, eg in /[ ^I]^ the pattern was never found even if it existed,
328// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200329static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100330static int prevprevchr; // previous-previous character
331static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000332
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100333// arguments for reg()
334#define REG_NOPAREN 0 // toplevel reg()
335#define REG_PAREN 1 // \(\)
336#define REG_ZPAREN 2 // \z(\)
337#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000338
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200339typedef struct
340{
341 char_u *regparse;
342 int prevchr_len;
343 int curchr;
344 int prevchr;
345 int prevprevchr;
346 int nextchr;
347 int at_start;
348 int prev_at_start;
349 int regnpar;
350} parse_state_T;
351
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100352static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100353static int getchr(void);
354static void skipchr_keepstart(void);
355static int peekchr(void);
356static void skipchr(void);
357static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100358static long gethexchrs(int maxinputlen);
359static long getoctchrs(void);
360static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100361static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100362static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200363static int cstrncmp(char_u *s1, char_u *s2, int *n);
364static char_u *cstrchr(char_u *, int);
365static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100366static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100367#ifdef FEAT_EVAL
368static void report_re_switch(char_u *pat);
369#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000370
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200371static regengine_T bt_regengine;
372static regengine_T nfa_regengine;
373
Bram Moolenaar071d4272004-06-13 20:20:40 +0000374/*
375 * Return TRUE if compiled regular expression "prog" can match a line break.
376 */
377 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100378re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000379{
380 return (prog->regflags & RF_HASNL);
381}
382
383/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000384 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
385 * Returns a character representing the class. Zero means that no item was
386 * recognized. Otherwise "pp" is advanced to after the item.
387 */
388 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100389get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000390{
391 int c;
392 int l = 1;
393 char_u *p = *pp;
394
Bram Moolenaar985079c2019-02-16 17:07:47 +0100395 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000396 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000397 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000398 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000399 if (p[l + 2] == '=' && p[l + 3] == ']')
400 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000401 if (has_mbyte)
402 c = mb_ptr2char(p + 2);
403 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000404 c = p[2];
405 *pp += l + 4;
406 return c;
407 }
408 }
409 return 0;
410}
411
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200412#ifdef EBCDIC
413/*
414 * Table for equivalence class "c". (IBM-1047)
415 */
Bram Moolenaar5843f5f2019-08-20 20:13:45 +0200416static char *EQUIVAL_CLASS_C[16] = {
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200417 "A\x62\x63\x64\x65\x66\x67",
418 "C\x68",
419 "E\x71\x72\x73\x74",
420 "I\x75\x76\x77\x78",
421 "N\x69",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200422 "O\xEB\xEC\xED\xEE\xEF\x80",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200423 "U\xFB\xFC\xFD\xFE",
424 "Y\xBA",
425 "a\x42\x43\x44\x45\x46\x47",
426 "c\x48",
427 "e\x51\x52\x53\x54",
428 "i\x55\x56\x57\x58",
429 "n\x49",
Bram Moolenaar22e42152016-04-03 14:02:02 +0200430 "o\xCB\xCC\xCD\xCE\xCF\x70",
Bram Moolenaar2c704a72010-06-03 21:17:25 +0200431 "u\xDB\xDC\xDD\xDE",
432 "y\x8D\xDF",
433};
434#endif
435
Bram Moolenaardf177f62005-02-22 08:39:57 +0000436/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000437 * Check for a collating element "[.a.]". "pp" points to the '['.
438 * Returns a character. Zero means that no item was recognized. Otherwise
439 * "pp" is advanced to after the item.
440 * Currently only single characters are recognized!
441 */
442 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100443get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000444{
445 int c;
446 int l = 1;
447 char_u *p = *pp;
448
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100449 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000450 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000451 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000452 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000453 if (p[l + 2] == '.' && p[l + 3] == ']')
454 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000455 if (has_mbyte)
456 c = mb_ptr2char(p + 2);
457 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000458 c = p[2];
459 *pp += l + 4;
460 return c;
461 }
462 }
463 return 0;
464}
465
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100466static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
467static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200468
469 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100470get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200471{
472 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
473 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
474}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000475
476/*
477 * Skip over a "[]" range.
478 * "p" must point to the character after the '['.
479 * The returned pointer is on the matching ']', or the terminating NUL.
480 */
481 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100482skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000483{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000484 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000485
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100486 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000487 ++p;
488 if (*p == ']' || *p == '-')
489 ++p;
490 while (*p != NUL && *p != ']')
491 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000492 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000493 p += l;
494 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000495 if (*p == '-')
496 {
497 ++p;
498 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100499 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000500 }
501 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200502 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000503 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200504 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000505 p += 2;
506 else if (*p == '[')
507 {
508 if (get_char_class(&p) == CLASS_NONE
509 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200510 && get_coll_element(&p) == 0
511 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100512 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000513 }
514 else
515 ++p;
516 }
517
518 return p;
519}
520
521/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000522 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200523 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000524 * Take care of characters with a backslash in front of it.
525 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000526 */
527 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100528skip_regexp(
529 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200530 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200531 int magic)
532{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100533 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200534}
535
536/*
537 * Call skip_regexp() and when the delimiter does not match give an error and
538 * return NULL.
539 */
540 char_u *
541skip_regexp_err(
542 char_u *startp,
543 int delim,
544 int magic)
545{
546 char_u *p = skip_regexp(startp, delim, magic);
547
548 if (*p != delim)
549 {
Bram Moolenaara6f79292022-01-04 21:30:47 +0000550 semsg(_(e_missing_delimiter_after_search_pattern_str), startp);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200551 return NULL;
552 }
553 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200554}
555
556/*
557 * skip_regexp() with extra arguments:
558 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
559 * expression and change "\?" to "?". If "*newp" is not NULL the expression
560 * is changed in-place.
561 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100562 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200563 */
564 char_u *
565skip_regexp_ex(
566 char_u *startp,
567 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100568 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200569 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100570 int *dropped,
571 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000572{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100573 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000574 char_u *p = startp;
575
576 if (magic)
577 mymagic = MAGIC_ON;
578 else
579 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200580 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000581
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100582 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000583 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100584 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000585 break;
586 if ((p[0] == '[' && mymagic >= MAGIC_ON)
587 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
588 {
589 p = skip_anyof(p + 1);
590 if (p[0] == NUL)
591 break;
592 }
593 else if (p[0] == '\\' && p[1] != NUL)
594 {
595 if (dirc == '?' && newp != NULL && p[1] == '?')
596 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100597 // change "\?" to "?", make a copy first.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000598 if (*newp == NULL)
599 {
600 *newp = vim_strsave(startp);
601 if (*newp != NULL)
602 p = *newp + (p - startp);
603 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200604 if (dropped != NULL)
605 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000606 if (*newp != NULL)
Bram Moolenaar446cb832008-06-24 21:56:24 +0000607 STRMOVE(p, p + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000608 else
609 ++p;
610 }
611 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100612 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000613 if (*p == 'v')
614 mymagic = MAGIC_ALL;
615 else if (*p == 'V')
616 mymagic = MAGIC_NONE;
617 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000618 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100619 if (magic_val != NULL)
620 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000621 return p;
622}
623
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200624/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200625 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200626 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100627static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200628static int at_start; // True when on the first character
629static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100630
Bram Moolenaar071d4272004-06-13 20:20:40 +0000631/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200632 * Start parsing at "str".
633 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000634 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100635initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000636{
637 regparse = str;
638 prevchr_len = 0;
639 curchr = prevprevchr = prevchr = nextchr = -1;
640 at_start = TRUE;
641 prev_at_start = FALSE;
642}
643
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200644/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200645 * Save the current parse state, so that it can be restored and parsing
646 * starts in the same state again.
647 */
648 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100649save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200650{
651 ps->regparse = regparse;
652 ps->prevchr_len = prevchr_len;
653 ps->curchr = curchr;
654 ps->prevchr = prevchr;
655 ps->prevprevchr = prevprevchr;
656 ps->nextchr = nextchr;
657 ps->at_start = at_start;
658 ps->prev_at_start = prev_at_start;
659 ps->regnpar = regnpar;
660}
661
662/*
663 * Restore a previously saved parse state.
664 */
665 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100666restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200667{
668 regparse = ps->regparse;
669 prevchr_len = ps->prevchr_len;
670 curchr = ps->curchr;
671 prevchr = ps->prevchr;
672 prevprevchr = ps->prevprevchr;
673 nextchr = ps->nextchr;
674 at_start = ps->at_start;
675 prev_at_start = ps->prev_at_start;
676 regnpar = ps->regnpar;
677}
678
679
680/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200681 * Get the next character without advancing.
682 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000683 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100684peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000685{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000686 static int after_slash = FALSE;
687
Bram Moolenaar071d4272004-06-13 20:20:40 +0000688 if (curchr == -1)
689 {
690 switch (curchr = regparse[0])
691 {
692 case '.':
693 case '[':
694 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100695 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000696 if (reg_magic >= MAGIC_ON)
697 curchr = Magic(curchr);
698 break;
699 case '(':
700 case ')':
701 case '{':
702 case '%':
703 case '+':
704 case '=':
705 case '?':
706 case '@':
707 case '!':
708 case '&':
709 case '|':
710 case '<':
711 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100712 case '#': // future ext.
713 case '"': // future ext.
714 case '\'': // future ext.
715 case ',': // future ext.
716 case '-': // future ext.
717 case ':': // future ext.
718 case ';': // future ext.
719 case '`': // future ext.
720 case '/': // Can't be used in / command
721 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000722 if (reg_magic == MAGIC_ALL)
723 curchr = Magic(curchr);
724 break;
725 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100726 // * is not magic as the very first character, eg "?*ptr", when
727 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
728 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000729 if (reg_magic >= MAGIC_ON
730 && !at_start
731 && !(prev_at_start && prevchr == Magic('^'))
732 && (after_slash
733 || (prevchr != Magic('(')
734 && prevchr != Magic('&')
735 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000736 curchr = Magic('*');
737 break;
738 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100739 // '^' is only magic as the very first character and if it's after
740 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000741 if (reg_magic >= MAGIC_OFF
742 && (at_start
743 || reg_magic == MAGIC_ALL
744 || prevchr == Magic('(')
745 || prevchr == Magic('|')
746 || prevchr == Magic('&')
747 || prevchr == Magic('n')
748 || (no_Magic(prevchr) == '('
749 && prevprevchr == Magic('%'))))
750 {
751 curchr = Magic('^');
752 at_start = TRUE;
753 prev_at_start = FALSE;
754 }
755 break;
756 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100757 // '$' is only magic as the very last char and if it's in front of
758 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000759 if (reg_magic >= MAGIC_OFF)
760 {
761 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200762 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000763
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100764 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000765 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200766 || p[1] == 'm' || p[1] == 'M'
767 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
768 {
769 if (p[1] == 'v')
770 is_magic_all = TRUE;
771 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
772 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000773 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200774 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000775 if (p[0] == NUL
776 || (p[0] == '\\'
777 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
778 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200779 || (is_magic_all
780 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000781 || reg_magic == MAGIC_ALL)
782 curchr = Magic('$');
783 }
784 break;
785 case '\\':
786 {
787 int c = regparse[1];
788
789 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100790 curchr = '\\'; // trailing '\'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000791 else if (
792#ifdef EBCDIC
793 vim_strchr(META, c)
794#else
795 c <= '~' && META_flags[c]
796#endif
797 )
798 {
799 /*
800 * META contains everything that may be magic sometimes,
801 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200802 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000803 * magicness. Therefore, \ is so meta-magic that it is
804 * not in META.
805 */
806 curchr = -1;
807 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100808 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000809 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000810 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000811 peekchr();
812 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000813 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000814 curchr = toggle_Magic(curchr);
815 }
816 else if (vim_strchr(REGEXP_ABBR, c))
817 {
818 /*
819 * Handle abbreviations, like "\t" for TAB -- webb
820 */
821 curchr = backslash_trans(c);
822 }
823 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
824 curchr = toggle_Magic(c);
825 else
826 {
827 /*
828 * Next character can never be (made) magic?
829 * Then backslashing it won't do anything.
830 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000831 if (has_mbyte)
832 curchr = (*mb_ptr2char)(regparse + 1);
833 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000834 curchr = c;
835 }
836 break;
837 }
838
Bram Moolenaar071d4272004-06-13 20:20:40 +0000839 default:
840 if (has_mbyte)
841 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000842 }
843 }
844
845 return curchr;
846}
847
848/*
849 * Eat one lexed character. Do this in a way that we can undo it.
850 */
851 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100852skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000853{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100854 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000855 if (*regparse == '\\')
856 prevchr_len = 1;
857 else
858 prevchr_len = 0;
859 if (regparse[prevchr_len] != NUL)
860 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000861 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100862 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000863 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000864 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000865 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000866 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000867 ++prevchr_len;
868 }
869 regparse += prevchr_len;
870 prev_at_start = at_start;
871 at_start = FALSE;
872 prevprevchr = prevchr;
873 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100874 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000875 nextchr = -1;
876}
877
878/*
879 * Skip a character while keeping the value of prev_at_start for at_start.
880 * prevchr and prevprevchr are also kept.
881 */
882 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100883skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000884{
885 int as = prev_at_start;
886 int pr = prevchr;
887 int prpr = prevprevchr;
888
889 skipchr();
890 at_start = as;
891 prevchr = pr;
892 prevprevchr = prpr;
893}
894
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200895/*
896 * Get the next character from the pattern. We know about magic and such, so
897 * therefore we need a lexical analyzer.
898 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000899 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100900getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000901{
902 int chr = peekchr();
903
904 skipchr();
905 return chr;
906}
907
908/*
909 * put character back. Works only once!
910 */
911 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100912ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000913{
914 nextchr = curchr;
915 curchr = prevchr;
916 prevchr = prevprevchr;
917 at_start = prev_at_start;
918 prev_at_start = FALSE;
919
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100920 // Backup regparse, so that it's at the same position as before the
921 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000922 regparse -= prevchr_len;
923}
924
925/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000926 * Get and return the value of the hex string at the current position.
927 * Return -1 if there is no valid hex number.
928 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000929 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000930 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000931 * The parameter controls the maximum number of input characters. This will be
932 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
933 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100934 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100935gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000936{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100937 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000938 int c;
939 int i;
940
941 for (i = 0; i < maxinputlen; ++i)
942 {
943 c = regparse[0];
944 if (!vim_isxdigit(c))
945 break;
946 nr <<= 4;
947 nr |= hex2nr(c);
948 ++regparse;
949 }
950
951 if (i == 0)
952 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100953 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000954}
955
956/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +0200957 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000958 * current position. Return -1 for invalid. Consumes all digits.
959 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100960 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100961getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000962{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100963 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000964 int c;
965 int i;
966
967 for (i = 0; ; ++i)
968 {
969 c = regparse[0];
970 if (c < '0' || c > '9')
971 break;
972 nr *= 10;
973 nr += c - '0';
974 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100975 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000976 }
977
978 if (i == 0)
979 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100980 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000981}
982
983/*
984 * get and return the value of the octal string immediately after the current
985 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
986 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
987 * treat 8 or 9 as recognised characters. Position is updated:
988 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000989 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000990 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100991 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100992getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000993{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100994 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000995 int c;
996 int i;
997
998 for (i = 0; i < 3 && nr < 040; ++i)
999 {
1000 c = regparse[0];
1001 if (c < '0' || c > '7')
1002 break;
1003 nr <<= 3;
1004 nr |= hex2nr(c);
1005 ++regparse;
1006 }
1007
1008 if (i == 0)
1009 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001010 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001011}
1012
1013/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001014 * read_limits - Read two integers to be taken as a minimum and maximum.
1015 * If the first character is '-', then the range is reversed.
1016 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1017 * missing, a very big number is the default.
1018 */
1019 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001020read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001021{
1022 int reverse = FALSE;
1023 char_u *first_char;
1024 long tmp;
1025
1026 if (*regparse == '-')
1027 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001028 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001029 regparse++;
1030 reverse = TRUE;
1031 }
1032 first_char = regparse;
1033 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001034 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001035 {
1036 if (vim_isdigit(*++regparse))
1037 *maxval = getdigits(&regparse);
1038 else
1039 *maxval = MAX_LIMIT;
1040 }
1041 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001042 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001043 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001044 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001045 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001046 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001047 if (*regparse != '}')
Bram Moolenaar1d423ef2022-01-02 21:26:16 +00001048 EMSG2_RET_FAIL(_(e_syntax_error_in_str_curlies),
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001049 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001050
1051 /*
1052 * Reverse the range if there was a '-', or make sure it is in the right
1053 * order otherwise.
1054 */
1055 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1056 {
1057 tmp = *minval;
1058 *minval = *maxval;
1059 *maxval = tmp;
1060 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001061 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001062 return OK;
1063}
1064
1065/*
1066 * vim_regexec and friends
1067 */
1068
1069/*
1070 * Global work variables for vim_regexec().
1071 */
1072
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001073static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001074#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001075static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001076#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001077static void reg_nextline(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001078static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001079
1080/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001081 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1082 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001083 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001084 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001085static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001086static unsigned reg_tofreelen;
1087
1088/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001089 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001090 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001091 * done:
1092 * single-line multi-line
1093 * reg_match &regmatch_T NULL
1094 * reg_mmatch NULL &regmmatch_T
1095 * reg_startp reg_match->startp <invalid>
1096 * reg_endp reg_match->endp <invalid>
1097 * reg_startpos <invalid> reg_mmatch->startpos
1098 * reg_endpos <invalid> reg_mmatch->endpos
1099 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001100 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001101 * reg_firstlnum <invalid> first line in which to search
1102 * reg_maxline 0 last line nr
1103 * reg_line_lbr FALSE or TRUE FALSE
1104 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001105typedef struct {
1106 regmatch_T *reg_match;
1107 regmmatch_T *reg_mmatch;
1108 char_u **reg_startp;
1109 char_u **reg_endp;
1110 lpos_T *reg_startpos;
1111 lpos_T *reg_endpos;
1112 win_T *reg_win;
1113 buf_T *reg_buf;
1114 linenr_T reg_firstlnum;
1115 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001116 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001117
Bram Moolenaar0270f382018-07-17 05:43:58 +02001118 // The current match-position is stord in these variables:
1119 linenr_T lnum; // line number, relative to first line
1120 char_u *line; // start of current line
Bram Moolenaar64066b92021-11-17 18:22:56 +00001121 char_u *input; // current input, points into "line"
Bram Moolenaar0270f382018-07-17 05:43:58 +02001122
1123 int need_clear_subexpr; // subexpressions still need to be cleared
1124#ifdef FEAT_SYN_HL
1125 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1126 // cleared
1127#endif
1128
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001129 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1130 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1131 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001132 int reg_ic;
1133
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001134 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1135 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001136 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001137
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001138 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1139 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001140 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001141
1142 // State for the NFA engine regexec.
1143 int nfa_has_zend; // NFA regexp \ze operator encountered.
1144 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1145 int nfa_nsubexpr; // Number of sub expressions actually being used
1146 // during execution. 1 if only the whole match
1147 // (subexpr 0) is used.
1148 // listid is global, so that it increases on recursive calls to
1149 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1150 // all the states.
1151 int nfa_listid;
1152 int nfa_alt_listid;
1153
1154#ifdef FEAT_SYN_HL
1155 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1156#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001157} regexec_T;
1158
1159static regexec_T rex;
1160static int rex_in_use = FALSE;
1161
Bram Moolenaar071d4272004-06-13 20:20:40 +00001162/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001163 * Return TRUE if character 'c' is included in 'iskeyword' option for
1164 * "reg_buf" buffer.
1165 */
1166 static int
1167reg_iswordc(int c)
1168{
1169 return vim_iswordc_buf(c, rex.reg_buf);
1170}
1171
1172/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001173 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1174 */
1175 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001176reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001177{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001178 // when looking behind for a match/no-match lnum is negative. But we
1179 // can't go before line 1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001180 if (rex.reg_firstlnum + lnum < 1)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001181 return NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001182 if (lnum > rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001183 // Must have matched the "\n" in the last line.
Bram Moolenaarae5bce12005-08-15 21:41:48 +00001184 return (char_u *)"";
Bram Moolenaar6100d022016-10-02 16:51:57 +02001185 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, FALSE);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001186}
1187
Bram Moolenaar071d4272004-06-13 20:20:40 +00001188#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001189static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1190static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1191static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1192static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001193#endif
1194
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001195// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001196#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001197
Bram Moolenaar071d4272004-06-13 20:20:40 +00001198#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001199/*
1200 * Create a new extmatch and mark it as referenced once.
1201 */
1202 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001203make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001204{
1205 reg_extmatch_T *em;
1206
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001207 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001208 if (em != NULL)
1209 em->refcnt = 1;
1210 return em;
1211}
1212
1213/*
1214 * Add a reference to an extmatch.
1215 */
1216 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001217ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001218{
1219 if (em != NULL)
1220 em->refcnt++;
1221 return em;
1222}
1223
1224/*
1225 * Remove a reference to an extmatch. If there are no references left, free
1226 * the info.
1227 */
1228 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001229unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001230{
1231 int i;
1232
1233 if (em != NULL && --em->refcnt <= 0)
1234 {
1235 for (i = 0; i < NSUBEXP; ++i)
1236 vim_free(em->matches[i]);
1237 vim_free(em);
1238 }
1239}
1240#endif
1241
1242/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001243 * Get class of previous character.
1244 */
1245 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001246reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001247{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001248 if (rex.input > rex.line)
1249 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001250 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001251 return -1;
1252}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001253
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001254/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001255 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001256 */
1257 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001258reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001259{
1260 pos_T top, bot;
1261 linenr_T lnum;
1262 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001263 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001264 int mode;
1265 colnr_T start, end;
1266 colnr_T start2, end2;
1267 colnr_T cols;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001268 colnr_T curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001269
Bram Moolenaar679d66c2022-01-30 16:42:56 +00001270 // Check if the buffer is the current buffer and not using a string.
1271 if (rex.reg_buf != curbuf || VIsual.lnum == 0 || rex.reg_maxline == 0)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001272 return FALSE;
1273
1274 if (VIsual_active)
1275 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001276 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001277 {
1278 top = VIsual;
1279 bot = wp->w_cursor;
1280 }
1281 else
1282 {
1283 top = wp->w_cursor;
1284 bot = VIsual;
1285 }
1286 mode = VIsual_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001287 curswant = wp->w_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001288 }
1289 else
1290 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001291 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001292 {
1293 top = curbuf->b_visual.vi_start;
1294 bot = curbuf->b_visual.vi_end;
1295 }
1296 else
1297 {
1298 top = curbuf->b_visual.vi_end;
1299 bot = curbuf->b_visual.vi_start;
1300 }
1301 mode = curbuf->b_visual.vi_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001302 curswant = curbuf->b_visual.vi_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001303 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001304 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001305 if (lnum < top.lnum || lnum > bot.lnum)
1306 return FALSE;
1307
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001308 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001309 if (mode == 'v')
1310 {
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001311 if ((lnum == top.lnum && col < top.col)
1312 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1313 return FALSE;
1314 }
1315 else if (mode == Ctrl_V)
1316 {
1317 getvvcol(wp, &top, &start, NULL, &end);
1318 getvvcol(wp, &bot, &start2, NULL, &end2);
1319 if (start2 < start)
1320 start = start2;
1321 if (end2 > end)
1322 end = end2;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001323 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001324 end = MAXCOL;
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001325
1326 // getvvcol() flushes rex.line, need to get it again
1327 rex.line = reg_getline(rex.lnum);
1328 rex.input = rex.line + col;
1329
1330 cols = win_linetabsize(wp, rex.line, col);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001331 if (cols < start || cols > end - (*p_sel == 'e'))
1332 return FALSE;
1333 }
1334 return TRUE;
1335}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001336
Bram Moolenaar071d4272004-06-13 20:20:40 +00001337/*
1338 * Check the regexp program for its magic number.
1339 * Return TRUE if it's wrong.
1340 */
1341 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001342prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001343{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001344 regprog_T *prog;
1345
Bram Moolenaar6100d022016-10-02 16:51:57 +02001346 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001347 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001348 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001349 return FALSE;
1350
1351 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001352 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001353 emsg(_(e_corrupted_regexp_program));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001354 return TRUE;
1355 }
1356 return FALSE;
1357}
1358
1359/*
1360 * Cleanup the subexpressions, if this wasn't done yet.
1361 * This construction is used to clear the subexpressions only when they are
1362 * used (to increase speed).
1363 */
1364 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001365cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001366{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001367 if (rex.need_clear_subexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001368 {
1369 if (REG_MULTI)
1370 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001371 // Use 0xff to set lnum to -1
Bram Moolenaar6100d022016-10-02 16:51:57 +02001372 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1373 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001374 }
1375 else
1376 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001377 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1378 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001379 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001380 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001381 }
1382}
1383
1384#ifdef FEAT_SYN_HL
1385 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001386cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001387{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001388 if (rex.need_clear_zsubexpr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001389 {
1390 if (REG_MULTI)
1391 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001392 // Use 0xff to set lnum to -1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001393 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1394 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1395 }
1396 else
1397 {
1398 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1399 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1400 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001401 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001402 }
1403}
1404#endif
1405
1406/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001407 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001408 */
1409 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001410reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001411{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001412 rex.line = reg_getline(++rex.lnum);
1413 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001414 fast_breakcheck();
1415}
1416
1417/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001418 * Check whether a backreference matches.
1419 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001420 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1421 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001422 */
1423 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001424match_with_backref(
1425 linenr_T start_lnum,
1426 colnr_T start_col,
1427 linenr_T end_lnum,
1428 colnr_T end_col,
1429 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001430{
1431 linenr_T clnum = start_lnum;
1432 colnr_T ccol = start_col;
1433 int len;
1434 char_u *p;
1435
1436 if (bytelen != NULL)
1437 *bytelen = 0;
1438 for (;;)
1439 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001440 // Since getting one line may invalidate the other, need to make copy.
1441 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001442 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001443 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001444 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001445 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1446 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001447 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001448 vim_free(reg_tofree);
1449 reg_tofree = alloc(len);
1450 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001451 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001452 reg_tofreelen = len;
1453 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001454 STRCPY(reg_tofree, rex.line);
1455 rex.input = reg_tofree + (rex.input - rex.line);
1456 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001457 }
1458
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001459 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001460 p = reg_getline(clnum);
1461 if (clnum == end_lnum)
1462 len = end_col - ccol;
1463 else
1464 len = (int)STRLEN(p + ccol);
1465
Bram Moolenaar0270f382018-07-17 05:43:58 +02001466 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001467 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001468 if (bytelen != NULL)
1469 *bytelen += len;
1470 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001471 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001472 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001473 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001474
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001475 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001476 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001477 if (bytelen != NULL)
1478 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001479 ++clnum;
1480 ccol = 0;
1481 if (got_int)
1482 return RA_FAIL;
1483 }
1484
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001485 // found a match! Note that rex.line may now point to a copy of the line,
1486 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001487 return RA_MATCH;
1488}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001489
Bram Moolenaarfb031402014-09-09 17:18:49 +02001490/*
1491 * Used in a place where no * or \+ can follow.
1492 */
1493 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001494re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001495{
1496 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001497 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001498 semsg(_(e_nfa_regexp_cannot_repeat_str), what);
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001499 rc_did_emsg = TRUE;
1500 return FAIL;
1501 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001502 return OK;
1503}
1504
Bram Moolenaar071d4272004-06-13 20:20:40 +00001505typedef struct
1506{
1507 int a, b, c;
1508} decomp_T;
1509
1510
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001511// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001512static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001513{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001514 {0x5e2,0,0}, // 0xfb20 alt ayin
1515 {0x5d0,0,0}, // 0xfb21 alt alef
1516 {0x5d3,0,0}, // 0xfb22 alt dalet
1517 {0x5d4,0,0}, // 0xfb23 alt he
1518 {0x5db,0,0}, // 0xfb24 alt kaf
1519 {0x5dc,0,0}, // 0xfb25 alt lamed
1520 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1521 {0x5e8,0,0}, // 0xfb27 alt resh
1522 {0x5ea,0,0}, // 0xfb28 alt tav
1523 {'+', 0, 0}, // 0xfb29 alt plus
1524 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1525 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1526 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1527 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1528 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1529 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1530 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1531 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1532 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1533 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1534 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1535 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1536 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1537 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1538 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1539 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1540 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1541 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1542 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1543 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1544 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1545 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1546 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1547 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1548 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1549 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1550 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1551 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1552 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1553 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1554 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1555 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1556 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1557 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1558 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1559 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1560 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1561 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001562};
1563
1564 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001565mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001566{
1567 decomp_T d;
1568
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001569 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001570 {
1571 d = decomp_table[c - 0xfb20];
1572 *c1 = d.a;
1573 *c2 = d.b;
1574 *c3 = d.c;
1575 }
1576 else
1577 {
1578 *c1 = c;
1579 *c2 = *c3 = 0;
1580 }
1581}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001582
1583/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001584 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001585 * Return 0 if strings match, non-zero otherwise.
1586 * Correct the length "*n" when composing characters are ignored.
1587 */
1588 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001589cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001590{
1591 int result;
1592
Bram Moolenaar6100d022016-10-02 16:51:57 +02001593 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001594 result = STRNCMP(s1, s2, *n);
1595 else
1596 result = MB_STRNICMP(s1, s2, *n);
1597
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001598 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001599 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001600 {
1601 char_u *str1, *str2;
1602 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001603 int junk;
1604
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001605 // we have to handle the strcmp ourselves, since it is necessary to
1606 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001607 str1 = s1;
1608 str2 = s2;
1609 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001610 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001611 {
1612 c1 = mb_ptr2char_adv(&str1);
1613 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001614
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001615 // Decompose the character if necessary, into 'base' characters.
1616 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001617 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001618 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001619 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001620 mb_decompose(c1, &c11, &junk, &junk);
1621 mb_decompose(c2, &c12, &junk, &junk);
1622 c1 = c11;
1623 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001624 if (c11 != c12
1625 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001626 break;
1627 }
1628 }
1629 result = c2 - c1;
1630 if (result == 0)
1631 *n = (int)(str2 - s2);
1632 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001633
1634 return result;
1635}
1636
1637/*
1638 * cstrchr: This function is used a lot for simple searches, keep it fast!
1639 */
1640 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001641cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001642{
1643 char_u *p;
1644 int cc;
1645
Bram Moolenaara12a1612019-01-24 16:39:02 +01001646 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001647 return vim_strchr(s, c);
1648
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001649 // tolower() and toupper() can be slow, comparing twice should be a lot
1650 // faster (esp. when using MS Visual C++!).
1651 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001652 if (enc_utf8 && c > 0x80)
1653 cc = utf_fold(c);
1654 else
Bram Moolenaara245a5b2007-08-11 11:58:23 +00001655 if (MB_ISUPPER(c))
1656 cc = MB_TOLOWER(c);
1657 else if (MB_ISLOWER(c))
1658 cc = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001659 else
1660 return vim_strchr(s, c);
1661
Bram Moolenaar071d4272004-06-13 20:20:40 +00001662 if (has_mbyte)
1663 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001664 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001665 {
1666 if (enc_utf8 && c > 0x80)
1667 {
1668 if (utf_fold(utf_ptr2char(p)) == cc)
1669 return p;
1670 }
1671 else if (*p == c || *p == cc)
1672 return p;
1673 }
1674 }
1675 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001676 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001677 for (p = s; *p != NUL; ++p)
1678 if (*p == c || *p == cc)
1679 return p;
1680
1681 return NULL;
1682}
1683
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001684////////////////////////////////////////////////////////////////
1685// regsub stuff //
1686////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001687
Bram Moolenaar071d4272004-06-13 20:20:40 +00001688/*
1689 * We should define ftpr as a pointer to a function returning a pointer to
1690 * a function returning a pointer to a function ...
1691 * This is impossible, so we declare a pointer to a function returning a
Bram Moolenaar30d64132020-09-06 17:09:12 +02001692 * void pointer. This should work for all compilers.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001693 */
Bram Moolenaar30d64132020-09-06 17:09:12 +02001694typedef void (*(*fptr_T)(int *, int));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001695
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001696static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int copy, int magic, int backslash);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001697
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001698 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001699do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001700{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001701 *d = MB_TOUPPER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001702
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001703 return (fptr_T)NULL;
1704}
1705
1706 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001707do_Upper(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001708{
1709 *d = MB_TOUPPER(c);
1710
1711 return (fptr_T)do_Upper;
1712}
1713
1714 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001715do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001716{
1717 *d = MB_TOLOWER(c);
1718
1719 return (fptr_T)NULL;
1720}
1721
1722 static fptr_T
Bram Moolenaar05540972016-01-30 20:31:25 +01001723do_Lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001724{
1725 *d = MB_TOLOWER(c);
1726
1727 return (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001728}
1729
1730/*
1731 * regtilde(): Replace tildes in the pattern by the old pattern.
1732 *
1733 * Short explanation of the tilde: It stands for the previous replacement
1734 * pattern. If that previous pattern also contains a ~ we should go back a
1735 * step further... But we insert the previous pattern into the current one
1736 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001737 * This still does not handle the case where "magic" changes. So require the
1738 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001739 *
1740 * The tildes are parsed once before the first call to vim_regsub().
1741 */
1742 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001743regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001744{
1745 char_u *newsub = source;
1746 char_u *tmpsub;
1747 char_u *p;
1748 int len;
1749 int prevlen;
1750
1751 for (p = newsub; *p; ++p)
1752 {
1753 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
1754 {
1755 if (reg_prev_sub != NULL)
1756 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001757 // length = len(newsub) - 1 + len(prev_sub) + 1
Bram Moolenaar071d4272004-06-13 20:20:40 +00001758 prevlen = (int)STRLEN(reg_prev_sub);
Bram Moolenaar964b3742019-05-24 18:54:09 +02001759 tmpsub = alloc(STRLEN(newsub) + prevlen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001760 if (tmpsub != NULL)
1761 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001762 // copy prefix
1763 len = (int)(p - newsub); // not including ~
Bram Moolenaar071d4272004-06-13 20:20:40 +00001764 mch_memmove(tmpsub, newsub, (size_t)len);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001765 // interpret tilde
Bram Moolenaar071d4272004-06-13 20:20:40 +00001766 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001767 // copy postfix
Bram Moolenaar071d4272004-06-13 20:20:40 +00001768 if (!magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001769 ++p; // back off backslash
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 STRCPY(tmpsub + len + prevlen, p + 1);
1771
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001772 if (newsub != source) // already allocated newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001773 vim_free(newsub);
1774 newsub = tmpsub;
1775 p = newsub + len + prevlen;
1776 }
1777 }
1778 else if (magic)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001779 STRMOVE(p, p + 1); // remove '~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001780 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001781 STRMOVE(p, p + 2); // remove '\~'
Bram Moolenaar071d4272004-06-13 20:20:40 +00001782 --p;
1783 }
1784 else
1785 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001786 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001787 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001788 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001789 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001790 }
1791 }
1792
1793 vim_free(reg_prev_sub);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001794 if (newsub != source) // newsub was allocated, just keep it
Bram Moolenaar071d4272004-06-13 20:20:40 +00001795 reg_prev_sub = newsub;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001796 else // no ~ found, need to save newsub
Bram Moolenaar071d4272004-06-13 20:20:40 +00001797 reg_prev_sub = vim_strsave(newsub);
1798 return newsub;
1799}
1800
1801#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001802static int can_f_submatch = FALSE; // TRUE when submatch() can be used
Bram Moolenaar071d4272004-06-13 20:20:40 +00001803
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001804// These pointers are used for reg_submatch(). Needed for when the
1805// substitution string is an expression that contains a call to substitute()
1806// and submatch().
Bram Moolenaar6100d022016-10-02 16:51:57 +02001807typedef struct {
1808 regmatch_T *sm_match;
1809 regmmatch_T *sm_mmatch;
1810 linenr_T sm_firstlnum;
1811 linenr_T sm_maxline;
1812 int sm_line_lbr;
1813} regsubmatch_T;
1814
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001815static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
Bram Moolenaar071d4272004-06-13 20:20:40 +00001816#endif
1817
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001818#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001819
1820/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001821 * Put the submatches in "argv[argskip]" which is a list passed into
1822 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001823 */
1824 static int
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001825fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, int argcount)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001826{
1827 listitem_T *li;
1828 int i;
1829 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001830 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001831
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001832 if (argcount == argskip)
1833 // called function doesn't take a submatches argument
1834 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001835
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001836 // Relies on sl_list to be the first item in staticList10_T.
1837 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001838
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001839 // There are always 10 list items in staticList10_T.
1840 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001841 for (i = 0; i < 10; ++i)
1842 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02001843 s = rsm.sm_match->startp[i];
1844 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001845 s = NULL;
1846 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02001847 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001848 li->li_tv.v_type = VAR_STRING;
1849 li->li_tv.vval.v_string = s;
1850 li = li->li_next;
1851 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01001852 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001853}
1854
1855 static void
1856clear_submatch_list(staticList10_T *sl)
1857{
1858 int i;
1859
1860 for (i = 0; i < 10; ++i)
1861 vim_free(sl->sl_items[i].li_tv.vval.v_string);
1862}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02001863#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02001864
Bram Moolenaar071d4272004-06-13 20:20:40 +00001865/*
1866 * vim_regsub() - perform substitutions after a vim_regexec() or
1867 * vim_regexec_multi() match.
1868 *
1869 * If "copy" is TRUE really copy into "dest".
1870 * If "copy" is FALSE nothing is copied, this is just to find out the length
1871 * of the result.
1872 *
1873 * If "backslash" is TRUE, a backslash will be removed later, need to double
1874 * them to keep them, and insert a backslash before a CR to avoid it being
1875 * replaced with a line break later.
1876 *
1877 * Note: The matched text must not change between the call of
1878 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
1879 * references invalid!
1880 *
1881 * Returns the size of the replacement, including terminating NUL.
1882 */
1883 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001884vim_regsub(
1885 regmatch_T *rmp,
1886 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001887 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001888 char_u *dest,
1889 int copy,
1890 int magic,
1891 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001892{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001893 int result;
1894 regexec_T rex_save;
1895 int rex_in_use_save = rex_in_use;
1896
1897 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001898 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001899 rex_save = rex;
1900 rex_in_use = TRUE;
1901
1902 rex.reg_match = rmp;
1903 rex.reg_mmatch = NULL;
1904 rex.reg_maxline = 0;
1905 rex.reg_buf = curbuf;
1906 rex.reg_line_lbr = TRUE;
1907 result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
1908
1909 rex_in_use = rex_in_use_save;
1910 if (rex_in_use)
1911 rex = rex_save;
1912
1913 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001914}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001915
1916 int
Bram Moolenaar05540972016-01-30 20:31:25 +01001917vim_regsub_multi(
1918 regmmatch_T *rmp,
1919 linenr_T lnum,
1920 char_u *source,
1921 char_u *dest,
1922 int copy,
1923 int magic,
1924 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001925{
Bram Moolenaar6100d022016-10-02 16:51:57 +02001926 int result;
1927 regexec_T rex_save;
1928 int rex_in_use_save = rex_in_use;
1929
1930 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001931 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001932 rex_save = rex;
1933 rex_in_use = TRUE;
1934
1935 rex.reg_match = NULL;
1936 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001937 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02001938 rex.reg_firstlnum = lnum;
1939 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
1940 rex.reg_line_lbr = FALSE;
1941 result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
1942
1943 rex_in_use = rex_in_use_save;
1944 if (rex_in_use)
1945 rex = rex_save;
1946
1947 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001948}
1949
1950 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001951vim_regsub_both(
1952 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001953 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01001954 char_u *dest,
1955 int copy,
1956 int magic,
1957 int backslash)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001958{
1959 char_u *src;
1960 char_u *dst;
1961 char_u *s;
1962 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001963 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001964 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01001965 fptr_T func_all = (fptr_T)NULL;
1966 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001967 linenr_T clnum = 0; // init for GCC
1968 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00001969#ifdef FEAT_EVAL
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001970 static char_u *eval_result = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001971#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00001972
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001973 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02001974 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001975 {
Bram Moolenaare29a27f2021-07-20 21:07:36 +02001976 emsg(_(e_null_argument));
Bram Moolenaar071d4272004-06-13 20:20:40 +00001977 return 0;
1978 }
1979 if (prog_magic_wrong())
1980 return 0;
1981 src = source;
1982 dst = dest;
1983
1984 /*
1985 * When the substitute part starts with "\=" evaluate it as an expression.
1986 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001987 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001988 {
1989#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001990 // To make sure that the length doesn't change between checking the
1991 // length and copying the string, and to speed up things, the
1992 // resulting string is saved from the call with "copy" == FALSE to the
1993 // call with "copy" == TRUE.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001994 if (copy)
1995 {
1996 if (eval_result != NULL)
1997 {
1998 STRCPY(dest, eval_result);
1999 dst += STRLEN(eval_result);
Bram Moolenaard23a8232018-02-10 18:45:26 +01002000 VIM_CLEAR(eval_result);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002001 }
2002 }
2003 else
2004 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002005 int prev_can_f_submatch = can_f_submatch;
2006 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002007
2008 vim_free(eval_result);
2009
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002010 // The expression may contain substitute(), which calls us
2011 // recursively. Make sure submatch() gets the text from the first
2012 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002013 if (can_f_submatch)
2014 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002015 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002016 rsm.sm_match = rex.reg_match;
2017 rsm.sm_mmatch = rex.reg_mmatch;
2018 rsm.sm_firstlnum = rex.reg_firstlnum;
2019 rsm.sm_maxline = rex.reg_maxline;
2020 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002021
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002022 if (expr != NULL)
2023 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002024 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002025 char_u buf[NUMBUFLEN];
2026 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002027 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002028 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002029
2030 rettv.v_type = VAR_STRING;
2031 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002032 argv[0].v_type = VAR_LIST;
2033 argv[0].vval.v_list = &matchList.sl_list;
2034 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002035 CLEAR_FIELD(funcexe);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002036 funcexe.fe_argv_func = fill_submatch_list;
2037 funcexe.fe_evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002038 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002039 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002040 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002041 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002042 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002043 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002044 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002045 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002046
Bram Moolenaar6100d022016-10-02 16:51:57 +02002047 s = partial_name(partial);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002048 funcexe.fe_partial = partial;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002049 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002050 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002051 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002052 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002053 clear_submatch_list(&matchList);
2054
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002055 if (rettv.v_type == VAR_UNKNOWN)
2056 // something failed, no need to report another error
2057 eval_result = NULL;
2058 else
2059 {
2060 eval_result = tv_get_string_buf_chk(&rettv, buf);
2061 if (eval_result != NULL)
2062 eval_result = vim_strsave(eval_result);
2063 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002064 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002065 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002066 else if (substitute_instr != NULL)
2067 // Execute instructions from ISN_SUBSTITUTE.
2068 eval_result = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002069 else
Bram Moolenaarb171fb12020-06-24 20:34:03 +02002070 eval_result = eval_to_string(source + 2, TRUE);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002071
Bram Moolenaar071d4272004-06-13 20:20:40 +00002072 if (eval_result != NULL)
2073 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002074 int had_backslash = FALSE;
2075
Bram Moolenaar91acfff2017-03-12 19:22:36 +01002076 for (s = eval_result; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002077 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002078 // Change NL to CR, so that it becomes a line break,
2079 // unless called from vim_regexec_nl().
2080 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002081 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002082 *s = CAR;
2083 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002084 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002085 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002086 /* Change NL to CR here too, so that this works:
2087 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2088 * abc\
2089 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002090 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002091 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002092 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002093 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002094 had_backslash = TRUE;
2095 }
2096 }
2097 if (had_backslash && backslash)
2098 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002099 // Backslashes will be consumed, need to double them.
Bram Moolenaar06975a42010-03-23 16:27:22 +01002100 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
2101 if (s != NULL)
2102 {
2103 vim_free(eval_result);
2104 eval_result = s;
2105 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002106 }
2107
2108 dst += STRLEN(eval_result);
2109 }
2110
Bram Moolenaar6100d022016-10-02 16:51:57 +02002111 can_f_submatch = prev_can_f_submatch;
2112 if (can_f_submatch)
2113 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002114 }
2115#endif
2116 }
2117 else
2118 while ((c = *src++) != NUL)
2119 {
2120 if (c == '&' && magic)
2121 no = 0;
2122 else if (c == '\\' && *src != NUL)
2123 {
2124 if (*src == '&' && !magic)
2125 {
2126 ++src;
2127 no = 0;
2128 }
2129 else if ('0' <= *src && *src <= '9')
2130 {
2131 no = *src++ - '0';
2132 }
2133 else if (vim_strchr((char_u *)"uUlLeE", *src))
2134 {
2135 switch (*src++)
2136 {
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002137 case 'u': func_one = (fptr_T)do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002138 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002139 case 'U': func_all = (fptr_T)do_Upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002140 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002141 case 'l': func_one = (fptr_T)do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002142 continue;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002143 case 'L': func_all = (fptr_T)do_Lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002144 continue;
2145 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002146 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002147 continue;
2148 }
2149 }
2150 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002151 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002152 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002153 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2154 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002155 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002156 if (copy)
2157 {
2158 *dst++ = c;
2159 *dst++ = *src++;
2160 *dst++ = *src++;
2161 }
2162 else
2163 {
2164 dst += 3;
2165 src += 2;
2166 }
2167 continue;
2168 }
2169
Bram Moolenaar071d4272004-06-13 20:20:40 +00002170 if (c == '\\' && *src != NUL)
2171 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002172 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002173 switch (*src)
2174 {
2175 case 'r': c = CAR; ++src; break;
2176 case 'n': c = NL; ++src; break;
2177 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002178 // Oh no! \e already has meaning in subst pat :-(
2179 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002180 case 'b': c = Ctrl_H; ++src; break;
2181
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002182 // If "backslash" is TRUE the backslash will be removed
2183 // later. Used to insert a literal CR.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002184 default: if (backslash)
2185 {
2186 if (copy)
2187 *dst = '\\';
2188 ++dst;
2189 }
2190 c = *src++;
2191 }
2192 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002193 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002194 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002195
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002196 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002197 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002198 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002199 func_one = (fptr_T)(func_one(&cc, c));
2200 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002201 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002202 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002203 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002204 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002205
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002206 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002207 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002208 int totlen = mb_ptr2len(src - 1);
2209
Bram Moolenaar071d4272004-06-13 20:20:40 +00002210 if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002211 mb_char2bytes(cc, dst);
2212 dst += mb_char2len(cc) - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002213 if (enc_utf8)
2214 {
2215 int clen = utf_ptr2len(src - 1);
2216
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002217 // If the character length is shorter than "totlen", there
2218 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002219 if (clen < totlen)
2220 {
2221 if (copy)
2222 mch_memmove(dst + 1, src - 1 + clen,
2223 (size_t)(totlen - clen));
2224 dst += totlen - clen;
2225 }
2226 }
2227 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002228 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002229 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002230 *dst = cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002231 dst++;
2232 }
2233 else
2234 {
2235 if (REG_MULTI)
2236 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002237 clnum = rex.reg_mmatch->startpos[no].lnum;
2238 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002239 s = NULL;
2240 else
2241 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002242 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2243 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2244 len = rex.reg_mmatch->endpos[no].col
2245 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002246 else
2247 len = (int)STRLEN(s);
2248 }
2249 }
2250 else
2251 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002252 s = rex.reg_match->startp[no];
2253 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002254 s = NULL;
2255 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002256 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002257 }
2258 if (s != NULL)
2259 {
2260 for (;;)
2261 {
2262 if (len == 0)
2263 {
2264 if (REG_MULTI)
2265 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002266 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002267 break;
2268 if (copy)
2269 *dst = CAR;
2270 ++dst;
2271 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002272 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2273 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002274 else
2275 len = (int)STRLEN(s);
2276 }
2277 else
2278 break;
2279 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002280 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002281 {
2282 if (copy)
Bram Moolenaare29a27f2021-07-20 21:07:36 +02002283 iemsg(_(e_damaged_match_string));
Bram Moolenaar071d4272004-06-13 20:20:40 +00002284 goto exit;
2285 }
2286 else
2287 {
2288 if (backslash && (*s == CAR || *s == '\\'))
2289 {
2290 /*
2291 * Insert a backslash in front of a CR, otherwise
2292 * it will be replaced by a line break.
2293 * Number of backslashes will be halved later,
2294 * double them here.
2295 */
2296 if (copy)
2297 {
2298 dst[0] = '\\';
2299 dst[1] = *s;
2300 }
2301 dst += 2;
2302 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002303 else
2304 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002305 if (has_mbyte)
2306 c = mb_ptr2char(s);
2307 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002308 c = *s;
2309
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002310 if (func_one != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002311 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002312 func_one = (fptr_T)(func_one(&cc, c));
2313 else if (func_all != (fptr_T)NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002314 // Turbo C complains without the typecast
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002315 func_all = (fptr_T)(func_all(&cc, c));
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002316 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002317 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002318
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002319 if (has_mbyte)
2320 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002321 int l;
2322
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002323 // Copy composing characters separately, one
2324 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002325 if (enc_utf8)
2326 l = utf_ptr2len(s) - 1;
2327 else
2328 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002329
2330 s += l;
2331 len -= l;
2332 if (copy)
2333 mb_char2bytes(cc, dst);
2334 dst += mb_char2len(cc) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002335 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002336 else if (copy)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002337 *dst = cc;
2338 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002339 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002340
Bram Moolenaar071d4272004-06-13 20:20:40 +00002341 ++s;
2342 --len;
2343 }
2344 }
2345 }
2346 no = -1;
2347 }
2348 }
2349 if (copy)
2350 *dst = NUL;
2351
2352exit:
2353 return (int)((dst - dest) + 1);
2354}
2355
2356#ifdef FEAT_EVAL
2357/*
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002358 * Call reg_getline() with the line numbers from the submatch. If a
2359 * substitute() was used the reg_maxline and other values have been
2360 * overwritten.
2361 */
2362 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002363reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002364{
2365 char_u *s;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002366 linenr_T save_first = rex.reg_firstlnum;
2367 linenr_T save_max = rex.reg_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002368
Bram Moolenaar6100d022016-10-02 16:51:57 +02002369 rex.reg_firstlnum = rsm.sm_firstlnum;
2370 rex.reg_maxline = rsm.sm_maxline;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002371
2372 s = reg_getline(lnum);
2373
Bram Moolenaar6100d022016-10-02 16:51:57 +02002374 rex.reg_firstlnum = save_first;
2375 rex.reg_maxline = save_max;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002376 return s;
2377}
2378
2379/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002380 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002381 * allocated memory.
2382 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2383 */
2384 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002385reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002386{
2387 char_u *retval = NULL;
2388 char_u *s;
2389 int len;
2390 int round;
2391 linenr_T lnum;
2392
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002393 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002394 return NULL;
2395
Bram Moolenaar6100d022016-10-02 16:51:57 +02002396 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002397 {
2398 /*
2399 * First round: compute the length and allocate memory.
2400 * Second round: copy the text.
2401 */
2402 for (round = 1; round <= 2; ++round)
2403 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002404 lnum = rsm.sm_mmatch->startpos[no].lnum;
2405 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002406 return NULL;
2407
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002408 s = reg_getline_submatch(lnum);
2409 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002410 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002411 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002412 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002413 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002414 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002415 len = rsm.sm_mmatch->endpos[no].col
2416 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002417 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002418 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002419 ++len;
2420 }
2421 else
2422 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002423 // Multiple lines: take start line from start col, middle
2424 // lines completely and end line up to end col.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002425 len = (int)STRLEN(s);
2426 if (round == 2)
2427 {
2428 STRCPY(retval, s);
2429 retval[len] = '\n';
2430 }
2431 ++len;
2432 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002433 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002434 {
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002435 s = reg_getline_submatch(lnum++);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002436 if (round == 2)
2437 STRCPY(retval + len, s);
2438 len += (int)STRLEN(s);
2439 if (round == 2)
2440 retval[len] = '\n';
2441 ++len;
2442 }
2443 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002444 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002445 rsm.sm_mmatch->endpos[no].col);
2446 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002447 if (round == 2)
2448 retval[len] = NUL;
2449 ++len;
2450 }
2451
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002452 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002453 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002454 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002455 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002456 return NULL;
2457 }
2458 }
2459 }
2460 else
2461 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002462 s = rsm.sm_match->startp[no];
2463 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002464 retval = NULL;
2465 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002466 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002467 }
2468
2469 return retval;
2470}
Bram Moolenaar41571762014-04-02 19:00:58 +02002471
2472/*
2473 * Used for the submatch() function with the optional non-zero argument: get
2474 * the list of strings from the n'th submatch in allocated memory with NULs
2475 * represented in NLs.
2476 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2477 * command, for a non-existing submatch and for any error.
2478 */
2479 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002480reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002481{
2482 char_u *s;
2483 linenr_T slnum;
2484 linenr_T elnum;
2485 colnr_T scol;
2486 colnr_T ecol;
2487 int i;
2488 list_T *list;
2489 int error = FALSE;
2490
2491 if (!can_f_submatch || no < 0)
2492 return NULL;
2493
Bram Moolenaar6100d022016-10-02 16:51:57 +02002494 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002495 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002496 slnum = rsm.sm_mmatch->startpos[no].lnum;
2497 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002498 if (slnum < 0 || elnum < 0)
2499 return NULL;
2500
Bram Moolenaar6100d022016-10-02 16:51:57 +02002501 scol = rsm.sm_mmatch->startpos[no].col;
2502 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002503
2504 list = list_alloc();
2505 if (list == NULL)
2506 return NULL;
2507
2508 s = reg_getline_submatch(slnum) + scol;
2509 if (slnum == elnum)
2510 {
2511 if (list_append_string(list, s, ecol - scol) == FAIL)
2512 error = TRUE;
2513 }
2514 else
2515 {
2516 if (list_append_string(list, s, -1) == FAIL)
2517 error = TRUE;
2518 for (i = 1; i < elnum - slnum; i++)
2519 {
2520 s = reg_getline_submatch(slnum + i);
2521 if (list_append_string(list, s, -1) == FAIL)
2522 error = TRUE;
2523 }
2524 s = reg_getline_submatch(elnum);
2525 if (list_append_string(list, s, ecol) == FAIL)
2526 error = TRUE;
2527 }
2528 }
2529 else
2530 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002531 s = rsm.sm_match->startp[no];
2532 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002533 return NULL;
2534 list = list_alloc();
2535 if (list == NULL)
2536 return NULL;
2537 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002538 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002539 error = TRUE;
2540 }
2541
2542 if (error)
2543 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002544 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002545 return NULL;
2546 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002547 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002548 return list;
2549}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002550#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002551
Bram Moolenaarf4140482020-02-15 23:06:45 +01002552/*
2553 * Initialize the values used for matching against multiple lines
2554 */
2555 static void
2556init_regexec_multi(
2557 regmmatch_T *rmp,
2558 win_T *win, // window in which to search or NULL
2559 buf_T *buf, // buffer in which to search
2560 linenr_T lnum) // nr of line to start looking for match
2561{
2562 rex.reg_match = NULL;
2563 rex.reg_mmatch = rmp;
2564 rex.reg_buf = buf;
2565 rex.reg_win = win;
2566 rex.reg_firstlnum = lnum;
2567 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2568 rex.reg_line_lbr = FALSE;
2569 rex.reg_ic = rmp->rmm_ic;
2570 rex.reg_icombine = FALSE;
2571 rex.reg_maxcol = rmp->rmm_maxcol;
2572}
2573
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002574#include "regexp_bt.c"
2575
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002576static regengine_T bt_regengine =
2577{
2578 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002579 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002580 bt_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002581 bt_regexec_multi,
2582 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002583};
2584
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002585#include "regexp_nfa.c"
2586
2587static regengine_T nfa_regengine =
2588{
2589 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002590 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002591 nfa_regexec_nl,
Bram Moolenaarfda37292014-11-05 14:27:36 +01002592 nfa_regexec_multi,
2593 (char_u *)""
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002594};
2595
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002596// Which regexp engine to use? Needed for vim_regcomp().
2597// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002598static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002599
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002600#ifdef DEBUG
2601static char_u regname[][30] = {
2602 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002603 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002604 "NFA Regexp Engine"
2605 };
2606#endif
2607
2608/*
2609 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002610 * Returns the program in allocated memory.
2611 * Use vim_regfree() to free the memory.
2612 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002613 */
2614 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002615vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002616{
2617 regprog_T *prog = NULL;
2618 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002619 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002620
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002621 regexp_engine = p_re;
2622
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002623 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002624 if (STRNCMP(expr, "\\%#=", 4) == 0)
2625 {
2626 int newengine = expr[4] - '0';
2627
2628 if (newengine == AUTOMATIC_ENGINE
2629 || newengine == BACKTRACKING_ENGINE
2630 || newengine == NFA_ENGINE)
2631 {
2632 regexp_engine = expr[4] - '0';
2633 expr += 5;
2634#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002635 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002636 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002637#endif
2638 }
2639 else
2640 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00002641 emsg(_(e_percent_hash_can_only_be_followed_by_zero_one_two_automatic_engine_will_be_used));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002642 regexp_engine = AUTOMATIC_ENGINE;
2643 }
2644 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002645#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002646 bt_regengine.expr = expr;
2647 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002648#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002649 // reg_iswordc() uses rex.reg_buf
2650 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002651
2652 /*
2653 * First try the NFA engine, unless backtracking was requested.
2654 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002655 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002656 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002657 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002658 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002659 else
2660 prog = bt_regengine.regcomp(expr, re_flags);
2661
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002662 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002663 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002664 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002665#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002666 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002667 {
2668 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002669 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002670 if (f)
2671 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002672 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002673 fclose(f);
2674 }
2675 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002676 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002677 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002678 }
2679#endif
2680 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002681 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002682 * The NFA engine also fails for patterns that it can't handle well
2683 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002684 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002685 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002686 if (regexp_engine == AUTOMATIC_ENGINE
2687 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002688 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002689 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002690#ifdef FEAT_EVAL
2691 report_re_switch(expr);
2692#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002693 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002694 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002695 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002696
Bram Moolenaarfda37292014-11-05 14:27:36 +01002697 if (prog != NULL)
2698 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002699 // Store the info needed to call regcomp() again when the engine turns
2700 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002701 prog->re_engine = regexp_engine;
2702 prog->re_flags = re_flags;
2703 }
2704
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002705 return prog;
2706}
2707
2708/*
Bram Moolenaar473de612013-06-08 18:19:48 +02002709 * Free a compiled regexp program, returned by vim_regcomp().
2710 */
2711 void
Bram Moolenaar05540972016-01-30 20:31:25 +01002712vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02002713{
2714 if (prog != NULL)
2715 prog->engine->regfree(prog);
2716}
2717
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002718#if defined(EXITFREE) || defined(PROTO)
2719 void
2720free_regexp_stuff(void)
2721{
2722 ga_clear(&regstack);
2723 ga_clear(&backpos);
2724 vim_free(reg_tofree);
2725 vim_free(reg_prev_sub);
2726}
2727#endif
2728
Bram Moolenaarfda37292014-11-05 14:27:36 +01002729#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01002730 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01002731report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002732{
2733 if (p_verbose > 0)
2734 {
2735 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01002736 msg_puts(_("Switching to backtracking RE engine for pattern: "));
2737 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002738 verbose_leave();
2739 }
2740}
2741#endif
2742
Bram Moolenaar651fca82021-11-29 20:39:38 +00002743#if defined(FEAT_X11) || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02002744/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002745 * Return whether "prog" is currently being executed.
2746 */
2747 int
2748regprog_in_use(regprog_T *prog)
2749{
2750 return prog->re_in_use;
2751}
Bram Moolenaar113e1072019-01-20 15:30:40 +01002752#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01002753
2754/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002755 * Match a regexp against a string.
2756 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002757 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002758 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002759 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002760 *
2761 * Return TRUE if there is a match, FALSE if not.
2762 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01002763 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002764vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01002765 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002766 char_u *line, // string to match against
2767 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01002768 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002769{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002770 int result;
2771 regexec_T rex_save;
2772 int rex_in_use_save = rex_in_use;
2773
Bram Moolenaar0270f382018-07-17 05:43:58 +02002774 // Cannot use the same prog recursively, it contains state.
2775 if (rmp->regprog->re_in_use)
2776 {
Bram Moolenaar677658a2022-01-05 16:09:06 +00002777 emsg(_(e_cannot_use_pattern_recursively));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002778 return FALSE;
2779 }
2780 rmp->regprog->re_in_use = TRUE;
2781
Bram Moolenaar6100d022016-10-02 16:51:57 +02002782 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02002783 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002784 rex_save = rex;
2785 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002786
Bram Moolenaar6100d022016-10-02 16:51:57 +02002787 rex.reg_startp = NULL;
2788 rex.reg_endp = NULL;
2789 rex.reg_startpos = NULL;
2790 rex.reg_endpos = NULL;
2791
2792 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002793 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002794
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002795 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002796 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2797 && result == NFA_TOO_EXPENSIVE)
2798 {
2799 int save_p_re = p_re;
2800 int re_flags = rmp->regprog->re_flags;
2801 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2802
2803 p_re = BACKTRACKING_ENGINE;
2804 vim_regfree(rmp->regprog);
2805 if (pat != NULL)
2806 {
2807#ifdef FEAT_EVAL
2808 report_re_switch(pat);
2809#endif
2810 rmp->regprog = vim_regcomp(pat, re_flags);
2811 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002812 {
2813 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002814 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02002815 rmp->regprog->re_in_use = FALSE;
2816 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002817 vim_free(pat);
2818 }
2819
2820 p_re = save_p_re;
2821 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002822
2823 rex_in_use = rex_in_use_save;
2824 if (rex_in_use)
2825 rex = rex_save;
2826
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002827 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002828}
2829
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002830/*
2831 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002832 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002833 */
2834 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002835vim_regexec_prog(
2836 regprog_T **prog,
2837 int ignore_case,
2838 char_u *line,
2839 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002840{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002841 int r;
2842 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002843
2844 regmatch.regprog = *prog;
2845 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002846 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002847 *prog = regmatch.regprog;
2848 return r;
2849}
2850
2851/*
2852 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002853 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002854 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002855 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002856vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002857{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002858 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002859}
2860
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002861/*
2862 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01002863 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002864 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002865 */
2866 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002867vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002868{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02002869 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002870}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002871
2872/*
2873 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002874 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
2875 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002876 * Uses curbuf for line count and 'iskeyword'.
2877 *
2878 * Return zero if there is no match. Return number of lines contained in the
2879 * match otherwise.
2880 */
2881 long
Bram Moolenaar05540972016-01-30 20:31:25 +01002882vim_regexec_multi(
2883 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002884 win_T *win, // window in which to search or NULL
2885 buf_T *buf, // buffer in which to search
2886 linenr_T lnum, // nr of line to start looking for match
2887 colnr_T col, // column to start looking for match
2888 proftime_T *tm, // timeout limit or NULL
2889 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002890{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002891 int result;
2892 regexec_T rex_save;
2893 int rex_in_use_save = rex_in_use;
2894
Bram Moolenaar0270f382018-07-17 05:43:58 +02002895 // Cannot use the same prog recursively, it contains state.
2896 if (rmp->regprog->re_in_use)
2897 {
Bram Moolenaar677658a2022-01-05 16:09:06 +00002898 emsg(_(e_cannot_use_pattern_recursively));
Bram Moolenaar0270f382018-07-17 05:43:58 +02002899 return FALSE;
2900 }
2901 rmp->regprog->re_in_use = TRUE;
2902
Bram Moolenaar6100d022016-10-02 16:51:57 +02002903 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002904 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002905 rex_save = rex;
2906 rex_in_use = TRUE;
2907
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002908 result = rmp->regprog->engine->regexec_multi(
2909 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002910 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002911
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002912 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002913 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
2914 && result == NFA_TOO_EXPENSIVE)
2915 {
2916 int save_p_re = p_re;
2917 int re_flags = rmp->regprog->re_flags;
2918 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
2919
2920 p_re = BACKTRACKING_ENGINE;
2921 vim_regfree(rmp->regprog);
2922 if (pat != NULL)
2923 {
2924#ifdef FEAT_EVAL
2925 report_re_switch(pat);
2926#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002927#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002928 // checking for \z misuse was already done when compiling for NFA,
2929 // allow all here
2930 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002931#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01002932 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002933#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002934 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02002935#endif
Bram Moolenaarbcf94422018-06-23 14:21:42 +02002936
Bram Moolenaarfda37292014-11-05 14:27:36 +01002937 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02002938 {
2939 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002940 result = rmp->regprog->engine->regexec_multi(
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02002941 rmp, win, buf, lnum, col, tm, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02002942 rmp->regprog->re_in_use = FALSE;
2943 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01002944 vim_free(pat);
2945 }
2946 p_re = save_p_re;
2947 }
2948
Bram Moolenaar6100d022016-10-02 16:51:57 +02002949 rex_in_use = rex_in_use_save;
2950 if (rex_in_use)
2951 rex = rex_save;
2952
Bram Moolenaar66a3e792014-11-20 23:07:05 +01002953 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002954}