blob: ea6079b00850b668732c01731c374ae7346af295 [file] [log] [blame]
Bram Moolenaaredf3f972016-08-29 22:49:24 +02001/* vi:set ts=8 sts=4 sw=4 noet:
Bram Moolenaar071d4272004-06-13 20:20:40 +00002 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
Bram Moolenaar071d4272004-06-13 20:20:40 +00004 */
5
Bram Moolenaarc2d09c92019-04-25 20:07:51 +02006// By default: do not create debugging logs or files related to regular
7// expressions, even when compiling with -DDEBUG.
8// Uncomment the second line to get the regexp debugging.
9#undef DEBUG
10// #define DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020011
Bram Moolenaar071d4272004-06-13 20:20:40 +000012#include "vim.h"
13
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020014#ifdef DEBUG
Bram Moolenaar63d9e732019-12-05 21:10:38 +010015// show/save debugging data when BT engine is used
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020016# define BT_REGEXP_DUMP
Bram Moolenaar63d9e732019-12-05 21:10:38 +010017// save the debugging data to a file instead of displaying it
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020018# define BT_REGEXP_LOG
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +020019# define BT_REGEXP_DEBUG_LOG
20# define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +020021#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +000022
Paul Ollis65745772022-06-05 16:55:54 +010023#ifdef FEAT_RELTIME
Bram Moolenaar155f2d12022-06-20 13:38:33 +010024static sig_atomic_t dummy_timeout_flag = 0;
25static volatile sig_atomic_t *timeout_flag = &dummy_timeout_flag;
Paul Ollis65745772022-06-05 16:55:54 +010026#endif
27
Bram Moolenaar071d4272004-06-13 20:20:40 +000028/*
Bram Moolenaar071d4272004-06-13 20:20:40 +000029 * Magic characters have a special meaning, they don't match literally.
30 * Magic characters are negative. This separates them from literal characters
31 * (possibly multi-byte). Only ASCII characters can be Magic.
32 */
33#define Magic(x) ((int)(x) - 256)
34#define un_Magic(x) ((x) + 256)
35#define is_Magic(x) ((x) < 0)
36
Bram Moolenaar071d4272004-06-13 20:20:40 +000037 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010038no_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000039{
40 if (is_Magic(x))
41 return un_Magic(x);
42 return x;
43}
44
45 static int
Bram Moolenaar05540972016-01-30 20:31:25 +010046toggle_Magic(int x)
Bram Moolenaar071d4272004-06-13 20:20:40 +000047{
48 if (is_Magic(x))
49 return un_Magic(x);
50 return Magic(x);
51}
52
Paul Ollis65745772022-06-05 16:55:54 +010053#ifdef FEAT_RELTIME
Bram Moolenaar0f618382022-08-26 21:33:04 +010054static int timeout_nesting = 0;
55
56/*
57 * Start a timer that will cause the regexp to abort after "msec".
58 * This doesn't work well recursively. In case it happens anyway, the first
59 * set timeout will prevail, nested ones are ignored.
60 * The caller must make sure there is a matching disable_regexp_timeout() call!
61 */
Paul Ollis65745772022-06-05 16:55:54 +010062 void
63init_regexp_timeout(long msec)
64{
Bram Moolenaar0f618382022-08-26 21:33:04 +010065 if (timeout_nesting == 0)
66 timeout_flag = start_timeout(msec);
67 ++timeout_nesting;
Paul Ollis65745772022-06-05 16:55:54 +010068}
69
70 void
71disable_regexp_timeout(void)
72{
Bram Moolenaar0f618382022-08-26 21:33:04 +010073 if (timeout_nesting == 0)
74 iemsg("disable_regexp_timeout() called without active timer");
75 else if (--timeout_nesting == 0)
76 {
77 stop_timeout();
78 timeout_flag = &dummy_timeout_flag;
79 }
Paul Ollis65745772022-06-05 16:55:54 +010080}
81#endif
82
Bram Moolenaar9781d9c2022-09-20 13:51:25 +010083#if defined(FEAT_EVAL) || defined(PROTO)
84# ifdef FEAT_RELTIME
85static sig_atomic_t *saved_timeout_flag;
86# endif
87
88/*
89 * Used at the debug prompt: disable the timeout so that expression evaluation
90 * can used patterns.
91 * Must be followed by calling restore_timeout_for_debugging().
92 */
93 void
94save_timeout_for_debugging(void)
95{
96# ifdef FEAT_RELTIME
97 saved_timeout_flag = (sig_atomic_t *)timeout_flag;
98 timeout_flag = &dummy_timeout_flag;
99# endif
100}
101
102 void
103restore_timeout_for_debugging(void)
104{
105# ifdef FEAT_RELTIME
106 timeout_flag = saved_timeout_flag;
107# endif
108}
109#endif
110
Bram Moolenaar071d4272004-06-13 20:20:40 +0000111/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200112 * The first byte of the BT regexp internal "program" is actually this magic
Bram Moolenaar071d4272004-06-13 20:20:40 +0000113 * number; the start node begins in the second byte. It's used to catch the
114 * most severe mutilation of the program by the caller.
115 */
116
117#define REGMAGIC 0234
118
119/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000120 * Utility definitions.
121 */
122#define UCHARAT(p) ((int)*(char_u *)(p))
123
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100124// Used for an error (down from) vim_regcomp(): give the error message, set
125// rc_did_emsg and return NULL
Bram Moolenaarf9e3e092019-01-13 23:38:42 +0100126#define EMSG_RET_NULL(m) return (emsg((m)), rc_did_emsg = TRUE, (void *)NULL)
127#define IEMSG_RET_NULL(m) return (iemsg((m)), rc_did_emsg = TRUE, (void *)NULL)
128#define EMSG_RET_FAIL(m) return (emsg((m)), rc_did_emsg = TRUE, FAIL)
129#define EMSG2_RET_NULL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaar1be45b22019-01-14 22:46:15 +0100130#define EMSG3_RET_NULL(m, c, a) return (semsg((const char *)(m), (c) ? "" : "\\", (a)), rc_did_emsg = TRUE, (void *)NULL)
Bram Moolenaarf9e3e092019-01-13 23:38:42 +0100131#define EMSG2_RET_FAIL(m, c) return (semsg((const char *)(m), (c) ? "" : "\\"), rc_did_emsg = TRUE, FAIL)
Bram Moolenaarac78dd42022-01-02 19:25:26 +0000132#define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_(e_invalid_item_in_str_brackets), reg_magic == MAGIC_ALL)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000133
Bram Moolenaar95f09602016-11-10 20:01:45 +0100134
Bram Moolenaar071d4272004-06-13 20:20:40 +0000135#define MAX_LIMIT (32767L << 16L)
136
Bram Moolenaar071d4272004-06-13 20:20:40 +0000137#define NOT_MULTI 0
138#define MULTI_ONE 1
139#define MULTI_MULT 2
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200140
141// return values for regmatch()
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100142#define RA_FAIL 1 // something failed, abort
143#define RA_CONT 2 // continue in inner loop
144#define RA_BREAK 3 // break inner loop
145#define RA_MATCH 4 // successful match
146#define RA_NOMATCH 5 // didn't match
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200147
Bram Moolenaar071d4272004-06-13 20:20:40 +0000148/*
149 * Return NOT_MULTI if c is not a "multi" operator.
150 * Return MULTI_ONE if c is a single "multi" operator.
151 * Return MULTI_MULT if c is a multi "multi" operator.
152 */
153 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100154re_multi_type(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000155{
156 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
157 return MULTI_ONE;
158 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
159 return MULTI_MULT;
160 return NOT_MULTI;
161}
162
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000163static char_u *reg_prev_sub = NULL;
John Marriott82792db2024-05-12 00:07:17 +0200164static size_t reg_prev_sublen = 0;
Bram Moolenaarf461c8e2005-06-25 23:04:51 +0000165
Bram Moolenaar071d4272004-06-13 20:20:40 +0000166/*
167 * REGEXP_INRANGE contains all characters which are always special in a []
168 * range after '\'.
169 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
170 * These are:
171 * \n - New line (NL).
172 * \r - Carriage Return (CR).
173 * \t - Tab (TAB).
174 * \e - Escape (ESC).
175 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000176 * \d - Character code in decimal, eg \d123
177 * \o - Character code in octal, eg \o80
178 * \x - Character code in hex, eg \x4a
179 * \u - Multibyte character code, eg \u20ac
180 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000181 */
182static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000183static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000184
Bram Moolenaar071d4272004-06-13 20:20:40 +0000185/*
186 * Translate '\x' to its control character, except "\n", which is Magic.
187 */
188 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100189backslash_trans(int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000190{
191 switch (c)
192 {
193 case 'r': return CAR;
194 case 't': return TAB;
195 case 'e': return ESC;
196 case 'b': return BS;
197 }
198 return c;
199}
200
John Marriott82792db2024-05-12 00:07:17 +0200201enum
202{
203 CLASS_ALNUM = 0,
204 CLASS_ALPHA,
205 CLASS_BLANK,
206 CLASS_CNTRL,
207 CLASS_DIGIT,
208 CLASS_GRAPH,
209 CLASS_LOWER,
210 CLASS_PRINT,
211 CLASS_PUNCT,
212 CLASS_SPACE,
213 CLASS_UPPER,
214 CLASS_XDIGIT,
215 CLASS_TAB,
216 CLASS_RETURN,
217 CLASS_BACKSPACE,
218 CLASS_ESCAPE,
219 CLASS_IDENT,
220 CLASS_KEYWORD,
221 CLASS_FNAME,
222 CLASS_NONE = 99
223};
224
Bram Moolenaar071d4272004-06-13 20:20:40 +0000225/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000226 * Check for a character class name "[:name:]". "pp" points to the '['.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000227 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
228 * recognized. Otherwise "pp" is advanced to after the item.
229 */
230 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100231get_char_class(char_u **pp)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000232{
John Marriott82792db2024-05-12 00:07:17 +0200233 // must be sorted by the 'value' field because it is used by bsearch()!
234 static keyvalue_T char_class_tab[] =
Bram Moolenaar071d4272004-06-13 20:20:40 +0000235 {
John Marriott82792db2024-05-12 00:07:17 +0200236 KEYVALUE_ENTRY(CLASS_ALNUM, "alnum:]"),
237 KEYVALUE_ENTRY(CLASS_ALPHA, "alpha:]"),
238 KEYVALUE_ENTRY(CLASS_BACKSPACE, "backspace:]"),
239 KEYVALUE_ENTRY(CLASS_BLANK, "blank:]"),
240 KEYVALUE_ENTRY(CLASS_CNTRL, "cntrl:]"),
241 KEYVALUE_ENTRY(CLASS_DIGIT, "digit:]"),
242 KEYVALUE_ENTRY(CLASS_ESCAPE, "escape:]"),
243 KEYVALUE_ENTRY(CLASS_FNAME, "fname:]"),
244 KEYVALUE_ENTRY(CLASS_GRAPH, "graph:]"),
245 KEYVALUE_ENTRY(CLASS_IDENT, "ident:]"),
246 KEYVALUE_ENTRY(CLASS_KEYWORD, "keyword:]"),
247 KEYVALUE_ENTRY(CLASS_LOWER, "lower:]"),
248 KEYVALUE_ENTRY(CLASS_PRINT, "print:]"),
249 KEYVALUE_ENTRY(CLASS_PUNCT, "punct:]"),
250 KEYVALUE_ENTRY(CLASS_RETURN, "return:]"),
251 KEYVALUE_ENTRY(CLASS_SPACE, "space:]"),
252 KEYVALUE_ENTRY(CLASS_TAB, "tab:]"),
253 KEYVALUE_ENTRY(CLASS_UPPER, "upper:]"),
254 KEYVALUE_ENTRY(CLASS_XDIGIT, "xdigit:]")
Bram Moolenaar071d4272004-06-13 20:20:40 +0000255 };
Bram Moolenaar071d4272004-06-13 20:20:40 +0000256
John Marriott82792db2024-05-12 00:07:17 +0200257 // check that the value of "pp" has a chance of matching
258 if ((*pp)[1] == ':' && ASCII_ISLOWER((*pp)[2])
259 && ASCII_ISLOWER((*pp)[3]) && ASCII_ISLOWER((*pp)[4]))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000260 {
John Marriott82792db2024-05-12 00:07:17 +0200261 keyvalue_T target;
262 keyvalue_T *entry;
263 // this function can be called repeatedly with the same value for "pp"
264 // so we cache the last found entry.
265 static keyvalue_T *last_entry = NULL;
266
267 target.key = 0;
John Marriott8d4477e2024-11-02 15:59:01 +0100268 target.value.string = *pp + 2;
269 target.value.length = 0; // not used, see cmp_keyvalue_value_n()
John Marriott82792db2024-05-12 00:07:17 +0200270
271 if (last_entry != NULL && cmp_keyvalue_value_n(&target, last_entry) == 0)
272 entry = last_entry;
273 else
274 entry = (keyvalue_T *)bsearch(&target, &char_class_tab,
275 ARRAY_LENGTH(char_class_tab),
276 sizeof(char_class_tab[0]), cmp_keyvalue_value_n);
277 if (entry != NULL)
278 {
279 last_entry = entry;
John Marriott8d4477e2024-11-02 15:59:01 +0100280 *pp += entry->value.length + 2;
John Marriott82792db2024-05-12 00:07:17 +0200281 return entry->key;
282 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000283 }
284 return CLASS_NONE;
285}
286
287/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000288 * Specific version of character class functions.
289 * Using a table to keep this fast.
290 */
291static short class_tab[256];
292
293#define RI_DIGIT 0x01
294#define RI_HEX 0x02
295#define RI_OCTAL 0x04
296#define RI_WORD 0x08
297#define RI_HEAD 0x10
298#define RI_ALPHA 0x20
299#define RI_LOWER 0x40
300#define RI_UPPER 0x80
301#define RI_WHITE 0x100
302
303 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100304init_class_tab(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000305{
306 int i;
307 static int done = FALSE;
308
309 if (done)
310 return;
311
312 for (i = 0; i < 256; ++i)
313 {
314 if (i >= '0' && i <= '7')
315 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
316 else if (i >= '8' && i <= '9')
317 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
318 else if (i >= 'a' && i <= 'f')
319 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000320 else if (i >= 'g' && i <= 'z')
Bram Moolenaar071d4272004-06-13 20:20:40 +0000321 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
322 else if (i >= 'A' && i <= 'F')
323 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000324 else if (i >= 'G' && i <= 'Z')
Bram Moolenaar071d4272004-06-13 20:20:40 +0000325 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
326 else if (i == '_')
327 class_tab[i] = RI_WORD + RI_HEAD;
328 else
329 class_tab[i] = 0;
330 }
331 class_tab[' '] |= RI_WHITE;
332 class_tab['\t'] |= RI_WHITE;
333 done = TRUE;
334}
335
kylo252ae6f1d82022-02-16 19:24:07 +0000336#define ri_digit(c) ((c) < 0x100 && (class_tab[c] & RI_DIGIT))
337#define ri_hex(c) ((c) < 0x100 && (class_tab[c] & RI_HEX))
338#define ri_octal(c) ((c) < 0x100 && (class_tab[c] & RI_OCTAL))
339#define ri_word(c) ((c) < 0x100 && (class_tab[c] & RI_WORD))
340#define ri_head(c) ((c) < 0x100 && (class_tab[c] & RI_HEAD))
341#define ri_alpha(c) ((c) < 0x100 && (class_tab[c] & RI_ALPHA))
342#define ri_lower(c) ((c) < 0x100 && (class_tab[c] & RI_LOWER))
343#define ri_upper(c) ((c) < 0x100 && (class_tab[c] & RI_UPPER))
344#define ri_white(c) ((c) < 0x100 && (class_tab[c] & RI_WHITE))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000345
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100346// flags for regflags
347#define RF_ICASE 1 // ignore case
348#define RF_NOICASE 2 // don't ignore case
349#define RF_HASNL 4 // can match a NL
350#define RF_ICOMBINE 8 // ignore combining characters
351#define RF_LOOKBH 16 // uses "\@<=" or "\@<!"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000352
353/*
354 * Global work variables for vim_regcomp().
355 */
356
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100357static char_u *regparse; // Input-scan pointer.
358static int regnpar; // () count.
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100359static int wants_nfa; // regex should use NFA engine
Bram Moolenaar071d4272004-06-13 20:20:40 +0000360#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100361static int regnzpar; // \z() count.
362static int re_has_z; // \z item detected
Bram Moolenaar071d4272004-06-13 20:20:40 +0000363#endif
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100364static unsigned regflags; // RF_ flags for prog
Bram Moolenaar071d4272004-06-13 20:20:40 +0000365#if defined(FEAT_SYN_HL) || defined(PROTO)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100366static int had_eol; // TRUE when EOL found by vim_regcomp()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000367#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000368
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100369static magic_T reg_magic; // magicness of the pattern
Bram Moolenaar071d4272004-06-13 20:20:40 +0000370
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100371static int reg_string; // matching with a string instead of a buffer
372 // line
373static int reg_strict; // "[abc" is illegal
Bram Moolenaar071d4272004-06-13 20:20:40 +0000374
375/*
376 * META contains all characters that may be magic, except '^' and '$'.
377 */
378
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100379// META[] is used often enough to justify turning it into a table.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000380static char_u META_flags[] = {
381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100383// % & ( ) * + .
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100385// 1 2 3 4 5 6 7 8 9 < = > ?
Bram Moolenaar071d4272004-06-13 20:20:40 +0000386 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100387// @ A C D F H I K L M O
Bram Moolenaar071d4272004-06-13 20:20:40 +0000388 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100389// P S U V W X Z [ _
Bram Moolenaar071d4272004-06-13 20:20:40 +0000390 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100391// a c d f h i k l m n o
Bram Moolenaar071d4272004-06-13 20:20:40 +0000392 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100393// p s u v w x z { | ~
Bram Moolenaar071d4272004-06-13 20:20:40 +0000394 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
395};
Bram Moolenaar071d4272004-06-13 20:20:40 +0000396
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100397static int curchr; // currently parsed character
398// Previous character. Note: prevchr is sometimes -1 when we are not at the
399// start, eg in /[ ^I]^ the pattern was never found even if it existed,
400// because ^ was taken to be magic -- webb
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200401static int prevchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100402static int prevprevchr; // previous-previous character
403static int nextchr; // used for ungetchr()
Bram Moolenaar071d4272004-06-13 20:20:40 +0000404
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100405// arguments for reg()
406#define REG_NOPAREN 0 // toplevel reg()
407#define REG_PAREN 1 // \(\)
408#define REG_ZPAREN 2 // \z(\)
409#define REG_NPAREN 3 // \%(\)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000410
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200411typedef struct
412{
413 char_u *regparse;
414 int prevchr_len;
415 int curchr;
416 int prevchr;
417 int prevprevchr;
418 int nextchr;
419 int at_start;
420 int prev_at_start;
421 int regnpar;
422} parse_state_T;
423
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100424static void initchr(char_u *);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100425static int getchr(void);
426static void skipchr_keepstart(void);
427static int peekchr(void);
428static void skipchr(void);
429static void ungetchr(void);
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100430static long gethexchrs(int maxinputlen);
431static long getoctchrs(void);
432static long getdecchrs(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100433static int coll_get_char(void);
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +0100434static int prog_magic_wrong(void);
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200435static int cstrncmp(char_u *s1, char_u *s2, int *n);
436static char_u *cstrchr(char_u *, int);
437static int re_mult_next(char *what);
Bram Moolenaar221cd9f2019-01-31 15:34:40 +0100438static int reg_iswordc(int);
Bram Moolenaar66c50c52021-01-02 17:43:49 +0100439#ifdef FEAT_EVAL
440static void report_re_switch(char_u *pat);
441#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +0000442
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200443static regengine_T bt_regengine;
444static regengine_T nfa_regengine;
445
Bram Moolenaar071d4272004-06-13 20:20:40 +0000446/*
447 * Return TRUE if compiled regular expression "prog" can match a line break.
448 */
449 int
Bram Moolenaar05540972016-01-30 20:31:25 +0100450re_multiline(regprog_T *prog)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000451{
452 return (prog->regflags & RF_HASNL);
453}
454
455/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000456 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
457 * Returns a character representing the class. Zero means that no item was
458 * recognized. Otherwise "pp" is advanced to after the item.
459 */
460 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100461get_equi_class(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000462{
463 int c;
464 int l = 1;
465 char_u *p = *pp;
466
Bram Moolenaar985079c2019-02-16 17:07:47 +0100467 if (p[1] == '=' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000468 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000469 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000470 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000471 if (p[l + 2] == '=' && p[l + 3] == ']')
472 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000473 if (has_mbyte)
474 c = mb_ptr2char(p + 2);
475 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000476 c = p[2];
477 *pp += l + 4;
478 return c;
479 }
480 }
481 return 0;
482}
483
484/*
Bram Moolenaardf177f62005-02-22 08:39:57 +0000485 * Check for a collating element "[.a.]". "pp" points to the '['.
486 * Returns a character. Zero means that no item was recognized. Otherwise
487 * "pp" is advanced to after the item.
488 * Currently only single characters are recognized!
489 */
490 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100491get_coll_element(char_u **pp)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000492{
493 int c;
494 int l = 1;
495 char_u *p = *pp;
496
Bram Moolenaarf1b57ab2019-02-17 13:53:34 +0100497 if (p[0] != NUL && p[1] == '.' && p[2] != NUL)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000498 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000499 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000500 l = (*mb_ptr2len)(p + 2);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000501 if (p[l + 2] == '.' && p[l + 3] == ']')
502 {
Bram Moolenaardf177f62005-02-22 08:39:57 +0000503 if (has_mbyte)
504 c = mb_ptr2char(p + 2);
505 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000506 c = p[2];
507 *pp += l + 4;
508 return c;
509 }
510 }
511 return 0;
512}
513
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100514static int reg_cpo_lit; // 'cpoptions' contains 'l' flag
515static int reg_cpo_bsl; // 'cpoptions' contains '\' flag
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200516
517 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100518get_cpo_flags(void)
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200519{
520 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
521 reg_cpo_bsl = vim_strchr(p_cpo, CPO_BACKSL) != NULL;
522}
Bram Moolenaardf177f62005-02-22 08:39:57 +0000523
524/*
525 * Skip over a "[]" range.
526 * "p" must point to the character after the '['.
527 * The returned pointer is on the matching ']', or the terminating NUL.
528 */
529 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100530skip_anyof(char_u *p)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000531{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000532 int l;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000533
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100534 if (*p == '^') // Complement of range.
Bram Moolenaardf177f62005-02-22 08:39:57 +0000535 ++p;
536 if (*p == ']' || *p == '-')
537 ++p;
538 while (*p != NUL && *p != ']')
539 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000540 if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1)
Bram Moolenaardf177f62005-02-22 08:39:57 +0000541 p += l;
542 else
Bram Moolenaardf177f62005-02-22 08:39:57 +0000543 if (*p == '-')
544 {
545 ++p;
546 if (*p != ']' && *p != NUL)
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100547 MB_PTR_ADV(p);
Bram Moolenaardf177f62005-02-22 08:39:57 +0000548 }
549 else if (*p == '\\'
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200550 && !reg_cpo_bsl
Bram Moolenaardf177f62005-02-22 08:39:57 +0000551 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200552 || (!reg_cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
Bram Moolenaardf177f62005-02-22 08:39:57 +0000553 p += 2;
554 else if (*p == '[')
555 {
556 if (get_char_class(&p) == CLASS_NONE
557 && get_equi_class(&p) == 0
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200558 && get_coll_element(&p) == 0
559 && *p != NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100560 ++p; // it is not a class name and not NUL
Bram Moolenaardf177f62005-02-22 08:39:57 +0000561 }
562 else
563 ++p;
564 }
565
566 return p;
567}
568
569/*
Bram Moolenaar071d4272004-06-13 20:20:40 +0000570 * Skip past regular expression.
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200571 * Stop at end of "startp" or where "delim" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000572 * Take care of characters with a backslash in front of it.
573 * Skip strings inside [ and ].
Bram Moolenaar071d4272004-06-13 20:20:40 +0000574 */
575 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +0100576skip_regexp(
577 char_u *startp,
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200578 int delim,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200579 int magic)
580{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100581 return skip_regexp_ex(startp, delim, magic, NULL, NULL, NULL);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200582}
583
584/*
585 * Call skip_regexp() and when the delimiter does not match give an error and
586 * return NULL.
587 */
588 char_u *
589skip_regexp_err(
590 char_u *startp,
591 int delim,
592 int magic)
593{
594 char_u *p = skip_regexp(startp, delim, magic);
595
596 if (*p != delim)
597 {
Bram Moolenaara6f79292022-01-04 21:30:47 +0000598 semsg(_(e_missing_delimiter_after_search_pattern_str), startp);
Bram Moolenaar2c5ed4e2020-04-20 19:42:10 +0200599 return NULL;
600 }
601 return p;
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200602}
603
604/*
605 * skip_regexp() with extra arguments:
606 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
607 * expression and change "\?" to "?". If "*newp" is not NULL the expression
608 * is changed in-place.
609 * If a "\?" is changed to "?" then "dropped" is incremented, unless NULL.
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100610 * If "magic_val" is not NULL, returns the effective magicness of the pattern
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200611 */
612 char_u *
613skip_regexp_ex(
614 char_u *startp,
615 int dirc,
Bram Moolenaar05540972016-01-30 20:31:25 +0100616 int magic,
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200617 char_u **newp,
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100618 int *dropped,
619 magic_T *magic_val)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000620{
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100621 magic_T mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000622 char_u *p = startp;
zeertzjq30741372024-05-24 07:37:36 +0200623 size_t startplen = 0;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000624
625 if (magic)
626 mymagic = MAGIC_ON;
627 else
628 mymagic = MAGIC_OFF;
Bram Moolenaar1cd3f2c2013-06-05 12:43:09 +0200629 get_cpo_flags();
Bram Moolenaar071d4272004-06-13 20:20:40 +0000630
Bram Moolenaar91acfff2017-03-12 19:22:36 +0100631 for (; p[0] != NUL; MB_PTR_ADV(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000632 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100633 if (p[0] == dirc) // found end of regexp
Bram Moolenaar071d4272004-06-13 20:20:40 +0000634 break;
635 if ((p[0] == '[' && mymagic >= MAGIC_ON)
636 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
637 {
638 p = skip_anyof(p + 1);
639 if (p[0] == NUL)
640 break;
641 }
642 else if (p[0] == '\\' && p[1] != NUL)
643 {
644 if (dirc == '?' && newp != NULL && p[1] == '?')
645 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100646 // change "\?" to "?", make a copy first.
zeertzjq30741372024-05-24 07:37:36 +0200647 if (startplen == 0)
648 startplen = STRLEN(startp);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000649 if (*newp == NULL)
650 {
John Marriott82792db2024-05-12 00:07:17 +0200651 *newp = vim_strnsave(startp, startplen);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000652 if (*newp != NULL)
zeertzjq30741372024-05-24 07:37:36 +0200653 {
Bram Moolenaar071d4272004-06-13 20:20:40 +0000654 p = *newp + (p - startp);
zeertzjq30741372024-05-24 07:37:36 +0200655 startp = *newp;
656 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000657 }
Bram Moolenaare8c4abb2020-04-02 21:13:25 +0200658 if (dropped != NULL)
659 ++*dropped;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000660 if (*newp != NULL)
zeertzjq30741372024-05-24 07:37:36 +0200661 mch_memmove(p, p + 1, startplen - ((p + 1) - startp) + 1);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000662 else
663 ++p;
664 }
665 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100666 ++p; // skip next character
Bram Moolenaar071d4272004-06-13 20:20:40 +0000667 if (*p == 'v')
668 mymagic = MAGIC_ALL;
669 else if (*p == 'V')
670 mymagic = MAGIC_NONE;
671 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000672 }
Bram Moolenaard93a7fc2021-01-04 12:42:13 +0100673 if (magic_val != NULL)
674 *magic_val = mymagic;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000675 return p;
676}
677
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200678/*
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +0200679 * Functions for getting characters from the regexp input.
Bram Moolenaar1ef9bbe2017-06-17 20:08:20 +0200680 */
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100681static int prevchr_len; // byte length of previous char
Bram Moolenaar0270f382018-07-17 05:43:58 +0200682static int at_start; // True when on the first character
683static int prev_at_start; // True when on the second character
Bram Moolenaar7c29f382016-02-12 19:08:15 +0100684
Bram Moolenaar071d4272004-06-13 20:20:40 +0000685/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200686 * Start parsing at "str".
687 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000688 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100689initchr(char_u *str)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000690{
691 regparse = str;
692 prevchr_len = 0;
693 curchr = prevprevchr = prevchr = nextchr = -1;
694 at_start = TRUE;
695 prev_at_start = FALSE;
696}
697
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200698/*
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200699 * Save the current parse state, so that it can be restored and parsing
700 * starts in the same state again.
701 */
702 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100703save_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200704{
705 ps->regparse = regparse;
706 ps->prevchr_len = prevchr_len;
707 ps->curchr = curchr;
708 ps->prevchr = prevchr;
709 ps->prevprevchr = prevprevchr;
710 ps->nextchr = nextchr;
711 ps->at_start = at_start;
712 ps->prev_at_start = prev_at_start;
713 ps->regnpar = regnpar;
714}
715
716/*
717 * Restore a previously saved parse state.
718 */
719 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100720restore_parse_state(parse_state_T *ps)
Bram Moolenaar3737fc12013-06-01 14:42:56 +0200721{
722 regparse = ps->regparse;
723 prevchr_len = ps->prevchr_len;
724 curchr = ps->curchr;
725 prevchr = ps->prevchr;
726 prevprevchr = ps->prevprevchr;
727 nextchr = ps->nextchr;
728 at_start = ps->at_start;
729 prev_at_start = ps->prev_at_start;
730 regnpar = ps->regnpar;
731}
732
733
734/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200735 * Get the next character without advancing.
736 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000737 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100738peekchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000739{
Bram Moolenaardf177f62005-02-22 08:39:57 +0000740 static int after_slash = FALSE;
741
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000742 if (curchr != -1)
743 return curchr;
744
745 switch (curchr = regparse[0])
Bram Moolenaar071d4272004-06-13 20:20:40 +0000746 {
Bram Moolenaar071d4272004-06-13 20:20:40 +0000747 case '.':
748 case '[':
749 case '~':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100750 // magic when 'magic' is on
Bram Moolenaar071d4272004-06-13 20:20:40 +0000751 if (reg_magic >= MAGIC_ON)
752 curchr = Magic(curchr);
753 break;
754 case '(':
755 case ')':
756 case '{':
757 case '%':
758 case '+':
759 case '=':
760 case '?':
761 case '@':
762 case '!':
763 case '&':
764 case '|':
765 case '<':
766 case '>':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100767 case '#': // future ext.
768 case '"': // future ext.
769 case '\'': // future ext.
770 case ',': // future ext.
771 case '-': // future ext.
772 case ':': // future ext.
773 case ';': // future ext.
774 case '`': // future ext.
775 case '/': // Can't be used in / command
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000776 // magic only after "\v"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000777 if (reg_magic == MAGIC_ALL)
778 curchr = Magic(curchr);
779 break;
780 case '*':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100781 // * is not magic as the very first character, eg "?*ptr", when
782 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
783 // "\(\*" is not magic, thus must be magic if "after_slash"
Bram Moolenaardf177f62005-02-22 08:39:57 +0000784 if (reg_magic >= MAGIC_ON
785 && !at_start
786 && !(prev_at_start && prevchr == Magic('^'))
787 && (after_slash
788 || (prevchr != Magic('(')
789 && prevchr != Magic('&')
790 && prevchr != Magic('|'))))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000791 curchr = Magic('*');
792 break;
793 case '^':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100794 // '^' is only magic as the very first character and if it's after
795 // "\(", "\|", "\&' or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000796 if (reg_magic >= MAGIC_OFF
797 && (at_start
798 || reg_magic == MAGIC_ALL
799 || prevchr == Magic('(')
800 || prevchr == Magic('|')
801 || prevchr == Magic('&')
802 || prevchr == Magic('n')
803 || (no_Magic(prevchr) == '('
804 && prevprevchr == Magic('%'))))
805 {
806 curchr = Magic('^');
807 at_start = TRUE;
808 prev_at_start = FALSE;
809 }
810 break;
811 case '$':
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100812 // '$' is only magic as the very last char and if it's in front of
813 // either "\|", "\)", "\&", or "\n"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000814 if (reg_magic >= MAGIC_OFF)
815 {
816 char_u *p = regparse + 1;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200817 int is_magic_all = (reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000818
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100819 // ignore \c \C \m \M \v \V and \Z after '$'
Bram Moolenaar071d4272004-06-13 20:20:40 +0000820 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000821 || p[1] == 'm' || p[1] == 'M'
822 || p[1] == 'v' || p[1] == 'V' || p[1] == 'Z'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200823 {
824 if (p[1] == 'v')
825 is_magic_all = TRUE;
826 else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V')
827 is_magic_all = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000828 p += 2;
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200829 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000830 if (p[0] == NUL
831 || (p[0] == '\\'
832 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
833 || p[1] == 'n'))
Bram Moolenaarff65ac82014-07-09 19:32:34 +0200834 || (is_magic_all
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +0000835 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000836 || reg_magic == MAGIC_ALL)
837 curchr = Magic('$');
838 }
839 break;
840 case '\\':
841 {
842 int c = regparse[1];
843
844 if (c == NUL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100845 curchr = '\\'; // trailing '\'
Bram Moolenaar424bcae2022-01-31 14:59:41 +0000846 else if (c <= '~' && META_flags[c])
Bram Moolenaar071d4272004-06-13 20:20:40 +0000847 {
848 /*
849 * META contains everything that may be magic sometimes,
850 * except ^ and $ ("\^" and "\$" are only magic after
Bram Moolenaarb878bbb2015-06-09 20:39:24 +0200851 * "\V"). We now fetch the next character and toggle its
Bram Moolenaar071d4272004-06-13 20:20:40 +0000852 * magicness. Therefore, \ is so meta-magic that it is
853 * not in META.
854 */
855 curchr = -1;
856 prev_at_start = at_start;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100857 at_start = FALSE; // be able to say "/\*ptr"
Bram Moolenaar071d4272004-06-13 20:20:40 +0000858 ++regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000859 ++after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000860 peekchr();
861 --regparse;
Bram Moolenaardf177f62005-02-22 08:39:57 +0000862 --after_slash;
Bram Moolenaar071d4272004-06-13 20:20:40 +0000863 curchr = toggle_Magic(curchr);
864 }
865 else if (vim_strchr(REGEXP_ABBR, c))
866 {
867 /*
868 * Handle abbreviations, like "\t" for TAB -- webb
869 */
870 curchr = backslash_trans(c);
871 }
872 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
873 curchr = toggle_Magic(c);
874 else
875 {
876 /*
877 * Next character can never be (made) magic?
878 * Then backslashing it won't do anything.
879 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000880 if (has_mbyte)
881 curchr = (*mb_ptr2char)(regparse + 1);
882 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000883 curchr = c;
884 }
885 break;
886 }
887
Bram Moolenaar071d4272004-06-13 20:20:40 +0000888 default:
889 if (has_mbyte)
890 curchr = (*mb_ptr2char)(regparse);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000891 }
892
893 return curchr;
894}
895
896/*
897 * Eat one lexed character. Do this in a way that we can undo it.
898 */
899 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100900skipchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000901{
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100902 // peekchr() eats a backslash, do the same here
Bram Moolenaar071d4272004-06-13 20:20:40 +0000903 if (*regparse == '\\')
904 prevchr_len = 1;
905 else
906 prevchr_len = 0;
907 if (regparse[prevchr_len] != NUL)
908 {
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000909 if (enc_utf8)
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100910 // exclude composing chars that mb_ptr2len does include
Bram Moolenaar8f5c5782007-11-29 20:27:21 +0000911 prevchr_len += utf_ptr2len(regparse + prevchr_len);
Bram Moolenaar362e1a32006-03-06 23:29:24 +0000912 else if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +0000913 prevchr_len += (*mb_ptr2len)(regparse + prevchr_len);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000914 else
Bram Moolenaar071d4272004-06-13 20:20:40 +0000915 ++prevchr_len;
916 }
917 regparse += prevchr_len;
918 prev_at_start = at_start;
919 at_start = FALSE;
920 prevprevchr = prevchr;
921 prevchr = curchr;
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100922 curchr = nextchr; // use previously unget char, or -1
Bram Moolenaar071d4272004-06-13 20:20:40 +0000923 nextchr = -1;
924}
925
926/*
927 * Skip a character while keeping the value of prev_at_start for at_start.
928 * prevchr and prevprevchr are also kept.
929 */
930 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100931skipchr_keepstart(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000932{
933 int as = prev_at_start;
934 int pr = prevchr;
935 int prpr = prevprevchr;
936
937 skipchr();
938 at_start = as;
939 prevchr = pr;
940 prevprevchr = prpr;
941}
942
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +0200943/*
944 * Get the next character from the pattern. We know about magic and such, so
945 * therefore we need a lexical analyzer.
946 */
Bram Moolenaar071d4272004-06-13 20:20:40 +0000947 static int
Bram Moolenaar05540972016-01-30 20:31:25 +0100948getchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000949{
950 int chr = peekchr();
951
952 skipchr();
953 return chr;
954}
955
956/*
957 * put character back. Works only once!
958 */
959 static void
Bram Moolenaar05540972016-01-30 20:31:25 +0100960ungetchr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +0000961{
962 nextchr = curchr;
963 curchr = prevchr;
964 prevchr = prevprevchr;
965 at_start = prev_at_start;
966 prev_at_start = FALSE;
967
Bram Moolenaar63d9e732019-12-05 21:10:38 +0100968 // Backup regparse, so that it's at the same position as before the
969 // getchr().
Bram Moolenaar071d4272004-06-13 20:20:40 +0000970 regparse -= prevchr_len;
971}
972
973/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +0000974 * Get and return the value of the hex string at the current position.
975 * Return -1 if there is no valid hex number.
976 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000977 * blahblah\%x20asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +0000978 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000979 * The parameter controls the maximum number of input characters. This will be
980 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
981 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100982 static long
Bram Moolenaar05540972016-01-30 20:31:25 +0100983gethexchrs(int maxinputlen)
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000984{
Bram Moolenaar4c22a912017-11-02 22:29:38 +0100985 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000986 int c;
987 int i;
988
989 for (i = 0; i < maxinputlen; ++i)
990 {
991 c = regparse[0];
992 if (!vim_isxdigit(c))
993 break;
994 nr <<= 4;
995 nr |= hex2nr(c);
996 ++regparse;
997 }
998
999 if (i == 0)
1000 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001001 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001002}
1003
1004/*
Bram Moolenaar75eb1612013-05-29 18:45:11 +02001005 * Get and return the value of the decimal string immediately after the
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001006 * current position. Return -1 for invalid. Consumes all digits.
1007 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001008 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001009getdecchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001010{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001011 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001012 int c;
1013 int i;
1014
1015 for (i = 0; ; ++i)
1016 {
1017 c = regparse[0];
1018 if (c < '0' || c > '9')
1019 break;
1020 nr *= 10;
1021 nr += c - '0';
1022 ++regparse;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001023 curchr = -1; // no longer valid
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001024 }
1025
1026 if (i == 0)
1027 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001028 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001029}
1030
1031/*
1032 * get and return the value of the octal string immediately after the current
1033 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
1034 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
1035 * treat 8 or 9 as recognised characters. Position is updated:
1036 * blahblah\%o210asdf
Bram Moolenaarc9b4b052006-04-30 18:54:39 +00001037 * before-^ ^-after
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001038 */
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001039 static long
Bram Moolenaar05540972016-01-30 20:31:25 +01001040getoctchrs(void)
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001041{
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001042 long_u nr = 0;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001043 int c;
1044 int i;
1045
1046 for (i = 0; i < 3 && nr < 040; ++i)
1047 {
1048 c = regparse[0];
1049 if (c < '0' || c > '7')
1050 break;
1051 nr <<= 3;
1052 nr |= hex2nr(c);
1053 ++regparse;
1054 }
1055
1056 if (i == 0)
1057 return -1;
Bram Moolenaar4c22a912017-11-02 22:29:38 +01001058 return (long)nr;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001059}
1060
1061/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001062 * read_limits - Read two integers to be taken as a minimum and maximum.
1063 * If the first character is '-', then the range is reversed.
1064 * Should end with 'end'. If minval is missing, zero is default, if maxval is
1065 * missing, a very big number is the default.
1066 */
1067 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001068read_limits(long *minval, long *maxval)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001069{
1070 int reverse = FALSE;
1071 char_u *first_char;
1072 long tmp;
1073
1074 if (*regparse == '-')
1075 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001076 // Starts with '-', so reverse the range later
Bram Moolenaar071d4272004-06-13 20:20:40 +00001077 regparse++;
1078 reverse = TRUE;
1079 }
1080 first_char = regparse;
1081 *minval = getdigits(&regparse);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001082 if (*regparse == ',') // There is a comma
Bram Moolenaar071d4272004-06-13 20:20:40 +00001083 {
1084 if (vim_isdigit(*++regparse))
1085 *maxval = getdigits(&regparse);
1086 else
1087 *maxval = MAX_LIMIT;
1088 }
1089 else if (VIM_ISDIGIT(*first_char))
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001090 *maxval = *minval; // It was \{n} or \{-n}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001091 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001092 *maxval = MAX_LIMIT; // It was \{} or \{-}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001093 if (*regparse == '\\')
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001094 regparse++; // Allow either \{...} or \{...\}
Bram Moolenaardf177f62005-02-22 08:39:57 +00001095 if (*regparse != '}')
Bram Moolenaar1d423ef2022-01-02 21:26:16 +00001096 EMSG2_RET_FAIL(_(e_syntax_error_in_str_curlies),
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001097 reg_magic == MAGIC_ALL);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001098
1099 /*
1100 * Reverse the range if there was a '-', or make sure it is in the right
1101 * order otherwise.
1102 */
1103 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
1104 {
1105 tmp = *minval;
1106 *minval = *maxval;
1107 *maxval = tmp;
1108 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001109 skipchr(); // let's be friends with the lexer again
Bram Moolenaar071d4272004-06-13 20:20:40 +00001110 return OK;
1111}
1112
1113/*
1114 * vim_regexec and friends
1115 */
1116
1117/*
1118 * Global work variables for vim_regexec().
1119 */
1120
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001121static void cleanup_subexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001122#ifdef FEAT_SYN_HL
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001123static void cleanup_zsubexpr(void);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001124#endif
Bram Moolenaarbaaa7e92016-01-29 22:47:03 +01001125static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001126
1127/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001128 * Sometimes need to save a copy of a line. Since alloc()/free() is very
1129 * slow, we keep one allocated piece of memory and only re-allocate it when
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001130 * it's too small. It's freed in bt_regexec_both() when finished.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001131 */
Bram Moolenaard4210772008-01-02 14:35:30 +00001132static char_u *reg_tofree = NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001133static unsigned reg_tofreelen;
1134
1135/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001136 * Structure used to store the execution state of the regex engine.
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00001137 * Which ones are set depends on whether a single-line or multi-line match is
Bram Moolenaar071d4272004-06-13 20:20:40 +00001138 * done:
1139 * single-line multi-line
1140 * reg_match &regmatch_T NULL
1141 * reg_mmatch NULL &regmmatch_T
1142 * reg_startp reg_match->startp <invalid>
1143 * reg_endp reg_match->endp <invalid>
1144 * reg_startpos <invalid> reg_mmatch->startpos
1145 * reg_endpos <invalid> reg_mmatch->endpos
1146 * reg_win NULL window in which to search
Bram Moolenaar2f315ab2013-01-25 20:11:01 +01001147 * reg_buf curbuf buffer in which to search
Bram Moolenaar071d4272004-06-13 20:20:40 +00001148 * reg_firstlnum <invalid> first line in which to search
1149 * reg_maxline 0 last line nr
1150 * reg_line_lbr FALSE or TRUE FALSE
1151 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02001152typedef struct {
1153 regmatch_T *reg_match;
1154 regmmatch_T *reg_mmatch;
Bram Moolenaar01105b32022-11-26 11:47:10 +00001155
Bram Moolenaar6100d022016-10-02 16:51:57 +02001156 char_u **reg_startp;
1157 char_u **reg_endp;
1158 lpos_T *reg_startpos;
1159 lpos_T *reg_endpos;
Bram Moolenaar01105b32022-11-26 11:47:10 +00001160
Bram Moolenaar6100d022016-10-02 16:51:57 +02001161 win_T *reg_win;
1162 buf_T *reg_buf;
1163 linenr_T reg_firstlnum;
1164 linenr_T reg_maxline;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001165 int reg_line_lbr; // "\n" in string is line break
Bram Moolenaar6100d022016-10-02 16:51:57 +02001166
Bram Moolenaar0270f382018-07-17 05:43:58 +02001167 // The current match-position is stord in these variables:
1168 linenr_T lnum; // line number, relative to first line
1169 char_u *line; // start of current line
Bram Moolenaar64066b92021-11-17 18:22:56 +00001170 char_u *input; // current input, points into "line"
Bram Moolenaar0270f382018-07-17 05:43:58 +02001171
1172 int need_clear_subexpr; // subexpressions still need to be cleared
1173#ifdef FEAT_SYN_HL
1174 int need_clear_zsubexpr; // extmatch subexpressions still need to be
1175 // cleared
1176#endif
1177
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001178 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
1179 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
1180 // contains '\c' or '\C' the value is overruled.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001181 int reg_ic;
1182
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001183 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
1184 // flag in the regexp. Defaults to false, always.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001185 int reg_icombine;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001186
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001187 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
1188 // there is no maximum.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001189 colnr_T reg_maxcol;
Bram Moolenaar0270f382018-07-17 05:43:58 +02001190
1191 // State for the NFA engine regexec.
1192 int nfa_has_zend; // NFA regexp \ze operator encountered.
1193 int nfa_has_backref; // NFA regexp \1 .. \9 encountered.
1194 int nfa_nsubexpr; // Number of sub expressions actually being used
1195 // during execution. 1 if only the whole match
1196 // (subexpr 0) is used.
1197 // listid is global, so that it increases on recursive calls to
1198 // nfa_regmatch(), which means we don't have to clear the lastlist field of
1199 // all the states.
1200 int nfa_listid;
1201 int nfa_alt_listid;
1202
1203#ifdef FEAT_SYN_HL
1204 int nfa_has_zsubexpr; // NFA regexp has \z( ), set zsubexpr.
1205#endif
Bram Moolenaar6100d022016-10-02 16:51:57 +02001206} regexec_T;
1207
1208static regexec_T rex;
1209static int rex_in_use = FALSE;
1210
Bram Moolenaar071d4272004-06-13 20:20:40 +00001211/*
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001212 * Return TRUE if character 'c' is included in 'iskeyword' option for
1213 * "reg_buf" buffer.
1214 */
1215 static int
1216reg_iswordc(int c)
1217{
1218 return vim_iswordc_buf(c, rex.reg_buf);
1219}
1220
John Marriott82792db2024-05-12 00:07:17 +02001221#ifdef FEAT_EVAL
1222static int can_f_submatch = FALSE; // TRUE when submatch() can be used
1223
1224// This struct is used for reg_submatch(). Needed for when the
1225// substitution string is an expression that contains a call to substitute()
1226// and submatch().
1227typedef struct {
1228 regmatch_T *sm_match;
1229 regmmatch_T *sm_mmatch;
1230 linenr_T sm_firstlnum;
1231 linenr_T sm_maxline;
1232 int sm_line_lbr;
1233} regsubmatch_T;
1234
1235static regsubmatch_T rsm; // can only be used when can_f_submatch is TRUE
1236#endif
1237
1238typedef enum
1239{
1240 RGLF_LINE = 0x01,
1241 RGLF_LENGTH = 0x02
1242#ifdef FEAT_EVAL
1243 ,
1244 RGLF_SUBMATCH = 0x04
1245#endif
1246} reg_getline_flags_T;
1247
1248//
1249// common code for reg_getline(), reg_getline_len(), reg_getline_submatch() and
1250// reg_getline_submatch_len().
1251// the flags argument (which is a bitmask) controls what info is to be returned and whether
1252// or not submatch is in effect.
1253// note:
1254// submatch is available only if FEAT_EVAL is defined.
1255 static void
1256reg_getline_common(linenr_T lnum, reg_getline_flags_T flags, char_u **line, colnr_T *length)
1257{
1258 int get_line = flags & RGLF_LINE;
1259 int get_length = flags & RGLF_LENGTH;
1260 linenr_T firstlnum;
1261 linenr_T maxline;
1262
1263#ifdef FEAT_EVAL
1264 if (flags & RGLF_SUBMATCH)
1265 {
1266 firstlnum = rsm.sm_firstlnum + lnum;
1267 maxline = rsm.sm_maxline;
1268 }
1269 else
1270#endif
1271 {
1272 firstlnum = rex.reg_firstlnum + lnum;
1273 maxline = rex.reg_maxline;
1274 }
1275
1276 // when looking behind for a match/no-match lnum is negative. but we
1277 // can't go before line 1.
1278 if (firstlnum < 1)
1279 {
1280 if (get_line)
1281 *line = NULL;
1282 if (get_length)
1283 *length = 0;
1284
1285 return;
1286 }
1287
1288 if (lnum > maxline)
1289 {
1290 // must have matched the "\n" in the last line.
1291 if (get_line)
1292 *line = (char_u *)"";
1293 if (get_length)
1294 *length = 0;
1295
1296 return;
1297 }
1298
1299 if (get_line)
1300 *line = ml_get_buf(rex.reg_buf, firstlnum, FALSE);
1301 if (get_length)
1302 *length = ml_get_buf_len(rex.reg_buf, firstlnum);
1303}
1304
Bram Moolenaar221cd9f2019-01-31 15:34:40 +01001305/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001306 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
1307 */
1308 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001309reg_getline(linenr_T lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001310{
John Marriott82792db2024-05-12 00:07:17 +02001311 char_u *line;
1312
1313 reg_getline_common(lnum, RGLF_LINE, &line, NULL);
1314
1315 return line;
1316}
1317
1318/*
1319 * Get length of line "lnum", which is relative to "reg_firstlnum".
1320 */
1321 static colnr_T
1322reg_getline_len(linenr_T lnum)
1323{
1324 colnr_T length;
1325
1326 reg_getline_common(lnum, RGLF_LENGTH, NULL, &length);
1327
1328 return length;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001329}
1330
Bram Moolenaar071d4272004-06-13 20:20:40 +00001331#ifdef FEAT_SYN_HL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001332static char_u *reg_startzp[NSUBEXP]; // Workspace to mark beginning
1333static char_u *reg_endzp[NSUBEXP]; // and end of \z(...\) matches
1334static lpos_T reg_startzpos[NSUBEXP]; // idem, beginning pos
1335static lpos_T reg_endzpos[NSUBEXP]; // idem, end pos
Bram Moolenaar071d4272004-06-13 20:20:40 +00001336#endif
1337
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001338// TRUE if using multi-line regexp.
Bram Moolenaar6100d022016-10-02 16:51:57 +02001339#define REG_MULTI (rex.reg_match == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001340
Bram Moolenaar071d4272004-06-13 20:20:40 +00001341#ifdef FEAT_SYN_HL
Bram Moolenaar071d4272004-06-13 20:20:40 +00001342/*
1343 * Create a new extmatch and mark it as referenced once.
1344 */
1345 static reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001346make_extmatch(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001347{
1348 reg_extmatch_T *em;
1349
Bram Moolenaarc799fe22019-05-28 23:08:19 +02001350 em = ALLOC_CLEAR_ONE(reg_extmatch_T);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001351 if (em != NULL)
1352 em->refcnt = 1;
1353 return em;
1354}
1355
1356/*
1357 * Add a reference to an extmatch.
1358 */
1359 reg_extmatch_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01001360ref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001361{
1362 if (em != NULL)
1363 em->refcnt++;
1364 return em;
1365}
1366
1367/*
1368 * Remove a reference to an extmatch. If there are no references left, free
1369 * the info.
1370 */
1371 void
Bram Moolenaar05540972016-01-30 20:31:25 +01001372unref_extmatch(reg_extmatch_T *em)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001373{
1374 int i;
1375
1376 if (em != NULL && --em->refcnt <= 0)
1377 {
1378 for (i = 0; i < NSUBEXP; ++i)
1379 vim_free(em->matches[i]);
1380 vim_free(em);
1381 }
1382}
1383#endif
1384
1385/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00001386 * Get class of previous character.
1387 */
1388 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001389reg_prev_class(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001390{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001391 if (rex.input > rex.line)
1392 return mb_get_class_buf(rex.input - 1
Bram Moolenaara12a1612019-01-24 16:39:02 +01001393 - (*mb_head_off)(rex.line, rex.input - 1), rex.reg_buf);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001394 return -1;
1395}
Bram Moolenaarf7ff6e82014-03-23 15:13:05 +01001396
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001397/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001398 * Return TRUE if the current rex.input position matches the Visual area.
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001399 */
1400 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001401reg_match_visual(void)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001402{
1403 pos_T top, bot;
1404 linenr_T lnum;
1405 colnr_T col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001406 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001407 int mode;
1408 colnr_T start, end;
1409 colnr_T start2, end2;
1410 colnr_T cols;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001411 colnr_T curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001412
Bram Moolenaar679d66c2022-01-30 16:42:56 +00001413 // Check if the buffer is the current buffer and not using a string.
Bram Moolenaar44a4d942022-01-30 17:17:41 +00001414 if (rex.reg_buf != curbuf || VIsual.lnum == 0 || !REG_MULTI)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001415 return FALSE;
1416
1417 if (VIsual_active)
1418 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001419 if (LT_POS(VIsual, wp->w_cursor))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001420 {
1421 top = VIsual;
1422 bot = wp->w_cursor;
1423 }
1424 else
1425 {
1426 top = wp->w_cursor;
1427 bot = VIsual;
1428 }
1429 mode = VIsual_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001430 curswant = wp->w_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001431 }
1432 else
1433 {
Bram Moolenaarb5aedf32017-03-12 18:23:53 +01001434 if (LT_POS(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end))
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001435 {
1436 top = curbuf->b_visual.vi_start;
1437 bot = curbuf->b_visual.vi_end;
1438 }
1439 else
1440 {
1441 top = curbuf->b_visual.vi_end;
1442 bot = curbuf->b_visual.vi_start;
1443 }
zeertzjqe7102202024-02-13 20:32:04 +01001444 // a substitute command may have removed some lines
Christian Brabandt7c71db32024-01-22 20:12:34 +01001445 if (bot.lnum > curbuf->b_ml.ml_line_count)
1446 bot.lnum = curbuf->b_ml.ml_line_count;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001447 mode = curbuf->b_visual.vi_mode;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001448 curswant = curbuf->b_visual.vi_curswant;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001449 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001450 lnum = rex.lnum + rex.reg_firstlnum;
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001451 if (lnum < top.lnum || lnum > bot.lnum)
1452 return FALSE;
1453
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001454 col = (colnr_T)(rex.input - rex.line);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001455 if (mode == 'v')
1456 {
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001457 if ((lnum == top.lnum && col < top.col)
1458 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e')))
1459 return FALSE;
1460 }
1461 else if (mode == Ctrl_V)
1462 {
1463 getvvcol(wp, &top, &start, NULL, &end);
1464 getvvcol(wp, &bot, &start2, NULL, &end2);
1465 if (start2 < start)
1466 start = start2;
1467 if (end2 > end)
1468 end = end2;
Bram Moolenaare71c0eb2021-05-30 16:43:11 +02001469 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL)
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001470 end = MAXCOL;
Bram Moolenaar4c13e5e2021-12-30 14:49:43 +00001471
1472 // getvvcol() flushes rex.line, need to get it again
1473 rex.line = reg_getline(rex.lnum);
1474 rex.input = rex.line + col;
1475
Bram Moolenaar7f9969c2022-07-25 18:13:54 +01001476 cols = win_linetabsize(wp, rex.reg_firstlnum + rex.lnum, rex.line, col);
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001477 if (cols < start || cols > end - (*p_sel == 'e'))
1478 return FALSE;
1479 }
1480 return TRUE;
1481}
Bram Moolenaardacd7de2013-06-04 18:28:48 +02001482
Bram Moolenaar071d4272004-06-13 20:20:40 +00001483/*
1484 * Check the regexp program for its magic number.
1485 * Return TRUE if it's wrong.
1486 */
1487 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001488prog_magic_wrong(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001489{
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001490 regprog_T *prog;
1491
Bram Moolenaar6100d022016-10-02 16:51:57 +02001492 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001493 if (prog->engine == &nfa_regengine)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001494 // For NFA matcher we don't check the magic
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02001495 return FALSE;
1496
1497 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001498 {
RestorerZ68ebcee2023-05-31 17:12:14 +01001499 iemsg(e_corrupted_regexp_program);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001500 return TRUE;
1501 }
1502 return FALSE;
1503}
1504
1505/*
1506 * Cleanup the subexpressions, if this wasn't done yet.
1507 * This construction is used to clear the subexpressions only when they are
1508 * used (to increase speed).
1509 */
1510 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001511cleanup_subexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001512{
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00001513 if (!rex.need_clear_subexpr)
1514 return;
1515
1516 if (REG_MULTI)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001517 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00001518 // Use 0xff to set lnum to -1
1519 vim_memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1520 vim_memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001521 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00001522 else
1523 {
1524 vim_memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
1525 vim_memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
1526 }
1527 rex.need_clear_subexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001528}
1529
1530#ifdef FEAT_SYN_HL
1531 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001532cleanup_zsubexpr(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001533{
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00001534 if (!rex.need_clear_zsubexpr)
1535 return;
1536
1537 if (REG_MULTI)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001538 {
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00001539 // Use 0xff to set lnum to -1
1540 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
1541 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001542 }
Yegappan Lakshmananf97a2952023-01-18 18:17:48 +00001543 else
1544 {
1545 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
1546 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
1547 }
1548 rex.need_clear_zsubexpr = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001549}
1550#endif
1551
1552/*
Bram Moolenaar0270f382018-07-17 05:43:58 +02001553 * Advance rex.lnum, rex.line and rex.input to the next line.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001554 */
1555 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001556reg_nextline(void)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001557{
Bram Moolenaar0270f382018-07-17 05:43:58 +02001558 rex.line = reg_getline(++rex.lnum);
1559 rex.input = rex.line;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001560 fast_breakcheck();
1561}
1562
1563/*
Bram Moolenaar580abea2013-06-14 20:31:28 +02001564 * Check whether a backreference matches.
1565 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001566 * If "bytelen" is not NULL, it is set to the byte length of the match in the
1567 * last line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001568 */
1569 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001570match_with_backref(
1571 linenr_T start_lnum,
1572 colnr_T start_col,
1573 linenr_T end_lnum,
1574 colnr_T end_col,
1575 int *bytelen)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001576{
1577 linenr_T clnum = start_lnum;
1578 colnr_T ccol = start_col;
1579 int len;
1580 char_u *p;
1581
1582 if (bytelen != NULL)
1583 *bytelen = 0;
1584 for (;;)
1585 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001586 // Since getting one line may invalidate the other, need to make copy.
1587 // Slow!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001588 if (rex.line != reg_tofree)
Bram Moolenaar580abea2013-06-14 20:31:28 +02001589 {
Bram Moolenaar0270f382018-07-17 05:43:58 +02001590 len = (int)STRLEN(rex.line);
Bram Moolenaar580abea2013-06-14 20:31:28 +02001591 if (reg_tofree == NULL || len >= (int)reg_tofreelen)
1592 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001593 len += 50; // get some extra
Bram Moolenaar580abea2013-06-14 20:31:28 +02001594 vim_free(reg_tofree);
1595 reg_tofree = alloc(len);
1596 if (reg_tofree == NULL)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001597 return RA_FAIL; // out of memory!
Bram Moolenaar580abea2013-06-14 20:31:28 +02001598 reg_tofreelen = len;
1599 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02001600 STRCPY(reg_tofree, rex.line);
1601 rex.input = reg_tofree + (rex.input - rex.line);
1602 rex.line = reg_tofree;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001603 }
1604
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001605 // Get the line to compare with.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001606 p = reg_getline(clnum);
1607 if (clnum == end_lnum)
1608 len = end_col - ccol;
1609 else
John Marriott82792db2024-05-12 00:07:17 +02001610 len = (int)reg_getline_len(clnum) - ccol;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001611
Bram Moolenaar0270f382018-07-17 05:43:58 +02001612 if (cstrncmp(p + ccol, rex.input, &len) != 0)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001613 return RA_NOMATCH; // doesn't match
Bram Moolenaar580abea2013-06-14 20:31:28 +02001614 if (bytelen != NULL)
1615 *bytelen += len;
1616 if (clnum == end_lnum)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001617 break; // match and at end!
Bram Moolenaar0270f382018-07-17 05:43:58 +02001618 if (rex.lnum >= rex.reg_maxline)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001619 return RA_NOMATCH; // text too short
Bram Moolenaar580abea2013-06-14 20:31:28 +02001620
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001621 // Advance to next line.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001622 reg_nextline();
Bram Moolenaar438ee5b2013-11-21 17:13:00 +01001623 if (bytelen != NULL)
1624 *bytelen = 0;
Bram Moolenaar580abea2013-06-14 20:31:28 +02001625 ++clnum;
1626 ccol = 0;
1627 if (got_int)
1628 return RA_FAIL;
1629 }
1630
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001631 // found a match! Note that rex.line may now point to a copy of the line,
1632 // that should not matter.
Bram Moolenaar580abea2013-06-14 20:31:28 +02001633 return RA_MATCH;
1634}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001635
Bram Moolenaarfb031402014-09-09 17:18:49 +02001636/*
1637 * Used in a place where no * or \+ can follow.
1638 */
1639 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001640re_mult_next(char *what)
Bram Moolenaarfb031402014-09-09 17:18:49 +02001641{
1642 if (re_multi_type(peekchr()) == MULTI_MULT)
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001643 {
Bram Moolenaard82a47d2022-01-05 20:24:39 +00001644 semsg(_(e_nfa_regexp_cannot_repeat_str), what);
Bram Moolenaar1be45b22019-01-14 22:46:15 +01001645 rc_did_emsg = TRUE;
1646 return FAIL;
1647 }
Bram Moolenaarfb031402014-09-09 17:18:49 +02001648 return OK;
1649}
1650
Bram Moolenaar071d4272004-06-13 20:20:40 +00001651typedef struct
1652{
1653 int a, b, c;
1654} decomp_T;
1655
1656
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001657// 0xfb20 - 0xfb4f
Bram Moolenaard6f676d2005-06-01 21:51:55 +00001658static decomp_T decomp_table[0xfb4f-0xfb20+1] =
Bram Moolenaar071d4272004-06-13 20:20:40 +00001659{
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001660 {0x5e2,0,0}, // 0xfb20 alt ayin
1661 {0x5d0,0,0}, // 0xfb21 alt alef
1662 {0x5d3,0,0}, // 0xfb22 alt dalet
1663 {0x5d4,0,0}, // 0xfb23 alt he
1664 {0x5db,0,0}, // 0xfb24 alt kaf
1665 {0x5dc,0,0}, // 0xfb25 alt lamed
1666 {0x5dd,0,0}, // 0xfb26 alt mem-sofit
1667 {0x5e8,0,0}, // 0xfb27 alt resh
1668 {0x5ea,0,0}, // 0xfb28 alt tav
1669 {'+', 0, 0}, // 0xfb29 alt plus
1670 {0x5e9, 0x5c1, 0}, // 0xfb2a shin+shin-dot
1671 {0x5e9, 0x5c2, 0}, // 0xfb2b shin+sin-dot
1672 {0x5e9, 0x5c1, 0x5bc}, // 0xfb2c shin+shin-dot+dagesh
1673 {0x5e9, 0x5c2, 0x5bc}, // 0xfb2d shin+sin-dot+dagesh
1674 {0x5d0, 0x5b7, 0}, // 0xfb2e alef+patah
1675 {0x5d0, 0x5b8, 0}, // 0xfb2f alef+qamats
1676 {0x5d0, 0x5b4, 0}, // 0xfb30 alef+hiriq
1677 {0x5d1, 0x5bc, 0}, // 0xfb31 bet+dagesh
1678 {0x5d2, 0x5bc, 0}, // 0xfb32 gimel+dagesh
1679 {0x5d3, 0x5bc, 0}, // 0xfb33 dalet+dagesh
1680 {0x5d4, 0x5bc, 0}, // 0xfb34 he+dagesh
1681 {0x5d5, 0x5bc, 0}, // 0xfb35 vav+dagesh
1682 {0x5d6, 0x5bc, 0}, // 0xfb36 zayin+dagesh
1683 {0xfb37, 0, 0}, // 0xfb37 -- UNUSED
1684 {0x5d8, 0x5bc, 0}, // 0xfb38 tet+dagesh
1685 {0x5d9, 0x5bc, 0}, // 0xfb39 yud+dagesh
1686 {0x5da, 0x5bc, 0}, // 0xfb3a kaf sofit+dagesh
1687 {0x5db, 0x5bc, 0}, // 0xfb3b kaf+dagesh
1688 {0x5dc, 0x5bc, 0}, // 0xfb3c lamed+dagesh
1689 {0xfb3d, 0, 0}, // 0xfb3d -- UNUSED
1690 {0x5de, 0x5bc, 0}, // 0xfb3e mem+dagesh
1691 {0xfb3f, 0, 0}, // 0xfb3f -- UNUSED
1692 {0x5e0, 0x5bc, 0}, // 0xfb40 nun+dagesh
1693 {0x5e1, 0x5bc, 0}, // 0xfb41 samech+dagesh
1694 {0xfb42, 0, 0}, // 0xfb42 -- UNUSED
1695 {0x5e3, 0x5bc, 0}, // 0xfb43 pe sofit+dagesh
1696 {0x5e4, 0x5bc,0}, // 0xfb44 pe+dagesh
1697 {0xfb45, 0, 0}, // 0xfb45 -- UNUSED
1698 {0x5e6, 0x5bc, 0}, // 0xfb46 tsadi+dagesh
1699 {0x5e7, 0x5bc, 0}, // 0xfb47 qof+dagesh
1700 {0x5e8, 0x5bc, 0}, // 0xfb48 resh+dagesh
1701 {0x5e9, 0x5bc, 0}, // 0xfb49 shin+dagesh
1702 {0x5ea, 0x5bc, 0}, // 0xfb4a tav+dagesh
1703 {0x5d5, 0x5b9, 0}, // 0xfb4b vav+holam
1704 {0x5d1, 0x5bf, 0}, // 0xfb4c bet+rafe
1705 {0x5db, 0x5bf, 0}, // 0xfb4d kaf+rafe
1706 {0x5e4, 0x5bf, 0}, // 0xfb4e pe+rafe
1707 {0x5d0, 0x5dc, 0} // 0xfb4f alef-lamed
Bram Moolenaar071d4272004-06-13 20:20:40 +00001708};
1709
1710 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001711mb_decompose(int c, int *c1, int *c2, int *c3)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001712{
1713 decomp_T d;
1714
Bram Moolenaar2eec59e2013-05-21 21:37:20 +02001715 if (c >= 0xfb20 && c <= 0xfb4f)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001716 {
1717 d = decomp_table[c - 0xfb20];
1718 *c1 = d.a;
1719 *c2 = d.b;
1720 *c3 = d.c;
1721 }
1722 else
1723 {
1724 *c1 = c;
1725 *c2 = *c3 = 0;
1726 }
1727}
Bram Moolenaar071d4272004-06-13 20:20:40 +00001728
1729/*
Bram Moolenaar6100d022016-10-02 16:51:57 +02001730 * Compare two strings, ignore case if rex.reg_ic set.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001731 * Return 0 if strings match, non-zero otherwise.
Christian Brabandt22e8e122024-07-30 20:39:18 +02001732 * Correct the length "*n" when composing characters are ignored
1733 * or for utf8 when both utf codepoints are considered equal because of
1734 * case-folding but have different length (e.g. 's' and 'Å¿')
Bram Moolenaar071d4272004-06-13 20:20:40 +00001735 */
1736 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01001737cstrncmp(char_u *s1, char_u *s2, int *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001738{
1739 int result;
1740
Bram Moolenaar6100d022016-10-02 16:51:57 +02001741 if (!rex.reg_ic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001742 result = STRNCMP(s1, s2, *n);
Christian Brabandt22e8e122024-07-30 20:39:18 +02001743 else if (enc_utf8)
1744 {
1745 char_u *p = s1;
zeertzjqe8feaa32024-08-01 22:48:53 +02001746 int n2 = 0;
Christian Brabandt22e8e122024-07-30 20:39:18 +02001747 int n1 = *n;
1748 // count the number of characters for byte-length of s1
1749 while (n1 > 0 && *p != NUL)
1750 {
1751 n1 -= mb_ptr2len(s1);
1752 MB_PTR_ADV(p);
1753 n2++;
1754 }
1755 // count the number of bytes to advance the same number of chars for s2
1756 p = s2;
1757 while (n2-- > 0 && *p != NUL)
1758 MB_PTR_ADV(p);
1759
1760 n2 = p - s2;
1761
1762 result = MB_STRNICMP2(s1, s2, *n, n2);
zeertzjqe8feaa32024-08-01 22:48:53 +02001763 if (result == 0 && n2 < *n)
Christian Brabandt22e8e122024-07-30 20:39:18 +02001764 *n = n2;
1765 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001766 else
1767 result = MB_STRNICMP(s1, s2, *n);
1768
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001769 // if it failed and it's utf8 and we want to combineignore:
Bram Moolenaar6100d022016-10-02 16:51:57 +02001770 if (result != 0 && enc_utf8 && rex.reg_icombine)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001771 {
1772 char_u *str1, *str2;
1773 int c1, c2, c11, c12;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001774 int junk;
1775
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001776 // we have to handle the strcmp ourselves, since it is necessary to
1777 // deal with the composing characters by ignoring them:
Bram Moolenaar071d4272004-06-13 20:20:40 +00001778 str1 = s1;
1779 str2 = s2;
1780 c1 = c2 = 0;
Bram Moolenaarcafda4f2005-09-06 19:25:11 +00001781 while ((int)(str1 - s1) < *n)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001782 {
1783 c1 = mb_ptr2char_adv(&str1);
1784 c2 = mb_ptr2char_adv(&str2);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001785
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001786 // Decompose the character if necessary, into 'base' characters.
1787 // Currently hard-coded for Hebrew, Arabic to be done...
Bram Moolenaar6100d022016-10-02 16:51:57 +02001788 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001789 {
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02001790 // decomposition necessary?
Bram Moolenaar071d4272004-06-13 20:20:40 +00001791 mb_decompose(c1, &c11, &junk, &junk);
1792 mb_decompose(c2, &c12, &junk, &junk);
1793 c1 = c11;
1794 c2 = c12;
Bram Moolenaar6100d022016-10-02 16:51:57 +02001795 if (c11 != c12
1796 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12)))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001797 break;
1798 }
1799 }
1800 result = c2 - c1;
1801 if (result == 0)
1802 *n = (int)(str2 - s2);
1803 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001804
1805 return result;
1806}
1807
1808/*
1809 * cstrchr: This function is used a lot for simple searches, keep it fast!
1810 */
1811 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001812cstrchr(char_u *s, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001813{
1814 char_u *p;
Christian Brabandt22e8e122024-07-30 20:39:18 +02001815 int cc, lc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001816
Bram Moolenaara12a1612019-01-24 16:39:02 +01001817 if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001818 return vim_strchr(s, c);
1819
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001820 // tolower() and toupper() can be slow, comparing twice should be a lot
1821 // faster (esp. when using MS Visual C++!).
1822 // For UTF-8 need to use folded case.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001823 if (enc_utf8 && c > 0x80)
Christian Brabandt22e8e122024-07-30 20:39:18 +02001824 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00001825 cc = utf_fold(c);
Christian Brabandt22e8e122024-07-30 20:39:18 +02001826 lc = cc;
1827 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001828 else
Christian Brabandt22e8e122024-07-30 20:39:18 +02001829 if (MB_ISUPPER(c))
1830 {
1831 cc = MB_TOLOWER(c);
1832 lc = cc;
1833 }
1834 else if (MB_ISLOWER(c))
1835 {
1836 cc = MB_TOUPPER(c);
1837 lc = c;
1838 }
1839 else
1840 return vim_strchr(s, c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001841
Bram Moolenaar071d4272004-06-13 20:20:40 +00001842 if (has_mbyte)
1843 {
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001844 for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001845 {
Christian Brabandt22e8e122024-07-30 20:39:18 +02001846 int uc = utf_ptr2char(p);
1847 if (enc_utf8 && (c > 0x80 || uc > 0x80))
Bram Moolenaar071d4272004-06-13 20:20:40 +00001848 {
Bram Moolenaarf5094052022-07-29 16:22:25 +01001849 // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf,
1850 // not 0xff.
Christian Brabandt22e8e122024-07-30 20:39:18 +02001851 // compare with lower case of the character
1852 if ((uc < 0x80 || uc != *p) && utf_fold(uc) == lc)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001853 return p;
1854 }
1855 else if (*p == c || *p == cc)
1856 return p;
1857 }
1858 }
1859 else
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001860 // Faster version for when there are no multi-byte characters.
Bram Moolenaar071d4272004-06-13 20:20:40 +00001861 for (p = s; *p != NUL; ++p)
1862 if (*p == c || *p == cc)
1863 return p;
1864
1865 return NULL;
1866}
1867
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001868////////////////////////////////////////////////////////////////
1869// regsub stuff //
1870////////////////////////////////////////////////////////////////
Bram Moolenaar071d4272004-06-13 20:20:40 +00001871
Yee Cheng Chind25021c2023-09-18 19:51:56 +02001872typedef void (*fptr_T)(int *, int);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001873
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01001874static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, int destlen, int flags);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001875
Yee Cheng Chind25021c2023-09-18 19:51:56 +02001876 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001877do_upper(int *d, int c)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001878{
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001879 *d = MB_TOUPPER(c);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001880}
1881
Yee Cheng Chind25021c2023-09-18 19:51:56 +02001882 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01001883do_lower(int *d, int c)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001884{
1885 *d = MB_TOLOWER(c);
Bram Moolenaar071d4272004-06-13 20:20:40 +00001886}
1887
1888/*
1889 * regtilde(): Replace tildes in the pattern by the old pattern.
1890 *
1891 * Short explanation of the tilde: It stands for the previous replacement
1892 * pattern. If that previous pattern also contains a ~ we should go back a
1893 * step further... But we insert the previous pattern into the current one
1894 * and remember that.
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00001895 * This still does not handle the case where "magic" changes. So require the
1896 * user to keep his hands off of "magic".
Bram Moolenaar071d4272004-06-13 20:20:40 +00001897 *
1898 * The tildes are parsed once before the first call to vim_regsub().
1899 */
1900 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01001901regtilde(char_u *source, int magic)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001902{
1903 char_u *newsub = source;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001904 char_u *p;
John Marriott82792db2024-05-12 00:07:17 +02001905 size_t newsublen = 0;
1906 char_u tilde[3] = {'~', NUL, NUL};
1907 size_t tildelen = 1;
1908 int error = FALSE;
1909
1910 if (!magic)
1911 {
1912 tilde[0] = '\\';
1913 tilde[1] = '~';
1914 tilde[2] = NUL;
1915 tildelen = 2;
1916 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001917
1918 for (p = newsub; *p; ++p)
1919 {
John Marriott82792db2024-05-12 00:07:17 +02001920 if (STRNCMP(p, tilde, tildelen) == 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001921 {
John Marriott82792db2024-05-12 00:07:17 +02001922 size_t prefixlen = p - newsub; // not including the tilde
1923 char_u *postfix = p + tildelen;
1924 size_t postfixlen;
1925 size_t tmpsublen;
1926
1927 if (newsublen == 0)
1928 newsublen = STRLEN(newsub);
1929 newsublen -= tildelen;
1930 postfixlen = newsublen - prefixlen;
1931 tmpsublen = prefixlen + reg_prev_sublen + postfixlen;
1932
1933 if (tmpsublen > 0 && reg_prev_sub != NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001934 {
John Marriott82792db2024-05-12 00:07:17 +02001935 char_u *tmpsub;
1936
Bram Moolenaarab9a2d82023-05-09 21:15:30 +01001937 // Avoid making the text longer than MAXCOL, it will cause
1938 // trouble at some point.
John Marriott82792db2024-05-12 00:07:17 +02001939 if (tmpsublen > MAXCOL)
Bram Moolenaarab9a2d82023-05-09 21:15:30 +01001940 {
1941 emsg(_(e_resulting_text_too_long));
John Marriott82792db2024-05-12 00:07:17 +02001942 error = TRUE;
Bram Moolenaarab9a2d82023-05-09 21:15:30 +01001943 break;
1944 }
1945
John Marriott82792db2024-05-12 00:07:17 +02001946 tmpsub = alloc(tmpsublen + 1);
1947 if (tmpsub == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00001948 {
John Marriott82792db2024-05-12 00:07:17 +02001949 emsg(_(e_out_of_memory));
1950 error = TRUE;
1951 break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001952 }
John Marriott82792db2024-05-12 00:07:17 +02001953
1954 // copy prefix
1955 mch_memmove(tmpsub, newsub, prefixlen);
1956 // interpret tilde
1957 mch_memmove(tmpsub + prefixlen, reg_prev_sub, reg_prev_sublen);
1958 // copy postfix
1959 STRCPY(tmpsub + prefixlen + reg_prev_sublen, postfix);
1960
1961 if (newsub != source) // allocated newsub before
1962 vim_free(newsub);
1963 newsub = tmpsub;
1964 newsublen = tmpsublen;
1965 p = newsub + prefixlen + reg_prev_sublen;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001966 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001967 else
John Marriott82792db2024-05-12 00:07:17 +02001968 mch_memmove(p, postfix, postfixlen + 1); // remove the tilde (+1 for the NUL)
1969
Bram Moolenaar071d4272004-06-13 20:20:40 +00001970 --p;
1971 }
1972 else
1973 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01001974 if (*p == '\\' && p[1]) // skip escaped characters
Bram Moolenaar071d4272004-06-13 20:20:40 +00001975 ++p;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001976 if (has_mbyte)
Bram Moolenaar0fa313a2005-08-10 21:07:57 +00001977 p += (*mb_ptr2len)(p) - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00001978 }
1979 }
1980
John Marriott82792db2024-05-12 00:07:17 +02001981 if (error)
1982 {
1983 if (newsub != source)
1984 vim_free(newsub);
1985 return source;
1986 }
1987
Bram Moolenaar32acf1f2022-07-07 22:20:31 +01001988 // Store a copy of newsub in reg_prev_sub. It is always allocated,
1989 // because recursive calls may make the returned string invalid.
John Marriott82792db2024-05-12 00:07:17 +02001990 // Only store it if there something to store.
1991 newsublen = p - newsub;
1992 if (newsublen == 0)
1993 VIM_CLEAR(reg_prev_sub);
1994 else
1995 {
1996 vim_free(reg_prev_sub);
1997 reg_prev_sub = vim_strnsave(newsub, newsublen);
1998 }
1999
2000 if (reg_prev_sub == NULL)
2001 reg_prev_sublen = 0;
2002 else
2003 reg_prev_sublen = newsublen;
Bram Moolenaar32acf1f2022-07-07 22:20:31 +01002004
Bram Moolenaar071d4272004-06-13 20:20:40 +00002005 return newsub;
2006}
2007
2008#ifdef FEAT_EVAL
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002009
2010/*
Bram Moolenaarb0745b22019-11-09 22:28:11 +01002011 * Put the submatches in "argv[argskip]" which is a list passed into
2012 * call_func() by vim_regsub_both().
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002013 */
2014 static int
zeertzjq48db5da2022-09-16 12:10:03 +01002015fill_submatch_list(int argc UNUSED, typval_T *argv, int argskip, ufunc_T *fp)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002016{
2017 listitem_T *li;
2018 int i;
2019 char_u *s;
Bram Moolenaarb0745b22019-11-09 22:28:11 +01002020 typval_T *listarg = argv + argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002021
zeertzjqabd58d82022-09-16 16:06:32 +01002022 if (!has_varargs(fp) && fp->uf_args.ga_len <= argskip)
Bram Moolenaarb0745b22019-11-09 22:28:11 +01002023 // called function doesn't take a submatches argument
2024 return argskip;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002025
Bram Moolenaarb0745b22019-11-09 22:28:11 +01002026 // Relies on sl_list to be the first item in staticList10_T.
2027 init_static_list((staticList10_T *)(listarg->vval.v_list));
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002028
Bram Moolenaarb0745b22019-11-09 22:28:11 +01002029 // There are always 10 list items in staticList10_T.
2030 li = listarg->vval.v_list->lv_first;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002031 for (i = 0; i < 10; ++i)
2032 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002033 s = rsm.sm_match->startp[i];
2034 if (s == NULL || rsm.sm_match->endp[i] == NULL)
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002035 s = NULL;
2036 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002037 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002038 li->li_tv.v_type = VAR_STRING;
2039 li->li_tv.vval.v_string = s;
2040 li = li->li_next;
2041 }
Bram Moolenaarb0745b22019-11-09 22:28:11 +01002042 return argskip + 1;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002043}
2044
2045 static void
2046clear_submatch_list(staticList10_T *sl)
2047{
2048 int i;
2049
2050 for (i = 0; i < 10; ++i)
2051 vim_free(sl->sl_items[i].li_tv.vval.v_string);
2052}
Bram Moolenaarb005cd82019-09-04 15:54:55 +02002053#endif
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002054
Bram Moolenaar071d4272004-06-13 20:20:40 +00002055/*
2056 * vim_regsub() - perform substitutions after a vim_regexec() or
2057 * vim_regexec_multi() match.
2058 *
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002059 * If "flags" has REGSUB_COPY really copy into "dest[destlen]".
dundargocc57b5bc2022-11-02 13:30:51 +00002060 * Otherwise nothing is copied, only compute the length of the result.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002061 *
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002062 * If "flags" has REGSUB_MAGIC then behave like 'magic' is set.
2063 *
2064 * If "flags" has REGSUB_BACKSLASH a backslash will be removed later, need to
2065 * double them to keep them, and insert a backslash before a CR to avoid it
2066 * being replaced with a line break later.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002067 *
2068 * Note: The matched text must not change between the call of
2069 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
2070 * references invalid!
2071 *
2072 * Returns the size of the replacement, including terminating NUL.
2073 */
2074 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002075vim_regsub(
2076 regmatch_T *rmp,
2077 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002078 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01002079 char_u *dest,
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002080 int destlen,
2081 int flags)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002082{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002083 int result;
2084 regexec_T rex_save;
2085 int rex_in_use_save = rex_in_use;
2086
2087 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002088 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002089 rex_save = rex;
2090 rex_in_use = TRUE;
2091
2092 rex.reg_match = rmp;
2093 rex.reg_mmatch = NULL;
2094 rex.reg_maxline = 0;
2095 rex.reg_buf = curbuf;
2096 rex.reg_line_lbr = TRUE;
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002097 result = vim_regsub_both(source, expr, dest, destlen, flags);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002098
2099 rex_in_use = rex_in_use_save;
2100 if (rex_in_use)
2101 rex = rex_save;
2102
2103 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002104}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002105
2106 int
Bram Moolenaar05540972016-01-30 20:31:25 +01002107vim_regsub_multi(
2108 regmmatch_T *rmp,
2109 linenr_T lnum,
2110 char_u *source,
2111 char_u *dest,
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002112 int destlen,
2113 int flags)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002114{
Bram Moolenaar6100d022016-10-02 16:51:57 +02002115 int result;
2116 regexec_T rex_save;
2117 int rex_in_use_save = rex_in_use;
2118
2119 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002120 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002121 rex_save = rex;
2122 rex_in_use = TRUE;
2123
2124 rex.reg_match = NULL;
2125 rex.reg_mmatch = rmp;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002126 rex.reg_buf = curbuf; // always works on the current buffer!
Bram Moolenaar6100d022016-10-02 16:51:57 +02002127 rex.reg_firstlnum = lnum;
2128 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
2129 rex.reg_line_lbr = FALSE;
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002130 result = vim_regsub_both(source, NULL, dest, destlen, flags);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002131
2132 rex_in_use = rex_in_use_save;
2133 if (rex_in_use)
2134 rex = rex_save;
2135
2136 return result;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002137}
2138
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002139#if defined(FEAT_EVAL) || defined(PROTO)
2140// When nesting more than a couple levels it's probably a mistake.
2141# define MAX_REGSUB_NESTING 4
2142static char_u *eval_result[MAX_REGSUB_NESTING] = {NULL, NULL, NULL, NULL};
2143
2144# if defined(EXITFREE) || defined(PROTO)
2145 void
2146free_resub_eval_result(void)
2147{
2148 int i;
2149
2150 for (i = 0; i < MAX_REGSUB_NESTING; ++i)
2151 VIM_CLEAR(eval_result[i]);
2152}
2153# endif
2154#endif
2155
Bram Moolenaar071d4272004-06-13 20:20:40 +00002156 static int
Bram Moolenaar05540972016-01-30 20:31:25 +01002157vim_regsub_both(
2158 char_u *source,
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002159 typval_T *expr,
Bram Moolenaar05540972016-01-30 20:31:25 +01002160 char_u *dest,
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002161 int destlen,
2162 int flags)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002163{
2164 char_u *src;
2165 char_u *dst;
2166 char_u *s;
2167 int c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002168 int cc;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002169 int no = -1;
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002170 fptr_T func_all = (fptr_T)NULL;
2171 fptr_T func_one = (fptr_T)NULL;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002172 linenr_T clnum = 0; // init for GCC
2173 int len = 0; // init for GCC
Bram Moolenaar071d4272004-06-13 20:20:40 +00002174#ifdef FEAT_EVAL
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002175 static int nesting = 0;
2176 int nested;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002177#endif
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002178 int copy = flags & REGSUB_COPY;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002179
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002180 // Be paranoid...
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002181 if ((source == NULL && expr == NULL) || dest == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002182 {
RestorerZ68ebcee2023-05-31 17:12:14 +01002183 iemsg(e_null_argument);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002184 return 0;
2185 }
2186 if (prog_magic_wrong())
2187 return 0;
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002188#ifdef FEAT_EVAL
2189 if (nesting == MAX_REGSUB_NESTING)
2190 {
2191 emsg(_(e_substitute_nesting_too_deep));
2192 return 0;
2193 }
2194 nested = nesting;
2195#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00002196 src = source;
2197 dst = dest;
2198
2199 /*
2200 * When the substitute part starts with "\=" evaluate it as an expression.
2201 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002202 if (expr != NULL || (source[0] == '\\' && source[1] == '='))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002203 {
2204#ifdef FEAT_EVAL
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002205 // To make sure that the length doesn't change between checking the
2206 // length and copying the string, and to speed up things, the
Paul Ollis65745772022-06-05 16:55:54 +01002207 // resulting string is saved from the call with
2208 // "flags & REGSUB_COPY" == 0 to the call with
2209 // "flags & REGSUB_COPY" != 0.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002210 if (copy)
2211 {
John Marriott82792db2024-05-12 00:07:17 +02002212 if (eval_result[nested] != NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002213 {
John Marriott82792db2024-05-12 00:07:17 +02002214 int eval_len = (int)STRLEN(eval_result[nested]);
2215
2216 if (eval_len < destlen)
2217 {
2218 STRCPY(dest, eval_result[nested]);
2219 dst += eval_len;
2220 VIM_CLEAR(eval_result[nested]);
2221 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002222 }
2223 }
2224 else
2225 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002226 int prev_can_f_submatch = can_f_submatch;
2227 regsubmatch_T rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002228
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002229 VIM_CLEAR(eval_result[nested]);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002230
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002231 // The expression may contain substitute(), which calls us
2232 // recursively. Make sure submatch() gets the text from the first
2233 // level.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002234 if (can_f_submatch)
2235 rsm_save = rsm;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002236 can_f_submatch = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002237 rsm.sm_match = rex.reg_match;
2238 rsm.sm_mmatch = rex.reg_mmatch;
2239 rsm.sm_firstlnum = rex.reg_firstlnum;
2240 rsm.sm_maxline = rex.reg_maxline;
2241 rsm.sm_line_lbr = rex.reg_line_lbr;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002242
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002243 // Although unlikely, it is possible that the expression invokes a
2244 // substitute command (it might fail, but still). Therefore keep
Bram Moolenaarabd56da2022-06-23 20:46:27 +01002245 // an array of eval results.
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002246 ++nesting;
2247
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002248 if (expr != NULL)
2249 {
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002250 typval_T argv[2];
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002251 char_u buf[NUMBUFLEN];
2252 typval_T rettv;
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002253 staticList10_T matchList;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002254 funcexe_T funcexe;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002255
2256 rettv.v_type = VAR_STRING;
2257 rettv.vval.v_string = NULL;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002258 argv[0].v_type = VAR_LIST;
2259 argv[0].vval.v_list = &matchList.sl_list;
2260 matchList.sl_list.lv_len = 0;
Bram Moolenaara80faa82020-04-12 19:37:17 +02002261 CLEAR_FIELD(funcexe);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002262 funcexe.fe_argv_func = fill_submatch_list;
2263 funcexe.fe_evaluate = TRUE;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002264 if (expr->v_type == VAR_FUNC)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002265 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002266 s = expr->vval.v_string;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002267 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002268 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002269 else if (expr->v_type == VAR_PARTIAL)
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002270 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002271 partial_T *partial = expr->vval.v_partial;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002272
Bram Moolenaar6100d022016-10-02 16:51:57 +02002273 s = partial_name(partial);
Bram Moolenaar851f86b2021-12-13 14:26:44 +00002274 funcexe.fe_partial = partial;
Bram Moolenaarc6538bc2019-08-03 18:17:11 +02002275 call_func(s, -1, &rettv, 1, argv, &funcexe);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002276 }
LemonBoyf3b48952022-05-05 13:53:03 +01002277 else if (expr->v_type == VAR_INSTR)
2278 {
2279 exe_typval_instr(expr, &rettv);
2280 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02002281 if (matchList.sl_list.lv_len > 0)
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002282 // fill_submatch_list() was called
Bram Moolenaar6100d022016-10-02 16:51:57 +02002283 clear_submatch_list(&matchList);
2284
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002285 if (rettv.v_type == VAR_UNKNOWN)
2286 // something failed, no need to report another error
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002287 eval_result[nested] = NULL;
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002288 else
2289 {
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002290 eval_result[nested] = tv_get_string_buf_chk(&rettv, buf);
2291 if (eval_result[nested] != NULL)
2292 eval_result[nested] = vim_strsave(eval_result[nested]);
Bram Moolenaar4c054e92019-11-10 00:13:50 +01002293 }
Bram Moolenaardf48fb42016-07-22 21:50:18 +02002294 clear_tv(&rettv);
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002295 }
Bram Moolenaar4c137212021-04-19 16:48:48 +02002296 else if (substitute_instr != NULL)
2297 // Execute instructions from ISN_SUBSTITUTE.
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002298 eval_result[nested] = exe_substitute_instr();
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002299 else
Bram Moolenaara4e0b972022-10-01 19:43:52 +01002300 eval_result[nested] = eval_to_string(source + 2, TRUE, FALSE);
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002301 --nesting;
Bram Moolenaar72ab7292016-07-19 19:10:51 +02002302
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002303 if (eval_result[nested] != NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002304 {
Bram Moolenaar06975a42010-03-23 16:27:22 +01002305 int had_backslash = FALSE;
2306
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002307 for (s = eval_result[nested]; *s != NUL; MB_PTR_ADV(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002308 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002309 // Change NL to CR, so that it becomes a line break,
2310 // unless called from vim_regexec_nl().
2311 // Skip over a backslashed character.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002312 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002313 *s = CAR;
2314 else if (*s == '\\' && s[1] != NUL)
Bram Moolenaar06975a42010-03-23 16:27:22 +01002315 {
Bram Moolenaar071d4272004-06-13 20:20:40 +00002316 ++s;
Bram Moolenaar60190782010-05-21 13:08:58 +02002317 /* Change NL to CR here too, so that this works:
2318 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
2319 * abc\
2320 * def
Bram Moolenaar978287b2011-06-19 04:32:15 +02002321 * Not when called from vim_regexec_nl().
Bram Moolenaar60190782010-05-21 13:08:58 +02002322 */
Bram Moolenaar6100d022016-10-02 16:51:57 +02002323 if (*s == NL && !rsm.sm_line_lbr)
Bram Moolenaar60190782010-05-21 13:08:58 +02002324 *s = CAR;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002325 had_backslash = TRUE;
2326 }
2327 }
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002328 if (had_backslash && (flags & REGSUB_BACKSLASH))
Bram Moolenaar06975a42010-03-23 16:27:22 +01002329 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002330 // Backslashes will be consumed, need to double them.
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002331 s = vim_strsave_escaped(eval_result[nested], (char_u *)"\\");
Bram Moolenaar06975a42010-03-23 16:27:22 +01002332 if (s != NULL)
2333 {
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002334 vim_free(eval_result[nested]);
2335 eval_result[nested] = s;
Bram Moolenaar06975a42010-03-23 16:27:22 +01002336 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002337 }
2338
Bram Moolenaar44ddf192022-06-21 22:15:25 +01002339 dst += STRLEN(eval_result[nested]);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002340 }
2341
Bram Moolenaar6100d022016-10-02 16:51:57 +02002342 can_f_submatch = prev_can_f_submatch;
2343 if (can_f_submatch)
2344 rsm = rsm_save;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002345 }
2346#endif
2347 }
2348 else
2349 while ((c = *src++) != NUL)
2350 {
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002351 if (c == '&' && (flags & REGSUB_MAGIC))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002352 no = 0;
2353 else if (c == '\\' && *src != NUL)
2354 {
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002355 if (*src == '&' && !(flags & REGSUB_MAGIC))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002356 {
2357 ++src;
2358 no = 0;
2359 }
2360 else if ('0' <= *src && *src <= '9')
2361 {
2362 no = *src++ - '0';
2363 }
2364 else if (vim_strchr((char_u *)"uUlLeE", *src))
2365 {
2366 switch (*src++)
2367 {
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002368 case 'u': func_one = do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002369 continue;
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002370 case 'U': func_all = do_upper;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002371 continue;
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002372 case 'l': func_one = do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002373 continue;
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002374 case 'L': func_all = do_lower;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002375 continue;
2376 case 'e':
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002377 case 'E': func_one = func_all = (fptr_T)NULL;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002378 continue;
2379 }
2380 }
2381 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002382 if (no < 0) // Ordinary character.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002383 {
Bram Moolenaardb552d602006-03-23 22:59:57 +00002384 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL)
2385 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002386 // Copy a special key as-is.
Bram Moolenaardb552d602006-03-23 22:59:57 +00002387 if (copy)
2388 {
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002389 if (dst + 3 > dest + destlen)
2390 {
2391 iemsg("vim_regsub_both(): not enough space");
2392 return 0;
2393 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002394 *dst++ = c;
2395 *dst++ = *src++;
2396 *dst++ = *src++;
2397 }
2398 else
2399 {
2400 dst += 3;
2401 src += 2;
2402 }
2403 continue;
2404 }
2405
Bram Moolenaar071d4272004-06-13 20:20:40 +00002406 if (c == '\\' && *src != NUL)
2407 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002408 // Check for abbreviations -- webb
Bram Moolenaar071d4272004-06-13 20:20:40 +00002409 switch (*src)
2410 {
2411 case 'r': c = CAR; ++src; break;
2412 case 'n': c = NL; ++src; break;
2413 case 't': c = TAB; ++src; break;
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002414 // Oh no! \e already has meaning in subst pat :-(
2415 // case 'e': c = ESC; ++src; break;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002416 case 'b': c = Ctrl_H; ++src; break;
2417
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002418 // If "backslash" is TRUE the backslash will be removed
2419 // later. Used to insert a literal CR.
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002420 default: if (flags & REGSUB_BACKSLASH)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002421 {
2422 if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002423 {
2424 if (dst + 1 > dest + destlen)
2425 {
2426 iemsg("vim_regsub_both(): not enough space");
2427 return 0;
2428 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002429 *dst = '\\';
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002430 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002431 ++dst;
2432 }
2433 c = *src++;
2434 }
2435 }
Bram Moolenaardb552d602006-03-23 22:59:57 +00002436 else if (has_mbyte)
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002437 c = mb_ptr2char(src - 1);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002438
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002439 // Write to buffer, if copy is set.
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002440 if (func_one != (fptr_T)NULL)
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002441 {
2442 func_one(&cc, c);
2443 func_one = NULL;
2444 }
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002445 else if (func_all != (fptr_T)NULL)
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002446 func_all(&cc, c);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002447 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002448 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002449
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002450 if (has_mbyte)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002451 {
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002452 int totlen = mb_ptr2len(src - 1);
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002453 int charlen = mb_char2len(cc);
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002454
Bram Moolenaar071d4272004-06-13 20:20:40 +00002455 if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002456 {
2457 if (dst + charlen > dest + destlen)
2458 {
2459 iemsg("vim_regsub_both(): not enough space");
2460 return 0;
2461 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002462 mb_char2bytes(cc, dst);
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002463 }
2464 dst += charlen - 1;
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002465 if (enc_utf8)
2466 {
2467 int clen = utf_ptr2len(src - 1);
2468
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002469 // If the character length is shorter than "totlen", there
2470 // are composing characters; copy them as-is.
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002471 if (clen < totlen)
2472 {
2473 if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002474 {
2475 if (dst + totlen - clen > dest + destlen)
2476 {
2477 iemsg("vim_regsub_both(): not enough space");
2478 return 0;
2479 }
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002480 mch_memmove(dst + 1, src - 1 + clen,
2481 (size_t)(totlen - clen));
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002482 }
Bram Moolenaar0c56c602010-07-12 22:42:33 +02002483 dst += totlen - clen;
2484 }
2485 }
2486 src += totlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002487 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002488 else if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002489 {
2490 if (dst + 1 > dest + destlen)
2491 {
2492 iemsg("vim_regsub_both(): not enough space");
2493 return 0;
2494 }
2495 *dst = cc;
2496 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002497 dst++;
2498 }
2499 else
2500 {
2501 if (REG_MULTI)
2502 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002503 clnum = rex.reg_mmatch->startpos[no].lnum;
2504 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002505 s = NULL;
2506 else
2507 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002508 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
2509 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2510 len = rex.reg_mmatch->endpos[no].col
2511 - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002512 else
John Marriott82792db2024-05-12 00:07:17 +02002513 len = (int)reg_getline_len(clnum) - rex.reg_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002514 }
2515 }
2516 else
2517 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002518 s = rex.reg_match->startp[no];
2519 if (rex.reg_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002520 s = NULL;
2521 else
Bram Moolenaar6100d022016-10-02 16:51:57 +02002522 len = (int)(rex.reg_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002523 }
2524 if (s != NULL)
2525 {
2526 for (;;)
2527 {
2528 if (len == 0)
2529 {
2530 if (REG_MULTI)
2531 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002532 if (rex.reg_mmatch->endpos[no].lnum == clnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002533 break;
2534 if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002535 {
2536 if (dst + 1 > dest + destlen)
2537 {
2538 iemsg("vim_regsub_both(): not enough space");
2539 return 0;
2540 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002541 *dst = CAR;
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002542 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002543 ++dst;
2544 s = reg_getline(++clnum);
Bram Moolenaar6100d022016-10-02 16:51:57 +02002545 if (rex.reg_mmatch->endpos[no].lnum == clnum)
2546 len = rex.reg_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002547 else
John Marriott82792db2024-05-12 00:07:17 +02002548 len = (int)reg_getline_len(clnum);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002549 }
2550 else
2551 break;
2552 }
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002553 else if (*s == NUL) // we hit NUL.
Bram Moolenaar071d4272004-06-13 20:20:40 +00002554 {
2555 if (copy)
RestorerZ68ebcee2023-05-31 17:12:14 +01002556 iemsg(e_damaged_match_string);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002557 goto exit;
2558 }
2559 else
2560 {
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002561 if ((flags & REGSUB_BACKSLASH)
2562 && (*s == CAR || *s == '\\'))
Bram Moolenaar071d4272004-06-13 20:20:40 +00002563 {
2564 /*
2565 * Insert a backslash in front of a CR, otherwise
2566 * it will be replaced by a line break.
2567 * Number of backslashes will be halved later,
2568 * double them here.
2569 */
2570 if (copy)
2571 {
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002572 if (dst + 2 > dest + destlen)
2573 {
2574 iemsg("vim_regsub_both(): not enough space");
2575 return 0;
2576 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002577 dst[0] = '\\';
2578 dst[1] = *s;
2579 }
2580 dst += 2;
2581 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00002582 else
2583 {
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002584 if (has_mbyte)
2585 c = mb_ptr2char(s);
2586 else
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002587 c = *s;
2588
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002589 if (func_one != (fptr_T)NULL)
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002590 {
2591 func_one(&cc, c);
2592 func_one = NULL;
2593 }
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002594 else if (func_all != (fptr_T)NULL)
Yee Cheng Chind25021c2023-09-18 19:51:56 +02002595 func_all(&cc, c);
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002596 else // just copy
Bram Moolenaarc2c355d2013-03-19 17:42:15 +01002597 cc = c;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002598
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002599 if (has_mbyte)
2600 {
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002601 int l;
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002602 int charlen;
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002603
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002604 // Copy composing characters separately, one
2605 // at a time.
Bram Moolenaar9225efb2007-07-30 20:32:53 +00002606 if (enc_utf8)
2607 l = utf_ptr2len(s) - 1;
2608 else
2609 l = mb_ptr2len(s) - 1;
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002610
2611 s += l;
2612 len -= l;
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002613 charlen = mb_char2len(cc);
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002614 if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002615 {
2616 if (dst + charlen > dest + destlen)
2617 {
2618 iemsg("vim_regsub_both(): not enough space");
2619 return 0;
2620 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002621 mb_char2bytes(cc, dst);
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002622 }
2623 dst += charlen - 1;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002624 }
Bram Moolenaara12a1612019-01-24 16:39:02 +01002625 else if (copy)
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01002626 {
2627 if (dst + 1 > dest + destlen)
2628 {
2629 iemsg("vim_regsub_both(): not enough space");
2630 return 0;
2631 }
2632 *dst = cc;
2633 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002634 dst++;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002635 }
Bram Moolenaarefd2bf12006-03-16 21:41:35 +00002636
Bram Moolenaar071d4272004-06-13 20:20:40 +00002637 ++s;
2638 --len;
2639 }
2640 }
2641 }
2642 no = -1;
2643 }
2644 }
2645 if (copy)
2646 *dst = NUL;
2647
2648exit:
2649 return (int)((dst - dest) + 1);
2650}
2651
2652#ifdef FEAT_EVAL
John Marriott82792db2024-05-12 00:07:17 +02002653
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002654 static char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002655reg_getline_submatch(linenr_T lnum)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002656{
John Marriott82792db2024-05-12 00:07:17 +02002657 char_u *line;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002658
John Marriott82792db2024-05-12 00:07:17 +02002659 reg_getline_common(lnum, RGLF_LINE | RGLF_SUBMATCH, &line, NULL);
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002660
John Marriott82792db2024-05-12 00:07:17 +02002661 return line;
2662}
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002663
John Marriott82792db2024-05-12 00:07:17 +02002664 static colnr_T
2665reg_getline_submatch_len(linenr_T lnum)
2666{
2667 colnr_T length;
2668
2669 reg_getline_common(lnum, RGLF_LENGTH | RGLF_SUBMATCH, NULL, &length);
2670
2671 return length;
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002672}
2673
2674/*
Bram Moolenaar7aa9f6a2007-05-10 18:00:30 +00002675 * Used for the submatch() function: get the string from the n'th submatch in
Bram Moolenaar071d4272004-06-13 20:20:40 +00002676 * allocated memory.
2677 * Returns NULL when not in a ":s" command and for a non-existing submatch.
2678 */
2679 char_u *
Bram Moolenaar05540972016-01-30 20:31:25 +01002680reg_submatch(int no)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002681{
2682 char_u *retval = NULL;
2683 char_u *s;
2684 int len;
2685 int round;
2686 linenr_T lnum;
2687
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002688 if (!can_f_submatch || no < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002689 return NULL;
2690
Bram Moolenaar6100d022016-10-02 16:51:57 +02002691 if (rsm.sm_match == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002692 {
2693 /*
2694 * First round: compute the length and allocate memory.
2695 * Second round: copy the text.
2696 */
2697 for (round = 1; round <= 2; ++round)
2698 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002699 lnum = rsm.sm_mmatch->startpos[no].lnum;
2700 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002701 return NULL;
2702
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002703 s = reg_getline_submatch(lnum);
2704 if (s == NULL) // anti-crash check, cannot happen?
Bram Moolenaar071d4272004-06-13 20:20:40 +00002705 break;
Bram Moolenaar64c8ed32019-03-20 21:18:34 +01002706 s += rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002707 if (rsm.sm_mmatch->endpos[no].lnum == lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002708 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002709 // Within one line: take form start to end col.
Bram Moolenaar6100d022016-10-02 16:51:57 +02002710 len = rsm.sm_mmatch->endpos[no].col
2711 - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002712 if (round == 2)
Bram Moolenaarbbebc852005-07-18 21:47:53 +00002713 vim_strncpy(retval, s, len);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002714 ++len;
2715 }
2716 else
2717 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002718 // Multiple lines: take start line from start col, middle
2719 // lines completely and end line up to end col.
John Marriott82792db2024-05-12 00:07:17 +02002720 len = (int)reg_getline_submatch_len(lnum) - rsm.sm_mmatch->startpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002721 if (round == 2)
2722 {
2723 STRCPY(retval, s);
2724 retval[len] = '\n';
2725 }
2726 ++len;
2727 ++lnum;
Bram Moolenaar6100d022016-10-02 16:51:57 +02002728 while (lnum < rsm.sm_mmatch->endpos[no].lnum)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002729 {
John Marriott82792db2024-05-12 00:07:17 +02002730 s = reg_getline_submatch(lnum);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002731 if (round == 2)
2732 STRCPY(retval + len, s);
John Marriott82792db2024-05-12 00:07:17 +02002733 len += (int)reg_getline_submatch_len(lnum);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002734 if (round == 2)
2735 retval[len] = '\n';
2736 ++len;
John Marriott82792db2024-05-12 00:07:17 +02002737 ++lnum;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002738 }
2739 if (round == 2)
Bram Moolenaar5ea08a82009-11-25 18:51:24 +00002740 STRNCPY(retval + len, reg_getline_submatch(lnum),
Bram Moolenaar6100d022016-10-02 16:51:57 +02002741 rsm.sm_mmatch->endpos[no].col);
2742 len += rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar071d4272004-06-13 20:20:40 +00002743 if (round == 2)
2744 retval[len] = NUL;
2745 ++len;
2746 }
2747
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002748 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002749 {
Bram Moolenaar18a4ba22019-05-24 19:39:03 +02002750 retval = alloc(len);
Bram Moolenaareb3593b2006-04-22 22:33:57 +00002751 if (retval == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002752 return NULL;
2753 }
2754 }
2755 }
2756 else
2757 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002758 s = rsm.sm_match->startp[no];
2759 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002760 retval = NULL;
2761 else
Bram Moolenaar71ccd032020-06-12 22:59:11 +02002762 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00002763 }
2764
2765 return retval;
2766}
Bram Moolenaar41571762014-04-02 19:00:58 +02002767
2768/*
2769 * Used for the submatch() function with the optional non-zero argument: get
2770 * the list of strings from the n'th submatch in allocated memory with NULs
2771 * represented in NLs.
2772 * Returns a list of allocated strings. Returns NULL when not in a ":s"
2773 * command, for a non-existing submatch and for any error.
2774 */
2775 list_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002776reg_submatch_list(int no)
Bram Moolenaar41571762014-04-02 19:00:58 +02002777{
2778 char_u *s;
2779 linenr_T slnum;
2780 linenr_T elnum;
2781 colnr_T scol;
2782 colnr_T ecol;
2783 int i;
2784 list_T *list;
2785 int error = FALSE;
2786
2787 if (!can_f_submatch || no < 0)
2788 return NULL;
2789
Bram Moolenaar6100d022016-10-02 16:51:57 +02002790 if (rsm.sm_match == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002791 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002792 slnum = rsm.sm_mmatch->startpos[no].lnum;
2793 elnum = rsm.sm_mmatch->endpos[no].lnum;
Bram Moolenaar41571762014-04-02 19:00:58 +02002794 if (slnum < 0 || elnum < 0)
2795 return NULL;
2796
Bram Moolenaar6100d022016-10-02 16:51:57 +02002797 scol = rsm.sm_mmatch->startpos[no].col;
2798 ecol = rsm.sm_mmatch->endpos[no].col;
Bram Moolenaar41571762014-04-02 19:00:58 +02002799
2800 list = list_alloc();
2801 if (list == NULL)
2802 return NULL;
2803
2804 s = reg_getline_submatch(slnum) + scol;
2805 if (slnum == elnum)
2806 {
2807 if (list_append_string(list, s, ecol - scol) == FAIL)
2808 error = TRUE;
2809 }
2810 else
2811 {
John Marriott82792db2024-05-12 00:07:17 +02002812 int max_lnum = elnum - slnum;
2813
Bram Moolenaar41571762014-04-02 19:00:58 +02002814 if (list_append_string(list, s, -1) == FAIL)
2815 error = TRUE;
John Marriott82792db2024-05-12 00:07:17 +02002816 for (i = 1; i < max_lnum; i++)
Bram Moolenaar41571762014-04-02 19:00:58 +02002817 {
2818 s = reg_getline_submatch(slnum + i);
2819 if (list_append_string(list, s, -1) == FAIL)
2820 error = TRUE;
2821 }
2822 s = reg_getline_submatch(elnum);
2823 if (list_append_string(list, s, ecol) == FAIL)
2824 error = TRUE;
2825 }
2826 }
2827 else
2828 {
Bram Moolenaar6100d022016-10-02 16:51:57 +02002829 s = rsm.sm_match->startp[no];
2830 if (s == NULL || rsm.sm_match->endp[no] == NULL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002831 return NULL;
2832 list = list_alloc();
2833 if (list == NULL)
2834 return NULL;
2835 if (list_append_string(list, s,
Bram Moolenaar6100d022016-10-02 16:51:57 +02002836 (int)(rsm.sm_match->endp[no] - s)) == FAIL)
Bram Moolenaar41571762014-04-02 19:00:58 +02002837 error = TRUE;
2838 }
2839
2840 if (error)
2841 {
Bram Moolenaar107e1ee2016-04-08 17:07:19 +02002842 list_free(list);
Bram Moolenaar41571762014-04-02 19:00:58 +02002843 return NULL;
2844 }
Bram Moolenaar8a0dcf42020-09-06 15:14:45 +02002845 ++list->lv_refcount;
Bram Moolenaar41571762014-04-02 19:00:58 +02002846 return list;
2847}
Bram Moolenaar071d4272004-06-13 20:20:40 +00002848#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002849
Bram Moolenaarf4140482020-02-15 23:06:45 +01002850/*
2851 * Initialize the values used for matching against multiple lines
2852 */
2853 static void
2854init_regexec_multi(
2855 regmmatch_T *rmp,
2856 win_T *win, // window in which to search or NULL
2857 buf_T *buf, // buffer in which to search
2858 linenr_T lnum) // nr of line to start looking for match
2859{
2860 rex.reg_match = NULL;
2861 rex.reg_mmatch = rmp;
2862 rex.reg_buf = buf;
2863 rex.reg_win = win;
2864 rex.reg_firstlnum = lnum;
2865 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
2866 rex.reg_line_lbr = FALSE;
2867 rex.reg_ic = rmp->rmm_ic;
2868 rex.reg_icombine = FALSE;
2869 rex.reg_maxcol = rmp->rmm_maxcol;
2870}
2871
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02002872#include "regexp_bt.c"
2873
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002874static regengine_T bt_regengine =
2875{
2876 bt_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002877 bt_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002878 bt_regexec_nl,
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002879 bt_regexec_multi
2880#ifdef DEBUG
2881 ,(char_u *)""
2882#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002883};
2884
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002885#include "regexp_nfa.c"
2886
2887static regengine_T nfa_regengine =
2888{
2889 nfa_regcomp,
Bram Moolenaar473de612013-06-08 18:19:48 +02002890 nfa_regfree,
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002891 nfa_regexec_nl,
Christian Brabandtd2cc51f2024-01-04 22:54:08 +01002892 nfa_regexec_multi
2893#ifdef DEBUG
2894 ,(char_u *)""
2895#endif
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002896};
2897
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002898// Which regexp engine to use? Needed for vim_regcomp().
2899// Must match with 'regexpengine'.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002900static int regexp_engine = 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01002901
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002902#ifdef DEBUG
2903static char_u regname[][30] = {
2904 "AUTOMATIC Regexp Engine",
Bram Moolenaar75eb1612013-05-29 18:45:11 +02002905 "BACKTRACKING Regexp Engine",
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002906 "NFA Regexp Engine"
2907 };
2908#endif
2909
2910/*
2911 * Compile a regular expression into internal code.
Bram Moolenaar473de612013-06-08 18:19:48 +02002912 * Returns the program in allocated memory.
2913 * Use vim_regfree() to free the memory.
2914 * Returns NULL for an error.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002915 */
2916 regprog_T *
Bram Moolenaar05540972016-01-30 20:31:25 +01002917vim_regcomp(char_u *expr_arg, int re_flags)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002918{
2919 regprog_T *prog = NULL;
2920 char_u *expr = expr_arg;
Bram Moolenaar53989552019-12-23 22:59:18 +01002921 int called_emsg_before;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002922
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002923 regexp_engine = p_re;
2924
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002925 // Check for prefix "\%#=", that sets the regexp engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002926 if (STRNCMP(expr, "\\%#=", 4) == 0)
2927 {
2928 int newengine = expr[4] - '0';
2929
2930 if (newengine == AUTOMATIC_ENGINE
2931 || newengine == BACKTRACKING_ENGINE
2932 || newengine == NFA_ENGINE)
2933 {
2934 regexp_engine = expr[4] - '0';
2935 expr += 5;
2936#ifdef DEBUG
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002937 smsg("New regexp mode selected (%d): %s",
Bram Moolenaar6e132072014-05-13 16:46:32 +02002938 regexp_engine, regname[newengine]);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002939#endif
2940 }
2941 else
2942 {
Bram Moolenaar9d00e4a2022-01-05 17:49:15 +00002943 emsg(_(e_percent_hash_can_only_be_followed_by_zero_one_two_automatic_engine_will_be_used));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002944 regexp_engine = AUTOMATIC_ENGINE;
2945 }
2946 }
Bram Moolenaar0270f382018-07-17 05:43:58 +02002947#ifdef DEBUG
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002948 bt_regengine.expr = expr;
2949 nfa_regengine.expr = expr;
Bram Moolenaar0270f382018-07-17 05:43:58 +02002950#endif
Bram Moolenaar8bfd9462019-02-16 18:07:57 +01002951 // reg_iswordc() uses rex.reg_buf
2952 rex.reg_buf = curbuf;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002953
2954 /*
2955 * First try the NFA engine, unless backtracking was requested.
2956 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002957 called_emsg_before = called_emsg;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002958 if (regexp_engine != BACKTRACKING_ENGINE)
Bram Moolenaard23a8232018-02-10 18:45:26 +01002959 prog = nfa_regengine.regcomp(expr,
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002960 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002961 else
2962 prog = bt_regengine.regcomp(expr, re_flags);
2963
Bram Moolenaar63d9e732019-12-05 21:10:38 +01002964 // Check for error compiling regexp with initial engine.
Bram Moolenaarfda37292014-11-05 14:27:36 +01002965 if (prog == NULL)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002966 {
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002967#ifdef BT_REGEXP_DEBUG_LOG
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002968 if (regexp_engine == BACKTRACKING_ENGINE) // debugging log for BT engine
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002969 {
2970 FILE *f;
Bram Moolenaar7fcff1f2013-05-20 21:49:13 +02002971 f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002972 if (f)
2973 {
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002974 fprintf(f, "Syntax error in \"%s\"\n", expr);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002975 fclose(f);
2976 }
2977 else
Bram Moolenaarf9e3e092019-01-13 23:38:42 +01002978 semsg("(NFA) Could not open \"%s\" to write !!!",
Bram Moolenaard23a8232018-02-10 18:45:26 +01002979 BT_REGEXP_DEBUG_LOG_NAME);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002980 }
2981#endif
2982 /*
Bram Moolenaarfda37292014-11-05 14:27:36 +01002983 * If the NFA engine failed, try the backtracking engine.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002984 * The NFA engine also fails for patterns that it can't handle well
2985 * but are still valid patterns, thus a retry should work.
Bram Moolenaarcd625122019-02-22 17:29:43 +01002986 * But don't try if an error message was given.
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002987 */
Bram Moolenaar53989552019-12-23 22:59:18 +01002988 if (regexp_engine == AUTOMATIC_ENGINE
2989 && called_emsg == called_emsg_before)
Bram Moolenaarfda37292014-11-05 14:27:36 +01002990 {
Bram Moolenaare0ad3652015-01-27 12:59:55 +01002991 regexp_engine = BACKTRACKING_ENGINE;
Bram Moolenaar66c50c52021-01-02 17:43:49 +01002992#ifdef FEAT_EVAL
2993 report_re_switch(expr);
2994#endif
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002995 prog = bt_regengine.regcomp(expr, re_flags);
Bram Moolenaarfda37292014-11-05 14:27:36 +01002996 }
Bram Moolenaarcd2d8bb2013-06-05 21:42:53 +02002997 }
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02002998
Bram Moolenaarfda37292014-11-05 14:27:36 +01002999 if (prog != NULL)
3000 {
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003001 // Store the info needed to call regcomp() again when the engine turns
3002 // out to be very slow when executing it.
Bram Moolenaarfda37292014-11-05 14:27:36 +01003003 prog->re_engine = regexp_engine;
3004 prog->re_flags = re_flags;
3005 }
3006
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003007 return prog;
3008}
3009
3010/*
Bram Moolenaar473de612013-06-08 18:19:48 +02003011 * Free a compiled regexp program, returned by vim_regcomp().
3012 */
3013 void
Bram Moolenaar05540972016-01-30 20:31:25 +01003014vim_regfree(regprog_T *prog)
Bram Moolenaar473de612013-06-08 18:19:48 +02003015{
3016 if (prog != NULL)
3017 prog->engine->regfree(prog);
3018}
3019
Bram Moolenaar6d7d7cf2019-09-07 23:16:33 +02003020#if defined(EXITFREE) || defined(PROTO)
3021 void
3022free_regexp_stuff(void)
3023{
3024 ga_clear(&regstack);
3025 ga_clear(&backpos);
3026 vim_free(reg_tofree);
3027 vim_free(reg_prev_sub);
3028}
3029#endif
3030
Bram Moolenaarfda37292014-11-05 14:27:36 +01003031#ifdef FEAT_EVAL
Bram Moolenaarfda37292014-11-05 14:27:36 +01003032 static void
Bram Moolenaar05540972016-01-30 20:31:25 +01003033report_re_switch(char_u *pat)
Bram Moolenaarfda37292014-11-05 14:27:36 +01003034{
3035 if (p_verbose > 0)
3036 {
3037 verbose_enter();
Bram Moolenaar32526b32019-01-19 17:43:09 +01003038 msg_puts(_("Switching to backtracking RE engine for pattern: "));
3039 msg_puts((char *)pat);
Bram Moolenaarfda37292014-11-05 14:27:36 +01003040 verbose_leave();
3041 }
3042}
3043#endif
3044
Bram Moolenaar651fca82021-11-29 20:39:38 +00003045#if defined(FEAT_X11) || defined(PROTO)
Bram Moolenaar473de612013-06-08 18:19:48 +02003046/*
Bram Moolenaara8bfa172018-12-29 22:28:46 +01003047 * Return whether "prog" is currently being executed.
3048 */
3049 int
3050regprog_in_use(regprog_T *prog)
3051{
3052 return prog->re_in_use;
3053}
Bram Moolenaar113e1072019-01-20 15:30:40 +01003054#endif
Bram Moolenaara8bfa172018-12-29 22:28:46 +01003055
3056/*
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003057 * Match a regexp against a string.
Bram Moolenaar4aaf3e72022-05-30 20:58:55 +01003058 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003059 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003060 * Uses curbuf for line count and 'iskeyword'.
Bram Moolenaarfda37292014-11-05 14:27:36 +01003061 * When "nl" is TRUE consider a "\n" in "line" to be a line break.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003062 *
3063 * Return TRUE if there is a match, FALSE if not.
3064 */
Bram Moolenaarfda37292014-11-05 14:27:36 +01003065 static int
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02003066vim_regexec_string(
Bram Moolenaar05540972016-01-30 20:31:25 +01003067 regmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003068 char_u *line, // string to match against
3069 colnr_T col, // column to start looking for match
Bram Moolenaar05540972016-01-30 20:31:25 +01003070 int nl)
Bram Moolenaarfda37292014-11-05 14:27:36 +01003071{
Bram Moolenaar6100d022016-10-02 16:51:57 +02003072 int result;
3073 regexec_T rex_save;
3074 int rex_in_use_save = rex_in_use;
3075
Bram Moolenaar0270f382018-07-17 05:43:58 +02003076 // Cannot use the same prog recursively, it contains state.
3077 if (rmp->regprog->re_in_use)
3078 {
Bram Moolenaar677658a2022-01-05 16:09:06 +00003079 emsg(_(e_cannot_use_pattern_recursively));
Bram Moolenaar0270f382018-07-17 05:43:58 +02003080 return FALSE;
3081 }
3082 rmp->regprog->re_in_use = TRUE;
3083
Bram Moolenaar6100d022016-10-02 16:51:57 +02003084 if (rex_in_use)
Bram Moolenaar0270f382018-07-17 05:43:58 +02003085 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02003086 rex_save = rex;
3087 rex_in_use = TRUE;
Bram Moolenaar0270f382018-07-17 05:43:58 +02003088
Bram Moolenaar6100d022016-10-02 16:51:57 +02003089 rex.reg_startp = NULL;
3090 rex.reg_endp = NULL;
3091 rex.reg_startpos = NULL;
3092 rex.reg_endpos = NULL;
3093
3094 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02003095 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01003096
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003097 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01003098 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
3099 && result == NFA_TOO_EXPENSIVE)
3100 {
3101 int save_p_re = p_re;
3102 int re_flags = rmp->regprog->re_flags;
3103 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
3104
3105 p_re = BACKTRACKING_ENGINE;
3106 vim_regfree(rmp->regprog);
3107 if (pat != NULL)
3108 {
3109#ifdef FEAT_EVAL
3110 report_re_switch(pat);
3111#endif
3112 rmp->regprog = vim_regcomp(pat, re_flags);
3113 if (rmp->regprog != NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02003114 {
3115 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01003116 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
Bram Moolenaar41499802018-07-18 06:02:09 +02003117 rmp->regprog->re_in_use = FALSE;
3118 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01003119 vim_free(pat);
3120 }
3121
3122 p_re = save_p_re;
3123 }
Bram Moolenaar6100d022016-10-02 16:51:57 +02003124
3125 rex_in_use = rex_in_use_save;
3126 if (rex_in_use)
3127 rex = rex_save;
3128
Bram Moolenaar66a3e792014-11-20 23:07:05 +01003129 return result > 0;
Bram Moolenaarfda37292014-11-05 14:27:36 +01003130}
3131
Dominique Pellee764d1b2023-03-12 21:20:59 +00003132#if defined(FEAT_SPELL) || defined(FEAT_EVAL) || defined(FEAT_X11) || defined(PROTO)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003133/*
3134 * Note: "*prog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01003135 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003136 */
3137 int
Bram Moolenaar05540972016-01-30 20:31:25 +01003138vim_regexec_prog(
3139 regprog_T **prog,
3140 int ignore_case,
3141 char_u *line,
3142 colnr_T col)
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003143{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02003144 int r;
3145 regmatch_T regmatch;
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003146
3147 regmatch.regprog = *prog;
3148 regmatch.rm_ic = ignore_case;
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02003149 r = vim_regexec_string(&regmatch, line, col, FALSE);
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003150 *prog = regmatch.regprog;
3151 return r;
3152}
Dominique Pellee764d1b2023-03-12 21:20:59 +00003153#endif
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003154
3155/*
3156 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01003157 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003158 */
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003159 int
Bram Moolenaar05540972016-01-30 20:31:25 +01003160vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003161{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02003162 return vim_regexec_string(rmp, line, col, FALSE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003163}
3164
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003165/*
3166 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
Bram Moolenaardffa5b82014-11-19 16:38:07 +01003167 * Note: "rmp->regprog" may be freed and changed.
Bram Moolenaar66a3e792014-11-20 23:07:05 +01003168 * Return TRUE if there is a match, FALSE if not.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003169 */
3170 int
Bram Moolenaar05540972016-01-30 20:31:25 +01003171vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003172{
Bram Moolenaar06f1ed22017-06-18 22:41:03 +02003173 return vim_regexec_string(rmp, line, col, TRUE);
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003174}
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003175
3176/*
3177 * Match a regexp against multiple lines.
Bram Moolenaarbcf94422018-06-23 14:21:42 +02003178 * "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
3179 * Note: "rmp->regprog" may be freed and changed, even set to NULL.
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003180 * Uses curbuf for line count and 'iskeyword'.
3181 *
3182 * Return zero if there is no match. Return number of lines contained in the
3183 * match otherwise.
3184 */
3185 long
Bram Moolenaar05540972016-01-30 20:31:25 +01003186vim_regexec_multi(
3187 regmmatch_T *rmp,
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003188 win_T *win, // window in which to search or NULL
3189 buf_T *buf, // buffer in which to search
3190 linenr_T lnum, // nr of line to start looking for match
3191 colnr_T col, // column to start looking for match
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003192 int *timed_out) // flag is set when timeout limit reached
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003193{
Bram Moolenaar6100d022016-10-02 16:51:57 +02003194 int result;
3195 regexec_T rex_save;
3196 int rex_in_use_save = rex_in_use;
3197
Bram Moolenaar0270f382018-07-17 05:43:58 +02003198 // Cannot use the same prog recursively, it contains state.
3199 if (rmp->regprog->re_in_use)
3200 {
Bram Moolenaar677658a2022-01-05 16:09:06 +00003201 emsg(_(e_cannot_use_pattern_recursively));
Bram Moolenaar0270f382018-07-17 05:43:58 +02003202 return FALSE;
3203 }
3204 rmp->regprog->re_in_use = TRUE;
3205
Bram Moolenaar6100d022016-10-02 16:51:57 +02003206 if (rex_in_use)
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003207 // Being called recursively, save the state.
Bram Moolenaar6100d022016-10-02 16:51:57 +02003208 rex_save = rex;
3209 rex_in_use = TRUE;
3210
Bram Moolenaarfbd0b0a2017-06-17 18:44:21 +02003211 result = rmp->regprog->engine->regexec_multi(
Paul Ollis65745772022-06-05 16:55:54 +01003212 rmp, win, buf, lnum, col, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02003213 rmp->regprog->re_in_use = FALSE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01003214
Bram Moolenaar63d9e732019-12-05 21:10:38 +01003215 // NFA engine aborted because it's very slow.
Bram Moolenaarfda37292014-11-05 14:27:36 +01003216 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
3217 && result == NFA_TOO_EXPENSIVE)
3218 {
3219 int save_p_re = p_re;
3220 int re_flags = rmp->regprog->re_flags;
3221 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
3222
3223 p_re = BACKTRACKING_ENGINE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01003224 if (pat != NULL)
3225 {
Bram Moolenaare8a4c0d2022-04-04 18:14:34 +01003226 regprog_T *prev_prog = rmp->regprog;
3227
Bram Moolenaarfda37292014-11-05 14:27:36 +01003228#ifdef FEAT_EVAL
3229 report_re_switch(pat);
3230#endif
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02003231#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02003232 // checking for \z misuse was already done when compiling for NFA,
3233 // allow all here
3234 reg_do_extmatch = REX_ALL;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02003235#endif
Bram Moolenaarfda37292014-11-05 14:27:36 +01003236 rmp->regprog = vim_regcomp(pat, re_flags);
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02003237#ifdef FEAT_SYN_HL
Bram Moolenaarbcf94422018-06-23 14:21:42 +02003238 reg_do_extmatch = 0;
Bram Moolenaar1f8c4692018-06-23 15:09:10 +02003239#endif
Bram Moolenaare8a4c0d2022-04-04 18:14:34 +01003240 if (rmp->regprog == NULL)
Bram Moolenaar41499802018-07-18 06:02:09 +02003241 {
Bram Moolenaare8a4c0d2022-04-04 18:14:34 +01003242 // Somehow compiling the pattern failed now, put back the
3243 // previous one to avoid "regprog" becoming NULL.
3244 rmp->regprog = prev_prog;
3245 }
3246 else
3247 {
3248 vim_regfree(prev_prog);
3249
Bram Moolenaar41499802018-07-18 06:02:09 +02003250 rmp->regprog->re_in_use = TRUE;
Bram Moolenaarfda37292014-11-05 14:27:36 +01003251 result = rmp->regprog->engine->regexec_multi(
Paul Ollis65745772022-06-05 16:55:54 +01003252 rmp, win, buf, lnum, col, timed_out);
Bram Moolenaar41499802018-07-18 06:02:09 +02003253 rmp->regprog->re_in_use = FALSE;
3254 }
Bram Moolenaarfda37292014-11-05 14:27:36 +01003255 vim_free(pat);
3256 }
3257 p_re = save_p_re;
3258 }
3259
Bram Moolenaar6100d022016-10-02 16:51:57 +02003260 rex_in_use = rex_in_use_save;
3261 if (rex_in_use)
3262 rex = rex_save;
3263
Bram Moolenaar66a3e792014-11-20 23:07:05 +01003264 return result <= 0 ? 0 : result;
Bram Moolenaarfbc0d2e2013-05-19 19:40:29 +02003265}