blob: 23e31af22611851e36d90f03f89b1d93e5f92484 [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
77 * node points to the node after the stuff to be repeated. The operand of some
78 * types of node is a literal string; for others, it is a node leading into a
79 * sub-FSM. In particular, the operand of a BRANCH node is the first node of
80 * the branch. (NB this is *not* a tree structure: the tail of the branch
81 * connects to the thing following the set of BRANCHes.)
82 *
83 * pattern is coded like:
84 *
85 * +-----------------+
86 * | V
87 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
88 * | ^ | ^
89 * +------+ +----------+
90 *
91 *
92 * +------------------+
93 * V |
94 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
95 * | | ^ ^
96 * | +---------------+ |
97 * +---------------------------------------------+
98 *
99 *
100 * +-------------------------+
101 * V |
102 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
103 * | | ^
104 * | +----------------+
105 * +-----------------------------------------------+
106 *
107 *
108 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
109 * | | ^ ^
110 * | +----------------+ |
111 * +--------------------------------+
112 *
113 * +---------+
114 * | V
115 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
116 * | | | | ^ ^
117 * | | | +-----+ |
118 * | | +----------------+ |
119 * | +---------------------------+ |
120 * +------------------------------------------------------+
121 *
122 * They all start with a BRANCH for "\|" alternaties, even when there is only
123 * one alternative.
124 */
125
126/*
127 * The opcodes are:
128 */
129
130/* definition number opnd? meaning */
131#define END 0 /* End of program or NOMATCH operand. */
132#define BOL 1 /* Match "" at beginning of line. */
133#define EOL 2 /* Match "" at end of line. */
134#define BRANCH 3 /* node Match this alternative, or the
135 * next... */
136#define BACK 4 /* Match "", "next" ptr points backward. */
137#define EXACTLY 5 /* str Match this string. */
138#define NOTHING 6 /* Match empty string. */
139#define STAR 7 /* node Match this (simple) thing 0 or more
140 * times. */
141#define PLUS 8 /* node Match this (simple) thing 1 or more
142 * times. */
143#define MATCH 9 /* node match the operand zero-width */
144#define NOMATCH 10 /* node check for no match with operand */
145#define BEHIND 11 /* node look behind for a match with operand */
146#define NOBEHIND 12 /* node look behind for no match with operand */
147#define SUBPAT 13 /* node match the operand here */
148#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
149 * n times (\{m,n\}). */
150#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
151#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
152#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
153 * and BRACE_COMPLEX. */
154#define NEWL 18 /* Match line-break */
155#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
156
157
158/* character classes: 20-48 normal, 50-78 include a line-break */
159#define ADD_NL 30
160#define FIRST_NL ANY + ADD_NL
161#define ANY 20 /* Match any one character. */
162#define ANYOF 21 /* str Match any character in this string. */
163#define ANYBUT 22 /* str Match any character not in this
164 * string. */
165#define IDENT 23 /* Match identifier char */
166#define SIDENT 24 /* Match identifier char but no digit */
167#define KWORD 25 /* Match keyword char */
168#define SKWORD 26 /* Match word char but no digit */
169#define FNAME 27 /* Match file name char */
170#define SFNAME 28 /* Match file name char but no digit */
171#define PRINT 29 /* Match printable char */
172#define SPRINT 30 /* Match printable char but no digit */
173#define WHITE 31 /* Match whitespace char */
174#define NWHITE 32 /* Match non-whitespace char */
175#define DIGIT 33 /* Match digit char */
176#define NDIGIT 34 /* Match non-digit char */
177#define HEX 35 /* Match hex char */
178#define NHEX 36 /* Match non-hex char */
179#define OCTAL 37 /* Match octal char */
180#define NOCTAL 38 /* Match non-octal char */
181#define WORD 39 /* Match word char */
182#define NWORD 40 /* Match non-word char */
183#define HEAD 41 /* Match head char */
184#define NHEAD 42 /* Match non-head char */
185#define ALPHA 43 /* Match alpha char */
186#define NALPHA 44 /* Match non-alpha char */
187#define LOWER 45 /* Match lowercase char */
188#define NLOWER 46 /* Match non-lowercase char */
189#define UPPER 47 /* Match uppercase char */
190#define NUPPER 48 /* Match non-uppercase char */
191#define LAST_NL NUPPER + ADD_NL
192#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
193
194#define MOPEN 80 /* -89 Mark this point in input as start of
195 * \( subexpr. MOPEN + 0 marks start of
196 * match. */
197#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
198 * end of match. */
199#define BACKREF 100 /* -109 node Match same string again \1-\9 */
200
201#ifdef FEAT_SYN_HL
202# define ZOPEN 110 /* -119 Mark this point in input as start of
203 * \z( subexpr. */
204# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
205# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
206#endif
207
208#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
209
210#define NOPEN 150 /* Mark this point in input as start of
211 \%( subexpr. */
212#define NCLOSE 151 /* Analogous to NOPEN. */
213
214#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
215#define RE_BOF 201 /* Match "" at beginning of file. */
216#define RE_EOF 202 /* Match "" at end of file. */
217#define CURSOR 203 /* Match location of cursor. */
218
219#define RE_LNUM 204 /* nr cmp Match line number */
220#define RE_COL 205 /* nr cmp Match column number */
221#define RE_VCOL 206 /* nr cmp Match virtual column number */
222
223/*
224 * Magic characters have a special meaning, they don't match literally.
225 * Magic characters are negative. This separates them from literal characters
226 * (possibly multi-byte). Only ASCII characters can be Magic.
227 */
228#define Magic(x) ((int)(x) - 256)
229#define un_Magic(x) ((x) + 256)
230#define is_Magic(x) ((x) < 0)
231
232static int no_Magic __ARGS((int x));
233static int toggle_Magic __ARGS((int x));
234
235 static int
236no_Magic(x)
237 int x;
238{
239 if (is_Magic(x))
240 return un_Magic(x);
241 return x;
242}
243
244 static int
245toggle_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return Magic(x);
251}
252
253/*
254 * The first byte of the regexp internal "program" is actually this magic
255 * number; the start node begins in the second byte. It's used to catch the
256 * most severe mutilation of the program by the caller.
257 */
258
259#define REGMAGIC 0234
260
261/*
262 * Opcode notes:
263 *
264 * BRANCH The set of branches constituting a single choice are hooked
265 * together with their "next" pointers, since precedence prevents
266 * anything being concatenated to any individual branch. The
267 * "next" pointer of the last BRANCH in a choice points to the
268 * thing following the whole choice. This is also where the
269 * final "next" pointer of each individual branch points; each
270 * branch starts with the operand node of a BRANCH node.
271 *
272 * BACK Normal "next" pointers all implicitly point forward; BACK
273 * exists to make loop structures possible.
274 *
275 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
276 * BRANCH structures using BACK. Simple cases (one character
277 * per match) are implemented with STAR and PLUS for speed
278 * and to minimize recursive plunges.
279 *
280 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
281 * node, and defines the min and max limits to be used for that
282 * node.
283 *
284 * MOPEN,MCLOSE ...are numbered at compile time.
285 * ZOPEN,ZCLOSE ...ditto
286 */
287
288/*
289 * A node is one char of opcode followed by two chars of "next" pointer.
290 * "Next" pointers are stored as two 8-bit bytes, high order first. The
291 * value is a positive offset from the opcode of the node containing it.
292 * An operand, if any, simply follows the node. (Note that much of the
293 * code generation knows about this implicit relationship.)
294 *
295 * Using two bytes for the "next" pointer is vast overkill for most things,
296 * but allows patterns to get big without disasters.
297 */
298#define OP(p) ((int)*(p))
299#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
300#define OPERAND(p) ((p) + 3)
301/* Obtain an operand that was stored as four bytes, MSB first. */
302#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
303 + ((long)(p)[5] << 8) + (long)(p)[6])
304/* Obtain a second operand stored as four bytes. */
305#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
306/* Obtain a second single-byte operand stored after a four bytes operand. */
307#define OPERAND_CMP(p) (p)[7]
308
309/*
310 * Utility definitions.
311 */
312#define UCHARAT(p) ((int)*(char_u *)(p))
313
314/* Used for an error (down from) vim_regcomp(): give the error message, set
315 * rc_did_emsg and return NULL */
316#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
317#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
318#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
319#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
320
321#define MAX_LIMIT (32767L << 16L)
322
323static int re_multi_type __ARGS((int));
324static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
325static char_u *cstrchr __ARGS((char_u *, int));
326
327#ifdef DEBUG
328static void regdump __ARGS((char_u *, regprog_T *));
329static char_u *regprop __ARGS((char_u *));
330#endif
331
332#define NOT_MULTI 0
333#define MULTI_ONE 1
334#define MULTI_MULT 2
335/*
336 * Return NOT_MULTI if c is not a "multi" operator.
337 * Return MULTI_ONE if c is a single "multi" operator.
338 * Return MULTI_MULT if c is a multi "multi" operator.
339 */
340 static int
341re_multi_type(c)
342 int c;
343{
344 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
345 return MULTI_ONE;
346 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
347 return MULTI_MULT;
348 return NOT_MULTI;
349}
350
351/*
352 * Flags to be passed up and down.
353 */
354#define HASWIDTH 0x1 /* Known never to match null string. */
355#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
356#define SPSTART 0x4 /* Starts with * or +. */
357#define HASNL 0x8 /* Contains some \n. */
358#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
359#define WORST 0 /* Worst case. */
360
361/*
362 * When regcode is set to this value, code is not emitted and size is computed
363 * instead.
364 */
365#define JUST_CALC_SIZE ((char_u *) -1)
366
367static char_u *reg_prev_sub;
368
369/*
370 * REGEXP_INRANGE contains all characters which are always special in a []
371 * range after '\'.
372 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
373 * These are:
374 * \n - New line (NL).
375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB).
377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000379 * \d - Character code in decimal, eg \d123
380 * \o - Character code in octal, eg \o80
381 * \x - Character code in hex, eg \x4a
382 * \u - Multibyte character code, eg \u20ac
383 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384 */
385static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000386static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000387
388static int backslash_trans __ARGS((int c));
389static int skip_class_name __ARGS((char_u **pp));
390static char_u *skip_anyof __ARGS((char_u *p));
391static void init_class_tab __ARGS((void));
392
393/*
394 * Translate '\x' to its control character, except "\n", which is Magic.
395 */
396 static int
397backslash_trans(c)
398 int c;
399{
400 switch (c)
401 {
402 case 'r': return CAR;
403 case 't': return TAB;
404 case 'e': return ESC;
405 case 'b': return BS;
406 }
407 return c;
408}
409
410/*
411 * Check for a character class name. "pp" points to the '['.
412 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
413 * recognized. Otherwise "pp" is advanced to after the item.
414 */
415 static int
416skip_class_name(pp)
417 char_u **pp;
418{
419 static const char *(class_names[]) =
420 {
421 "alnum:]",
422#define CLASS_ALNUM 0
423 "alpha:]",
424#define CLASS_ALPHA 1
425 "blank:]",
426#define CLASS_BLANK 2
427 "cntrl:]",
428#define CLASS_CNTRL 3
429 "digit:]",
430#define CLASS_DIGIT 4
431 "graph:]",
432#define CLASS_GRAPH 5
433 "lower:]",
434#define CLASS_LOWER 6
435 "print:]",
436#define CLASS_PRINT 7
437 "punct:]",
438#define CLASS_PUNCT 8
439 "space:]",
440#define CLASS_SPACE 9
441 "upper:]",
442#define CLASS_UPPER 10
443 "xdigit:]",
444#define CLASS_XDIGIT 11
445 "tab:]",
446#define CLASS_TAB 12
447 "return:]",
448#define CLASS_RETURN 13
449 "backspace:]",
450#define CLASS_BACKSPACE 14
451 "escape:]",
452#define CLASS_ESCAPE 15
453 };
454#define CLASS_NONE 99
455 int i;
456
457 if ((*pp)[1] == ':')
458 {
459 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
460 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
461 {
462 *pp += STRLEN(class_names[i]) + 2;
463 return i;
464 }
465 }
466 return CLASS_NONE;
467}
468
469/*
470 * Skip over a "[]" range.
471 * "p" must point to the character after the '['.
472 * The returned pointer is on the matching ']', or the terminating NUL.
473 */
474 static char_u *
475skip_anyof(p)
476 char_u *p;
477{
478 int cpo_lit; /* 'cpoptions' contains 'l' flag */
479#ifdef FEAT_MBYTE
480 int l;
481#endif
482
483 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
484
485 if (*p == '^') /* Complement of range. */
486 ++p;
487 if (*p == ']' || *p == '-')
488 ++p;
489 while (*p != NUL && *p != ']')
490 {
491#ifdef FEAT_MBYTE
492 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
493 p += l;
494 else
495#endif
496 if (*p == '-')
497 {
498 ++p;
499 if (*p != ']' && *p != NUL)
500 {
501#ifdef FEAT_MBYTE
502 if (has_mbyte)
503 p += (*mb_ptr2len_check)(p);
504 else
505#endif
506 ++p;
507 }
508 }
509 else if (*p == '\\'
510 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
511 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
512 p += 2;
513 else if (*p == '[')
514 {
515 if (skip_class_name(&p) == CLASS_NONE)
516 ++p; /* It was not a class name */
517 }
518 else
519 ++p;
520 }
521
522 return p;
523}
524
525/*
526 * Specific version of character class functions.
527 * Using a table to keep this fast.
528 */
529static short class_tab[256];
530
531#define RI_DIGIT 0x01
532#define RI_HEX 0x02
533#define RI_OCTAL 0x04
534#define RI_WORD 0x08
535#define RI_HEAD 0x10
536#define RI_ALPHA 0x20
537#define RI_LOWER 0x40
538#define RI_UPPER 0x80
539#define RI_WHITE 0x100
540
541 static void
542init_class_tab()
543{
544 int i;
545 static int done = FALSE;
546
547 if (done)
548 return;
549
550 for (i = 0; i < 256; ++i)
551 {
552 if (i >= '0' && i <= '7')
553 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
554 else if (i >= '8' && i <= '9')
555 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
556 else if (i >= 'a' && i <= 'f')
557 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
558#ifdef EBCDIC
559 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
560 || (i >= 's' && i <= 'z'))
561#else
562 else if (i >= 'g' && i <= 'z')
563#endif
564 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
565 else if (i >= 'A' && i <= 'F')
566 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
567#ifdef EBCDIC
568 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
569 || (i >= 'S' && i <= 'Z'))
570#else
571 else if (i >= 'G' && i <= 'Z')
572#endif
573 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
574 else if (i == '_')
575 class_tab[i] = RI_WORD + RI_HEAD;
576 else
577 class_tab[i] = 0;
578 }
579 class_tab[' '] |= RI_WHITE;
580 class_tab['\t'] |= RI_WHITE;
581 done = TRUE;
582}
583
584#ifdef FEAT_MBYTE
585# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
586# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
587# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
588# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
589# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
590# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
591# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
592# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
593# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
594#else
595# define ri_digit(c) (class_tab[c] & RI_DIGIT)
596# define ri_hex(c) (class_tab[c] & RI_HEX)
597# define ri_octal(c) (class_tab[c] & RI_OCTAL)
598# define ri_word(c) (class_tab[c] & RI_WORD)
599# define ri_head(c) (class_tab[c] & RI_HEAD)
600# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
601# define ri_lower(c) (class_tab[c] & RI_LOWER)
602# define ri_upper(c) (class_tab[c] & RI_UPPER)
603# define ri_white(c) (class_tab[c] & RI_WHITE)
604#endif
605
606/* flags for regflags */
607#define RF_ICASE 1 /* ignore case */
608#define RF_NOICASE 2 /* don't ignore case */
609#define RF_HASNL 4 /* can match a NL */
610#define RF_ICOMBINE 8 /* ignore combining characters */
611#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
612
613/*
614 * Global work variables for vim_regcomp().
615 */
616
617static char_u *regparse; /* Input-scan pointer. */
618static int prevchr_len; /* byte length of previous char */
619static int num_complex_braces; /* Complex \{...} count */
620static int regnpar; /* () count. */
621#ifdef FEAT_SYN_HL
622static int regnzpar; /* \z() count. */
623static int re_has_z; /* \z item detected */
624#endif
625static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
626static long regsize; /* Code size. */
627static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
628static unsigned regflags; /* RF_ flags for prog */
629static long brace_min[10]; /* Minimums for complex brace repeats */
630static long brace_max[10]; /* Maximums for complex brace repeats */
631static int brace_count[10]; /* Current counts for complex brace repeats */
632#if defined(FEAT_SYN_HL) || defined(PROTO)
633static int had_eol; /* TRUE when EOL found by vim_regcomp() */
634#endif
635static int one_exactly = FALSE; /* only do one char for EXACTLY */
636
637static int reg_magic; /* magicness of the pattern: */
638#define MAGIC_NONE 1 /* "\V" very unmagic */
639#define MAGIC_OFF 2 /* "\M" or 'magic' off */
640#define MAGIC_ON 3 /* "\m" or 'magic' */
641#define MAGIC_ALL 4 /* "\v" very magic */
642
643static int reg_string; /* matching with a string instead of a buffer
644 line */
645
646/*
647 * META contains all characters that may be magic, except '^' and '$'.
648 */
649
650#ifdef EBCDIC
651static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
652#else
653/* META[] is used often enough to justify turning it into a table. */
654static char_u META_flags[] = {
655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
656 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
657/* % & ( ) * + . */
658 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
659/* 1 2 3 4 5 6 7 8 9 < = > ? */
660 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
661/* @ A C D F H I K L M O */
662 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
663/* P S U V W X Z [ _ */
664 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
665/* a c d f h i k l m n o */
666 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
667/* p s u v w x z { | ~ */
668 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
669};
670#endif
671
672static int curchr;
673
674/* arguments for reg() */
675#define REG_NOPAREN 0 /* toplevel reg() */
676#define REG_PAREN 1 /* \(\) */
677#define REG_ZPAREN 2 /* \z(\) */
678#define REG_NPAREN 3 /* \%(\) */
679
680/*
681 * Forward declarations for vim_regcomp()'s friends.
682 */
683static void initchr __ARGS((char_u *));
684static int getchr __ARGS((void));
685static void skipchr_keepstart __ARGS((void));
686static int peekchr __ARGS((void));
687static void skipchr __ARGS((void));
688static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000689static int gethexchrs __ARGS((int maxinputlen));
690static int getoctchrs __ARGS((void));
691static int getdecchrs __ARGS((void));
692static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000693static void regcomp_start __ARGS((char_u *expr, int flags));
694static char_u *reg __ARGS((int, int *));
695static char_u *regbranch __ARGS((int *flagp));
696static char_u *regconcat __ARGS((int *flagp));
697static char_u *regpiece __ARGS((int *));
698static char_u *regatom __ARGS((int *));
699static char_u *regnode __ARGS((int));
700static int prog_magic_wrong __ARGS((void));
701static char_u *regnext __ARGS((char_u *));
702static void regc __ARGS((int b));
703#ifdef FEAT_MBYTE
704static void regmbc __ARGS((int c));
705#endif
706static void reginsert __ARGS((int, char_u *));
707static void reginsert_limits __ARGS((int, long, long, char_u *));
708static char_u *re_put_long __ARGS((char_u *pr, long_u val));
709static int read_limits __ARGS((long *, long *));
710static void regtail __ARGS((char_u *, char_u *));
711static void regoptail __ARGS((char_u *, char_u *));
712
713/*
714 * Return TRUE if compiled regular expression "prog" can match a line break.
715 */
716 int
717re_multiline(prog)
718 regprog_T *prog;
719{
720 return (prog->regflags & RF_HASNL);
721}
722
723/*
724 * Return TRUE if compiled regular expression "prog" looks before the start
725 * position (pattern contains "\@<=" or "\@<!").
726 */
727 int
728re_lookbehind(prog)
729 regprog_T *prog;
730{
731 return (prog->regflags & RF_LOOKBH);
732}
733
734/*
735 * Skip past regular expression.
736 * Stop at end of 'p' of where 'dirc' is found ('/', '?', etc).
737 * Take care of characters with a backslash in front of it.
738 * Skip strings inside [ and ].
739 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
740 * expression and change "\?" to "?". If "*newp" is not NULL the expression
741 * is changed in-place.
742 */
743 char_u *
744skip_regexp(startp, dirc, magic, newp)
745 char_u *startp;
746 int dirc;
747 int magic;
748 char_u **newp;
749{
750 int mymagic;
751 char_u *p = startp;
752
753 if (magic)
754 mymagic = MAGIC_ON;
755 else
756 mymagic = MAGIC_OFF;
757
758 for (; p[0] != NUL; ++p)
759 {
760 if (p[0] == dirc) /* found end of regexp */
761 break;
762 if ((p[0] == '[' && mymagic >= MAGIC_ON)
763 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
764 {
765 p = skip_anyof(p + 1);
766 if (p[0] == NUL)
767 break;
768 }
769 else if (p[0] == '\\' && p[1] != NUL)
770 {
771 if (dirc == '?' && newp != NULL && p[1] == '?')
772 {
773 /* change "\?" to "?", make a copy first. */
774 if (*newp == NULL)
775 {
776 *newp = vim_strsave(startp);
777 if (*newp != NULL)
778 p = *newp + (p - startp);
779 }
780 if (*newp != NULL)
781 mch_memmove(p, p + 1, STRLEN(p));
782 else
783 ++p;
784 }
785 else
786 ++p; /* skip next character */
787 if (*p == 'v')
788 mymagic = MAGIC_ALL;
789 else if (*p == 'V')
790 mymagic = MAGIC_NONE;
791 }
792#ifdef FEAT_MBYTE
793 else if (has_mbyte)
794 p += (*mb_ptr2len_check)(p) - 1;
795#endif
796 }
797 return p;
798}
799
800/*
801 * vim_regcomp - compile a regular expression into internal code
802 *
803 * We can't allocate space until we know how big the compiled form will be,
804 * but we can't compile it (and thus know how big it is) until we've got a
805 * place to put the code. So we cheat: we compile it twice, once with code
806 * generation turned off and size counting turned on, and once "for real".
807 * This also means that we don't allocate space until we are sure that the
808 * thing really will compile successfully, and we never have to move the
809 * code and thus invalidate pointers into it. (Note that it has to be in
810 * one piece because vim_free() must be able to free it all.)
811 *
812 * Whether upper/lower case is to be ignored is decided when executing the
813 * program, it does not matter here.
814 *
815 * Beware that the optimization-preparation code in here knows about some
816 * of the structure of the compiled regexp.
817 * "re_flags": RE_MAGIC and/or RE_STRING.
818 */
819 regprog_T *
820vim_regcomp(expr, re_flags)
821 char_u *expr;
822 int re_flags;
823{
824 regprog_T *r;
825 char_u *scan;
826 char_u *longest;
827 int len;
828 int flags;
829
830 if (expr == NULL)
831 EMSG_RET_NULL(_(e_null));
832
833 init_class_tab();
834
835 /*
836 * First pass: determine size, legality.
837 */
838 regcomp_start(expr, re_flags);
839 regcode = JUST_CALC_SIZE;
840 regc(REGMAGIC);
841 if (reg(REG_NOPAREN, &flags) == NULL)
842 return NULL;
843
844 /* Small enough for pointer-storage convention? */
845#ifdef SMALL_MALLOC /* 16 bit storage allocation */
846 if (regsize >= 65536L - 256L)
847 EMSG_RET_NULL(_("E339: Pattern too long"));
848#endif
849
850 /* Allocate space. */
851 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
852 if (r == NULL)
853 return NULL;
854
855 /*
856 * Second pass: emit code.
857 */
858 regcomp_start(expr, re_flags);
859 regcode = r->program;
860 regc(REGMAGIC);
861 if (reg(REG_NOPAREN, &flags) == NULL)
862 {
863 vim_free(r);
864 return NULL;
865 }
866
867 /* Dig out information for optimizations. */
868 r->regstart = NUL; /* Worst-case defaults. */
869 r->reganch = 0;
870 r->regmust = NULL;
871 r->regmlen = 0;
872 r->regflags = regflags;
873 if (flags & HASNL)
874 r->regflags |= RF_HASNL;
875 if (flags & HASLOOKBH)
876 r->regflags |= RF_LOOKBH;
877#ifdef FEAT_SYN_HL
878 /* Remember whether this pattern has any \z specials in it. */
879 r->reghasz = re_has_z;
880#endif
881 scan = r->program + 1; /* First BRANCH. */
882 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
883 {
884 scan = OPERAND(scan);
885
886 /* Starting-point info. */
887 if (OP(scan) == BOL || OP(scan) == RE_BOF)
888 {
889 r->reganch++;
890 scan = regnext(scan);
891 }
892
893 if (OP(scan) == EXACTLY)
894 {
895#ifdef FEAT_MBYTE
896 if (has_mbyte)
897 r->regstart = (*mb_ptr2char)(OPERAND(scan));
898 else
899#endif
900 r->regstart = *OPERAND(scan);
901 }
902 else if ((OP(scan) == BOW
903 || OP(scan) == EOW
904 || OP(scan) == NOTHING
905 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
906 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
907 && OP(regnext(scan)) == EXACTLY)
908 {
909#ifdef FEAT_MBYTE
910 if (has_mbyte)
911 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
912 else
913#endif
914 r->regstart = *OPERAND(regnext(scan));
915 }
916
917 /*
918 * If there's something expensive in the r.e., find the longest
919 * literal string that must appear and make it the regmust. Resolve
920 * ties in favor of later strings, since the regstart check works
921 * with the beginning of the r.e. and avoiding duplication
922 * strengthens checking. Not a strong reason, but sufficient in the
923 * absence of others.
924 */
925 /*
926 * When the r.e. starts with BOW, it is faster to look for a regmust
927 * first. Used a lot for "#" and "*" commands. (Added by mool).
928 */
929 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
930 && !(flags & HASNL))
931 {
932 longest = NULL;
933 len = 0;
934 for (; scan != NULL; scan = regnext(scan))
935 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
936 {
937 longest = OPERAND(scan);
938 len = (int)STRLEN(OPERAND(scan));
939 }
940 r->regmust = longest;
941 r->regmlen = len;
942 }
943 }
944#ifdef DEBUG
945 regdump(expr, r);
946#endif
947 return r;
948}
949
950/*
951 * Setup to parse the regexp. Used once to get the length and once to do it.
952 */
953 static void
954regcomp_start(expr, re_flags)
955 char_u *expr;
956 int re_flags; /* see vim_regcomp() */
957{
958 initchr(expr);
959 if (re_flags & RE_MAGIC)
960 reg_magic = MAGIC_ON;
961 else
962 reg_magic = MAGIC_OFF;
963 reg_string = (re_flags & RE_STRING);
964
965 num_complex_braces = 0;
966 regnpar = 1;
967 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
968#ifdef FEAT_SYN_HL
969 regnzpar = 1;
970 re_has_z = 0;
971#endif
972 regsize = 0L;
973 regflags = 0;
974#if defined(FEAT_SYN_HL) || defined(PROTO)
975 had_eol = FALSE;
976#endif
977}
978
979#if defined(FEAT_SYN_HL) || defined(PROTO)
980/*
981 * Check if during the previous call to vim_regcomp the EOL item "$" has been
982 * found. This is messy, but it works fine.
983 */
984 int
985vim_regcomp_had_eol()
986{
987 return had_eol;
988}
989#endif
990
991/*
992 * reg - regular expression, i.e. main body or parenthesized thing
993 *
994 * Caller must absorb opening parenthesis.
995 *
996 * Combining parenthesis handling with the base level of regular expression
997 * is a trifle forced, but the need to tie the tails of the branches to what
998 * follows makes it hard to avoid.
999 */
1000 static char_u *
1001reg(paren, flagp)
1002 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1003 int *flagp;
1004{
1005 char_u *ret;
1006 char_u *br;
1007 char_u *ender;
1008 int parno = 0;
1009 int flags;
1010
1011 *flagp = HASWIDTH; /* Tentatively. */
1012
1013#ifdef FEAT_SYN_HL
1014 if (paren == REG_ZPAREN)
1015 {
1016 /* Make a ZOPEN node. */
1017 if (regnzpar >= NSUBEXP)
1018 EMSG_RET_NULL(_("E50: Too many \\z("));
1019 parno = regnzpar;
1020 regnzpar++;
1021 ret = regnode(ZOPEN + parno);
1022 }
1023 else
1024#endif
1025 if (paren == REG_PAREN)
1026 {
1027 /* Make a MOPEN node. */
1028 if (regnpar >= NSUBEXP)
1029 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1030 parno = regnpar;
1031 ++regnpar;
1032 ret = regnode(MOPEN + parno);
1033 }
1034 else if (paren == REG_NPAREN)
1035 {
1036 /* Make a NOPEN node. */
1037 ret = regnode(NOPEN);
1038 }
1039 else
1040 ret = NULL;
1041
1042 /* Pick up the branches, linking them together. */
1043 br = regbranch(&flags);
1044 if (br == NULL)
1045 return NULL;
1046 if (ret != NULL)
1047 regtail(ret, br); /* [MZ]OPEN -> first. */
1048 else
1049 ret = br;
1050 /* If one of the branches can be zero-width, the whole thing can.
1051 * If one of the branches has * at start or matches a line-break, the
1052 * whole thing can. */
1053 if (!(flags & HASWIDTH))
1054 *flagp &= ~HASWIDTH;
1055 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1056 while (peekchr() == Magic('|'))
1057 {
1058 skipchr();
1059 br = regbranch(&flags);
1060 if (br == NULL)
1061 return NULL;
1062 regtail(ret, br); /* BRANCH -> BRANCH. */
1063 if (!(flags & HASWIDTH))
1064 *flagp &= ~HASWIDTH;
1065 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1066 }
1067
1068 /* Make a closing node, and hook it on the end. */
1069 ender = regnode(
1070#ifdef FEAT_SYN_HL
1071 paren == REG_ZPAREN ? ZCLOSE + parno :
1072#endif
1073 paren == REG_PAREN ? MCLOSE + parno :
1074 paren == REG_NPAREN ? NCLOSE : END);
1075 regtail(ret, ender);
1076
1077 /* Hook the tails of the branches to the closing node. */
1078 for (br = ret; br != NULL; br = regnext(br))
1079 regoptail(br, ender);
1080
1081 /* Check for proper termination. */
1082 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1083 {
1084#ifdef FEAT_SYN_HL
1085 if (paren == REG_ZPAREN)
1086 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1087 else
1088#endif
1089 if (paren == REG_NPAREN)
1090 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1091 else
1092 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1093 }
1094 else if (paren == REG_NOPAREN && peekchr() != NUL)
1095 {
1096 if (curchr == Magic(')'))
1097 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1098 else
1099 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1100 /* NOTREACHED */
1101 }
1102 /*
1103 * Here we set the flag allowing back references to this set of
1104 * parentheses.
1105 */
1106 if (paren == REG_PAREN)
1107 had_endbrace[parno] = TRUE; /* have seen the close paren */
1108 return ret;
1109}
1110
1111/*
1112 * regbranch - one alternative of an | operator
1113 *
1114 * Implements the & operator.
1115 */
1116 static char_u *
1117regbranch(flagp)
1118 int *flagp;
1119{
1120 char_u *ret;
1121 char_u *chain = NULL;
1122 char_u *latest;
1123 int flags;
1124
1125 *flagp = WORST | HASNL; /* Tentatively. */
1126
1127 ret = regnode(BRANCH);
1128 for (;;)
1129 {
1130 latest = regconcat(&flags);
1131 if (latest == NULL)
1132 return NULL;
1133 /* If one of the branches has width, the whole thing has. If one of
1134 * the branches anchors at start-of-line, the whole thing does.
1135 * If one of the branches uses look-behind, the whole thing does. */
1136 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1137 /* If one of the branches doesn't match a line-break, the whole thing
1138 * doesn't. */
1139 *flagp &= ~HASNL | (flags & HASNL);
1140 if (chain != NULL)
1141 regtail(chain, latest);
1142 if (peekchr() != Magic('&'))
1143 break;
1144 skipchr();
1145 regtail(latest, regnode(END)); /* operand ends */
1146 reginsert(MATCH, latest);
1147 chain = latest;
1148 }
1149
1150 return ret;
1151}
1152
1153/*
1154 * regbranch - one alternative of an | or & operator
1155 *
1156 * Implements the concatenation operator.
1157 */
1158 static char_u *
1159regconcat(flagp)
1160 int *flagp;
1161{
1162 char_u *first = NULL;
1163 char_u *chain = NULL;
1164 char_u *latest;
1165 int flags;
1166 int cont = TRUE;
1167
1168 *flagp = WORST; /* Tentatively. */
1169
1170 while (cont)
1171 {
1172 switch (peekchr())
1173 {
1174 case NUL:
1175 case Magic('|'):
1176 case Magic('&'):
1177 case Magic(')'):
1178 cont = FALSE;
1179 break;
1180 case Magic('Z'):
1181#ifdef FEAT_MBYTE
1182 regflags |= RF_ICOMBINE;
1183#endif
1184 skipchr_keepstart();
1185 break;
1186 case Magic('c'):
1187 regflags |= RF_ICASE;
1188 skipchr_keepstart();
1189 break;
1190 case Magic('C'):
1191 regflags |= RF_NOICASE;
1192 skipchr_keepstart();
1193 break;
1194 case Magic('v'):
1195 reg_magic = MAGIC_ALL;
1196 skipchr_keepstart();
1197 curchr = -1;
1198 break;
1199 case Magic('m'):
1200 reg_magic = MAGIC_ON;
1201 skipchr_keepstart();
1202 curchr = -1;
1203 break;
1204 case Magic('M'):
1205 reg_magic = MAGIC_OFF;
1206 skipchr_keepstart();
1207 curchr = -1;
1208 break;
1209 case Magic('V'):
1210 reg_magic = MAGIC_NONE;
1211 skipchr_keepstart();
1212 curchr = -1;
1213 break;
1214 default:
1215 latest = regpiece(&flags);
1216 if (latest == NULL)
1217 return NULL;
1218 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1219 if (chain == NULL) /* First piece. */
1220 *flagp |= flags & SPSTART;
1221 else
1222 regtail(chain, latest);
1223 chain = latest;
1224 if (first == NULL)
1225 first = latest;
1226 break;
1227 }
1228 }
1229 if (first == NULL) /* Loop ran zero times. */
1230 first = regnode(NOTHING);
1231 return first;
1232}
1233
1234/*
1235 * regpiece - something followed by possible [*+=]
1236 *
1237 * Note that the branching code sequences used for = and the general cases
1238 * of * and + are somewhat optimized: they use the same NOTHING node as
1239 * both the endmarker for their branch list and the body of the last branch.
1240 * It might seem that this node could be dispensed with entirely, but the
1241 * endmarker role is not redundant.
1242 */
1243 static char_u *
1244regpiece(flagp)
1245 int *flagp;
1246{
1247 char_u *ret;
1248 int op;
1249 char_u *next;
1250 int flags;
1251 long minval;
1252 long maxval;
1253
1254 ret = regatom(&flags);
1255 if (ret == NULL)
1256 return NULL;
1257
1258 op = peekchr();
1259 if (re_multi_type(op) == NOT_MULTI)
1260 {
1261 *flagp = flags;
1262 return ret;
1263 }
1264 if (!(flags & HASWIDTH) && re_multi_type(op) == MULTI_MULT)
1265 {
1266 if (op == Magic('*'))
1267 EMSG_M_RET_NULL(_("E56: %s* operand could be empty"),
1268 reg_magic >= MAGIC_ON);
1269 if (op == Magic('+'))
1270 EMSG_M_RET_NULL(_("E57: %s+ operand could be empty"),
1271 reg_magic == MAGIC_ALL);
1272 /* "\{}" is checked below, it's allowed when there is an upper limit */
1273 }
1274 /* default flags */
1275 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1276
1277 skipchr();
1278 switch (op)
1279 {
1280 case Magic('*'):
1281 if (flags & SIMPLE)
1282 reginsert(STAR, ret);
1283 else
1284 {
1285 /* Emit x* as (x&|), where & means "self". */
1286 reginsert(BRANCH, ret); /* Either x */
1287 regoptail(ret, regnode(BACK)); /* and loop */
1288 regoptail(ret, ret); /* back */
1289 regtail(ret, regnode(BRANCH)); /* or */
1290 regtail(ret, regnode(NOTHING)); /* null. */
1291 }
1292 break;
1293
1294 case Magic('+'):
1295 if (flags & SIMPLE)
1296 reginsert(PLUS, ret);
1297 else
1298 {
1299 /* Emit x+ as x(&|), where & means "self". */
1300 next = regnode(BRANCH); /* Either */
1301 regtail(ret, next);
1302 regtail(regnode(BACK), ret); /* loop back */
1303 regtail(next, regnode(BRANCH)); /* or */
1304 regtail(ret, regnode(NOTHING)); /* null. */
1305 }
1306 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1307 break;
1308
1309 case Magic('@'):
1310 {
1311 int lop = END;
1312
1313 switch (no_Magic(getchr()))
1314 {
1315 case '=': lop = MATCH; break; /* \@= */
1316 case '!': lop = NOMATCH; break; /* \@! */
1317 case '>': lop = SUBPAT; break; /* \@> */
1318 case '<': switch (no_Magic(getchr()))
1319 {
1320 case '=': lop = BEHIND; break; /* \@<= */
1321 case '!': lop = NOBEHIND; break; /* \@<! */
1322 }
1323 }
1324 if (lop == END)
1325 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1326 reg_magic == MAGIC_ALL);
1327 /* Look behind must match with behind_pos. */
1328 if (lop == BEHIND || lop == NOBEHIND)
1329 {
1330 regtail(ret, regnode(BHPOS));
1331 *flagp |= HASLOOKBH;
1332 }
1333 regtail(ret, regnode(END)); /* operand ends */
1334 reginsert(lop, ret);
1335 break;
1336 }
1337
1338 case Magic('?'):
1339 case Magic('='):
1340 /* Emit x= as (x|) */
1341 reginsert(BRANCH, ret); /* Either x */
1342 regtail(ret, regnode(BRANCH)); /* or */
1343 next = regnode(NOTHING); /* null. */
1344 regtail(ret, next);
1345 regoptail(ret, next);
1346 break;
1347
1348 case Magic('{'):
1349 if (!read_limits(&minval, &maxval))
1350 return NULL;
1351 if (!(flags & HASWIDTH) && (maxval > minval
1352 ? maxval >= MAX_LIMIT : minval >= MAX_LIMIT))
1353 EMSG_M_RET_NULL(_("E58: %s{ operand could be empty"),
1354 reg_magic == MAGIC_ALL);
1355 if (flags & SIMPLE)
1356 {
1357 reginsert(BRACE_SIMPLE, ret);
1358 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1359 }
1360 else
1361 {
1362 if (num_complex_braces >= 10)
1363 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1364 reg_magic == MAGIC_ALL);
1365 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1366 regoptail(ret, regnode(BACK));
1367 regoptail(ret, ret);
1368 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1369 ++num_complex_braces;
1370 }
1371 if (minval > 0 && maxval > 0)
1372 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1373 break;
1374 }
1375 if (re_multi_type(peekchr()) != NOT_MULTI)
1376 {
1377 /* Can't have a multi follow a multi. */
1378 if (peekchr() == Magic('*'))
1379 sprintf((char *)IObuff, _("E61: Nested %s*"),
1380 reg_magic >= MAGIC_ON ? "" : "\\");
1381 else
1382 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1383 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1384 EMSG_RET_NULL(IObuff);
1385 }
1386
1387 return ret;
1388}
1389
1390/*
1391 * regatom - the lowest level
1392 *
1393 * Optimization: gobbles an entire sequence of ordinary characters so that
1394 * it can turn them into a single node, which is smaller to store and
1395 * faster to run. Don't do this when one_exactly is set.
1396 */
1397 static char_u *
1398regatom(flagp)
1399 int *flagp;
1400{
1401 char_u *ret;
1402 int flags;
1403 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1404 int c;
1405 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1406 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1407 FNAME, SFNAME, PRINT, SPRINT,
1408 WHITE, NWHITE, DIGIT, NDIGIT,
1409 HEX, NHEX, OCTAL, NOCTAL,
1410 WORD, NWORD, HEAD, NHEAD,
1411 ALPHA, NALPHA, LOWER, NLOWER,
1412 UPPER, NUPPER
1413 };
1414 char_u *p;
1415 int extra = 0;
1416
1417 *flagp = WORST; /* Tentatively. */
1418 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
1419
1420 c = getchr();
1421 switch (c)
1422 {
1423 case Magic('^'):
1424 ret = regnode(BOL);
1425 break;
1426
1427 case Magic('$'):
1428 ret = regnode(EOL);
1429#if defined(FEAT_SYN_HL) || defined(PROTO)
1430 had_eol = TRUE;
1431#endif
1432 break;
1433
1434 case Magic('<'):
1435 ret = regnode(BOW);
1436 break;
1437
1438 case Magic('>'):
1439 ret = regnode(EOW);
1440 break;
1441
1442 case Magic('_'):
1443 c = no_Magic(getchr());
1444 if (c == '^') /* "\_^" is start-of-line */
1445 {
1446 ret = regnode(BOL);
1447 break;
1448 }
1449 if (c == '$') /* "\_$" is end-of-line */
1450 {
1451 ret = regnode(EOL);
1452#if defined(FEAT_SYN_HL) || defined(PROTO)
1453 had_eol = TRUE;
1454#endif
1455 break;
1456 }
1457
1458 extra = ADD_NL;
1459 *flagp |= HASNL;
1460
1461 /* "\_[" is character range plus newline */
1462 if (c == '[')
1463 goto collection;
1464
1465 /* "\_x" is character class plus newline */
1466 /*FALLTHROUGH*/
1467
1468 /*
1469 * Character classes.
1470 */
1471 case Magic('.'):
1472 case Magic('i'):
1473 case Magic('I'):
1474 case Magic('k'):
1475 case Magic('K'):
1476 case Magic('f'):
1477 case Magic('F'):
1478 case Magic('p'):
1479 case Magic('P'):
1480 case Magic('s'):
1481 case Magic('S'):
1482 case Magic('d'):
1483 case Magic('D'):
1484 case Magic('x'):
1485 case Magic('X'):
1486 case Magic('o'):
1487 case Magic('O'):
1488 case Magic('w'):
1489 case Magic('W'):
1490 case Magic('h'):
1491 case Magic('H'):
1492 case Magic('a'):
1493 case Magic('A'):
1494 case Magic('l'):
1495 case Magic('L'):
1496 case Magic('u'):
1497 case Magic('U'):
1498 p = vim_strchr(classchars, no_Magic(c));
1499 if (p == NULL)
1500 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1501 ret = regnode(classcodes[p - classchars] + extra);
1502 *flagp |= HASWIDTH | SIMPLE;
1503 break;
1504
1505 case Magic('n'):
1506 if (reg_string)
1507 {
1508 /* In a string "\n" matches a newline character. */
1509 ret = regnode(EXACTLY);
1510 regc(NL);
1511 regc(NUL);
1512 *flagp |= HASWIDTH | SIMPLE;
1513 }
1514 else
1515 {
1516 /* In buffer text "\n" matches the end of a line. */
1517 ret = regnode(NEWL);
1518 *flagp |= HASWIDTH | HASNL;
1519 }
1520 break;
1521
1522 case Magic('('):
1523 if (one_exactly)
1524 EMSG_ONE_RET_NULL;
1525 ret = reg(REG_PAREN, &flags);
1526 if (ret == NULL)
1527 return NULL;
1528 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1529 break;
1530
1531 case NUL:
1532 case Magic('|'):
1533 case Magic('&'):
1534 case Magic(')'):
1535 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1536 /* NOTREACHED */
1537
1538 case Magic('='):
1539 case Magic('?'):
1540 case Magic('+'):
1541 case Magic('@'):
1542 case Magic('{'):
1543 case Magic('*'):
1544 c = no_Magic(c);
1545 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1546 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1547 ? "" : "\\", c);
1548 EMSG_RET_NULL(IObuff);
1549 /* NOTREACHED */
1550
1551 case Magic('~'): /* previous substitute pattern */
1552 if (reg_prev_sub)
1553 {
1554 char_u *lp;
1555
1556 ret = regnode(EXACTLY);
1557 lp = reg_prev_sub;
1558 while (*lp != NUL)
1559 regc(*lp++);
1560 regc(NUL);
1561 if (*reg_prev_sub != NUL)
1562 {
1563 *flagp |= HASWIDTH;
1564 if ((lp - reg_prev_sub) == 1)
1565 *flagp |= SIMPLE;
1566 }
1567 }
1568 else
1569 EMSG_RET_NULL(_(e_nopresub));
1570 break;
1571
1572 case Magic('1'):
1573 case Magic('2'):
1574 case Magic('3'):
1575 case Magic('4'):
1576 case Magic('5'):
1577 case Magic('6'):
1578 case Magic('7'):
1579 case Magic('8'):
1580 case Magic('9'):
1581 {
1582 int refnum;
1583
1584 refnum = c - Magic('0');
1585 /*
1586 * Check if the back reference is legal. We must have seen the
1587 * close brace.
1588 * TODO: Should also check that we don't refer to something
1589 * that is repeated (+*=): what instance of the repetition
1590 * should we match?
1591 */
1592 if (!had_endbrace[refnum])
1593 {
1594 /* Trick: check if "@<=" or "@<!" follows, in which case
1595 * the \1 can appear before the referenced match. */
1596 for (p = regparse; *p != NUL; ++p)
1597 if (p[0] == '@' && p[1] == '<'
1598 && (p[2] == '!' || p[2] == '='))
1599 break;
1600 if (*p == NUL)
1601 EMSG_RET_NULL(_("E65: Illegal back reference"));
1602 }
1603 ret = regnode(BACKREF + refnum);
1604 }
1605 break;
1606
1607#ifdef FEAT_SYN_HL
1608 case Magic('z'):
1609 {
1610 c = no_Magic(getchr());
1611 switch (c)
1612 {
1613 case '(': if (reg_do_extmatch != REX_SET)
1614 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1615 if (one_exactly)
1616 EMSG_ONE_RET_NULL;
1617 ret = reg(REG_ZPAREN, &flags);
1618 if (ret == NULL)
1619 return NULL;
1620 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1621 re_has_z = REX_SET;
1622 break;
1623
1624 case '1':
1625 case '2':
1626 case '3':
1627 case '4':
1628 case '5':
1629 case '6':
1630 case '7':
1631 case '8':
1632 case '9': if (reg_do_extmatch != REX_USE)
1633 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1634 ret = regnode(ZREF + c - '0');
1635 re_has_z = REX_USE;
1636 break;
1637
1638 case 's': ret = regnode(MOPEN + 0);
1639 break;
1640
1641 case 'e': ret = regnode(MCLOSE + 0);
1642 break;
1643
1644 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1645 }
1646 }
1647 break;
1648#endif
1649
1650 case Magic('%'):
1651 {
1652 c = no_Magic(getchr());
1653 switch (c)
1654 {
1655 /* () without a back reference */
1656 case '(':
1657 if (one_exactly)
1658 EMSG_ONE_RET_NULL;
1659 ret = reg(REG_NPAREN, &flags);
1660 if (ret == NULL)
1661 return NULL;
1662 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1663 break;
1664
1665 /* Catch \%^ and \%$ regardless of where they appear in the
1666 * pattern -- regardless of whether or not it makes sense. */
1667 case '^':
1668 ret = regnode(RE_BOF);
1669 break;
1670
1671 case '$':
1672 ret = regnode(RE_EOF);
1673 break;
1674
1675 case '#':
1676 ret = regnode(CURSOR);
1677 break;
1678
1679 /* \%[abc]: Emit as a list of branches, all ending at the last
1680 * branch which matches nothing. */
1681 case '[':
1682 if (one_exactly) /* doesn't nest */
1683 EMSG_ONE_RET_NULL;
1684 {
1685 char_u *lastbranch;
1686 char_u *lastnode = NULL;
1687 char_u *br;
1688
1689 ret = NULL;
1690 while ((c = getchr()) != ']')
1691 {
1692 if (c == NUL)
1693 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1694 reg_magic == MAGIC_ALL);
1695 br = regnode(BRANCH);
1696 if (ret == NULL)
1697 ret = br;
1698 else
1699 regtail(lastnode, br);
1700
1701 ungetchr();
1702 one_exactly = TRUE;
1703 lastnode = regatom(flagp);
1704 one_exactly = FALSE;
1705 if (lastnode == NULL)
1706 return NULL;
1707 }
1708 if (ret == NULL)
1709 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1710 reg_magic == MAGIC_ALL);
1711 lastbranch = regnode(BRANCH);
1712 br = regnode(NOTHING);
1713 if (ret != JUST_CALC_SIZE)
1714 {
1715 regtail(lastnode, br);
1716 regtail(lastbranch, br);
1717 /* connect all branches to the NOTHING
1718 * branch at the end */
1719 for (br = ret; br != lastnode; )
1720 {
1721 if (OP(br) == BRANCH)
1722 {
1723 regtail(br, lastbranch);
1724 br = OPERAND(br);
1725 }
1726 else
1727 br = regnext(br);
1728 }
1729 }
1730 *flagp &= ~HASWIDTH;
1731 break;
1732 }
1733
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001734 case 'd': /* %d123 decimal */
1735 case 'o': /* %o123 octal */
1736 case 'x': /* %xab hex 2 */
1737 case 'u': /* %uabcd hex 4 */
1738 case 'U': /* %U1234abcd hex 8 */
1739 {
1740 int i;
1741
1742 switch (c)
1743 {
1744 case 'd': i = getdecchrs(); break;
1745 case 'o': i = getoctchrs(); break;
1746 case 'x': i = gethexchrs(2); break;
1747 case 'u': i = gethexchrs(4); break;
1748 case 'U': i = gethexchrs(8); break;
1749 default: i = -1; break;
1750 }
1751
1752 if (i < 0)
1753 EMSG_M_RET_NULL(
1754 _("E678: Invalid character after %s%%[dxouU]"),
1755 reg_magic == MAGIC_ALL);
1756 ret = regnode(EXACTLY);
1757 if (i == 0)
1758 regc(0x0a);
1759 else
1760#ifdef FEAT_MBYTE
1761 regmbc(i);
1762#else
1763 regc(i);
1764#endif
1765 regc(NUL);
1766 *flagp |= HASWIDTH;
1767 break;
1768 }
1769
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 default:
1771 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1772 {
1773 long_u n = 0;
1774 int cmp;
1775
1776 cmp = c;
1777 if (cmp == '<' || cmp == '>')
1778 c = getchr();
1779 while (VIM_ISDIGIT(c))
1780 {
1781 n = n * 10 + (c - '0');
1782 c = getchr();
1783 }
1784 if (c == 'l' || c == 'c' || c == 'v')
1785 {
1786 if (c == 'l')
1787 ret = regnode(RE_LNUM);
1788 else if (c == 'c')
1789 ret = regnode(RE_COL);
1790 else
1791 ret = regnode(RE_VCOL);
1792 if (ret == JUST_CALC_SIZE)
1793 regsize += 5;
1794 else
1795 {
1796 /* put the number and the optional
1797 * comparator after the opcode */
1798 regcode = re_put_long(regcode, n);
1799 *regcode++ = cmp;
1800 }
1801 break;
1802 }
1803 }
1804
1805 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1806 reg_magic == MAGIC_ALL);
1807 }
1808 }
1809 break;
1810
1811 case Magic('['):
1812collection:
1813 {
1814 char_u *lp;
1815
1816 /*
1817 * If there is no matching ']', we assume the '[' is a normal
1818 * character. This makes 'incsearch' and ":help [" work.
1819 */
1820 lp = skip_anyof(regparse);
1821 if (*lp == ']') /* there is a matching ']' */
1822 {
1823 int startc = -1; /* > 0 when next '-' is a range */
1824 int endc;
1825
1826 /*
1827 * In a character class, different parsing rules apply.
1828 * Not even \ is special anymore, nothing is.
1829 */
1830 if (*regparse == '^') /* Complement of range. */
1831 {
1832 ret = regnode(ANYBUT + extra);
1833 regparse++;
1834 }
1835 else
1836 ret = regnode(ANYOF + extra);
1837
1838 /* At the start ']' and '-' mean the literal character. */
1839 if (*regparse == ']' || *regparse == '-')
1840 regc(*regparse++);
1841
1842 while (*regparse != NUL && *regparse != ']')
1843 {
1844 if (*regparse == '-')
1845 {
1846 ++regparse;
1847 /* The '-' is not used for a range at the end and
1848 * after or before a '\n'. */
1849 if (*regparse == ']' || *regparse == NUL
1850 || startc == -1
1851 || (regparse[0] == '\\' && regparse[1] == 'n'))
1852 {
1853 regc('-');
1854 startc = '-'; /* [--x] is a range */
1855 }
1856 else
1857 {
1858#ifdef FEAT_MBYTE
1859 if (has_mbyte)
1860 endc = mb_ptr2char_adv(&regparse);
1861 else
1862#endif
1863 endc = *regparse++;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001864
1865 /* Handle \o40, \x20 and \u20AC style sequences */
1866 if (endc == '\\' && !cpo_lit)
1867 endc = coll_get_char();
1868
Bram Moolenaar071d4272004-06-13 20:20:40 +00001869 if (startc > endc)
1870 EMSG_RET_NULL(_(e_invrange));
1871#ifdef FEAT_MBYTE
1872 if (has_mbyte && ((*mb_char2len)(startc) > 1
1873 || (*mb_char2len)(endc) > 1))
1874 {
1875 /* Limit to a range of 256 chars */
1876 if (endc > startc + 256)
1877 EMSG_RET_NULL(_(e_invrange));
1878 while (++startc <= endc)
1879 regmbc(startc);
1880 }
1881 else
1882#endif
1883 {
1884#ifdef EBCDIC
1885 int alpha_only = FALSE;
1886
1887 /* for alphabetical range skip the gaps
1888 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1889 if (isalpha(startc) && isalpha(endc))
1890 alpha_only = TRUE;
1891#endif
1892 while (++startc <= endc)
1893#ifdef EBCDIC
1894 if (!alpha_only || isalpha(startc))
1895#endif
1896 regc(startc);
1897 }
1898 startc = -1;
1899 }
1900 }
1901 /*
1902 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1903 * accepts "\t", "\e", etc., but only when the 'l' flag in
1904 * 'cpoptions' is not included.
1905 */
1906 else if (*regparse == '\\'
1907 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1908 || (!cpo_lit
1909 && vim_strchr(REGEXP_ABBR,
1910 regparse[1]) != NULL)))
1911 {
1912 regparse++;
1913 if (*regparse == 'n')
1914 {
1915 /* '\n' in range: also match NL */
1916 if (ret != JUST_CALC_SIZE)
1917 {
1918 if (*ret == ANYBUT)
1919 *ret = ANYBUT + ADD_NL;
1920 else if (*ret == ANYOF)
1921 *ret = ANYOF + ADD_NL;
1922 /* else: must have had a \n already */
1923 }
1924 *flagp |= HASNL;
1925 regparse++;
1926 startc = -1;
1927 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001928 else if (*regparse == 'd'
1929 || *regparse == 'o'
1930 || *regparse == 'x'
1931 || *regparse == 'u'
1932 || *regparse == 'U')
1933 {
1934 startc = coll_get_char();
1935 if (startc == 0)
1936 regc(0x0a);
1937 else
1938#ifdef FEAT_MBYTE
1939 regmbc(startc);
1940#else
1941 regc(startc);
1942#endif
1943 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001944 else
1945 {
1946 startc = backslash_trans(*regparse++);
1947 regc(startc);
1948 }
1949 }
1950 else if (*regparse == '[')
1951 {
1952 int c_class;
1953 int cu;
1954
1955 c_class = skip_class_name(&regparse);
1956 startc = -1;
1957 /* Characters assumed to be 8 bits! */
1958 switch (c_class)
1959 {
1960 case CLASS_NONE:
1961 /* literal '[', allow [[-x] as a range */
1962 startc = *regparse++;
1963 regc(startc);
1964 break;
1965 case CLASS_ALNUM:
1966 for (cu = 1; cu <= 255; cu++)
1967 if (isalnum(cu))
1968 regc(cu);
1969 break;
1970 case CLASS_ALPHA:
1971 for (cu = 1; cu <= 255; cu++)
1972 if (isalpha(cu))
1973 regc(cu);
1974 break;
1975 case CLASS_BLANK:
1976 regc(' ');
1977 regc('\t');
1978 break;
1979 case CLASS_CNTRL:
1980 for (cu = 1; cu <= 255; cu++)
1981 if (iscntrl(cu))
1982 regc(cu);
1983 break;
1984 case CLASS_DIGIT:
1985 for (cu = 1; cu <= 255; cu++)
1986 if (VIM_ISDIGIT(cu))
1987 regc(cu);
1988 break;
1989 case CLASS_GRAPH:
1990 for (cu = 1; cu <= 255; cu++)
1991 if (isgraph(cu))
1992 regc(cu);
1993 break;
1994 case CLASS_LOWER:
1995 for (cu = 1; cu <= 255; cu++)
1996 if (islower(cu))
1997 regc(cu);
1998 break;
1999 case CLASS_PRINT:
2000 for (cu = 1; cu <= 255; cu++)
2001 if (vim_isprintc(cu))
2002 regc(cu);
2003 break;
2004 case CLASS_PUNCT:
2005 for (cu = 1; cu <= 255; cu++)
2006 if (ispunct(cu))
2007 regc(cu);
2008 break;
2009 case CLASS_SPACE:
2010 for (cu = 9; cu <= 13; cu++)
2011 regc(cu);
2012 regc(' ');
2013 break;
2014 case CLASS_UPPER:
2015 for (cu = 1; cu <= 255; cu++)
2016 if (isupper(cu))
2017 regc(cu);
2018 break;
2019 case CLASS_XDIGIT:
2020 for (cu = 1; cu <= 255; cu++)
2021 if (vim_isxdigit(cu))
2022 regc(cu);
2023 break;
2024 case CLASS_TAB:
2025 regc('\t');
2026 break;
2027 case CLASS_RETURN:
2028 regc('\r');
2029 break;
2030 case CLASS_BACKSPACE:
2031 regc('\b');
2032 break;
2033 case CLASS_ESCAPE:
2034 regc('\033');
2035 break;
2036 }
2037 }
2038 else
2039 {
2040#ifdef FEAT_MBYTE
2041 if (has_mbyte)
2042 {
2043 int len;
2044
2045 /* produce a multibyte character, including any
2046 * following composing characters */
2047 startc = mb_ptr2char(regparse);
2048 len = (*mb_ptr2len_check)(regparse);
2049 if (enc_utf8 && utf_char2len(startc) != len)
2050 startc = -1; /* composing chars */
2051 while (--len >= 0)
2052 regc(*regparse++);
2053 }
2054 else
2055#endif
2056 {
2057 startc = *regparse++;
2058 regc(startc);
2059 }
2060 }
2061 }
2062 regc(NUL);
2063 prevchr_len = 1; /* last char was the ']' */
2064 if (*regparse != ']')
2065 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2066 skipchr(); /* let's be friends with the lexer again */
2067 *flagp |= HASWIDTH | SIMPLE;
2068 break;
2069 }
2070 }
2071 /* FALLTHROUGH */
2072
2073 default:
2074 {
2075 int len;
2076
2077#ifdef FEAT_MBYTE
2078 /* A multi-byte character is handled as a separate atom if it's
2079 * before a multi. */
2080 if (has_mbyte && (*mb_char2len)(c) > 1
2081 && re_multi_type(peekchr()) != NOT_MULTI)
2082 {
2083 ret = regnode(MULTIBYTECODE);
2084 regmbc(c);
2085 *flagp |= HASWIDTH | SIMPLE;
2086 break;
2087 }
2088#endif
2089
2090 ret = regnode(EXACTLY);
2091
2092 /*
2093 * Append characters as long as:
2094 * - there is no following multi, we then need the character in
2095 * front of it as a single character operand
2096 * - not running into a Magic character
2097 * - "one_exactly" is not set
2098 * But always emit at least one character. Might be a Multi,
2099 * e.g., a "[" without matching "]".
2100 */
2101 for (len = 0; c != NUL && (len == 0
2102 || (re_multi_type(peekchr()) == NOT_MULTI
2103 && !one_exactly
2104 && !is_Magic(c))); ++len)
2105 {
2106 c = no_Magic(c);
2107#ifdef FEAT_MBYTE
2108 if (has_mbyte)
2109 {
2110 regmbc(c);
2111 if (enc_utf8)
2112 {
2113 int off;
2114 int l;
2115
2116 /* Need to get composing character too, directly
2117 * access regparse for that, because skipchr() skips
2118 * over composing chars. */
2119 ungetchr();
2120 if (*regparse == '\\' && regparse[1] != NUL)
2121 off = 1;
2122 else
2123 off = 0;
2124 for (;;)
2125 {
2126 l = utf_ptr2len_check(regparse + off);
2127 if (!UTF_COMPOSINGLIKE(regparse + off,
2128 regparse + off + l))
2129 break;
2130 off += l;
2131 regmbc(utf_ptr2char(regparse + off));
2132 }
2133 skipchr();
2134 }
2135 }
2136 else
2137#endif
2138 regc(c);
2139 c = getchr();
2140 }
2141 ungetchr();
2142
2143 regc(NUL);
2144 *flagp |= HASWIDTH;
2145 if (len == 1)
2146 *flagp |= SIMPLE;
2147 }
2148 break;
2149 }
2150
2151 return ret;
2152}
2153
2154/*
2155 * emit a node
2156 * Return pointer to generated code.
2157 */
2158 static char_u *
2159regnode(op)
2160 int op;
2161{
2162 char_u *ret;
2163
2164 ret = regcode;
2165 if (ret == JUST_CALC_SIZE)
2166 regsize += 3;
2167 else
2168 {
2169 *regcode++ = op;
2170 *regcode++ = NUL; /* Null "next" pointer. */
2171 *regcode++ = NUL;
2172 }
2173 return ret;
2174}
2175
2176/*
2177 * Emit (if appropriate) a byte of code
2178 */
2179 static void
2180regc(b)
2181 int b;
2182{
2183 if (regcode == JUST_CALC_SIZE)
2184 regsize++;
2185 else
2186 *regcode++ = b;
2187}
2188
2189#ifdef FEAT_MBYTE
2190/*
2191 * Emit (if appropriate) a multi-byte character of code
2192 */
2193 static void
2194regmbc(c)
2195 int c;
2196{
2197 if (regcode == JUST_CALC_SIZE)
2198 regsize += (*mb_char2len)(c);
2199 else
2200 regcode += (*mb_char2bytes)(c, regcode);
2201}
2202#endif
2203
2204/*
2205 * reginsert - insert an operator in front of already-emitted operand
2206 *
2207 * Means relocating the operand.
2208 */
2209 static void
2210reginsert(op, opnd)
2211 int op;
2212 char_u *opnd;
2213{
2214 char_u *src;
2215 char_u *dst;
2216 char_u *place;
2217
2218 if (regcode == JUST_CALC_SIZE)
2219 {
2220 regsize += 3;
2221 return;
2222 }
2223 src = regcode;
2224 regcode += 3;
2225 dst = regcode;
2226 while (src > opnd)
2227 *--dst = *--src;
2228
2229 place = opnd; /* Op node, where operand used to be. */
2230 *place++ = op;
2231 *place++ = NUL;
2232 *place = NUL;
2233}
2234
2235/*
2236 * reginsert_limits - insert an operator in front of already-emitted operand.
2237 * The operator has the given limit values as operands. Also set next pointer.
2238 *
2239 * Means relocating the operand.
2240 */
2241 static void
2242reginsert_limits(op, minval, maxval, opnd)
2243 int op;
2244 long minval;
2245 long maxval;
2246 char_u *opnd;
2247{
2248 char_u *src;
2249 char_u *dst;
2250 char_u *place;
2251
2252 if (regcode == JUST_CALC_SIZE)
2253 {
2254 regsize += 11;
2255 return;
2256 }
2257 src = regcode;
2258 regcode += 11;
2259 dst = regcode;
2260 while (src > opnd)
2261 *--dst = *--src;
2262
2263 place = opnd; /* Op node, where operand used to be. */
2264 *place++ = op;
2265 *place++ = NUL;
2266 *place++ = NUL;
2267 place = re_put_long(place, (long_u)minval);
2268 place = re_put_long(place, (long_u)maxval);
2269 regtail(opnd, place);
2270}
2271
2272/*
2273 * Write a long as four bytes at "p" and return pointer to the next char.
2274 */
2275 static char_u *
2276re_put_long(p, val)
2277 char_u *p;
2278 long_u val;
2279{
2280 *p++ = (char_u) ((val >> 24) & 0377);
2281 *p++ = (char_u) ((val >> 16) & 0377);
2282 *p++ = (char_u) ((val >> 8) & 0377);
2283 *p++ = (char_u) (val & 0377);
2284 return p;
2285}
2286
2287/*
2288 * regtail - set the next-pointer at the end of a node chain
2289 */
2290 static void
2291regtail(p, val)
2292 char_u *p;
2293 char_u *val;
2294{
2295 char_u *scan;
2296 char_u *temp;
2297 int offset;
2298
2299 if (p == JUST_CALC_SIZE)
2300 return;
2301
2302 /* Find last node. */
2303 scan = p;
2304 for (;;)
2305 {
2306 temp = regnext(scan);
2307 if (temp == NULL)
2308 break;
2309 scan = temp;
2310 }
2311
2312 if (OP(scan) == BACK)
2313 offset = (int)(scan - val);
2314 else
2315 offset = (int)(val - scan);
2316 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2317 *(scan + 2) = (char_u) (offset & 0377);
2318}
2319
2320/*
2321 * regoptail - regtail on item after a BRANCH; nop if none
2322 */
2323 static void
2324regoptail(p, val)
2325 char_u *p;
2326 char_u *val;
2327{
2328 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2329 if (p == NULL || p == JUST_CALC_SIZE
2330 || (OP(p) != BRANCH
2331 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2332 return;
2333 regtail(OPERAND(p), val);
2334}
2335
2336/*
2337 * getchr() - get the next character from the pattern. We know about
2338 * magic and such, so therefore we need a lexical analyzer.
2339 */
2340
2341/* static int curchr; */
2342static int prevprevchr;
2343static int prevchr;
2344static int nextchr; /* used for ungetchr() */
2345/*
2346 * Note: prevchr is sometimes -1 when we are not at the start,
2347 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2348 * taken to be magic -- webb
2349 */
2350static int at_start; /* True when on the first character */
2351static int prev_at_start; /* True when on the second character */
2352
2353 static void
2354initchr(str)
2355 char_u *str;
2356{
2357 regparse = str;
2358 prevchr_len = 0;
2359 curchr = prevprevchr = prevchr = nextchr = -1;
2360 at_start = TRUE;
2361 prev_at_start = FALSE;
2362}
2363
2364 static int
2365peekchr()
2366{
2367 if (curchr == -1)
2368 {
2369 switch (curchr = regparse[0])
2370 {
2371 case '.':
2372 case '[':
2373 case '~':
2374 /* magic when 'magic' is on */
2375 if (reg_magic >= MAGIC_ON)
2376 curchr = Magic(curchr);
2377 break;
2378 case '(':
2379 case ')':
2380 case '{':
2381 case '%':
2382 case '+':
2383 case '=':
2384 case '?':
2385 case '@':
2386 case '!':
2387 case '&':
2388 case '|':
2389 case '<':
2390 case '>':
2391 case '#': /* future ext. */
2392 case '"': /* future ext. */
2393 case '\'': /* future ext. */
2394 case ',': /* future ext. */
2395 case '-': /* future ext. */
2396 case ':': /* future ext. */
2397 case ';': /* future ext. */
2398 case '`': /* future ext. */
2399 case '/': /* Can't be used in / command */
2400 /* magic only after "\v" */
2401 if (reg_magic == MAGIC_ALL)
2402 curchr = Magic(curchr);
2403 break;
2404 case '*':
2405 /* * is not magic as the very first character, eg "?*ptr" and when
2406 * after '^', eg "/^*ptr" */
2407 if (reg_magic >= MAGIC_ON && !at_start
2408 && !(prev_at_start && prevchr == Magic('^')))
2409 curchr = Magic('*');
2410 break;
2411 case '^':
2412 /* '^' is only magic as the very first character and if it's after
2413 * "\(", "\|", "\&' or "\n" */
2414 if (reg_magic >= MAGIC_OFF
2415 && (at_start
2416 || reg_magic == MAGIC_ALL
2417 || prevchr == Magic('(')
2418 || prevchr == Magic('|')
2419 || prevchr == Magic('&')
2420 || prevchr == Magic('n')
2421 || (no_Magic(prevchr) == '('
2422 && prevprevchr == Magic('%'))))
2423 {
2424 curchr = Magic('^');
2425 at_start = TRUE;
2426 prev_at_start = FALSE;
2427 }
2428 break;
2429 case '$':
2430 /* '$' is only magic as the very last char and if it's in front of
2431 * either "\|", "\)", "\&", or "\n" */
2432 if (reg_magic >= MAGIC_OFF)
2433 {
2434 char_u *p = regparse + 1;
2435
2436 /* ignore \c \C \m and \M after '$' */
2437 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2438 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2439 p += 2;
2440 if (p[0] == NUL
2441 || (p[0] == '\\'
2442 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2443 || p[1] == 'n'))
2444 || reg_magic == MAGIC_ALL)
2445 curchr = Magic('$');
2446 }
2447 break;
2448 case '\\':
2449 {
2450 int c = regparse[1];
2451
2452 if (c == NUL)
2453 curchr = '\\'; /* trailing '\' */
2454 else if (
2455#ifdef EBCDIC
2456 vim_strchr(META, c)
2457#else
2458 c <= '~' && META_flags[c]
2459#endif
2460 )
2461 {
2462 /*
2463 * META contains everything that may be magic sometimes,
2464 * except ^ and $ ("\^" and "\$" are only magic after
2465 * "\v"). We now fetch the next character and toggle its
2466 * magicness. Therefore, \ is so meta-magic that it is
2467 * not in META.
2468 */
2469 curchr = -1;
2470 prev_at_start = at_start;
2471 at_start = FALSE; /* be able to say "/\*ptr" */
2472 ++regparse;
2473 peekchr();
2474 --regparse;
2475 curchr = toggle_Magic(curchr);
2476 }
2477 else if (vim_strchr(REGEXP_ABBR, c))
2478 {
2479 /*
2480 * Handle abbreviations, like "\t" for TAB -- webb
2481 */
2482 curchr = backslash_trans(c);
2483 }
2484 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2485 curchr = toggle_Magic(c);
2486 else
2487 {
2488 /*
2489 * Next character can never be (made) magic?
2490 * Then backslashing it won't do anything.
2491 */
2492#ifdef FEAT_MBYTE
2493 if (has_mbyte)
2494 curchr = (*mb_ptr2char)(regparse + 1);
2495 else
2496#endif
2497 curchr = c;
2498 }
2499 break;
2500 }
2501
2502#ifdef FEAT_MBYTE
2503 default:
2504 if (has_mbyte)
2505 curchr = (*mb_ptr2char)(regparse);
2506#endif
2507 }
2508 }
2509
2510 return curchr;
2511}
2512
2513/*
2514 * Eat one lexed character. Do this in a way that we can undo it.
2515 */
2516 static void
2517skipchr()
2518{
2519 /* peekchr() eats a backslash, do the same here */
2520 if (*regparse == '\\')
2521 prevchr_len = 1;
2522 else
2523 prevchr_len = 0;
2524 if (regparse[prevchr_len] != NUL)
2525 {
2526#ifdef FEAT_MBYTE
2527 if (has_mbyte)
2528 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2529 else
2530#endif
2531 ++prevchr_len;
2532 }
2533 regparse += prevchr_len;
2534 prev_at_start = at_start;
2535 at_start = FALSE;
2536 prevprevchr = prevchr;
2537 prevchr = curchr;
2538 curchr = nextchr; /* use previously unget char, or -1 */
2539 nextchr = -1;
2540}
2541
2542/*
2543 * Skip a character while keeping the value of prev_at_start for at_start.
2544 * prevchr and prevprevchr are also kept.
2545 */
2546 static void
2547skipchr_keepstart()
2548{
2549 int as = prev_at_start;
2550 int pr = prevchr;
2551 int prpr = prevprevchr;
2552
2553 skipchr();
2554 at_start = as;
2555 prevchr = pr;
2556 prevprevchr = prpr;
2557}
2558
2559 static int
2560getchr()
2561{
2562 int chr = peekchr();
2563
2564 skipchr();
2565 return chr;
2566}
2567
2568/*
2569 * put character back. Works only once!
2570 */
2571 static void
2572ungetchr()
2573{
2574 nextchr = curchr;
2575 curchr = prevchr;
2576 prevchr = prevprevchr;
2577 at_start = prev_at_start;
2578 prev_at_start = FALSE;
2579
2580 /* Backup regparse, so that it's at the same position as before the
2581 * getchr(). */
2582 regparse -= prevchr_len;
2583}
2584
2585/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +00002586 * Get and return the value of the hex string at the current position.
2587 * Return -1 if there is no valid hex number.
2588 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002589 * blahblah\%x20asdf
2590 * before-^ ^-after
2591 * The parameter controls the maximum number of input characters. This will be
2592 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2593 */
2594 static int
2595gethexchrs(maxinputlen)
2596 int maxinputlen;
2597{
2598 int nr = 0;
2599 int c;
2600 int i;
2601
2602 for (i = 0; i < maxinputlen; ++i)
2603 {
2604 c = regparse[0];
2605 if (!vim_isxdigit(c))
2606 break;
2607 nr <<= 4;
2608 nr |= hex2nr(c);
2609 ++regparse;
2610 }
2611
2612 if (i == 0)
2613 return -1;
2614 return nr;
2615}
2616
2617/*
2618 * get and return the value of the decimal string immediately after the
2619 * current position. Return -1 for invalid. Consumes all digits.
2620 */
2621 static int
2622getdecchrs()
2623{
2624 int nr = 0;
2625 int c;
2626 int i;
2627
2628 for (i = 0; ; ++i)
2629 {
2630 c = regparse[0];
2631 if (c < '0' || c > '9')
2632 break;
2633 nr *= 10;
2634 nr += c - '0';
2635 ++regparse;
2636 }
2637
2638 if (i == 0)
2639 return -1;
2640 return nr;
2641}
2642
2643/*
2644 * get and return the value of the octal string immediately after the current
2645 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2646 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2647 * treat 8 or 9 as recognised characters. Position is updated:
2648 * blahblah\%o210asdf
2649 * before-^ ^-after
2650 */
2651 static int
2652getoctchrs()
2653{
2654 int nr = 0;
2655 int c;
2656 int i;
2657
2658 for (i = 0; i < 3 && nr < 040; ++i)
2659 {
2660 c = regparse[0];
2661 if (c < '0' || c > '7')
2662 break;
2663 nr <<= 3;
2664 nr |= hex2nr(c);
2665 ++regparse;
2666 }
2667
2668 if (i == 0)
2669 return -1;
2670 return nr;
2671}
2672
2673/*
2674 * Get a number after a backslash that is inside [].
2675 * When nothing is recognized return a backslash.
2676 */
2677 static int
2678coll_get_char()
2679{
2680 int nr = -1;
2681
2682 switch (*regparse++)
2683 {
2684 case 'd': nr = getdecchrs(); break;
2685 case 'o': nr = getoctchrs(); break;
2686 case 'x': nr = gethexchrs(2); break;
2687 case 'u': nr = gethexchrs(4); break;
2688 case 'U': nr = gethexchrs(8); break;
2689 }
2690 if (nr < 0)
2691 {
2692 /* If getting the number fails be backwards compatible: the character
2693 * is a backslash. */
2694 --regparse;
2695 nr = '\\';
2696 }
2697 return nr;
2698}
2699
2700/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002701 * read_limits - Read two integers to be taken as a minimum and maximum.
2702 * If the first character is '-', then the range is reversed.
2703 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2704 * missing, a very big number is the default.
2705 */
2706 static int
2707read_limits(minval, maxval)
2708 long *minval;
2709 long *maxval;
2710{
2711 int reverse = FALSE;
2712 char_u *first_char;
2713 long tmp;
2714
2715 if (*regparse == '-')
2716 {
2717 /* Starts with '-', so reverse the range later */
2718 regparse++;
2719 reverse = TRUE;
2720 }
2721 first_char = regparse;
2722 *minval = getdigits(&regparse);
2723 if (*regparse == ',') /* There is a comma */
2724 {
2725 if (vim_isdigit(*++regparse))
2726 *maxval = getdigits(&regparse);
2727 else
2728 *maxval = MAX_LIMIT;
2729 }
2730 else if (VIM_ISDIGIT(*first_char))
2731 *maxval = *minval; /* It was \{n} or \{-n} */
2732 else
2733 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2734 if (*regparse == '\\')
2735 regparse++; /* Allow either \{...} or \{...\} */
2736 if (*regparse != '}' || (*maxval == 0 && *minval == 0))
2737 {
2738 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2739 reg_magic == MAGIC_ALL ? "" : "\\");
2740 EMSG_RET_FAIL(IObuff);
2741 }
2742
2743 /*
2744 * Reverse the range if there was a '-', or make sure it is in the right
2745 * order otherwise.
2746 */
2747 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2748 {
2749 tmp = *minval;
2750 *minval = *maxval;
2751 *maxval = tmp;
2752 }
2753 skipchr(); /* let's be friends with the lexer again */
2754 return OK;
2755}
2756
2757/*
2758 * vim_regexec and friends
2759 */
2760
2761/*
2762 * Global work variables for vim_regexec().
2763 */
2764
2765/* The current match-position is remembered with these variables: */
2766static linenr_T reglnum; /* line number, relative to first line */
2767static char_u *regline; /* start of current line */
2768static char_u *reginput; /* current input, points into "regline" */
2769
2770static int need_clear_subexpr; /* subexpressions still need to be
2771 * cleared */
2772#ifdef FEAT_SYN_HL
2773static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2774 * still need to be cleared */
2775#endif
2776
2777static int out_of_stack; /* TRUE when ran out of stack space */
2778
2779/*
2780 * Structure used to save the current input state, when it needs to be
2781 * restored after trying a match. Used by reg_save() and reg_restore().
2782 */
2783typedef struct
2784{
2785 union
2786 {
2787 char_u *ptr; /* reginput pointer, for single-line regexp */
2788 lpos_T pos; /* reginput pos, for multi-line regexp */
2789 } rs_u;
2790} regsave_T;
2791
2792/* struct to save start/end pointer/position in for \(\) */
2793typedef struct
2794{
2795 union
2796 {
2797 char_u *ptr;
2798 lpos_T pos;
2799 } se_u;
2800} save_se_T;
2801
2802static char_u *reg_getline __ARGS((linenr_T lnum));
2803static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2804static long regtry __ARGS((regprog_T *prog, colnr_T col));
2805static void cleanup_subexpr __ARGS((void));
2806#ifdef FEAT_SYN_HL
2807static void cleanup_zsubexpr __ARGS((void));
2808#endif
2809static void reg_nextline __ARGS((void));
2810static void reg_save __ARGS((regsave_T *save));
2811static void reg_restore __ARGS((regsave_T *save));
2812static int reg_save_equal __ARGS((regsave_T *save));
2813static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2814static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2815
2816/* Save the sub-expressions before attempting a match. */
2817#define save_se(savep, posp, pp) \
2818 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2819
2820/* After a failed match restore the sub-expressions. */
2821#define restore_se(savep, posp, pp) { \
2822 if (REG_MULTI) \
2823 *(posp) = (savep)->se_u.pos; \
2824 else \
2825 *(pp) = (savep)->se_u.ptr; }
2826
2827static int re_num_cmp __ARGS((long_u val, char_u *scan));
2828static int regmatch __ARGS((char_u *prog));
2829static int regrepeat __ARGS((char_u *p, long maxcount));
2830
2831#ifdef DEBUG
2832int regnarrate = 0;
2833#endif
2834
2835/*
2836 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
2837 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
2838 * contains '\c' or '\C' the value is overruled.
2839 */
2840static int ireg_ic;
2841
2842#ifdef FEAT_MBYTE
2843/*
2844 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
2845 * in the regexp. Defaults to false, always.
2846 */
2847static int ireg_icombine;
2848#endif
2849
2850/*
2851 * Sometimes need to save a copy of a line. Since alloc()/free() is very
2852 * slow, we keep one allocated piece of memory and only re-allocate it when
2853 * it's too small. It's freed in vim_regexec_both() when finished.
2854 */
2855static char_u *reg_tofree;
2856static unsigned reg_tofreelen;
2857
2858/*
2859 * These variables are set when executing a regexp to speed up the execution.
2860 * Which ones are set depends on whethere a single-line or multi-line match is
2861 * done:
2862 * single-line multi-line
2863 * reg_match &regmatch_T NULL
2864 * reg_mmatch NULL &regmmatch_T
2865 * reg_startp reg_match->startp <invalid>
2866 * reg_endp reg_match->endp <invalid>
2867 * reg_startpos <invalid> reg_mmatch->startpos
2868 * reg_endpos <invalid> reg_mmatch->endpos
2869 * reg_win NULL window in which to search
2870 * reg_buf <invalid> buffer in which to search
2871 * reg_firstlnum <invalid> first line in which to search
2872 * reg_maxline 0 last line nr
2873 * reg_line_lbr FALSE or TRUE FALSE
2874 */
2875static regmatch_T *reg_match;
2876static regmmatch_T *reg_mmatch;
2877static char_u **reg_startp = NULL;
2878static char_u **reg_endp = NULL;
2879static lpos_T *reg_startpos = NULL;
2880static lpos_T *reg_endpos = NULL;
2881static win_T *reg_win;
2882static buf_T *reg_buf;
2883static linenr_T reg_firstlnum;
2884static linenr_T reg_maxline;
2885static int reg_line_lbr; /* "\n" in string is line break */
2886
2887/*
2888 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
2889 */
2890 static char_u *
2891reg_getline(lnum)
2892 linenr_T lnum;
2893{
2894 /* when looking behind for a match/no-match lnum is negative. But we
2895 * can't go before line 1 */
2896 if (reg_firstlnum + lnum < 1)
2897 return NULL;
2898 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
2899}
2900
2901static regsave_T behind_pos;
2902
2903#ifdef FEAT_SYN_HL
2904static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
2905static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
2906static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
2907static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
2908#endif
2909
2910/* TRUE if using multi-line regexp. */
2911#define REG_MULTI (reg_match == NULL)
2912
2913/*
2914 * Match a regexp against a string.
2915 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2916 * Uses curbuf for line count and 'iskeyword'.
2917 *
2918 * Return TRUE if there is a match, FALSE if not.
2919 */
2920 int
2921vim_regexec(rmp, line, col)
2922 regmatch_T *rmp;
2923 char_u *line; /* string to match against */
2924 colnr_T col; /* column to start looking for match */
2925{
2926 reg_match = rmp;
2927 reg_mmatch = NULL;
2928 reg_maxline = 0;
2929 reg_line_lbr = FALSE;
2930 reg_win = NULL;
2931 ireg_ic = rmp->rm_ic;
2932#ifdef FEAT_MBYTE
2933 ireg_icombine = FALSE;
2934#endif
2935 return (vim_regexec_both(line, col) != 0);
2936}
2937
2938#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
2939/*
2940 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2941 */
2942 int
2943vim_regexec_nl(rmp, line, col)
2944 regmatch_T *rmp;
2945 char_u *line; /* string to match against */
2946 colnr_T col; /* column to start looking for match */
2947{
2948 reg_match = rmp;
2949 reg_mmatch = NULL;
2950 reg_maxline = 0;
2951 reg_line_lbr = TRUE;
2952 reg_win = NULL;
2953 ireg_ic = rmp->rm_ic;
2954#ifdef FEAT_MBYTE
2955 ireg_icombine = FALSE;
2956#endif
2957 return (vim_regexec_both(line, col) != 0);
2958}
2959#endif
2960
2961/*
2962 * Match a regexp against multiple lines.
2963 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2964 * Uses curbuf for line count and 'iskeyword'.
2965 *
2966 * Return zero if there is no match. Return number of lines contained in the
2967 * match otherwise.
2968 */
2969 long
2970vim_regexec_multi(rmp, win, buf, lnum, col)
2971 regmmatch_T *rmp;
2972 win_T *win; /* window in which to search or NULL */
2973 buf_T *buf; /* buffer in which to search */
2974 linenr_T lnum; /* nr of line to start looking for match */
2975 colnr_T col; /* column to start looking for match */
2976{
2977 long r;
2978 buf_T *save_curbuf = curbuf;
2979
2980 reg_match = NULL;
2981 reg_mmatch = rmp;
2982 reg_buf = buf;
2983 reg_win = win;
2984 reg_firstlnum = lnum;
2985 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
2986 reg_line_lbr = FALSE;
2987 ireg_ic = rmp->rmm_ic;
2988#ifdef FEAT_MBYTE
2989 ireg_icombine = FALSE;
2990#endif
2991
2992 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
2993 curbuf = buf;
2994 r = vim_regexec_both(NULL, col);
2995 curbuf = save_curbuf;
2996
2997 return r;
2998}
2999
3000/*
3001 * Match a regexp against a string ("line" points to the string) or multiple
3002 * lines ("line" is NULL, use reg_getline()).
3003 */
3004#ifdef HAVE_SETJMP_H
3005 static long
3006vim_regexec_both(line_arg, col_arg)
3007 char_u *line_arg;
3008 colnr_T col_arg; /* column to start looking for match */
3009#else
3010 static long
3011vim_regexec_both(line, col)
3012 char_u *line;
3013 colnr_T col; /* column to start looking for match */
3014#endif
3015{
3016 regprog_T *prog;
3017 char_u *s;
3018 long retval;
3019#ifdef HAVE_SETJMP_H
3020 char_u *line;
3021 colnr_T col;
3022#endif
3023
3024 reg_tofree = NULL;
3025
3026#ifdef HAVE_TRY_EXCEPT
3027 __try
3028 {
3029#endif
3030
3031#ifdef HAVE_SETJMP_H
3032 /*
3033 * Matching with a regexp may cause a very deep recursive call of
3034 * regmatch(). Vim will crash when running out of stack space. Catch
3035 * this here if the system supports it.
3036 */
3037 mch_startjmp();
3038 if (SETJMP(lc_jump_env) != 0)
3039 {
3040 mch_didjmp();
3041# ifdef SIGHASARG
3042 if (lc_signal != SIGINT)
3043# endif
3044 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3045 retval = 0L;
3046 goto theend;
3047 }
3048
3049 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3050 line = line_arg;
3051 col = col_arg;
3052#endif
3053 retval = 0L;
3054
3055 if (REG_MULTI)
3056 {
3057 prog = reg_mmatch->regprog;
3058 line = reg_getline((linenr_T)0);
3059 reg_startpos = reg_mmatch->startpos;
3060 reg_endpos = reg_mmatch->endpos;
3061 }
3062 else
3063 {
3064 prog = reg_match->regprog;
3065 reg_startp = reg_match->startp;
3066 reg_endp = reg_match->endp;
3067 }
3068
3069 /* Be paranoid... */
3070 if (prog == NULL || line == NULL)
3071 {
3072 EMSG(_(e_null));
3073 goto theend;
3074 }
3075
3076 /* Check validity of program. */
3077 if (prog_magic_wrong())
3078 goto theend;
3079
3080 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3081 if (prog->regflags & RF_ICASE)
3082 ireg_ic = TRUE;
3083 else if (prog->regflags & RF_NOICASE)
3084 ireg_ic = FALSE;
3085
3086#ifdef FEAT_MBYTE
3087 /* If pattern contains "\Z" overrule value of ireg_icombine */
3088 if (prog->regflags & RF_ICOMBINE)
3089 ireg_icombine = TRUE;
3090#endif
3091
3092 /* If there is a "must appear" string, look for it. */
3093 if (prog->regmust != NULL)
3094 {
3095 int c;
3096
3097#ifdef FEAT_MBYTE
3098 if (has_mbyte)
3099 c = (*mb_ptr2char)(prog->regmust);
3100 else
3101#endif
3102 c = *prog->regmust;
3103 s = line + col;
3104 while ((s = cstrchr(s, c)) != NULL)
3105 {
3106 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3107 break; /* Found it. */
3108#ifdef FEAT_MBYTE
3109 if (has_mbyte)
3110 s += (*mb_ptr2len_check)(s);
3111 else
3112#endif
3113 ++s;
3114 }
3115 if (s == NULL) /* Not present. */
3116 goto theend;
3117 }
3118
3119 regline = line;
3120 reglnum = 0;
3121 out_of_stack = FALSE;
3122
3123 /* Simplest case: Anchored match need be tried only once. */
3124 if (prog->reganch)
3125 {
3126 int c;
3127
3128#ifdef FEAT_MBYTE
3129 if (has_mbyte)
3130 c = (*mb_ptr2char)(regline + col);
3131 else
3132#endif
3133 c = regline[col];
3134 if (prog->regstart == NUL
3135 || prog->regstart == c
3136 || (ireg_ic && ((
3137#ifdef FEAT_MBYTE
3138 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3139 || (c < 255 && prog->regstart < 255 &&
3140#endif
3141 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3142 retval = regtry(prog, col);
3143 else
3144 retval = 0;
3145 }
3146 else
3147 {
3148 /* Messy cases: unanchored match. */
3149 while (!got_int && !out_of_stack)
3150 {
3151 if (prog->regstart != NUL)
3152 {
3153 /* Skip until the char we know it must start with. */
3154 s = cstrchr(regline + col, prog->regstart);
3155 if (s == NULL)
3156 {
3157 retval = 0;
3158 break;
3159 }
3160 col = (int)(s - regline);
3161 }
3162
3163 retval = regtry(prog, col);
3164 if (retval > 0)
3165 break;
3166
3167 /* if not currently on the first line, get it again */
3168 if (reglnum != 0)
3169 {
3170 regline = reg_getline((linenr_T)0);
3171 reglnum = 0;
3172 }
3173 if (regline[col] == NUL)
3174 break;
3175#ifdef FEAT_MBYTE
3176 if (has_mbyte)
3177 col += (*mb_ptr2len_check)(regline + col);
3178 else
3179#endif
3180 ++col;
3181 }
3182 }
3183
3184 if (out_of_stack)
3185 EMSG(_("E363: pattern caused out-of-stack error"));
3186
3187#ifdef HAVE_TRY_EXCEPT
3188 }
3189 __except(EXCEPTION_EXECUTE_HANDLER)
3190 {
3191 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3192 {
3193 RESETSTKOFLW();
3194 EMSG(_("E363: pattern caused out-of-stack error"));
3195 }
3196 else
3197 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3198 retval = 0L;
3199 }
3200#endif
3201
3202theend:
3203 /* Didn't find a match. */
3204 vim_free(reg_tofree);
3205#ifdef HAVE_SETJMP_H
3206 mch_endjmp();
3207#endif
3208 return retval;
3209}
3210
3211#ifdef FEAT_SYN_HL
3212static reg_extmatch_T *make_extmatch __ARGS((void));
3213
3214/*
3215 * Create a new extmatch and mark it as referenced once.
3216 */
3217 static reg_extmatch_T *
3218make_extmatch()
3219{
3220 reg_extmatch_T *em;
3221
3222 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3223 if (em != NULL)
3224 em->refcnt = 1;
3225 return em;
3226}
3227
3228/*
3229 * Add a reference to an extmatch.
3230 */
3231 reg_extmatch_T *
3232ref_extmatch(em)
3233 reg_extmatch_T *em;
3234{
3235 if (em != NULL)
3236 em->refcnt++;
3237 return em;
3238}
3239
3240/*
3241 * Remove a reference to an extmatch. If there are no references left, free
3242 * the info.
3243 */
3244 void
3245unref_extmatch(em)
3246 reg_extmatch_T *em;
3247{
3248 int i;
3249
3250 if (em != NULL && --em->refcnt <= 0)
3251 {
3252 for (i = 0; i < NSUBEXP; ++i)
3253 vim_free(em->matches[i]);
3254 vim_free(em);
3255 }
3256}
3257#endif
3258
3259/*
3260 * regtry - try match of "prog" with at regline["col"].
3261 * Returns 0 for failure, number of lines contained in the match otherwise.
3262 */
3263 static long
3264regtry(prog, col)
3265 regprog_T *prog;
3266 colnr_T col;
3267{
3268 reginput = regline + col;
3269 need_clear_subexpr = TRUE;
3270#ifdef FEAT_SYN_HL
3271 /* Clear the external match subpointers if necessary. */
3272 if (prog->reghasz == REX_SET)
3273 need_clear_zsubexpr = TRUE;
3274#endif
3275
3276 if (regmatch(prog->program + 1))
3277 {
3278 cleanup_subexpr();
3279 if (REG_MULTI)
3280 {
3281 if (reg_startpos[0].lnum < 0)
3282 {
3283 reg_startpos[0].lnum = 0;
3284 reg_startpos[0].col = col;
3285 }
3286 if (reg_endpos[0].lnum < 0)
3287 {
3288 reg_endpos[0].lnum = reglnum;
3289 reg_endpos[0].col = (int)(reginput - regline);
3290 }
3291 else
3292 /* Use line number of "\ze". */
3293 reglnum = reg_endpos[0].lnum;
3294 }
3295 else
3296 {
3297 if (reg_startp[0] == NULL)
3298 reg_startp[0] = regline + col;
3299 if (reg_endp[0] == NULL)
3300 reg_endp[0] = reginput;
3301 }
3302#ifdef FEAT_SYN_HL
3303 /* Package any found \z(...\) matches for export. Default is none. */
3304 unref_extmatch(re_extmatch_out);
3305 re_extmatch_out = NULL;
3306
3307 if (prog->reghasz == REX_SET)
3308 {
3309 int i;
3310
3311 cleanup_zsubexpr();
3312 re_extmatch_out = make_extmatch();
3313 for (i = 0; i < NSUBEXP; i++)
3314 {
3315 if (REG_MULTI)
3316 {
3317 /* Only accept single line matches. */
3318 if (reg_startzpos[i].lnum >= 0
3319 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3320 re_extmatch_out->matches[i] =
3321 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3322 + reg_startzpos[i].col,
3323 reg_endzpos[i].col - reg_startzpos[i].col);
3324 }
3325 else
3326 {
3327 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3328 re_extmatch_out->matches[i] =
3329 vim_strnsave(reg_startzp[i],
3330 (int)(reg_endzp[i] - reg_startzp[i]));
3331 }
3332 }
3333 }
3334#endif
3335 return 1 + reglnum;
3336 }
3337 return 0;
3338}
3339
3340#ifdef FEAT_MBYTE
3341/* multi-byte: advance reginput with a function */
3342# define ADVANCE_REGINPUT() advance_reginput()
3343
3344static void advance_reginput __ARGS((void));
3345static int reg_prev_class __ARGS((void));
3346
3347 static void
3348advance_reginput()
3349{
3350 if (has_mbyte)
3351 reginput += (*mb_ptr2len_check)(reginput);
3352 else
3353 ++reginput;
3354}
3355
3356/*
3357 * Get class of previous character.
3358 */
3359 static int
3360reg_prev_class()
3361{
3362 if (reginput > regline)
3363 return mb_get_class(reginput - 1
3364 - (*mb_head_off)(regline, reginput - 1));
3365 return -1;
3366}
3367
3368#else
3369/* No multi-byte: It's too simple to make a function for. */
3370# define ADVANCE_REGINPUT() ++reginput
3371#endif
3372
3373/*
3374 * The arguments from BRACE_LIMITS are stored here. They are actually local
3375 * to regmatch(), but they are here to reduce the amount of stack space used
3376 * (it can be called recursively many times).
3377 */
3378static long bl_minval;
3379static long bl_maxval;
3380
3381/*
3382 * regmatch - main matching routine
3383 *
3384 * Conceptually the strategy is simple: Check to see whether the current
3385 * node matches, call self recursively to see whether the rest matches,
3386 * and then act accordingly. In practice we make some effort to avoid
3387 * recursion, in particular by going through "ordinary" nodes (that don't
3388 * need to know whether the rest of the match failed) by a loop instead of
3389 * by recursion.
3390 *
3391 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3392 * the last matched character.
3393 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3394 * undefined state!
3395 */
3396 static int
3397regmatch(scan)
3398 char_u *scan; /* Current node. */
3399{
3400 char_u *next; /* Next node. */
3401 int op;
3402 int c;
3403
3404#ifdef HAVE_GETRLIMIT
3405 /* Check if we are running out of stack space. Could be caused by
3406 * recursively calling ourselves. */
3407 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3408 {
3409 out_of_stack = TRUE;
3410 return FALSE;
3411 }
3412#endif
3413
3414 /* Some patterns my cause a long time to match, even though they are not
3415 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3416 fast_breakcheck();
3417
3418#ifdef DEBUG
3419 if (scan != NULL && regnarrate)
3420 {
3421 mch_errmsg(regprop(scan));
3422 mch_errmsg("(\n");
3423 }
3424#endif
3425 while (scan != NULL)
3426 {
3427 if (got_int || out_of_stack)
3428 return FALSE;
3429#ifdef DEBUG
3430 if (regnarrate)
3431 {
3432 mch_errmsg(regprop(scan));
3433 mch_errmsg("...\n");
3434# ifdef FEAT_SYN_HL
3435 if (re_extmatch_in != NULL)
3436 {
3437 int i;
3438
3439 mch_errmsg(_("External submatches:\n"));
3440 for (i = 0; i < NSUBEXP; i++)
3441 {
3442 mch_errmsg(" \"");
3443 if (re_extmatch_in->matches[i] != NULL)
3444 mch_errmsg(re_extmatch_in->matches[i]);
3445 mch_errmsg("\"\n");
3446 }
3447 }
3448# endif
3449 }
3450#endif
3451 next = regnext(scan);
3452
3453 op = OP(scan);
3454 /* Check for character class with NL added. */
3455 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3456 {
3457 reg_nextline();
3458 }
3459 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3460 {
3461 ADVANCE_REGINPUT();
3462 }
3463 else
3464 {
3465 if (WITH_NL(op))
3466 op -= ADD_NL;
3467#ifdef FEAT_MBYTE
3468 if (has_mbyte)
3469 c = (*mb_ptr2char)(reginput);
3470 else
3471#endif
3472 c = *reginput;
3473 switch (op)
3474 {
3475 case BOL:
3476 if (reginput != regline)
3477 return FALSE;
3478 break;
3479
3480 case EOL:
3481 if (c != NUL)
3482 return FALSE;
3483 break;
3484
3485 case RE_BOF:
3486 /* Passing -1 to the getline() function provided for the search
3487 * should always return NULL if the current line is the first
3488 * line of the file. */
3489 if (reglnum != 0 || reginput != regline
3490 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3491 return FALSE;
3492 break;
3493
3494 case RE_EOF:
3495 if (reglnum != reg_maxline || c != NUL)
3496 return FALSE;
3497 break;
3498
3499 case CURSOR:
3500 /* Check if the buffer is in a window and compare the
3501 * reg_win->w_cursor position to the match position. */
3502 if (reg_win == NULL
3503 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3504 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3505 return FALSE;
3506 break;
3507
3508 case RE_LNUM:
3509 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3510 scan))
3511 return FALSE;
3512 break;
3513
3514 case RE_COL:
3515 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3516 return FALSE;
3517 break;
3518
3519 case RE_VCOL:
3520 if (!re_num_cmp((long_u)win_linetabsize(
3521 reg_win == NULL ? curwin : reg_win,
3522 regline, (colnr_T)(reginput - regline)) + 1, scan))
3523 return FALSE;
3524 break;
3525
3526 case BOW: /* \<word; reginput points to w */
3527 if (c == NUL) /* Can't match at end of line */
3528 return FALSE;
3529#ifdef FEAT_MBYTE
3530 if (has_mbyte)
3531 {
3532 int this_class;
3533
3534 /* Get class of current and previous char (if it exists). */
3535 this_class = mb_get_class(reginput);
3536 if (this_class <= 1)
3537 return FALSE; /* not on a word at all */
3538 if (reg_prev_class() == this_class)
3539 return FALSE; /* previous char is in same word */
3540 }
3541#endif
3542 else
3543 {
3544 if (!vim_iswordc(c)
3545 || (reginput > regline && vim_iswordc(reginput[-1])))
3546 return FALSE;
3547 }
3548 break;
3549
3550 case EOW: /* word\>; reginput points after d */
3551 if (reginput == regline) /* Can't match at start of line */
3552 return FALSE;
3553#ifdef FEAT_MBYTE
3554 if (has_mbyte)
3555 {
3556 int this_class, prev_class;
3557
3558 /* Get class of current and previous char (if it exists). */
3559 this_class = mb_get_class(reginput);
3560 prev_class = reg_prev_class();
3561 if (this_class == prev_class)
3562 return FALSE;
3563 if (prev_class == 0 || prev_class == 1)
3564 return FALSE;
3565 }
3566 else
3567#endif
3568 {
3569 if (!vim_iswordc(reginput[-1]))
3570 return FALSE;
3571 if (reginput[0] != NUL && vim_iswordc(c))
3572 return FALSE;
3573 }
3574 break; /* Matched with EOW */
3575
3576 case ANY:
3577 if (c == NUL)
3578 return FALSE;
3579 ADVANCE_REGINPUT();
3580 break;
3581
3582 case IDENT:
3583 if (!vim_isIDc(c))
3584 return FALSE;
3585 ADVANCE_REGINPUT();
3586 break;
3587
3588 case SIDENT:
3589 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3590 return FALSE;
3591 ADVANCE_REGINPUT();
3592 break;
3593
3594 case KWORD:
3595 if (!vim_iswordp(reginput))
3596 return FALSE;
3597 ADVANCE_REGINPUT();
3598 break;
3599
3600 case SKWORD:
3601 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3602 return FALSE;
3603 ADVANCE_REGINPUT();
3604 break;
3605
3606 case FNAME:
3607 if (!vim_isfilec(c))
3608 return FALSE;
3609 ADVANCE_REGINPUT();
3610 break;
3611
3612 case SFNAME:
3613 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3614 return FALSE;
3615 ADVANCE_REGINPUT();
3616 break;
3617
3618 case PRINT:
3619 if (ptr2cells(reginput) != 1)
3620 return FALSE;
3621 ADVANCE_REGINPUT();
3622 break;
3623
3624 case SPRINT:
3625 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3626 return FALSE;
3627 ADVANCE_REGINPUT();
3628 break;
3629
3630 case WHITE:
3631 if (!vim_iswhite(c))
3632 return FALSE;
3633 ADVANCE_REGINPUT();
3634 break;
3635
3636 case NWHITE:
3637 if (c == NUL || vim_iswhite(c))
3638 return FALSE;
3639 ADVANCE_REGINPUT();
3640 break;
3641
3642 case DIGIT:
3643 if (!ri_digit(c))
3644 return FALSE;
3645 ADVANCE_REGINPUT();
3646 break;
3647
3648 case NDIGIT:
3649 if (c == NUL || ri_digit(c))
3650 return FALSE;
3651 ADVANCE_REGINPUT();
3652 break;
3653
3654 case HEX:
3655 if (!ri_hex(c))
3656 return FALSE;
3657 ADVANCE_REGINPUT();
3658 break;
3659
3660 case NHEX:
3661 if (c == NUL || ri_hex(c))
3662 return FALSE;
3663 ADVANCE_REGINPUT();
3664 break;
3665
3666 case OCTAL:
3667 if (!ri_octal(c))
3668 return FALSE;
3669 ADVANCE_REGINPUT();
3670 break;
3671
3672 case NOCTAL:
3673 if (c == NUL || ri_octal(c))
3674 return FALSE;
3675 ADVANCE_REGINPUT();
3676 break;
3677
3678 case WORD:
3679 if (!ri_word(c))
3680 return FALSE;
3681 ADVANCE_REGINPUT();
3682 break;
3683
3684 case NWORD:
3685 if (c == NUL || ri_word(c))
3686 return FALSE;
3687 ADVANCE_REGINPUT();
3688 break;
3689
3690 case HEAD:
3691 if (!ri_head(c))
3692 return FALSE;
3693 ADVANCE_REGINPUT();
3694 break;
3695
3696 case NHEAD:
3697 if (c == NUL || ri_head(c))
3698 return FALSE;
3699 ADVANCE_REGINPUT();
3700 break;
3701
3702 case ALPHA:
3703 if (!ri_alpha(c))
3704 return FALSE;
3705 ADVANCE_REGINPUT();
3706 break;
3707
3708 case NALPHA:
3709 if (c == NUL || ri_alpha(c))
3710 return FALSE;
3711 ADVANCE_REGINPUT();
3712 break;
3713
3714 case LOWER:
3715 if (!ri_lower(c))
3716 return FALSE;
3717 ADVANCE_REGINPUT();
3718 break;
3719
3720 case NLOWER:
3721 if (c == NUL || ri_lower(c))
3722 return FALSE;
3723 ADVANCE_REGINPUT();
3724 break;
3725
3726 case UPPER:
3727 if (!ri_upper(c))
3728 return FALSE;
3729 ADVANCE_REGINPUT();
3730 break;
3731
3732 case NUPPER:
3733 if (c == NUL || ri_upper(c))
3734 return FALSE;
3735 ADVANCE_REGINPUT();
3736 break;
3737
3738 case EXACTLY:
3739 {
3740 int len;
3741 char_u *opnd;
3742
3743 opnd = OPERAND(scan);
3744 /* Inline the first byte, for speed. */
3745 if (*opnd != *reginput
3746 && (!ireg_ic || (
3747#ifdef FEAT_MBYTE
3748 !enc_utf8 &&
3749#endif
3750 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3751 return FALSE;
3752 if (*opnd == NUL)
3753 {
3754 /* match empty string always works; happens when "~" is
3755 * empty. */
3756 }
3757 else if (opnd[1] == NUL
3758#ifdef FEAT_MBYTE
3759 && !(enc_utf8 && ireg_ic)
3760#endif
3761 )
3762 ++reginput; /* matched a single char */
3763 else
3764 {
3765 len = (int)STRLEN(opnd);
3766 /* Need to match first byte again for multi-byte. */
3767 if (cstrncmp(opnd, reginput, &len) != 0)
3768 return FALSE;
3769#ifdef FEAT_MBYTE
3770 /* Check for following composing character. */
3771 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3772 {
3773 /* raaron: This code makes a composing character get
3774 * ignored, which is the correct behavior (sometimes)
3775 * for voweled Hebrew texts. */
3776 if (!ireg_icombine)
3777 return FALSE;
3778 }
3779 else
3780#endif
3781 reginput += len;
3782 }
3783 }
3784 break;
3785
3786 case ANYOF:
3787 case ANYBUT:
3788 if (c == NUL)
3789 return FALSE;
3790 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3791 return FALSE;
3792 ADVANCE_REGINPUT();
3793 break;
3794
3795#ifdef FEAT_MBYTE
3796 case MULTIBYTECODE:
3797 if (has_mbyte)
3798 {
3799 int i, len;
3800 char_u *opnd;
3801
3802 opnd = OPERAND(scan);
3803 /* Safety check (just in case 'encoding' was changed since
3804 * compiling the program). */
3805 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3806 return FALSE;
3807 for (i = 0; i < len; ++i)
3808 if (opnd[i] != reginput[i])
3809 return FALSE;
3810 reginput += len;
3811 }
3812 else
3813 return FALSE;
3814 break;
3815#endif
3816
3817 case NOTHING:
3818 break;
3819
3820 case BACK:
3821 break;
3822
3823 case MOPEN + 0: /* Match start: \zs */
3824 case MOPEN + 1: /* \( */
3825 case MOPEN + 2:
3826 case MOPEN + 3:
3827 case MOPEN + 4:
3828 case MOPEN + 5:
3829 case MOPEN + 6:
3830 case MOPEN + 7:
3831 case MOPEN + 8:
3832 case MOPEN + 9:
3833 {
3834 int no;
3835 save_se_T save;
3836
3837 no = op - MOPEN;
3838 cleanup_subexpr();
3839 save_se(&save, &reg_startpos[no], &reg_startp[no]);
3840
3841 if (regmatch(next))
3842 return TRUE;
3843
3844 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
3845 return FALSE;
3846 }
3847 /* break; Not Reached */
3848
3849 case NOPEN: /* \%( */
3850 case NCLOSE: /* \) after \%( */
3851 if (regmatch(next))
3852 return TRUE;
3853 return FALSE;
3854 /* break; Not Reached */
3855
3856#ifdef FEAT_SYN_HL
3857 case ZOPEN + 1:
3858 case ZOPEN + 2:
3859 case ZOPEN + 3:
3860 case ZOPEN + 4:
3861 case ZOPEN + 5:
3862 case ZOPEN + 6:
3863 case ZOPEN + 7:
3864 case ZOPEN + 8:
3865 case ZOPEN + 9:
3866 {
3867 int no;
3868 save_se_T save;
3869
3870 no = op - ZOPEN;
3871 cleanup_zsubexpr();
3872 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3873
3874 if (regmatch(next))
3875 return TRUE;
3876
3877 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3878 return FALSE;
3879 }
3880 /* break; Not Reached */
3881#endif
3882
3883 case MCLOSE + 0: /* Match end: \ze */
3884 case MCLOSE + 1: /* \) */
3885 case MCLOSE + 2:
3886 case MCLOSE + 3:
3887 case MCLOSE + 4:
3888 case MCLOSE + 5:
3889 case MCLOSE + 6:
3890 case MCLOSE + 7:
3891 case MCLOSE + 8:
3892 case MCLOSE + 9:
3893 {
3894 int no;
3895 save_se_T save;
3896
3897 no = op - MCLOSE;
3898 cleanup_subexpr();
3899 save_se(&save, &reg_endpos[no], &reg_endp[no]);
3900
3901 if (regmatch(next))
3902 return TRUE;
3903
3904 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
3905 return FALSE;
3906 }
3907 /* break; Not Reached */
3908
3909#ifdef FEAT_SYN_HL
3910 case ZCLOSE + 1: /* \) after \z( */
3911 case ZCLOSE + 2:
3912 case ZCLOSE + 3:
3913 case ZCLOSE + 4:
3914 case ZCLOSE + 5:
3915 case ZCLOSE + 6:
3916 case ZCLOSE + 7:
3917 case ZCLOSE + 8:
3918 case ZCLOSE + 9:
3919 {
3920 int no;
3921 save_se_T save;
3922
3923 no = op - ZCLOSE;
3924 cleanup_zsubexpr();
3925 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3926
3927 if (regmatch(next))
3928 return TRUE;
3929
3930 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3931 return FALSE;
3932 }
3933 /* break; Not Reached */
3934#endif
3935
3936 case BACKREF + 1:
3937 case BACKREF + 2:
3938 case BACKREF + 3:
3939 case BACKREF + 4:
3940 case BACKREF + 5:
3941 case BACKREF + 6:
3942 case BACKREF + 7:
3943 case BACKREF + 8:
3944 case BACKREF + 9:
3945 {
3946 int no;
3947 int len;
3948 linenr_T clnum;
3949 colnr_T ccol;
3950 char_u *p;
3951
3952 no = op - BACKREF;
3953 cleanup_subexpr();
3954 if (!REG_MULTI) /* Single-line regexp */
3955 {
3956 if (reg_endp[no] == NULL)
3957 {
3958 /* Backref was not set: Match an empty string. */
3959 len = 0;
3960 }
3961 else
3962 {
3963 /* Compare current input with back-ref in the same
3964 * line. */
3965 len = (int)(reg_endp[no] - reg_startp[no]);
3966 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
3967 return FALSE;
3968 }
3969 }
3970 else /* Multi-line regexp */
3971 {
3972 if (reg_endpos[no].lnum < 0)
3973 {
3974 /* Backref was not set: Match an empty string. */
3975 len = 0;
3976 }
3977 else
3978 {
3979 if (reg_startpos[no].lnum == reglnum
3980 && reg_endpos[no].lnum == reglnum)
3981 {
3982 /* Compare back-ref within the current line. */
3983 len = reg_endpos[no].col - reg_startpos[no].col;
3984 if (cstrncmp(regline + reg_startpos[no].col,
3985 reginput, &len) != 0)
3986 return FALSE;
3987 }
3988 else
3989 {
3990 /* Messy situation: Need to compare between two
3991 * lines. */
3992 ccol = reg_startpos[no].col;
3993 clnum = reg_startpos[no].lnum;
3994 for (;;)
3995 {
3996 /* Since getting one line may invalidate
3997 * the other, need to make copy. Slow! */
3998 if (regline != reg_tofree)
3999 {
4000 len = (int)STRLEN(regline);
4001 if (reg_tofree == NULL
4002 || len >= (int)reg_tofreelen)
4003 {
4004 len += 50; /* get some extra */
4005 vim_free(reg_tofree);
4006 reg_tofree = alloc(len);
4007 if (reg_tofree == NULL)
4008 return FALSE; /* out of memory! */
4009 reg_tofreelen = len;
4010 }
4011 STRCPY(reg_tofree, regline);
4012 reginput = reg_tofree
4013 + (reginput - regline);
4014 regline = reg_tofree;
4015 }
4016
4017 /* Get the line to compare with. */
4018 p = reg_getline(clnum);
4019 if (clnum == reg_endpos[no].lnum)
4020 len = reg_endpos[no].col - ccol;
4021 else
4022 len = (int)STRLEN(p + ccol);
4023
4024 if (cstrncmp(p + ccol, reginput, &len) != 0)
4025 return FALSE; /* doesn't match */
4026 if (clnum == reg_endpos[no].lnum)
4027 break; /* match and at end! */
4028 if (reglnum == reg_maxline)
4029 return FALSE; /* text too short */
4030
4031 /* Advance to next line. */
4032 reg_nextline();
4033 ++clnum;
4034 ccol = 0;
4035 if (got_int || out_of_stack)
4036 return FALSE;
4037 }
4038
4039 /* found a match! Note that regline may now point
4040 * to a copy of the line, that should not matter. */
4041 }
4042 }
4043 }
4044
4045 /* Matched the backref, skip over it. */
4046 reginput += len;
4047 }
4048 break;
4049
4050#ifdef FEAT_SYN_HL
4051 case ZREF + 1:
4052 case ZREF + 2:
4053 case ZREF + 3:
4054 case ZREF + 4:
4055 case ZREF + 5:
4056 case ZREF + 6:
4057 case ZREF + 7:
4058 case ZREF + 8:
4059 case ZREF + 9:
4060 {
4061 int no;
4062 int len;
4063
4064 cleanup_zsubexpr();
4065 no = op - ZREF;
4066 if (re_extmatch_in != NULL
4067 && re_extmatch_in->matches[no] != NULL)
4068 {
4069 len = (int)STRLEN(re_extmatch_in->matches[no]);
4070 if (cstrncmp(re_extmatch_in->matches[no],
4071 reginput, &len) != 0)
4072 return FALSE;
4073 reginput += len;
4074 }
4075 else
4076 {
4077 /* Backref was not set: Match an empty string. */
4078 }
4079 }
4080 break;
4081#endif
4082
4083 case BRANCH:
4084 {
4085 if (OP(next) != BRANCH) /* No choice. */
4086 next = OPERAND(scan); /* Avoid recursion. */
4087 else
4088 {
4089 regsave_T save;
4090
4091 do
4092 {
4093 reg_save(&save);
4094 if (regmatch(OPERAND(scan)))
4095 return TRUE;
4096 reg_restore(&save);
4097 scan = regnext(scan);
4098 } while (scan != NULL && OP(scan) == BRANCH);
4099 return FALSE;
4100 /* NOTREACHED */
4101 }
4102 }
4103 break;
4104
4105 case BRACE_LIMITS:
4106 {
4107 int no;
4108
4109 if (OP(next) == BRACE_SIMPLE)
4110 {
4111 bl_minval = OPERAND_MIN(scan);
4112 bl_maxval = OPERAND_MAX(scan);
4113 }
4114 else if (OP(next) >= BRACE_COMPLEX
4115 && OP(next) < BRACE_COMPLEX + 10)
4116 {
4117 no = OP(next) - BRACE_COMPLEX;
4118 brace_min[no] = OPERAND_MIN(scan);
4119 brace_max[no] = OPERAND_MAX(scan);
4120 brace_count[no] = 0;
4121 }
4122 else
4123 {
4124 EMSG(_(e_internal)); /* Shouldn't happen */
4125 return FALSE;
4126 }
4127 }
4128 break;
4129
4130 case BRACE_COMPLEX + 0:
4131 case BRACE_COMPLEX + 1:
4132 case BRACE_COMPLEX + 2:
4133 case BRACE_COMPLEX + 3:
4134 case BRACE_COMPLEX + 4:
4135 case BRACE_COMPLEX + 5:
4136 case BRACE_COMPLEX + 6:
4137 case BRACE_COMPLEX + 7:
4138 case BRACE_COMPLEX + 8:
4139 case BRACE_COMPLEX + 9:
4140 {
4141 int no;
4142 regsave_T save;
4143
4144 no = op - BRACE_COMPLEX;
4145 ++brace_count[no];
4146
4147 /* If not matched enough times yet, try one more */
4148 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4149 ? brace_min[no] : brace_max[no]))
4150 {
4151 reg_save(&save);
4152 if (regmatch(OPERAND(scan)))
4153 return TRUE;
4154 reg_restore(&save);
4155 --brace_count[no]; /* failed, decrement match count */
4156 return FALSE;
4157 }
4158
4159 /* If matched enough times, may try matching some more */
4160 if (brace_min[no] <= brace_max[no])
4161 {
4162 /* Range is the normal way around, use longest match */
4163 if (brace_count[no] <= brace_max[no])
4164 {
4165 reg_save(&save);
4166 if (regmatch(OPERAND(scan)))
4167 return TRUE; /* matched some more times */
4168 reg_restore(&save);
4169 --brace_count[no]; /* matched just enough times */
4170 /* continue with the items after \{} */
4171 }
4172 }
4173 else
4174 {
4175 /* Range is backwards, use shortest match first */
4176 if (brace_count[no] <= brace_min[no])
4177 {
4178 reg_save(&save);
4179 if (regmatch(next))
4180 return TRUE;
4181 reg_restore(&save);
4182 next = OPERAND(scan);
4183 /* must try to match one more item */
4184 }
4185 }
4186 }
4187 break;
4188
4189 case BRACE_SIMPLE:
4190 case STAR:
4191 case PLUS:
4192 {
4193 int nextb; /* next byte */
4194 int nextb_ic; /* next byte reverse case */
4195 long count;
4196 regsave_T save;
4197 long minval;
4198 long maxval;
4199
4200 /*
4201 * Lookahead to avoid useless match attempts when we know
4202 * what character comes next.
4203 */
4204 if (OP(next) == EXACTLY)
4205 {
4206 nextb = *OPERAND(next);
4207 if (ireg_ic)
4208 {
4209 if (isupper(nextb))
4210 nextb_ic = TOLOWER_LOC(nextb);
4211 else
4212 nextb_ic = TOUPPER_LOC(nextb);
4213 }
4214 else
4215 nextb_ic = nextb;
4216 }
4217 else
4218 {
4219 nextb = NUL;
4220 nextb_ic = NUL;
4221 }
4222 if (op != BRACE_SIMPLE)
4223 {
4224 minval = (op == STAR) ? 0 : 1;
4225 maxval = MAX_LIMIT;
4226 }
4227 else
4228 {
4229 minval = bl_minval;
4230 maxval = bl_maxval;
4231 }
4232
4233 /*
4234 * When maxval > minval, try matching as much as possible, up
4235 * to maxval. When maxval < minval, try matching at least the
4236 * minimal number (since the range is backwards, that's also
4237 * maxval!).
4238 */
4239 count = regrepeat(OPERAND(scan), maxval);
4240 if (got_int)
4241 return FALSE;
4242 if (minval <= maxval)
4243 {
4244 /* Range is the normal way around, use longest match */
4245 while (count >= minval)
4246 {
4247 /* If it could match, try it. */
4248 if (nextb == NUL || *reginput == nextb
4249 || *reginput == nextb_ic)
4250 {
4251 reg_save(&save);
4252 if (regmatch(next))
4253 return TRUE;
4254 reg_restore(&save);
4255 }
4256 /* Couldn't or didn't match -- back up one char. */
4257 if (--count < minval)
4258 break;
4259 if (reginput == regline)
4260 {
4261 /* backup to last char of previous line */
4262 --reglnum;
4263 regline = reg_getline(reglnum);
4264 /* Just in case regrepeat() didn't count right. */
4265 if (regline == NULL)
4266 return FALSE;
4267 reginput = regline + STRLEN(regline);
4268 fast_breakcheck();
4269 if (got_int || out_of_stack)
4270 return FALSE;
4271 }
4272 else
4273 {
4274 --reginput;
4275#ifdef FEAT_MBYTE
4276 if (has_mbyte)
4277 reginput -= (*mb_head_off)(regline, reginput);
4278#endif
4279 }
4280 }
4281 }
4282 else
4283 {
4284 /* Range is backwards, use shortest match first.
4285 * Careful: maxval and minval are exchanged! */
4286 if (count < maxval)
4287 return FALSE;
4288 for (;;)
4289 {
4290 /* If it could work, try it. */
4291 if (nextb == NUL || *reginput == nextb
4292 || *reginput == nextb_ic)
4293 {
4294 reg_save(&save);
4295 if (regmatch(next))
4296 return TRUE;
4297 reg_restore(&save);
4298 }
4299 /* Couldn't or didn't match: try advancing one char. */
4300 if (count == minval
4301 || regrepeat(OPERAND(scan), 1L) == 0)
4302 break;
4303 ++count;
4304 if (got_int || out_of_stack)
4305 return FALSE;
4306 }
4307 }
4308 return FALSE;
4309 }
4310 /* break; Not Reached */
4311
4312 case NOMATCH:
4313 {
4314 regsave_T save;
4315
4316 /* If the operand matches, we fail. Otherwise backup and
4317 * continue with the next item. */
4318 reg_save(&save);
4319 if (regmatch(OPERAND(scan)))
4320 return FALSE;
4321 reg_restore(&save);
4322 }
4323 break;
4324
4325 case MATCH:
4326 case SUBPAT:
4327 {
4328 regsave_T save;
4329
4330 /* If the operand doesn't match, we fail. Otherwise backup
4331 * and continue with the next item. */
4332 reg_save(&save);
4333 if (!regmatch(OPERAND(scan)))
4334 return FALSE;
4335 if (op == MATCH) /* zero-width */
4336 reg_restore(&save);
4337 }
4338 break;
4339
4340 case BEHIND:
4341 case NOBEHIND:
4342 {
4343 regsave_T save_after, save_start;
4344 regsave_T save_behind_pos;
4345 int needmatch = (op == BEHIND);
4346
4347 /*
4348 * Look back in the input of the operand matches or not. This
4349 * must be done at every position in the input and checking if
4350 * the match ends at the current position.
4351 * First check if the next item matches, that's probably
4352 * faster.
4353 */
4354 reg_save(&save_start);
4355 if (regmatch(next))
4356 {
4357 /* save the position after the found match for next */
4358 reg_save(&save_after);
4359
4360 /* start looking for a match with operand at the current
4361 * postion. Go back one character until we find the
4362 * result, hitting the start of the line or the previous
4363 * line (for multi-line matching).
4364 * Set behind_pos to where the match should end, BHPOS
4365 * will match it. */
4366 save_behind_pos = behind_pos;
4367 behind_pos = save_start;
4368 for (;;)
4369 {
4370 reg_restore(&save_start);
4371 if (regmatch(OPERAND(scan))
4372 && reg_save_equal(&behind_pos))
4373 {
4374 behind_pos = save_behind_pos;
4375 /* found a match that ends where "next" started */
4376 if (needmatch)
4377 {
4378 reg_restore(&save_after);
4379 return TRUE;
4380 }
4381 return FALSE;
4382 }
4383 /*
4384 * No match: Go back one character. May go to
4385 * previous line once.
4386 */
4387 if (REG_MULTI)
4388 {
4389 if (save_start.rs_u.pos.col == 0)
4390 {
4391 if (save_start.rs_u.pos.lnum
4392 < behind_pos.rs_u.pos.lnum
4393 || reg_getline(
4394 --save_start.rs_u.pos.lnum) == NULL)
4395 break;
4396 reg_restore(&save_start);
4397 save_start.rs_u.pos.col =
4398 (colnr_T)STRLEN(regline);
4399 }
4400 else
4401 --save_start.rs_u.pos.col;
4402 }
4403 else
4404 {
4405 if (save_start.rs_u.ptr == regline)
4406 break;
4407 --save_start.rs_u.ptr;
4408 }
4409 }
4410
4411 /* NOBEHIND succeeds when no match was found */
4412 behind_pos = save_behind_pos;
4413 if (!needmatch)
4414 {
4415 reg_restore(&save_after);
4416 return TRUE;
4417 }
4418 }
4419 return FALSE;
4420 }
4421
4422 case BHPOS:
4423 if (REG_MULTI)
4424 {
4425 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4426 || behind_pos.rs_u.pos.lnum != reglnum)
4427 return FALSE;
4428 }
4429 else if (behind_pos.rs_u.ptr != reginput)
4430 return FALSE;
4431 break;
4432
4433 case NEWL:
4434 if ((c != NUL || reglnum == reg_maxline)
4435 && (c != '\n' || !reg_line_lbr))
4436 return FALSE;
4437 if (reg_line_lbr)
4438 ADVANCE_REGINPUT();
4439 else
4440 reg_nextline();
4441 break;
4442
4443 case END:
4444 return TRUE; /* Success! */
4445
4446 default:
4447 EMSG(_(e_re_corr));
4448#ifdef DEBUG
4449 printf("Illegal op code %d\n", op);
4450#endif
4451 return FALSE;
4452 }
4453 }
4454
4455 scan = next;
4456 }
4457
4458 /*
4459 * We get here only if there's trouble -- normally "case END" is the
4460 * terminating point.
4461 */
4462 EMSG(_(e_re_corr));
4463#ifdef DEBUG
4464 printf("Premature EOL\n");
4465#endif
4466 return FALSE;
4467}
4468
4469#ifdef FEAT_MBYTE
4470# define ADVANCE_P(x) if (has_mbyte) x += (*mb_ptr2len_check)(x); else ++x
4471#else
4472# define ADVANCE_P(x) ++x
4473#endif
4474
4475/*
4476 * regrepeat - repeatedly match something simple, return how many.
4477 * Advances reginput (and reglnum) to just after the matched chars.
4478 */
4479 static int
4480regrepeat(p, maxcount)
4481 char_u *p;
4482 long maxcount; /* maximum number of matches allowed */
4483{
4484 long count = 0;
4485 char_u *scan;
4486 char_u *opnd;
4487 int mask;
4488 int testval = 0;
4489
4490 scan = reginput; /* Make local copy of reginput for speed. */
4491 opnd = OPERAND(p);
4492 switch (OP(p))
4493 {
4494 case ANY:
4495 case ANY + ADD_NL:
4496 while (count < maxcount)
4497 {
4498 /* Matching anything means we continue until end-of-line (or
4499 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4500 while (*scan != NUL && count < maxcount)
4501 {
4502 ++count;
4503 ADVANCE_P(scan);
4504 }
4505 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4506 break;
4507 ++count; /* count the line-break */
4508 reg_nextline();
4509 scan = reginput;
4510 if (got_int)
4511 break;
4512 }
4513 break;
4514
4515 case IDENT:
4516 case IDENT + ADD_NL:
4517 testval = TRUE;
4518 /*FALLTHROUGH*/
4519 case SIDENT:
4520 case SIDENT + ADD_NL:
4521 while (count < maxcount)
4522 {
4523 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4524 {
4525 ADVANCE_P(scan);
4526 }
4527 else if (*scan == NUL)
4528 {
4529 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4530 break;
4531 reg_nextline();
4532 scan = reginput;
4533 if (got_int)
4534 break;
4535 }
4536 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4537 ++scan;
4538 else
4539 break;
4540 ++count;
4541 }
4542 break;
4543
4544 case KWORD:
4545 case KWORD + ADD_NL:
4546 testval = TRUE;
4547 /*FALLTHROUGH*/
4548 case SKWORD:
4549 case SKWORD + ADD_NL:
4550 while (count < maxcount)
4551 {
4552 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4553 {
4554 ADVANCE_P(scan);
4555 }
4556 else if (*scan == NUL)
4557 {
4558 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4559 break;
4560 reg_nextline();
4561 scan = reginput;
4562 if (got_int)
4563 break;
4564 }
4565 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4566 ++scan;
4567 else
4568 break;
4569 ++count;
4570 }
4571 break;
4572
4573 case FNAME:
4574 case FNAME + ADD_NL:
4575 testval = TRUE;
4576 /*FALLTHROUGH*/
4577 case SFNAME:
4578 case SFNAME + ADD_NL:
4579 while (count < maxcount)
4580 {
4581 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4582 {
4583 ADVANCE_P(scan);
4584 }
4585 else if (*scan == NUL)
4586 {
4587 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4588 break;
4589 reg_nextline();
4590 scan = reginput;
4591 if (got_int)
4592 break;
4593 }
4594 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4595 ++scan;
4596 else
4597 break;
4598 ++count;
4599 }
4600 break;
4601
4602 case PRINT:
4603 case PRINT + ADD_NL:
4604 testval = TRUE;
4605 /*FALLTHROUGH*/
4606 case SPRINT:
4607 case SPRINT + ADD_NL:
4608 while (count < maxcount)
4609 {
4610 if (*scan == NUL)
4611 {
4612 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4613 break;
4614 reg_nextline();
4615 scan = reginput;
4616 if (got_int)
4617 break;
4618 }
4619 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4620 {
4621 ADVANCE_P(scan);
4622 }
4623 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4624 ++scan;
4625 else
4626 break;
4627 ++count;
4628 }
4629 break;
4630
4631 case WHITE:
4632 case WHITE + ADD_NL:
4633 testval = mask = RI_WHITE;
4634do_class:
4635 while (count < maxcount)
4636 {
4637#ifdef FEAT_MBYTE
4638 int l;
4639#endif
4640 if (*scan == NUL)
4641 {
4642 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4643 break;
4644 reg_nextline();
4645 scan = reginput;
4646 if (got_int)
4647 break;
4648 }
4649#ifdef FEAT_MBYTE
4650 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4651 {
4652 if (testval != 0)
4653 break;
4654 scan += l;
4655 }
4656#endif
4657 else if ((class_tab[*scan] & mask) == testval)
4658 ++scan;
4659 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4660 ++scan;
4661 else
4662 break;
4663 ++count;
4664 }
4665 break;
4666
4667 case NWHITE:
4668 case NWHITE + ADD_NL:
4669 mask = RI_WHITE;
4670 goto do_class;
4671 case DIGIT:
4672 case DIGIT + ADD_NL:
4673 testval = mask = RI_DIGIT;
4674 goto do_class;
4675 case NDIGIT:
4676 case NDIGIT + ADD_NL:
4677 mask = RI_DIGIT;
4678 goto do_class;
4679 case HEX:
4680 case HEX + ADD_NL:
4681 testval = mask = RI_HEX;
4682 goto do_class;
4683 case NHEX:
4684 case NHEX + ADD_NL:
4685 mask = RI_HEX;
4686 goto do_class;
4687 case OCTAL:
4688 case OCTAL + ADD_NL:
4689 testval = mask = RI_OCTAL;
4690 goto do_class;
4691 case NOCTAL:
4692 case NOCTAL + ADD_NL:
4693 mask = RI_OCTAL;
4694 goto do_class;
4695 case WORD:
4696 case WORD + ADD_NL:
4697 testval = mask = RI_WORD;
4698 goto do_class;
4699 case NWORD:
4700 case NWORD + ADD_NL:
4701 mask = RI_WORD;
4702 goto do_class;
4703 case HEAD:
4704 case HEAD + ADD_NL:
4705 testval = mask = RI_HEAD;
4706 goto do_class;
4707 case NHEAD:
4708 case NHEAD + ADD_NL:
4709 mask = RI_HEAD;
4710 goto do_class;
4711 case ALPHA:
4712 case ALPHA + ADD_NL:
4713 testval = mask = RI_ALPHA;
4714 goto do_class;
4715 case NALPHA:
4716 case NALPHA + ADD_NL:
4717 mask = RI_ALPHA;
4718 goto do_class;
4719 case LOWER:
4720 case LOWER + ADD_NL:
4721 testval = mask = RI_LOWER;
4722 goto do_class;
4723 case NLOWER:
4724 case NLOWER + ADD_NL:
4725 mask = RI_LOWER;
4726 goto do_class;
4727 case UPPER:
4728 case UPPER + ADD_NL:
4729 testval = mask = RI_UPPER;
4730 goto do_class;
4731 case NUPPER:
4732 case NUPPER + ADD_NL:
4733 mask = RI_UPPER;
4734 goto do_class;
4735
4736 case EXACTLY:
4737 {
4738 int cu, cl;
4739
4740 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4741 * would have been used for it. */
4742 if (ireg_ic)
4743 {
4744 cu = TOUPPER_LOC(*opnd);
4745 cl = TOLOWER_LOC(*opnd);
4746 while (count < maxcount && (*scan == cu || *scan == cl))
4747 {
4748 count++;
4749 scan++;
4750 }
4751 }
4752 else
4753 {
4754 cu = *opnd;
4755 while (count < maxcount && *scan == cu)
4756 {
4757 count++;
4758 scan++;
4759 }
4760 }
4761 break;
4762 }
4763
4764#ifdef FEAT_MBYTE
4765 case MULTIBYTECODE:
4766 {
4767 int i, len, cf = 0;
4768
4769 /* Safety check (just in case 'encoding' was changed since
4770 * compiling the program). */
4771 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4772 {
4773 if (ireg_ic && enc_utf8)
4774 cf = utf_fold(utf_ptr2char(opnd));
4775 while (count < maxcount)
4776 {
4777 for (i = 0; i < len; ++i)
4778 if (opnd[i] != scan[i])
4779 break;
4780 if (i < len && (!ireg_ic || !enc_utf8
4781 || utf_fold(utf_ptr2char(scan)) != cf))
4782 break;
4783 scan += len;
4784 ++count;
4785 }
4786 }
4787 }
4788 break;
4789#endif
4790
4791 case ANYOF:
4792 case ANYOF + ADD_NL:
4793 testval = TRUE;
4794 /*FALLTHROUGH*/
4795
4796 case ANYBUT:
4797 case ANYBUT + ADD_NL:
4798 while (count < maxcount)
4799 {
4800#ifdef FEAT_MBYTE
4801 int len;
4802#endif
4803 if (*scan == NUL)
4804 {
4805 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4806 break;
4807 reg_nextline();
4808 scan = reginput;
4809 if (got_int)
4810 break;
4811 }
4812 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4813 ++scan;
4814#ifdef FEAT_MBYTE
4815 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4816 {
4817 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4818 break;
4819 scan += len;
4820 }
4821#endif
4822 else
4823 {
4824 if ((cstrchr(opnd, *scan) == NULL) == testval)
4825 break;
4826 ++scan;
4827 }
4828 ++count;
4829 }
4830 break;
4831
4832 case NEWL:
4833 while (count < maxcount
4834 && ((*scan == NUL && reglnum < reg_maxline)
4835 || (*scan == '\n' && reg_line_lbr)))
4836 {
4837 count++;
4838 if (reg_line_lbr)
4839 ADVANCE_REGINPUT();
4840 else
4841 reg_nextline();
4842 scan = reginput;
4843 if (got_int)
4844 break;
4845 }
4846 break;
4847
4848 default: /* Oh dear. Called inappropriately. */
4849 EMSG(_(e_re_corr));
4850#ifdef DEBUG
4851 printf("Called regrepeat with op code %d\n", OP(p));
4852#endif
4853 break;
4854 }
4855
4856 reginput = scan;
4857
4858 return (int)count;
4859}
4860
4861/*
4862 * regnext - dig the "next" pointer out of a node
4863 */
4864 static char_u *
4865regnext(p)
4866 char_u *p;
4867{
4868 int offset;
4869
4870 if (p == JUST_CALC_SIZE)
4871 return NULL;
4872
4873 offset = NEXT(p);
4874 if (offset == 0)
4875 return NULL;
4876
4877 if (OP(p) == BACK)
4878 return p - offset;
4879 else
4880 return p + offset;
4881}
4882
4883/*
4884 * Check the regexp program for its magic number.
4885 * Return TRUE if it's wrong.
4886 */
4887 static int
4888prog_magic_wrong()
4889{
4890 if (UCHARAT(REG_MULTI
4891 ? reg_mmatch->regprog->program
4892 : reg_match->regprog->program) != REGMAGIC)
4893 {
4894 EMSG(_(e_re_corr));
4895 return TRUE;
4896 }
4897 return FALSE;
4898}
4899
4900/*
4901 * Cleanup the subexpressions, if this wasn't done yet.
4902 * This construction is used to clear the subexpressions only when they are
4903 * used (to increase speed).
4904 */
4905 static void
4906cleanup_subexpr()
4907{
4908 if (need_clear_subexpr)
4909 {
4910 if (REG_MULTI)
4911 {
4912 /* Use 0xff to set lnum to -1 */
4913 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4914 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4915 }
4916 else
4917 {
4918 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
4919 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
4920 }
4921 need_clear_subexpr = FALSE;
4922 }
4923}
4924
4925#ifdef FEAT_SYN_HL
4926 static void
4927cleanup_zsubexpr()
4928{
4929 if (need_clear_zsubexpr)
4930 {
4931 if (REG_MULTI)
4932 {
4933 /* Use 0xff to set lnum to -1 */
4934 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4935 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4936 }
4937 else
4938 {
4939 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
4940 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
4941 }
4942 need_clear_zsubexpr = FALSE;
4943 }
4944}
4945#endif
4946
4947/*
4948 * Advance reglnum, regline and reginput to the next line.
4949 */
4950 static void
4951reg_nextline()
4952{
4953 regline = reg_getline(++reglnum);
4954 reginput = regline;
4955 fast_breakcheck();
4956}
4957
4958/*
4959 * Save the input line and position in a regsave_T.
4960 */
4961 static void
4962reg_save(save)
4963 regsave_T *save;
4964{
4965 if (REG_MULTI)
4966 {
4967 save->rs_u.pos.col = (colnr_T)(reginput - regline);
4968 save->rs_u.pos.lnum = reglnum;
4969 }
4970 else
4971 save->rs_u.ptr = reginput;
4972}
4973
4974/*
4975 * Restore the input line and position from a regsave_T.
4976 */
4977 static void
4978reg_restore(save)
4979 regsave_T *save;
4980{
4981 if (REG_MULTI)
4982 {
4983 if (reglnum != save->rs_u.pos.lnum)
4984 {
4985 /* only call reg_getline() when the line number changed to save
4986 * a bit of time */
4987 reglnum = save->rs_u.pos.lnum;
4988 regline = reg_getline(reglnum);
4989 }
4990 reginput = regline + save->rs_u.pos.col;
4991 }
4992 else
4993 reginput = save->rs_u.ptr;
4994}
4995
4996/*
4997 * Return TRUE if current position is equal to saved position.
4998 */
4999 static int
5000reg_save_equal(save)
5001 regsave_T *save;
5002{
5003 if (REG_MULTI)
5004 return reglnum == save->rs_u.pos.lnum
5005 && reginput == regline + save->rs_u.pos.col;
5006 return reginput == save->rs_u.ptr;
5007}
5008
5009/*
5010 * Tentatively set the sub-expression start to the current position (after
5011 * calling regmatch() they will have changed). Need to save the existing
5012 * values for when there is no match.
5013 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
5014 * depending on REG_MULTI.
5015 */
5016 static void
5017save_se_multi(savep, posp)
5018 save_se_T *savep;
5019 lpos_T *posp;
5020{
5021 savep->se_u.pos = *posp;
5022 posp->lnum = reglnum;
5023 posp->col = (colnr_T)(reginput - regline);
5024}
5025
5026 static void
5027save_se_one(savep, pp)
5028 save_se_T *savep;
5029 char_u **pp;
5030{
5031 savep->se_u.ptr = *pp;
5032 *pp = reginput;
5033}
5034
5035/*
5036 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5037 */
5038 static int
5039re_num_cmp(val, scan)
5040 long_u val;
5041 char_u *scan;
5042{
5043 long_u n = OPERAND_MIN(scan);
5044
5045 if (OPERAND_CMP(scan) == '>')
5046 return val > n;
5047 if (OPERAND_CMP(scan) == '<')
5048 return val < n;
5049 return val == n;
5050}
5051
5052
5053#ifdef DEBUG
5054
5055/*
5056 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5057 */
5058 static void
5059regdump(pattern, r)
5060 char_u *pattern;
5061 regprog_T *r;
5062{
5063 char_u *s;
5064 int op = EXACTLY; /* Arbitrary non-END op. */
5065 char_u *next;
5066 char_u *end = NULL;
5067
5068 printf("\r\nregcomp(%s):\r\n", pattern);
5069
5070 s = r->program + 1;
5071 /*
5072 * Loop until we find the END that isn't before a referred next (an END
5073 * can also appear in a NOMATCH operand).
5074 */
5075 while (op != END || s <= end)
5076 {
5077 op = OP(s);
5078 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5079 next = regnext(s);
5080 if (next == NULL) /* Next ptr. */
5081 printf("(0)");
5082 else
5083 printf("(%d)", (int)((s - r->program) + (next - s)));
5084 if (end < next)
5085 end = next;
5086 if (op == BRACE_LIMITS)
5087 {
5088 /* Two short ints */
5089 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5090 s += 8;
5091 }
5092 s += 3;
5093 if (op == ANYOF || op == ANYOF + ADD_NL
5094 || op == ANYBUT || op == ANYBUT + ADD_NL
5095 || op == EXACTLY)
5096 {
5097 /* Literal string, where present. */
5098 while (*s != NUL)
5099 printf("%c", *s++);
5100 s++;
5101 }
5102 printf("\r\n");
5103 }
5104
5105 /* Header fields of interest. */
5106 if (r->regstart != NUL)
5107 printf("start `%s' 0x%x; ", r->regstart < 256
5108 ? (char *)transchar(r->regstart)
5109 : "multibyte", r->regstart);
5110 if (r->reganch)
5111 printf("anchored; ");
5112 if (r->regmust != NULL)
5113 printf("must have \"%s\"", r->regmust);
5114 printf("\r\n");
5115}
5116
5117/*
5118 * regprop - printable representation of opcode
5119 */
5120 static char_u *
5121regprop(op)
5122 char_u *op;
5123{
5124 char_u *p;
5125 static char_u buf[50];
5126
5127 (void) strcpy(buf, ":");
5128
5129 switch (OP(op))
5130 {
5131 case BOL:
5132 p = "BOL";
5133 break;
5134 case EOL:
5135 p = "EOL";
5136 break;
5137 case RE_BOF:
5138 p = "BOF";
5139 break;
5140 case RE_EOF:
5141 p = "EOF";
5142 break;
5143 case CURSOR:
5144 p = "CURSOR";
5145 break;
5146 case RE_LNUM:
5147 p = "RE_LNUM";
5148 break;
5149 case RE_COL:
5150 p = "RE_COL";
5151 break;
5152 case RE_VCOL:
5153 p = "RE_VCOL";
5154 break;
5155 case BOW:
5156 p = "BOW";
5157 break;
5158 case EOW:
5159 p = "EOW";
5160 break;
5161 case ANY:
5162 p = "ANY";
5163 break;
5164 case ANY + ADD_NL:
5165 p = "ANY+NL";
5166 break;
5167 case ANYOF:
5168 p = "ANYOF";
5169 break;
5170 case ANYOF + ADD_NL:
5171 p = "ANYOF+NL";
5172 break;
5173 case ANYBUT:
5174 p = "ANYBUT";
5175 break;
5176 case ANYBUT + ADD_NL:
5177 p = "ANYBUT+NL";
5178 break;
5179 case IDENT:
5180 p = "IDENT";
5181 break;
5182 case IDENT + ADD_NL:
5183 p = "IDENT+NL";
5184 break;
5185 case SIDENT:
5186 p = "SIDENT";
5187 break;
5188 case SIDENT + ADD_NL:
5189 p = "SIDENT+NL";
5190 break;
5191 case KWORD:
5192 p = "KWORD";
5193 break;
5194 case KWORD + ADD_NL:
5195 p = "KWORD+NL";
5196 break;
5197 case SKWORD:
5198 p = "SKWORD";
5199 break;
5200 case SKWORD + ADD_NL:
5201 p = "SKWORD+NL";
5202 break;
5203 case FNAME:
5204 p = "FNAME";
5205 break;
5206 case FNAME + ADD_NL:
5207 p = "FNAME+NL";
5208 break;
5209 case SFNAME:
5210 p = "SFNAME";
5211 break;
5212 case SFNAME + ADD_NL:
5213 p = "SFNAME+NL";
5214 break;
5215 case PRINT:
5216 p = "PRINT";
5217 break;
5218 case PRINT + ADD_NL:
5219 p = "PRINT+NL";
5220 break;
5221 case SPRINT:
5222 p = "SPRINT";
5223 break;
5224 case SPRINT + ADD_NL:
5225 p = "SPRINT+NL";
5226 break;
5227 case WHITE:
5228 p = "WHITE";
5229 break;
5230 case WHITE + ADD_NL:
5231 p = "WHITE+NL";
5232 break;
5233 case NWHITE:
5234 p = "NWHITE";
5235 break;
5236 case NWHITE + ADD_NL:
5237 p = "NWHITE+NL";
5238 break;
5239 case DIGIT:
5240 p = "DIGIT";
5241 break;
5242 case DIGIT + ADD_NL:
5243 p = "DIGIT+NL";
5244 break;
5245 case NDIGIT:
5246 p = "NDIGIT";
5247 break;
5248 case NDIGIT + ADD_NL:
5249 p = "NDIGIT+NL";
5250 break;
5251 case HEX:
5252 p = "HEX";
5253 break;
5254 case HEX + ADD_NL:
5255 p = "HEX+NL";
5256 break;
5257 case NHEX:
5258 p = "NHEX";
5259 break;
5260 case NHEX + ADD_NL:
5261 p = "NHEX+NL";
5262 break;
5263 case OCTAL:
5264 p = "OCTAL";
5265 break;
5266 case OCTAL + ADD_NL:
5267 p = "OCTAL+NL";
5268 break;
5269 case NOCTAL:
5270 p = "NOCTAL";
5271 break;
5272 case NOCTAL + ADD_NL:
5273 p = "NOCTAL+NL";
5274 break;
5275 case WORD:
5276 p = "WORD";
5277 break;
5278 case WORD + ADD_NL:
5279 p = "WORD+NL";
5280 break;
5281 case NWORD:
5282 p = "NWORD";
5283 break;
5284 case NWORD + ADD_NL:
5285 p = "NWORD+NL";
5286 break;
5287 case HEAD:
5288 p = "HEAD";
5289 break;
5290 case HEAD + ADD_NL:
5291 p = "HEAD+NL";
5292 break;
5293 case NHEAD:
5294 p = "NHEAD";
5295 break;
5296 case NHEAD + ADD_NL:
5297 p = "NHEAD+NL";
5298 break;
5299 case ALPHA:
5300 p = "ALPHA";
5301 break;
5302 case ALPHA + ADD_NL:
5303 p = "ALPHA+NL";
5304 break;
5305 case NALPHA:
5306 p = "NALPHA";
5307 break;
5308 case NALPHA + ADD_NL:
5309 p = "NALPHA+NL";
5310 break;
5311 case LOWER:
5312 p = "LOWER";
5313 break;
5314 case LOWER + ADD_NL:
5315 p = "LOWER+NL";
5316 break;
5317 case NLOWER:
5318 p = "NLOWER";
5319 break;
5320 case NLOWER + ADD_NL:
5321 p = "NLOWER+NL";
5322 break;
5323 case UPPER:
5324 p = "UPPER";
5325 break;
5326 case UPPER + ADD_NL:
5327 p = "UPPER+NL";
5328 break;
5329 case NUPPER:
5330 p = "NUPPER";
5331 break;
5332 case NUPPER + ADD_NL:
5333 p = "NUPPER+NL";
5334 break;
5335 case BRANCH:
5336 p = "BRANCH";
5337 break;
5338 case EXACTLY:
5339 p = "EXACTLY";
5340 break;
5341 case NOTHING:
5342 p = "NOTHING";
5343 break;
5344 case BACK:
5345 p = "BACK";
5346 break;
5347 case END:
5348 p = "END";
5349 break;
5350 case MOPEN + 0:
5351 p = "MATCH START";
5352 break;
5353 case MOPEN + 1:
5354 case MOPEN + 2:
5355 case MOPEN + 3:
5356 case MOPEN + 4:
5357 case MOPEN + 5:
5358 case MOPEN + 6:
5359 case MOPEN + 7:
5360 case MOPEN + 8:
5361 case MOPEN + 9:
5362 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5363 p = NULL;
5364 break;
5365 case MCLOSE + 0:
5366 p = "MATCH END";
5367 break;
5368 case MCLOSE + 1:
5369 case MCLOSE + 2:
5370 case MCLOSE + 3:
5371 case MCLOSE + 4:
5372 case MCLOSE + 5:
5373 case MCLOSE + 6:
5374 case MCLOSE + 7:
5375 case MCLOSE + 8:
5376 case MCLOSE + 9:
5377 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5378 p = NULL;
5379 break;
5380 case BACKREF + 1:
5381 case BACKREF + 2:
5382 case BACKREF + 3:
5383 case BACKREF + 4:
5384 case BACKREF + 5:
5385 case BACKREF + 6:
5386 case BACKREF + 7:
5387 case BACKREF + 8:
5388 case BACKREF + 9:
5389 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5390 p = NULL;
5391 break;
5392 case NOPEN:
5393 p = "NOPEN";
5394 break;
5395 case NCLOSE:
5396 p = "NCLOSE";
5397 break;
5398#ifdef FEAT_SYN_HL
5399 case ZOPEN + 1:
5400 case ZOPEN + 2:
5401 case ZOPEN + 3:
5402 case ZOPEN + 4:
5403 case ZOPEN + 5:
5404 case ZOPEN + 6:
5405 case ZOPEN + 7:
5406 case ZOPEN + 8:
5407 case ZOPEN + 9:
5408 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5409 p = NULL;
5410 break;
5411 case ZCLOSE + 1:
5412 case ZCLOSE + 2:
5413 case ZCLOSE + 3:
5414 case ZCLOSE + 4:
5415 case ZCLOSE + 5:
5416 case ZCLOSE + 6:
5417 case ZCLOSE + 7:
5418 case ZCLOSE + 8:
5419 case ZCLOSE + 9:
5420 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5421 p = NULL;
5422 break;
5423 case ZREF + 1:
5424 case ZREF + 2:
5425 case ZREF + 3:
5426 case ZREF + 4:
5427 case ZREF + 5:
5428 case ZREF + 6:
5429 case ZREF + 7:
5430 case ZREF + 8:
5431 case ZREF + 9:
5432 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5433 p = NULL;
5434 break;
5435#endif
5436 case STAR:
5437 p = "STAR";
5438 break;
5439 case PLUS:
5440 p = "PLUS";
5441 break;
5442 case NOMATCH:
5443 p = "NOMATCH";
5444 break;
5445 case MATCH:
5446 p = "MATCH";
5447 break;
5448 case BEHIND:
5449 p = "BEHIND";
5450 break;
5451 case NOBEHIND:
5452 p = "NOBEHIND";
5453 break;
5454 case SUBPAT:
5455 p = "SUBPAT";
5456 break;
5457 case BRACE_LIMITS:
5458 p = "BRACE_LIMITS";
5459 break;
5460 case BRACE_SIMPLE:
5461 p = "BRACE_SIMPLE";
5462 break;
5463 case BRACE_COMPLEX + 0:
5464 case BRACE_COMPLEX + 1:
5465 case BRACE_COMPLEX + 2:
5466 case BRACE_COMPLEX + 3:
5467 case BRACE_COMPLEX + 4:
5468 case BRACE_COMPLEX + 5:
5469 case BRACE_COMPLEX + 6:
5470 case BRACE_COMPLEX + 7:
5471 case BRACE_COMPLEX + 8:
5472 case BRACE_COMPLEX + 9:
5473 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5474 p = NULL;
5475 break;
5476#ifdef FEAT_MBYTE
5477 case MULTIBYTECODE:
5478 p = "MULTIBYTECODE";
5479 break;
5480#endif
5481 case NEWL:
5482 p = "NEWL";
5483 break;
5484 default:
5485 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5486 p = NULL;
5487 break;
5488 }
5489 if (p != NULL)
5490 (void) strcat(buf, p);
5491 return buf;
5492}
5493#endif
5494
5495#ifdef FEAT_MBYTE
5496static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5497
5498typedef struct
5499{
5500 int a, b, c;
5501} decomp_T;
5502
5503
5504/* 0xfb20 - 0xfb4f */
5505decomp_T decomp_table[0xfb4f-0xfb20+1] =
5506{
5507 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5508 {0x5d0,0,0}, /* 0xfb21 alt alef */
5509 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5510 {0x5d4,0,0}, /* 0xfb23 alt he */
5511 {0x5db,0,0}, /* 0xfb24 alt kaf */
5512 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5513 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5514 {0x5e8,0,0}, /* 0xfb27 alt resh */
5515 {0x5ea,0,0}, /* 0xfb28 alt tav */
5516 {'+', 0, 0}, /* 0xfb29 alt plus */
5517 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5518 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5519 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5520 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5521 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5522 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5523 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5524 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5525 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5526 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5527 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5528 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5529 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5530 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5531 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5532 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5533 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5534 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5535 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5536 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5537 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5538 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5539 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5540 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5541 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5542 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5543 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5544 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5545 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5546 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5547 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5548 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5549 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5550 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5551 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5552 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5553 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5554 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5555};
5556
5557 static void
5558mb_decompose(c, c1, c2, c3)
5559 int c, *c1, *c2, *c3;
5560{
5561 decomp_T d;
5562
5563 if (c >= 0x4b20 && c <= 0xfb4f)
5564 {
5565 d = decomp_table[c - 0xfb20];
5566 *c1 = d.a;
5567 *c2 = d.b;
5568 *c3 = d.c;
5569 }
5570 else
5571 {
5572 *c1 = c;
5573 *c2 = *c3 = 0;
5574 }
5575}
5576#endif
5577
5578/*
5579 * Compare two strings, ignore case if ireg_ic set.
5580 * Return 0 if strings match, non-zero otherwise.
5581 * Correct the length "*n" when composing characters are ignored.
5582 */
5583 static int
5584cstrncmp(s1, s2, n)
5585 char_u *s1, *s2;
5586 int *n;
5587{
5588 int result;
5589
5590 if (!ireg_ic)
5591 result = STRNCMP(s1, s2, *n);
5592 else
5593 result = MB_STRNICMP(s1, s2, *n);
5594
5595#ifdef FEAT_MBYTE
5596 /* if it failed and it's utf8 and we want to combineignore: */
5597 if (result != 0 && enc_utf8 && ireg_icombine)
5598 {
5599 char_u *str1, *str2;
5600 int c1, c2, c11, c12;
5601 int ix;
5602 int junk;
5603
5604 /* we have to handle the strcmp ourselves, since it is necessary to
5605 * deal with the composing characters by ignoring them: */
5606 str1 = s1;
5607 str2 = s2;
5608 c1 = c2 = 0;
5609 for (ix = 0; ix < *n; )
5610 {
5611 c1 = mb_ptr2char_adv(&str1);
5612 c2 = mb_ptr2char_adv(&str2);
5613 ix += utf_char2len(c1);
5614
5615 /* decompose the character if necessary, into 'base' characters
5616 * because I don't care about Arabic, I will hard-code the Hebrew
5617 * which I *do* care about! So sue me... */
5618 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5619 {
5620 /* decomposition necessary? */
5621 mb_decompose(c1, &c11, &junk, &junk);
5622 mb_decompose(c2, &c12, &junk, &junk);
5623 c1 = c11;
5624 c2 = c12;
5625 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5626 break;
5627 }
5628 }
5629 result = c2 - c1;
5630 if (result == 0)
5631 *n = (int)(str2 - s2);
5632 }
5633#endif
5634
5635 return result;
5636}
5637
5638/*
5639 * cstrchr: This function is used a lot for simple searches, keep it fast!
5640 */
5641 static char_u *
5642cstrchr(s, c)
5643 char_u *s;
5644 int c;
5645{
5646 char_u *p;
5647 int cc;
5648
5649 if (!ireg_ic
5650#ifdef FEAT_MBYTE
5651 || (!enc_utf8 && mb_char2len(c) > 1)
5652#endif
5653 )
5654 return vim_strchr(s, c);
5655
5656 /* tolower() and toupper() can be slow, comparing twice should be a lot
5657 * faster (esp. when using MS Visual C++!).
5658 * For UTF-8 need to use folded case. */
5659#ifdef FEAT_MBYTE
5660 if (enc_utf8 && c > 0x80)
5661 cc = utf_fold(c);
5662 else
5663#endif
5664 if (isupper(c))
5665 cc = TOLOWER_LOC(c);
5666 else if (islower(c))
5667 cc = TOUPPER_LOC(c);
5668 else
5669 return vim_strchr(s, c);
5670
5671#ifdef FEAT_MBYTE
5672 if (has_mbyte)
5673 {
5674 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5675 {
5676 if (enc_utf8 && c > 0x80)
5677 {
5678 if (utf_fold(utf_ptr2char(p)) == cc)
5679 return p;
5680 }
5681 else if (*p == c || *p == cc)
5682 return p;
5683 }
5684 }
5685 else
5686#endif
5687 /* Faster version for when there are no multi-byte characters. */
5688 for (p = s; *p != NUL; ++p)
5689 if (*p == c || *p == cc)
5690 return p;
5691
5692 return NULL;
5693}
5694
5695/***************************************************************
5696 * regsub stuff *
5697 ***************************************************************/
5698
5699/* This stuff below really confuses cc on an SGI -- webb */
5700#ifdef __sgi
5701# undef __ARGS
5702# define __ARGS(x) ()
5703#endif
5704
5705/*
5706 * We should define ftpr as a pointer to a function returning a pointer to
5707 * a function returning a pointer to a function ...
5708 * This is impossible, so we declare a pointer to a function returning a
5709 * pointer to a function returning void. This should work for all compilers.
5710 */
5711typedef void (*(*fptr) __ARGS((char_u *, int)))();
5712
5713static fptr do_upper __ARGS((char_u *, int));
5714static fptr do_Upper __ARGS((char_u *, int));
5715static fptr do_lower __ARGS((char_u *, int));
5716static fptr do_Lower __ARGS((char_u *, int));
5717
5718static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5719
5720 static fptr
5721do_upper(d, c)
5722 char_u *d;
5723 int c;
5724{
5725 *d = TOUPPER_LOC(c);
5726
5727 return (fptr)NULL;
5728}
5729
5730 static fptr
5731do_Upper(d, c)
5732 char_u *d;
5733 int c;
5734{
5735 *d = TOUPPER_LOC(c);
5736
5737 return (fptr)do_Upper;
5738}
5739
5740 static fptr
5741do_lower(d, c)
5742 char_u *d;
5743 int c;
5744{
5745 *d = TOLOWER_LOC(c);
5746
5747 return (fptr)NULL;
5748}
5749
5750 static fptr
5751do_Lower(d, c)
5752 char_u *d;
5753 int c;
5754{
5755 *d = TOLOWER_LOC(c);
5756
5757 return (fptr)do_Lower;
5758}
5759
5760/*
5761 * regtilde(): Replace tildes in the pattern by the old pattern.
5762 *
5763 * Short explanation of the tilde: It stands for the previous replacement
5764 * pattern. If that previous pattern also contains a ~ we should go back a
5765 * step further... But we insert the previous pattern into the current one
5766 * and remember that.
5767 * This still does not handle the case where "magic" changes. TODO?
5768 *
5769 * The tildes are parsed once before the first call to vim_regsub().
5770 */
5771 char_u *
5772regtilde(source, magic)
5773 char_u *source;
5774 int magic;
5775{
5776 char_u *newsub = source;
5777 char_u *tmpsub;
5778 char_u *p;
5779 int len;
5780 int prevlen;
5781
5782 for (p = newsub; *p; ++p)
5783 {
5784 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5785 {
5786 if (reg_prev_sub != NULL)
5787 {
5788 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5789 prevlen = (int)STRLEN(reg_prev_sub);
5790 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5791 if (tmpsub != NULL)
5792 {
5793 /* copy prefix */
5794 len = (int)(p - newsub); /* not including ~ */
5795 mch_memmove(tmpsub, newsub, (size_t)len);
5796 /* interpretate tilde */
5797 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5798 /* copy postfix */
5799 if (!magic)
5800 ++p; /* back off \ */
5801 STRCPY(tmpsub + len + prevlen, p + 1);
5802
5803 if (newsub != source) /* already allocated newsub */
5804 vim_free(newsub);
5805 newsub = tmpsub;
5806 p = newsub + len + prevlen;
5807 }
5808 }
5809 else if (magic)
5810 STRCPY(p, p + 1); /* remove '~' */
5811 else
5812 STRCPY(p, p + 2); /* remove '\~' */
5813 --p;
5814 }
5815 else
5816 {
5817 if (*p == '\\' && p[1]) /* skip escaped characters */
5818 ++p;
5819#ifdef FEAT_MBYTE
5820 if (has_mbyte)
5821 p += (*mb_ptr2len_check)(p) - 1;
5822#endif
5823 }
5824 }
5825
5826 vim_free(reg_prev_sub);
5827 if (newsub != source) /* newsub was allocated, just keep it */
5828 reg_prev_sub = newsub;
5829 else /* no ~ found, need to save newsub */
5830 reg_prev_sub = vim_strsave(newsub);
5831 return newsub;
5832}
5833
5834#ifdef FEAT_EVAL
5835static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
5836
5837/* These pointers are used instead of reg_match and reg_mmatch for
5838 * reg_submatch(). Needed for when the substitution string is an expression
5839 * that contains a call to substitute() and submatch(). */
5840static regmatch_T *submatch_match;
5841static regmmatch_T *submatch_mmatch;
5842#endif
5843
5844#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
5845/*
5846 * vim_regsub() - perform substitutions after a vim_regexec() or
5847 * vim_regexec_multi() match.
5848 *
5849 * If "copy" is TRUE really copy into "dest".
5850 * If "copy" is FALSE nothing is copied, this is just to find out the length
5851 * of the result.
5852 *
5853 * If "backslash" is TRUE, a backslash will be removed later, need to double
5854 * them to keep them, and insert a backslash before a CR to avoid it being
5855 * replaced with a line break later.
5856 *
5857 * Note: The matched text must not change between the call of
5858 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
5859 * references invalid!
5860 *
5861 * Returns the size of the replacement, including terminating NUL.
5862 */
5863 int
5864vim_regsub(rmp, source, dest, copy, magic, backslash)
5865 regmatch_T *rmp;
5866 char_u *source;
5867 char_u *dest;
5868 int copy;
5869 int magic;
5870 int backslash;
5871{
5872 reg_match = rmp;
5873 reg_mmatch = NULL;
5874 reg_maxline = 0;
5875 return vim_regsub_both(source, dest, copy, magic, backslash);
5876}
5877#endif
5878
5879 int
5880vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
5881 regmmatch_T *rmp;
5882 linenr_T lnum;
5883 char_u *source;
5884 char_u *dest;
5885 int copy;
5886 int magic;
5887 int backslash;
5888{
5889 reg_match = NULL;
5890 reg_mmatch = rmp;
5891 reg_buf = curbuf; /* always works on the current buffer! */
5892 reg_firstlnum = lnum;
5893 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
5894 return vim_regsub_both(source, dest, copy, magic, backslash);
5895}
5896
5897 static int
5898vim_regsub_both(source, dest, copy, magic, backslash)
5899 char_u *source;
5900 char_u *dest;
5901 int copy;
5902 int magic;
5903 int backslash;
5904{
5905 char_u *src;
5906 char_u *dst;
5907 char_u *s;
5908 int c;
5909 int no = -1;
5910 fptr func = (fptr)NULL;
5911 linenr_T clnum = 0; /* init for GCC */
5912 int len = 0; /* init for GCC */
5913#ifdef FEAT_EVAL
5914 static char_u *eval_result = NULL;
5915#endif
5916#ifdef FEAT_MBYTE
5917 int l;
5918#endif
5919
5920
5921 /* Be paranoid... */
5922 if (source == NULL || dest == NULL)
5923 {
5924 EMSG(_(e_null));
5925 return 0;
5926 }
5927 if (prog_magic_wrong())
5928 return 0;
5929 src = source;
5930 dst = dest;
5931
5932 /*
5933 * When the substitute part starts with "\=" evaluate it as an expression.
5934 */
5935 if (source[0] == '\\' && source[1] == '='
5936#ifdef FEAT_EVAL
5937 && !can_f_submatch /* can't do this recursively */
5938#endif
5939 )
5940 {
5941#ifdef FEAT_EVAL
5942 /* To make sure that the length doesn't change between checking the
5943 * length and copying the string, and to speed up things, the
5944 * resulting string is saved from the call with "copy" == FALSE to the
5945 * call with "copy" == TRUE. */
5946 if (copy)
5947 {
5948 if (eval_result != NULL)
5949 {
5950 STRCPY(dest, eval_result);
5951 dst += STRLEN(eval_result);
5952 vim_free(eval_result);
5953 eval_result = NULL;
5954 }
5955 }
5956 else
5957 {
5958 linenr_T save_reg_maxline;
5959 win_T *save_reg_win;
5960 int save_ireg_ic;
5961
5962 vim_free(eval_result);
5963
5964 /* The expression may contain substitute(), which calls us
5965 * recursively. Make sure submatch() gets the text from the first
5966 * level. Don't need to save "reg_buf", because
5967 * vim_regexec_multi() can't be called recursively. */
5968 submatch_match = reg_match;
5969 submatch_mmatch = reg_mmatch;
5970 save_reg_maxline = reg_maxline;
5971 save_reg_win = reg_win;
5972 save_ireg_ic = ireg_ic;
5973 can_f_submatch = TRUE;
5974
5975 eval_result = eval_to_string(source + 2, NULL);
5976 if (eval_result != NULL)
5977 {
5978 for (s = eval_result; *s != NUL; ++s)
5979 {
5980 /* Change NL to CR, so that it becomes a line break.
5981 * Skip over a backslashed character. */
5982 if (*s == NL)
5983 *s = CAR;
5984 else if (*s == '\\' && s[1] != NUL)
5985 ++s;
5986#ifdef FEAT_MBYTE
5987 if (has_mbyte)
5988 s += (*mb_ptr2len_check)(s) - 1;
5989#endif
5990 }
5991
5992 dst += STRLEN(eval_result);
5993 }
5994
5995 reg_match = submatch_match;
5996 reg_mmatch = submatch_mmatch;
5997 reg_maxline = save_reg_maxline;
5998 reg_win = save_reg_win;
5999 ireg_ic = save_ireg_ic;
6000 can_f_submatch = FALSE;
6001 }
6002#endif
6003 }
6004 else
6005 while ((c = *src++) != NUL)
6006 {
6007 if (c == '&' && magic)
6008 no = 0;
6009 else if (c == '\\' && *src != NUL)
6010 {
6011 if (*src == '&' && !magic)
6012 {
6013 ++src;
6014 no = 0;
6015 }
6016 else if ('0' <= *src && *src <= '9')
6017 {
6018 no = *src++ - '0';
6019 }
6020 else if (vim_strchr((char_u *)"uUlLeE", *src))
6021 {
6022 switch (*src++)
6023 {
6024 case 'u': func = (fptr)do_upper;
6025 continue;
6026 case 'U': func = (fptr)do_Upper;
6027 continue;
6028 case 'l': func = (fptr)do_lower;
6029 continue;
6030 case 'L': func = (fptr)do_Lower;
6031 continue;
6032 case 'e':
6033 case 'E': func = (fptr)NULL;
6034 continue;
6035 }
6036 }
6037 }
6038 if (no < 0) /* Ordinary character. */
6039 {
6040 if (c == '\\' && *src != NUL)
6041 {
6042 /* Check for abbreviations -- webb */
6043 switch (*src)
6044 {
6045 case 'r': c = CAR; ++src; break;
6046 case 'n': c = NL; ++src; break;
6047 case 't': c = TAB; ++src; break;
6048 /* Oh no! \e already has meaning in subst pat :-( */
6049 /* case 'e': c = ESC; ++src; break; */
6050 case 'b': c = Ctrl_H; ++src; break;
6051
6052 /* If "backslash" is TRUE the backslash will be removed
6053 * later. Used to insert a literal CR. */
6054 default: if (backslash)
6055 {
6056 if (copy)
6057 *dst = '\\';
6058 ++dst;
6059 }
6060 c = *src++;
6061 }
6062 }
6063
6064 /* Write to buffer, if copy is set. */
6065#ifdef FEAT_MBYTE
6066 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6067 {
6068 /* TODO: should use "func" here. */
6069 if (copy)
6070 mch_memmove(dst, src - 1, l);
6071 dst += l - 1;
6072 src += l - 1;
6073 }
6074 else
6075 {
6076#endif
6077 if (copy)
6078 {
6079 if (func == (fptr)NULL) /* just copy */
6080 *dst = c;
6081 else /* change case */
6082 func = (fptr)(func(dst, c));
6083 /* Turbo C complains without the typecast */
6084 }
6085#ifdef FEAT_MBYTE
6086 }
6087#endif
6088 dst++;
6089 }
6090 else
6091 {
6092 if (REG_MULTI)
6093 {
6094 clnum = reg_mmatch->startpos[no].lnum;
6095 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6096 s = NULL;
6097 else
6098 {
6099 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6100 if (reg_mmatch->endpos[no].lnum == clnum)
6101 len = reg_mmatch->endpos[no].col
6102 - reg_mmatch->startpos[no].col;
6103 else
6104 len = (int)STRLEN(s);
6105 }
6106 }
6107 else
6108 {
6109 s = reg_match->startp[no];
6110 if (reg_match->endp[no] == NULL)
6111 s = NULL;
6112 else
6113 len = (int)(reg_match->endp[no] - s);
6114 }
6115 if (s != NULL)
6116 {
6117 for (;;)
6118 {
6119 if (len == 0)
6120 {
6121 if (REG_MULTI)
6122 {
6123 if (reg_mmatch->endpos[no].lnum == clnum)
6124 break;
6125 if (copy)
6126 *dst = CAR;
6127 ++dst;
6128 s = reg_getline(++clnum);
6129 if (reg_mmatch->endpos[no].lnum == clnum)
6130 len = reg_mmatch->endpos[no].col;
6131 else
6132 len = (int)STRLEN(s);
6133 }
6134 else
6135 break;
6136 }
6137 else if (*s == NUL) /* we hit NUL. */
6138 {
6139 if (copy)
6140 EMSG(_(e_re_damg));
6141 goto exit;
6142 }
6143 else
6144 {
6145 if (backslash && (*s == CAR || *s == '\\'))
6146 {
6147 /*
6148 * Insert a backslash in front of a CR, otherwise
6149 * it will be replaced by a line break.
6150 * Number of backslashes will be halved later,
6151 * double them here.
6152 */
6153 if (copy)
6154 {
6155 dst[0] = '\\';
6156 dst[1] = *s;
6157 }
6158 dst += 2;
6159 }
6160#ifdef FEAT_MBYTE
6161 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6162 {
6163 /* TODO: should use "func" here. */
6164 if (copy)
6165 mch_memmove(dst, s, l);
6166 dst += l;
6167 s += l - 1;
6168 len -= l - 1;
6169 }
6170#endif
6171 else
6172 {
6173 if (copy)
6174 {
6175 if (func == (fptr)NULL) /* just copy */
6176 *dst = *s;
6177 else /* change case */
6178 func = (fptr)(func(dst, *s));
6179 /* Turbo C complains without the typecast */
6180 }
6181 ++dst;
6182 }
6183 ++s;
6184 --len;
6185 }
6186 }
6187 }
6188 no = -1;
6189 }
6190 }
6191 if (copy)
6192 *dst = NUL;
6193
6194exit:
6195 return (int)((dst - dest) + 1);
6196}
6197
6198#ifdef FEAT_EVAL
6199/*
6200 * Used for the submatch() function: get the string from tne n'th submatch in
6201 * allocated memory.
6202 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6203 */
6204 char_u *
6205reg_submatch(no)
6206 int no;
6207{
6208 char_u *retval = NULL;
6209 char_u *s;
6210 int len;
6211 int round;
6212 linenr_T lnum;
6213
6214 if (!can_f_submatch)
6215 return NULL;
6216
6217 if (submatch_match == NULL)
6218 {
6219 /*
6220 * First round: compute the length and allocate memory.
6221 * Second round: copy the text.
6222 */
6223 for (round = 1; round <= 2; ++round)
6224 {
6225 lnum = submatch_mmatch->startpos[no].lnum;
6226 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6227 return NULL;
6228
6229 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6230 if (s == NULL) /* anti-crash check, cannot happen? */
6231 break;
6232 if (submatch_mmatch->endpos[no].lnum == lnum)
6233 {
6234 /* Within one line: take form start to end col. */
6235 len = submatch_mmatch->endpos[no].col
6236 - submatch_mmatch->startpos[no].col;
6237 if (round == 2)
6238 {
6239 STRNCPY(retval, s, len);
6240 retval[len] = NUL;
6241 }
6242 ++len;
6243 }
6244 else
6245 {
6246 /* Multiple lines: take start line from start col, middle
6247 * lines completely and end line up to end col. */
6248 len = (int)STRLEN(s);
6249 if (round == 2)
6250 {
6251 STRCPY(retval, s);
6252 retval[len] = '\n';
6253 }
6254 ++len;
6255 ++lnum;
6256 while (lnum < submatch_mmatch->endpos[no].lnum)
6257 {
6258 s = reg_getline(lnum++);
6259 if (round == 2)
6260 STRCPY(retval + len, s);
6261 len += (int)STRLEN(s);
6262 if (round == 2)
6263 retval[len] = '\n';
6264 ++len;
6265 }
6266 if (round == 2)
6267 STRNCPY(retval + len, reg_getline(lnum),
6268 submatch_mmatch->endpos[no].col);
6269 len += submatch_mmatch->endpos[no].col;
6270 if (round == 2)
6271 retval[len] = NUL;
6272 ++len;
6273 }
6274
6275 if (round == 1)
6276 {
6277 retval = lalloc((long_u)len, TRUE);
6278 if (s == NULL)
6279 return NULL;
6280 }
6281 }
6282 }
6283 else
6284 {
6285 if (submatch_match->endp[no] == NULL)
6286 retval = NULL;
6287 else
6288 {
6289 s = submatch_match->startp[no];
6290 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6291 }
6292 }
6293
6294 return retval;
6295}
6296#endif