blob: dac34a2b3bcf6f31d2024889d04d33a220162db3 [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
77 * node points to the node after the stuff to be repeated. The operand of some
78 * types of node is a literal string; for others, it is a node leading into a
79 * sub-FSM. In particular, the operand of a BRANCH node is the first node of
80 * the branch. (NB this is *not* a tree structure: the tail of the branch
81 * connects to the thing following the set of BRANCHes.)
82 *
83 * pattern is coded like:
84 *
85 * +-----------------+
86 * | V
87 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
88 * | ^ | ^
89 * +------+ +----------+
90 *
91 *
92 * +------------------+
93 * V |
94 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
95 * | | ^ ^
96 * | +---------------+ |
97 * +---------------------------------------------+
98 *
99 *
100 * +-------------------------+
101 * V |
102 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
103 * | | ^
104 * | +----------------+
105 * +-----------------------------------------------+
106 *
107 *
108 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
109 * | | ^ ^
110 * | +----------------+ |
111 * +--------------------------------+
112 *
113 * +---------+
114 * | V
115 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
116 * | | | | ^ ^
117 * | | | +-----+ |
118 * | | +----------------+ |
119 * | +---------------------------+ |
120 * +------------------------------------------------------+
121 *
122 * They all start with a BRANCH for "\|" alternaties, even when there is only
123 * one alternative.
124 */
125
126/*
127 * The opcodes are:
128 */
129
130/* definition number opnd? meaning */
131#define END 0 /* End of program or NOMATCH operand. */
132#define BOL 1 /* Match "" at beginning of line. */
133#define EOL 2 /* Match "" at end of line. */
134#define BRANCH 3 /* node Match this alternative, or the
135 * next... */
136#define BACK 4 /* Match "", "next" ptr points backward. */
137#define EXACTLY 5 /* str Match this string. */
138#define NOTHING 6 /* Match empty string. */
139#define STAR 7 /* node Match this (simple) thing 0 or more
140 * times. */
141#define PLUS 8 /* node Match this (simple) thing 1 or more
142 * times. */
143#define MATCH 9 /* node match the operand zero-width */
144#define NOMATCH 10 /* node check for no match with operand */
145#define BEHIND 11 /* node look behind for a match with operand */
146#define NOBEHIND 12 /* node look behind for no match with operand */
147#define SUBPAT 13 /* node match the operand here */
148#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
149 * n times (\{m,n\}). */
150#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
151#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
152#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
153 * and BRACE_COMPLEX. */
154#define NEWL 18 /* Match line-break */
155#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
156
157
158/* character classes: 20-48 normal, 50-78 include a line-break */
159#define ADD_NL 30
160#define FIRST_NL ANY + ADD_NL
161#define ANY 20 /* Match any one character. */
162#define ANYOF 21 /* str Match any character in this string. */
163#define ANYBUT 22 /* str Match any character not in this
164 * string. */
165#define IDENT 23 /* Match identifier char */
166#define SIDENT 24 /* Match identifier char but no digit */
167#define KWORD 25 /* Match keyword char */
168#define SKWORD 26 /* Match word char but no digit */
169#define FNAME 27 /* Match file name char */
170#define SFNAME 28 /* Match file name char but no digit */
171#define PRINT 29 /* Match printable char */
172#define SPRINT 30 /* Match printable char but no digit */
173#define WHITE 31 /* Match whitespace char */
174#define NWHITE 32 /* Match non-whitespace char */
175#define DIGIT 33 /* Match digit char */
176#define NDIGIT 34 /* Match non-digit char */
177#define HEX 35 /* Match hex char */
178#define NHEX 36 /* Match non-hex char */
179#define OCTAL 37 /* Match octal char */
180#define NOCTAL 38 /* Match non-octal char */
181#define WORD 39 /* Match word char */
182#define NWORD 40 /* Match non-word char */
183#define HEAD 41 /* Match head char */
184#define NHEAD 42 /* Match non-head char */
185#define ALPHA 43 /* Match alpha char */
186#define NALPHA 44 /* Match non-alpha char */
187#define LOWER 45 /* Match lowercase char */
188#define NLOWER 46 /* Match non-lowercase char */
189#define UPPER 47 /* Match uppercase char */
190#define NUPPER 48 /* Match non-uppercase char */
191#define LAST_NL NUPPER + ADD_NL
192#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
193
194#define MOPEN 80 /* -89 Mark this point in input as start of
195 * \( subexpr. MOPEN + 0 marks start of
196 * match. */
197#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
198 * end of match. */
199#define BACKREF 100 /* -109 node Match same string again \1-\9 */
200
201#ifdef FEAT_SYN_HL
202# define ZOPEN 110 /* -119 Mark this point in input as start of
203 * \z( subexpr. */
204# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
205# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
206#endif
207
208#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
209
210#define NOPEN 150 /* Mark this point in input as start of
211 \%( subexpr. */
212#define NCLOSE 151 /* Analogous to NOPEN. */
213
214#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
215#define RE_BOF 201 /* Match "" at beginning of file. */
216#define RE_EOF 202 /* Match "" at end of file. */
217#define CURSOR 203 /* Match location of cursor. */
218
219#define RE_LNUM 204 /* nr cmp Match line number */
220#define RE_COL 205 /* nr cmp Match column number */
221#define RE_VCOL 206 /* nr cmp Match virtual column number */
222
223/*
224 * Magic characters have a special meaning, they don't match literally.
225 * Magic characters are negative. This separates them from literal characters
226 * (possibly multi-byte). Only ASCII characters can be Magic.
227 */
228#define Magic(x) ((int)(x) - 256)
229#define un_Magic(x) ((x) + 256)
230#define is_Magic(x) ((x) < 0)
231
232static int no_Magic __ARGS((int x));
233static int toggle_Magic __ARGS((int x));
234
235 static int
236no_Magic(x)
237 int x;
238{
239 if (is_Magic(x))
240 return un_Magic(x);
241 return x;
242}
243
244 static int
245toggle_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return Magic(x);
251}
252
253/*
254 * The first byte of the regexp internal "program" is actually this magic
255 * number; the start node begins in the second byte. It's used to catch the
256 * most severe mutilation of the program by the caller.
257 */
258
259#define REGMAGIC 0234
260
261/*
262 * Opcode notes:
263 *
264 * BRANCH The set of branches constituting a single choice are hooked
265 * together with their "next" pointers, since precedence prevents
266 * anything being concatenated to any individual branch. The
267 * "next" pointer of the last BRANCH in a choice points to the
268 * thing following the whole choice. This is also where the
269 * final "next" pointer of each individual branch points; each
270 * branch starts with the operand node of a BRANCH node.
271 *
272 * BACK Normal "next" pointers all implicitly point forward; BACK
273 * exists to make loop structures possible.
274 *
275 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
276 * BRANCH structures using BACK. Simple cases (one character
277 * per match) are implemented with STAR and PLUS for speed
278 * and to minimize recursive plunges.
279 *
280 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
281 * node, and defines the min and max limits to be used for that
282 * node.
283 *
284 * MOPEN,MCLOSE ...are numbered at compile time.
285 * ZOPEN,ZCLOSE ...ditto
286 */
287
288/*
289 * A node is one char of opcode followed by two chars of "next" pointer.
290 * "Next" pointers are stored as two 8-bit bytes, high order first. The
291 * value is a positive offset from the opcode of the node containing it.
292 * An operand, if any, simply follows the node. (Note that much of the
293 * code generation knows about this implicit relationship.)
294 *
295 * Using two bytes for the "next" pointer is vast overkill for most things,
296 * but allows patterns to get big without disasters.
297 */
298#define OP(p) ((int)*(p))
299#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
300#define OPERAND(p) ((p) + 3)
301/* Obtain an operand that was stored as four bytes, MSB first. */
302#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
303 + ((long)(p)[5] << 8) + (long)(p)[6])
304/* Obtain a second operand stored as four bytes. */
305#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
306/* Obtain a second single-byte operand stored after a four bytes operand. */
307#define OPERAND_CMP(p) (p)[7]
308
309/*
310 * Utility definitions.
311 */
312#define UCHARAT(p) ((int)*(char_u *)(p))
313
314/* Used for an error (down from) vim_regcomp(): give the error message, set
315 * rc_did_emsg and return NULL */
316#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
317#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
318#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
319#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
320
321#define MAX_LIMIT (32767L << 16L)
322
323static int re_multi_type __ARGS((int));
324static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
325static char_u *cstrchr __ARGS((char_u *, int));
326
327#ifdef DEBUG
328static void regdump __ARGS((char_u *, regprog_T *));
329static char_u *regprop __ARGS((char_u *));
330#endif
331
332#define NOT_MULTI 0
333#define MULTI_ONE 1
334#define MULTI_MULT 2
335/*
336 * Return NOT_MULTI if c is not a "multi" operator.
337 * Return MULTI_ONE if c is a single "multi" operator.
338 * Return MULTI_MULT if c is a multi "multi" operator.
339 */
340 static int
341re_multi_type(c)
342 int c;
343{
344 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
345 return MULTI_ONE;
346 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
347 return MULTI_MULT;
348 return NOT_MULTI;
349}
350
351/*
352 * Flags to be passed up and down.
353 */
354#define HASWIDTH 0x1 /* Known never to match null string. */
355#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
356#define SPSTART 0x4 /* Starts with * or +. */
357#define HASNL 0x8 /* Contains some \n. */
358#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
359#define WORST 0 /* Worst case. */
360
361/*
362 * When regcode is set to this value, code is not emitted and size is computed
363 * instead.
364 */
365#define JUST_CALC_SIZE ((char_u *) -1)
366
367static char_u *reg_prev_sub;
368
369/*
370 * REGEXP_INRANGE contains all characters which are always special in a []
371 * range after '\'.
372 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
373 * These are:
374 * \n - New line (NL).
375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB).
377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000379 * \d - Character code in decimal, eg \d123
380 * \o - Character code in octal, eg \o80
381 * \x - Character code in hex, eg \x4a
382 * \u - Multibyte character code, eg \u20ac
383 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384 */
385static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000386static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000387
388static int backslash_trans __ARGS((int c));
389static int skip_class_name __ARGS((char_u **pp));
390static char_u *skip_anyof __ARGS((char_u *p));
391static void init_class_tab __ARGS((void));
392
393/*
394 * Translate '\x' to its control character, except "\n", which is Magic.
395 */
396 static int
397backslash_trans(c)
398 int c;
399{
400 switch (c)
401 {
402 case 'r': return CAR;
403 case 't': return TAB;
404 case 'e': return ESC;
405 case 'b': return BS;
406 }
407 return c;
408}
409
410/*
411 * Check for a character class name. "pp" points to the '['.
412 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
413 * recognized. Otherwise "pp" is advanced to after the item.
414 */
415 static int
416skip_class_name(pp)
417 char_u **pp;
418{
419 static const char *(class_names[]) =
420 {
421 "alnum:]",
422#define CLASS_ALNUM 0
423 "alpha:]",
424#define CLASS_ALPHA 1
425 "blank:]",
426#define CLASS_BLANK 2
427 "cntrl:]",
428#define CLASS_CNTRL 3
429 "digit:]",
430#define CLASS_DIGIT 4
431 "graph:]",
432#define CLASS_GRAPH 5
433 "lower:]",
434#define CLASS_LOWER 6
435 "print:]",
436#define CLASS_PRINT 7
437 "punct:]",
438#define CLASS_PUNCT 8
439 "space:]",
440#define CLASS_SPACE 9
441 "upper:]",
442#define CLASS_UPPER 10
443 "xdigit:]",
444#define CLASS_XDIGIT 11
445 "tab:]",
446#define CLASS_TAB 12
447 "return:]",
448#define CLASS_RETURN 13
449 "backspace:]",
450#define CLASS_BACKSPACE 14
451 "escape:]",
452#define CLASS_ESCAPE 15
453 };
454#define CLASS_NONE 99
455 int i;
456
457 if ((*pp)[1] == ':')
458 {
459 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
460 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
461 {
462 *pp += STRLEN(class_names[i]) + 2;
463 return i;
464 }
465 }
466 return CLASS_NONE;
467}
468
469/*
470 * Skip over a "[]" range.
471 * "p" must point to the character after the '['.
472 * The returned pointer is on the matching ']', or the terminating NUL.
473 */
474 static char_u *
475skip_anyof(p)
476 char_u *p;
477{
478 int cpo_lit; /* 'cpoptions' contains 'l' flag */
479#ifdef FEAT_MBYTE
480 int l;
481#endif
482
483 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
484
485 if (*p == '^') /* Complement of range. */
486 ++p;
487 if (*p == ']' || *p == '-')
488 ++p;
489 while (*p != NUL && *p != ']')
490 {
491#ifdef FEAT_MBYTE
492 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
493 p += l;
494 else
495#endif
496 if (*p == '-')
497 {
498 ++p;
499 if (*p != ']' && *p != NUL)
500 {
501#ifdef FEAT_MBYTE
502 if (has_mbyte)
503 p += (*mb_ptr2len_check)(p);
504 else
505#endif
506 ++p;
507 }
508 }
509 else if (*p == '\\'
510 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
511 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
512 p += 2;
513 else if (*p == '[')
514 {
515 if (skip_class_name(&p) == CLASS_NONE)
516 ++p; /* It was not a class name */
517 }
518 else
519 ++p;
520 }
521
522 return p;
523}
524
525/*
526 * Specific version of character class functions.
527 * Using a table to keep this fast.
528 */
529static short class_tab[256];
530
531#define RI_DIGIT 0x01
532#define RI_HEX 0x02
533#define RI_OCTAL 0x04
534#define RI_WORD 0x08
535#define RI_HEAD 0x10
536#define RI_ALPHA 0x20
537#define RI_LOWER 0x40
538#define RI_UPPER 0x80
539#define RI_WHITE 0x100
540
541 static void
542init_class_tab()
543{
544 int i;
545 static int done = FALSE;
546
547 if (done)
548 return;
549
550 for (i = 0; i < 256; ++i)
551 {
552 if (i >= '0' && i <= '7')
553 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
554 else if (i >= '8' && i <= '9')
555 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
556 else if (i >= 'a' && i <= 'f')
557 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
558#ifdef EBCDIC
559 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
560 || (i >= 's' && i <= 'z'))
561#else
562 else if (i >= 'g' && i <= 'z')
563#endif
564 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
565 else if (i >= 'A' && i <= 'F')
566 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
567#ifdef EBCDIC
568 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
569 || (i >= 'S' && i <= 'Z'))
570#else
571 else if (i >= 'G' && i <= 'Z')
572#endif
573 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
574 else if (i == '_')
575 class_tab[i] = RI_WORD + RI_HEAD;
576 else
577 class_tab[i] = 0;
578 }
579 class_tab[' '] |= RI_WHITE;
580 class_tab['\t'] |= RI_WHITE;
581 done = TRUE;
582}
583
584#ifdef FEAT_MBYTE
585# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
586# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
587# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
588# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
589# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
590# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
591# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
592# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
593# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
594#else
595# define ri_digit(c) (class_tab[c] & RI_DIGIT)
596# define ri_hex(c) (class_tab[c] & RI_HEX)
597# define ri_octal(c) (class_tab[c] & RI_OCTAL)
598# define ri_word(c) (class_tab[c] & RI_WORD)
599# define ri_head(c) (class_tab[c] & RI_HEAD)
600# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
601# define ri_lower(c) (class_tab[c] & RI_LOWER)
602# define ri_upper(c) (class_tab[c] & RI_UPPER)
603# define ri_white(c) (class_tab[c] & RI_WHITE)
604#endif
605
606/* flags for regflags */
607#define RF_ICASE 1 /* ignore case */
608#define RF_NOICASE 2 /* don't ignore case */
609#define RF_HASNL 4 /* can match a NL */
610#define RF_ICOMBINE 8 /* ignore combining characters */
611#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
612
613/*
614 * Global work variables for vim_regcomp().
615 */
616
617static char_u *regparse; /* Input-scan pointer. */
618static int prevchr_len; /* byte length of previous char */
619static int num_complex_braces; /* Complex \{...} count */
620static int regnpar; /* () count. */
621#ifdef FEAT_SYN_HL
622static int regnzpar; /* \z() count. */
623static int re_has_z; /* \z item detected */
624#endif
625static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
626static long regsize; /* Code size. */
627static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
628static unsigned regflags; /* RF_ flags for prog */
629static long brace_min[10]; /* Minimums for complex brace repeats */
630static long brace_max[10]; /* Maximums for complex brace repeats */
631static int brace_count[10]; /* Current counts for complex brace repeats */
632#if defined(FEAT_SYN_HL) || defined(PROTO)
633static int had_eol; /* TRUE when EOL found by vim_regcomp() */
634#endif
635static int one_exactly = FALSE; /* only do one char for EXACTLY */
636
637static int reg_magic; /* magicness of the pattern: */
638#define MAGIC_NONE 1 /* "\V" very unmagic */
639#define MAGIC_OFF 2 /* "\M" or 'magic' off */
640#define MAGIC_ON 3 /* "\m" or 'magic' */
641#define MAGIC_ALL 4 /* "\v" very magic */
642
643static int reg_string; /* matching with a string instead of a buffer
644 line */
645
646/*
647 * META contains all characters that may be magic, except '^' and '$'.
648 */
649
650#ifdef EBCDIC
651static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
652#else
653/* META[] is used often enough to justify turning it into a table. */
654static char_u META_flags[] = {
655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
656 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
657/* % & ( ) * + . */
658 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
659/* 1 2 3 4 5 6 7 8 9 < = > ? */
660 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
661/* @ A C D F H I K L M O */
662 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
663/* P S U V W X Z [ _ */
664 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
665/* a c d f h i k l m n o */
666 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
667/* p s u v w x z { | ~ */
668 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
669};
670#endif
671
672static int curchr;
673
674/* arguments for reg() */
675#define REG_NOPAREN 0 /* toplevel reg() */
676#define REG_PAREN 1 /* \(\) */
677#define REG_ZPAREN 2 /* \z(\) */
678#define REG_NPAREN 3 /* \%(\) */
679
680/*
681 * Forward declarations for vim_regcomp()'s friends.
682 */
683static void initchr __ARGS((char_u *));
684static int getchr __ARGS((void));
685static void skipchr_keepstart __ARGS((void));
686static int peekchr __ARGS((void));
687static void skipchr __ARGS((void));
688static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000689static int gethexchrs __ARGS((int maxinputlen));
690static int getoctchrs __ARGS((void));
691static int getdecchrs __ARGS((void));
692static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000693static void regcomp_start __ARGS((char_u *expr, int flags));
694static char_u *reg __ARGS((int, int *));
695static char_u *regbranch __ARGS((int *flagp));
696static char_u *regconcat __ARGS((int *flagp));
697static char_u *regpiece __ARGS((int *));
698static char_u *regatom __ARGS((int *));
699static char_u *regnode __ARGS((int));
700static int prog_magic_wrong __ARGS((void));
701static char_u *regnext __ARGS((char_u *));
702static void regc __ARGS((int b));
703#ifdef FEAT_MBYTE
704static void regmbc __ARGS((int c));
705#endif
706static void reginsert __ARGS((int, char_u *));
707static void reginsert_limits __ARGS((int, long, long, char_u *));
708static char_u *re_put_long __ARGS((char_u *pr, long_u val));
709static int read_limits __ARGS((long *, long *));
710static void regtail __ARGS((char_u *, char_u *));
711static void regoptail __ARGS((char_u *, char_u *));
712
713/*
714 * Return TRUE if compiled regular expression "prog" can match a line break.
715 */
716 int
717re_multiline(prog)
718 regprog_T *prog;
719{
720 return (prog->regflags & RF_HASNL);
721}
722
723/*
724 * Return TRUE if compiled regular expression "prog" looks before the start
725 * position (pattern contains "\@<=" or "\@<!").
726 */
727 int
728re_lookbehind(prog)
729 regprog_T *prog;
730{
731 return (prog->regflags & RF_LOOKBH);
732}
733
734/*
735 * Skip past regular expression.
736 * Stop at end of 'p' of where 'dirc' is found ('/', '?', etc).
737 * Take care of characters with a backslash in front of it.
738 * Skip strings inside [ and ].
739 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
740 * expression and change "\?" to "?". If "*newp" is not NULL the expression
741 * is changed in-place.
742 */
743 char_u *
744skip_regexp(startp, dirc, magic, newp)
745 char_u *startp;
746 int dirc;
747 int magic;
748 char_u **newp;
749{
750 int mymagic;
751 char_u *p = startp;
752
753 if (magic)
754 mymagic = MAGIC_ON;
755 else
756 mymagic = MAGIC_OFF;
757
758 for (; p[0] != NUL; ++p)
759 {
760 if (p[0] == dirc) /* found end of regexp */
761 break;
762 if ((p[0] == '[' && mymagic >= MAGIC_ON)
763 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
764 {
765 p = skip_anyof(p + 1);
766 if (p[0] == NUL)
767 break;
768 }
769 else if (p[0] == '\\' && p[1] != NUL)
770 {
771 if (dirc == '?' && newp != NULL && p[1] == '?')
772 {
773 /* change "\?" to "?", make a copy first. */
774 if (*newp == NULL)
775 {
776 *newp = vim_strsave(startp);
777 if (*newp != NULL)
778 p = *newp + (p - startp);
779 }
780 if (*newp != NULL)
781 mch_memmove(p, p + 1, STRLEN(p));
782 else
783 ++p;
784 }
785 else
786 ++p; /* skip next character */
787 if (*p == 'v')
788 mymagic = MAGIC_ALL;
789 else if (*p == 'V')
790 mymagic = MAGIC_NONE;
791 }
792#ifdef FEAT_MBYTE
793 else if (has_mbyte)
794 p += (*mb_ptr2len_check)(p) - 1;
795#endif
796 }
797 return p;
798}
799
800/*
801 * vim_regcomp - compile a regular expression into internal code
802 *
803 * We can't allocate space until we know how big the compiled form will be,
804 * but we can't compile it (and thus know how big it is) until we've got a
805 * place to put the code. So we cheat: we compile it twice, once with code
806 * generation turned off and size counting turned on, and once "for real".
807 * This also means that we don't allocate space until we are sure that the
808 * thing really will compile successfully, and we never have to move the
809 * code and thus invalidate pointers into it. (Note that it has to be in
810 * one piece because vim_free() must be able to free it all.)
811 *
812 * Whether upper/lower case is to be ignored is decided when executing the
813 * program, it does not matter here.
814 *
815 * Beware that the optimization-preparation code in here knows about some
816 * of the structure of the compiled regexp.
817 * "re_flags": RE_MAGIC and/or RE_STRING.
818 */
819 regprog_T *
820vim_regcomp(expr, re_flags)
821 char_u *expr;
822 int re_flags;
823{
824 regprog_T *r;
825 char_u *scan;
826 char_u *longest;
827 int len;
828 int flags;
829
830 if (expr == NULL)
831 EMSG_RET_NULL(_(e_null));
832
833 init_class_tab();
834
835 /*
836 * First pass: determine size, legality.
837 */
838 regcomp_start(expr, re_flags);
839 regcode = JUST_CALC_SIZE;
840 regc(REGMAGIC);
841 if (reg(REG_NOPAREN, &flags) == NULL)
842 return NULL;
843
844 /* Small enough for pointer-storage convention? */
845#ifdef SMALL_MALLOC /* 16 bit storage allocation */
846 if (regsize >= 65536L - 256L)
847 EMSG_RET_NULL(_("E339: Pattern too long"));
848#endif
849
850 /* Allocate space. */
851 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
852 if (r == NULL)
853 return NULL;
854
855 /*
856 * Second pass: emit code.
857 */
858 regcomp_start(expr, re_flags);
859 regcode = r->program;
860 regc(REGMAGIC);
861 if (reg(REG_NOPAREN, &flags) == NULL)
862 {
863 vim_free(r);
864 return NULL;
865 }
866
867 /* Dig out information for optimizations. */
868 r->regstart = NUL; /* Worst-case defaults. */
869 r->reganch = 0;
870 r->regmust = NULL;
871 r->regmlen = 0;
872 r->regflags = regflags;
873 if (flags & HASNL)
874 r->regflags |= RF_HASNL;
875 if (flags & HASLOOKBH)
876 r->regflags |= RF_LOOKBH;
877#ifdef FEAT_SYN_HL
878 /* Remember whether this pattern has any \z specials in it. */
879 r->reghasz = re_has_z;
880#endif
881 scan = r->program + 1; /* First BRANCH. */
882 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
883 {
884 scan = OPERAND(scan);
885
886 /* Starting-point info. */
887 if (OP(scan) == BOL || OP(scan) == RE_BOF)
888 {
889 r->reganch++;
890 scan = regnext(scan);
891 }
892
893 if (OP(scan) == EXACTLY)
894 {
895#ifdef FEAT_MBYTE
896 if (has_mbyte)
897 r->regstart = (*mb_ptr2char)(OPERAND(scan));
898 else
899#endif
900 r->regstart = *OPERAND(scan);
901 }
902 else if ((OP(scan) == BOW
903 || OP(scan) == EOW
904 || OP(scan) == NOTHING
905 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
906 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
907 && OP(regnext(scan)) == EXACTLY)
908 {
909#ifdef FEAT_MBYTE
910 if (has_mbyte)
911 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
912 else
913#endif
914 r->regstart = *OPERAND(regnext(scan));
915 }
916
917 /*
918 * If there's something expensive in the r.e., find the longest
919 * literal string that must appear and make it the regmust. Resolve
920 * ties in favor of later strings, since the regstart check works
921 * with the beginning of the r.e. and avoiding duplication
922 * strengthens checking. Not a strong reason, but sufficient in the
923 * absence of others.
924 */
925 /*
926 * When the r.e. starts with BOW, it is faster to look for a regmust
927 * first. Used a lot for "#" and "*" commands. (Added by mool).
928 */
929 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
930 && !(flags & HASNL))
931 {
932 longest = NULL;
933 len = 0;
934 for (; scan != NULL; scan = regnext(scan))
935 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
936 {
937 longest = OPERAND(scan);
938 len = (int)STRLEN(OPERAND(scan));
939 }
940 r->regmust = longest;
941 r->regmlen = len;
942 }
943 }
944#ifdef DEBUG
945 regdump(expr, r);
946#endif
947 return r;
948}
949
950/*
951 * Setup to parse the regexp. Used once to get the length and once to do it.
952 */
953 static void
954regcomp_start(expr, re_flags)
955 char_u *expr;
956 int re_flags; /* see vim_regcomp() */
957{
958 initchr(expr);
959 if (re_flags & RE_MAGIC)
960 reg_magic = MAGIC_ON;
961 else
962 reg_magic = MAGIC_OFF;
963 reg_string = (re_flags & RE_STRING);
964
965 num_complex_braces = 0;
966 regnpar = 1;
967 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
968#ifdef FEAT_SYN_HL
969 regnzpar = 1;
970 re_has_z = 0;
971#endif
972 regsize = 0L;
973 regflags = 0;
974#if defined(FEAT_SYN_HL) || defined(PROTO)
975 had_eol = FALSE;
976#endif
977}
978
979#if defined(FEAT_SYN_HL) || defined(PROTO)
980/*
981 * Check if during the previous call to vim_regcomp the EOL item "$" has been
982 * found. This is messy, but it works fine.
983 */
984 int
985vim_regcomp_had_eol()
986{
987 return had_eol;
988}
989#endif
990
991/*
992 * reg - regular expression, i.e. main body or parenthesized thing
993 *
994 * Caller must absorb opening parenthesis.
995 *
996 * Combining parenthesis handling with the base level of regular expression
997 * is a trifle forced, but the need to tie the tails of the branches to what
998 * follows makes it hard to avoid.
999 */
1000 static char_u *
1001reg(paren, flagp)
1002 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1003 int *flagp;
1004{
1005 char_u *ret;
1006 char_u *br;
1007 char_u *ender;
1008 int parno = 0;
1009 int flags;
1010
1011 *flagp = HASWIDTH; /* Tentatively. */
1012
1013#ifdef FEAT_SYN_HL
1014 if (paren == REG_ZPAREN)
1015 {
1016 /* Make a ZOPEN node. */
1017 if (regnzpar >= NSUBEXP)
1018 EMSG_RET_NULL(_("E50: Too many \\z("));
1019 parno = regnzpar;
1020 regnzpar++;
1021 ret = regnode(ZOPEN + parno);
1022 }
1023 else
1024#endif
1025 if (paren == REG_PAREN)
1026 {
1027 /* Make a MOPEN node. */
1028 if (regnpar >= NSUBEXP)
1029 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1030 parno = regnpar;
1031 ++regnpar;
1032 ret = regnode(MOPEN + parno);
1033 }
1034 else if (paren == REG_NPAREN)
1035 {
1036 /* Make a NOPEN node. */
1037 ret = regnode(NOPEN);
1038 }
1039 else
1040 ret = NULL;
1041
1042 /* Pick up the branches, linking them together. */
1043 br = regbranch(&flags);
1044 if (br == NULL)
1045 return NULL;
1046 if (ret != NULL)
1047 regtail(ret, br); /* [MZ]OPEN -> first. */
1048 else
1049 ret = br;
1050 /* If one of the branches can be zero-width, the whole thing can.
1051 * If one of the branches has * at start or matches a line-break, the
1052 * whole thing can. */
1053 if (!(flags & HASWIDTH))
1054 *flagp &= ~HASWIDTH;
1055 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1056 while (peekchr() == Magic('|'))
1057 {
1058 skipchr();
1059 br = regbranch(&flags);
1060 if (br == NULL)
1061 return NULL;
1062 regtail(ret, br); /* BRANCH -> BRANCH. */
1063 if (!(flags & HASWIDTH))
1064 *flagp &= ~HASWIDTH;
1065 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1066 }
1067
1068 /* Make a closing node, and hook it on the end. */
1069 ender = regnode(
1070#ifdef FEAT_SYN_HL
1071 paren == REG_ZPAREN ? ZCLOSE + parno :
1072#endif
1073 paren == REG_PAREN ? MCLOSE + parno :
1074 paren == REG_NPAREN ? NCLOSE : END);
1075 regtail(ret, ender);
1076
1077 /* Hook the tails of the branches to the closing node. */
1078 for (br = ret; br != NULL; br = regnext(br))
1079 regoptail(br, ender);
1080
1081 /* Check for proper termination. */
1082 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1083 {
1084#ifdef FEAT_SYN_HL
1085 if (paren == REG_ZPAREN)
1086 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1087 else
1088#endif
1089 if (paren == REG_NPAREN)
1090 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1091 else
1092 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1093 }
1094 else if (paren == REG_NOPAREN && peekchr() != NUL)
1095 {
1096 if (curchr == Magic(')'))
1097 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1098 else
1099 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1100 /* NOTREACHED */
1101 }
1102 /*
1103 * Here we set the flag allowing back references to this set of
1104 * parentheses.
1105 */
1106 if (paren == REG_PAREN)
1107 had_endbrace[parno] = TRUE; /* have seen the close paren */
1108 return ret;
1109}
1110
1111/*
1112 * regbranch - one alternative of an | operator
1113 *
1114 * Implements the & operator.
1115 */
1116 static char_u *
1117regbranch(flagp)
1118 int *flagp;
1119{
1120 char_u *ret;
1121 char_u *chain = NULL;
1122 char_u *latest;
1123 int flags;
1124
1125 *flagp = WORST | HASNL; /* Tentatively. */
1126
1127 ret = regnode(BRANCH);
1128 for (;;)
1129 {
1130 latest = regconcat(&flags);
1131 if (latest == NULL)
1132 return NULL;
1133 /* If one of the branches has width, the whole thing has. If one of
1134 * the branches anchors at start-of-line, the whole thing does.
1135 * If one of the branches uses look-behind, the whole thing does. */
1136 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1137 /* If one of the branches doesn't match a line-break, the whole thing
1138 * doesn't. */
1139 *flagp &= ~HASNL | (flags & HASNL);
1140 if (chain != NULL)
1141 regtail(chain, latest);
1142 if (peekchr() != Magic('&'))
1143 break;
1144 skipchr();
1145 regtail(latest, regnode(END)); /* operand ends */
1146 reginsert(MATCH, latest);
1147 chain = latest;
1148 }
1149
1150 return ret;
1151}
1152
1153/*
1154 * regbranch - one alternative of an | or & operator
1155 *
1156 * Implements the concatenation operator.
1157 */
1158 static char_u *
1159regconcat(flagp)
1160 int *flagp;
1161{
1162 char_u *first = NULL;
1163 char_u *chain = NULL;
1164 char_u *latest;
1165 int flags;
1166 int cont = TRUE;
1167
1168 *flagp = WORST; /* Tentatively. */
1169
1170 while (cont)
1171 {
1172 switch (peekchr())
1173 {
1174 case NUL:
1175 case Magic('|'):
1176 case Magic('&'):
1177 case Magic(')'):
1178 cont = FALSE;
1179 break;
1180 case Magic('Z'):
1181#ifdef FEAT_MBYTE
1182 regflags |= RF_ICOMBINE;
1183#endif
1184 skipchr_keepstart();
1185 break;
1186 case Magic('c'):
1187 regflags |= RF_ICASE;
1188 skipchr_keepstart();
1189 break;
1190 case Magic('C'):
1191 regflags |= RF_NOICASE;
1192 skipchr_keepstart();
1193 break;
1194 case Magic('v'):
1195 reg_magic = MAGIC_ALL;
1196 skipchr_keepstart();
1197 curchr = -1;
1198 break;
1199 case Magic('m'):
1200 reg_magic = MAGIC_ON;
1201 skipchr_keepstart();
1202 curchr = -1;
1203 break;
1204 case Magic('M'):
1205 reg_magic = MAGIC_OFF;
1206 skipchr_keepstart();
1207 curchr = -1;
1208 break;
1209 case Magic('V'):
1210 reg_magic = MAGIC_NONE;
1211 skipchr_keepstart();
1212 curchr = -1;
1213 break;
1214 default:
1215 latest = regpiece(&flags);
1216 if (latest == NULL)
1217 return NULL;
1218 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1219 if (chain == NULL) /* First piece. */
1220 *flagp |= flags & SPSTART;
1221 else
1222 regtail(chain, latest);
1223 chain = latest;
1224 if (first == NULL)
1225 first = latest;
1226 break;
1227 }
1228 }
1229 if (first == NULL) /* Loop ran zero times. */
1230 first = regnode(NOTHING);
1231 return first;
1232}
1233
1234/*
1235 * regpiece - something followed by possible [*+=]
1236 *
1237 * Note that the branching code sequences used for = and the general cases
1238 * of * and + are somewhat optimized: they use the same NOTHING node as
1239 * both the endmarker for their branch list and the body of the last branch.
1240 * It might seem that this node could be dispensed with entirely, but the
1241 * endmarker role is not redundant.
1242 */
1243 static char_u *
1244regpiece(flagp)
1245 int *flagp;
1246{
1247 char_u *ret;
1248 int op;
1249 char_u *next;
1250 int flags;
1251 long minval;
1252 long maxval;
1253
1254 ret = regatom(&flags);
1255 if (ret == NULL)
1256 return NULL;
1257
1258 op = peekchr();
1259 if (re_multi_type(op) == NOT_MULTI)
1260 {
1261 *flagp = flags;
1262 return ret;
1263 }
1264 if (!(flags & HASWIDTH) && re_multi_type(op) == MULTI_MULT)
1265 {
1266 if (op == Magic('*'))
1267 EMSG_M_RET_NULL(_("E56: %s* operand could be empty"),
1268 reg_magic >= MAGIC_ON);
1269 if (op == Magic('+'))
1270 EMSG_M_RET_NULL(_("E57: %s+ operand could be empty"),
1271 reg_magic == MAGIC_ALL);
1272 /* "\{}" is checked below, it's allowed when there is an upper limit */
1273 }
1274 /* default flags */
1275 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1276
1277 skipchr();
1278 switch (op)
1279 {
1280 case Magic('*'):
1281 if (flags & SIMPLE)
1282 reginsert(STAR, ret);
1283 else
1284 {
1285 /* Emit x* as (x&|), where & means "self". */
1286 reginsert(BRANCH, ret); /* Either x */
1287 regoptail(ret, regnode(BACK)); /* and loop */
1288 regoptail(ret, ret); /* back */
1289 regtail(ret, regnode(BRANCH)); /* or */
1290 regtail(ret, regnode(NOTHING)); /* null. */
1291 }
1292 break;
1293
1294 case Magic('+'):
1295 if (flags & SIMPLE)
1296 reginsert(PLUS, ret);
1297 else
1298 {
1299 /* Emit x+ as x(&|), where & means "self". */
1300 next = regnode(BRANCH); /* Either */
1301 regtail(ret, next);
1302 regtail(regnode(BACK), ret); /* loop back */
1303 regtail(next, regnode(BRANCH)); /* or */
1304 regtail(ret, regnode(NOTHING)); /* null. */
1305 }
1306 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1307 break;
1308
1309 case Magic('@'):
1310 {
1311 int lop = END;
1312
1313 switch (no_Magic(getchr()))
1314 {
1315 case '=': lop = MATCH; break; /* \@= */
1316 case '!': lop = NOMATCH; break; /* \@! */
1317 case '>': lop = SUBPAT; break; /* \@> */
1318 case '<': switch (no_Magic(getchr()))
1319 {
1320 case '=': lop = BEHIND; break; /* \@<= */
1321 case '!': lop = NOBEHIND; break; /* \@<! */
1322 }
1323 }
1324 if (lop == END)
1325 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1326 reg_magic == MAGIC_ALL);
1327 /* Look behind must match with behind_pos. */
1328 if (lop == BEHIND || lop == NOBEHIND)
1329 {
1330 regtail(ret, regnode(BHPOS));
1331 *flagp |= HASLOOKBH;
1332 }
1333 regtail(ret, regnode(END)); /* operand ends */
1334 reginsert(lop, ret);
1335 break;
1336 }
1337
1338 case Magic('?'):
1339 case Magic('='):
1340 /* Emit x= as (x|) */
1341 reginsert(BRANCH, ret); /* Either x */
1342 regtail(ret, regnode(BRANCH)); /* or */
1343 next = regnode(NOTHING); /* null. */
1344 regtail(ret, next);
1345 regoptail(ret, next);
1346 break;
1347
1348 case Magic('{'):
1349 if (!read_limits(&minval, &maxval))
1350 return NULL;
1351 if (!(flags & HASWIDTH) && (maxval > minval
1352 ? maxval >= MAX_LIMIT : minval >= MAX_LIMIT))
1353 EMSG_M_RET_NULL(_("E58: %s{ operand could be empty"),
1354 reg_magic == MAGIC_ALL);
1355 if (flags & SIMPLE)
1356 {
1357 reginsert(BRACE_SIMPLE, ret);
1358 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1359 }
1360 else
1361 {
1362 if (num_complex_braces >= 10)
1363 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1364 reg_magic == MAGIC_ALL);
1365 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1366 regoptail(ret, regnode(BACK));
1367 regoptail(ret, ret);
1368 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1369 ++num_complex_braces;
1370 }
1371 if (minval > 0 && maxval > 0)
1372 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1373 break;
1374 }
1375 if (re_multi_type(peekchr()) != NOT_MULTI)
1376 {
1377 /* Can't have a multi follow a multi. */
1378 if (peekchr() == Magic('*'))
1379 sprintf((char *)IObuff, _("E61: Nested %s*"),
1380 reg_magic >= MAGIC_ON ? "" : "\\");
1381 else
1382 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1383 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1384 EMSG_RET_NULL(IObuff);
1385 }
1386
1387 return ret;
1388}
1389
1390/*
1391 * regatom - the lowest level
1392 *
1393 * Optimization: gobbles an entire sequence of ordinary characters so that
1394 * it can turn them into a single node, which is smaller to store and
1395 * faster to run. Don't do this when one_exactly is set.
1396 */
1397 static char_u *
1398regatom(flagp)
1399 int *flagp;
1400{
1401 char_u *ret;
1402 int flags;
1403 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1404 int c;
1405 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1406 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1407 FNAME, SFNAME, PRINT, SPRINT,
1408 WHITE, NWHITE, DIGIT, NDIGIT,
1409 HEX, NHEX, OCTAL, NOCTAL,
1410 WORD, NWORD, HEAD, NHEAD,
1411 ALPHA, NALPHA, LOWER, NLOWER,
1412 UPPER, NUPPER
1413 };
1414 char_u *p;
1415 int extra = 0;
1416
1417 *flagp = WORST; /* Tentatively. */
1418 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
1419
1420 c = getchr();
1421 switch (c)
1422 {
1423 case Magic('^'):
1424 ret = regnode(BOL);
1425 break;
1426
1427 case Magic('$'):
1428 ret = regnode(EOL);
1429#if defined(FEAT_SYN_HL) || defined(PROTO)
1430 had_eol = TRUE;
1431#endif
1432 break;
1433
1434 case Magic('<'):
1435 ret = regnode(BOW);
1436 break;
1437
1438 case Magic('>'):
1439 ret = regnode(EOW);
1440 break;
1441
1442 case Magic('_'):
1443 c = no_Magic(getchr());
1444 if (c == '^') /* "\_^" is start-of-line */
1445 {
1446 ret = regnode(BOL);
1447 break;
1448 }
1449 if (c == '$') /* "\_$" is end-of-line */
1450 {
1451 ret = regnode(EOL);
1452#if defined(FEAT_SYN_HL) || defined(PROTO)
1453 had_eol = TRUE;
1454#endif
1455 break;
1456 }
1457
1458 extra = ADD_NL;
1459 *flagp |= HASNL;
1460
1461 /* "\_[" is character range plus newline */
1462 if (c == '[')
1463 goto collection;
1464
1465 /* "\_x" is character class plus newline */
1466 /*FALLTHROUGH*/
1467
1468 /*
1469 * Character classes.
1470 */
1471 case Magic('.'):
1472 case Magic('i'):
1473 case Magic('I'):
1474 case Magic('k'):
1475 case Magic('K'):
1476 case Magic('f'):
1477 case Magic('F'):
1478 case Magic('p'):
1479 case Magic('P'):
1480 case Magic('s'):
1481 case Magic('S'):
1482 case Magic('d'):
1483 case Magic('D'):
1484 case Magic('x'):
1485 case Magic('X'):
1486 case Magic('o'):
1487 case Magic('O'):
1488 case Magic('w'):
1489 case Magic('W'):
1490 case Magic('h'):
1491 case Magic('H'):
1492 case Magic('a'):
1493 case Magic('A'):
1494 case Magic('l'):
1495 case Magic('L'):
1496 case Magic('u'):
1497 case Magic('U'):
1498 p = vim_strchr(classchars, no_Magic(c));
1499 if (p == NULL)
1500 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1501 ret = regnode(classcodes[p - classchars] + extra);
1502 *flagp |= HASWIDTH | SIMPLE;
1503 break;
1504
1505 case Magic('n'):
1506 if (reg_string)
1507 {
1508 /* In a string "\n" matches a newline character. */
1509 ret = regnode(EXACTLY);
1510 regc(NL);
1511 regc(NUL);
1512 *flagp |= HASWIDTH | SIMPLE;
1513 }
1514 else
1515 {
1516 /* In buffer text "\n" matches the end of a line. */
1517 ret = regnode(NEWL);
1518 *flagp |= HASWIDTH | HASNL;
1519 }
1520 break;
1521
1522 case Magic('('):
1523 if (one_exactly)
1524 EMSG_ONE_RET_NULL;
1525 ret = reg(REG_PAREN, &flags);
1526 if (ret == NULL)
1527 return NULL;
1528 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1529 break;
1530
1531 case NUL:
1532 case Magic('|'):
1533 case Magic('&'):
1534 case Magic(')'):
1535 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1536 /* NOTREACHED */
1537
1538 case Magic('='):
1539 case Magic('?'):
1540 case Magic('+'):
1541 case Magic('@'):
1542 case Magic('{'):
1543 case Magic('*'):
1544 c = no_Magic(c);
1545 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1546 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1547 ? "" : "\\", c);
1548 EMSG_RET_NULL(IObuff);
1549 /* NOTREACHED */
1550
1551 case Magic('~'): /* previous substitute pattern */
1552 if (reg_prev_sub)
1553 {
1554 char_u *lp;
1555
1556 ret = regnode(EXACTLY);
1557 lp = reg_prev_sub;
1558 while (*lp != NUL)
1559 regc(*lp++);
1560 regc(NUL);
1561 if (*reg_prev_sub != NUL)
1562 {
1563 *flagp |= HASWIDTH;
1564 if ((lp - reg_prev_sub) == 1)
1565 *flagp |= SIMPLE;
1566 }
1567 }
1568 else
1569 EMSG_RET_NULL(_(e_nopresub));
1570 break;
1571
1572 case Magic('1'):
1573 case Magic('2'):
1574 case Magic('3'):
1575 case Magic('4'):
1576 case Magic('5'):
1577 case Magic('6'):
1578 case Magic('7'):
1579 case Magic('8'):
1580 case Magic('9'):
1581 {
1582 int refnum;
1583
1584 refnum = c - Magic('0');
1585 /*
1586 * Check if the back reference is legal. We must have seen the
1587 * close brace.
1588 * TODO: Should also check that we don't refer to something
1589 * that is repeated (+*=): what instance of the repetition
1590 * should we match?
1591 */
1592 if (!had_endbrace[refnum])
1593 {
1594 /* Trick: check if "@<=" or "@<!" follows, in which case
1595 * the \1 can appear before the referenced match. */
1596 for (p = regparse; *p != NUL; ++p)
1597 if (p[0] == '@' && p[1] == '<'
1598 && (p[2] == '!' || p[2] == '='))
1599 break;
1600 if (*p == NUL)
1601 EMSG_RET_NULL(_("E65: Illegal back reference"));
1602 }
1603 ret = regnode(BACKREF + refnum);
1604 }
1605 break;
1606
1607#ifdef FEAT_SYN_HL
1608 case Magic('z'):
1609 {
1610 c = no_Magic(getchr());
1611 switch (c)
1612 {
1613 case '(': if (reg_do_extmatch != REX_SET)
1614 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1615 if (one_exactly)
1616 EMSG_ONE_RET_NULL;
1617 ret = reg(REG_ZPAREN, &flags);
1618 if (ret == NULL)
1619 return NULL;
1620 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1621 re_has_z = REX_SET;
1622 break;
1623
1624 case '1':
1625 case '2':
1626 case '3':
1627 case '4':
1628 case '5':
1629 case '6':
1630 case '7':
1631 case '8':
1632 case '9': if (reg_do_extmatch != REX_USE)
1633 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1634 ret = regnode(ZREF + c - '0');
1635 re_has_z = REX_USE;
1636 break;
1637
1638 case 's': ret = regnode(MOPEN + 0);
1639 break;
1640
1641 case 'e': ret = regnode(MCLOSE + 0);
1642 break;
1643
1644 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1645 }
1646 }
1647 break;
1648#endif
1649
1650 case Magic('%'):
1651 {
1652 c = no_Magic(getchr());
1653 switch (c)
1654 {
1655 /* () without a back reference */
1656 case '(':
1657 if (one_exactly)
1658 EMSG_ONE_RET_NULL;
1659 ret = reg(REG_NPAREN, &flags);
1660 if (ret == NULL)
1661 return NULL;
1662 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1663 break;
1664
1665 /* Catch \%^ and \%$ regardless of where they appear in the
1666 * pattern -- regardless of whether or not it makes sense. */
1667 case '^':
1668 ret = regnode(RE_BOF);
1669 break;
1670
1671 case '$':
1672 ret = regnode(RE_EOF);
1673 break;
1674
1675 case '#':
1676 ret = regnode(CURSOR);
1677 break;
1678
1679 /* \%[abc]: Emit as a list of branches, all ending at the last
1680 * branch which matches nothing. */
1681 case '[':
1682 if (one_exactly) /* doesn't nest */
1683 EMSG_ONE_RET_NULL;
1684 {
1685 char_u *lastbranch;
1686 char_u *lastnode = NULL;
1687 char_u *br;
1688
1689 ret = NULL;
1690 while ((c = getchr()) != ']')
1691 {
1692 if (c == NUL)
1693 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1694 reg_magic == MAGIC_ALL);
1695 br = regnode(BRANCH);
1696 if (ret == NULL)
1697 ret = br;
1698 else
1699 regtail(lastnode, br);
1700
1701 ungetchr();
1702 one_exactly = TRUE;
1703 lastnode = regatom(flagp);
1704 one_exactly = FALSE;
1705 if (lastnode == NULL)
1706 return NULL;
1707 }
1708 if (ret == NULL)
1709 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1710 reg_magic == MAGIC_ALL);
1711 lastbranch = regnode(BRANCH);
1712 br = regnode(NOTHING);
1713 if (ret != JUST_CALC_SIZE)
1714 {
1715 regtail(lastnode, br);
1716 regtail(lastbranch, br);
1717 /* connect all branches to the NOTHING
1718 * branch at the end */
1719 for (br = ret; br != lastnode; )
1720 {
1721 if (OP(br) == BRANCH)
1722 {
1723 regtail(br, lastbranch);
1724 br = OPERAND(br);
1725 }
1726 else
1727 br = regnext(br);
1728 }
1729 }
1730 *flagp &= ~HASWIDTH;
1731 break;
1732 }
1733
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001734 case 'd': /* %d123 decimal */
1735 case 'o': /* %o123 octal */
1736 case 'x': /* %xab hex 2 */
1737 case 'u': /* %uabcd hex 4 */
1738 case 'U': /* %U1234abcd hex 8 */
1739 {
1740 int i;
1741
1742 switch (c)
1743 {
1744 case 'd': i = getdecchrs(); break;
1745 case 'o': i = getoctchrs(); break;
1746 case 'x': i = gethexchrs(2); break;
1747 case 'u': i = gethexchrs(4); break;
1748 case 'U': i = gethexchrs(8); break;
1749 default: i = -1; break;
1750 }
1751
1752 if (i < 0)
1753 EMSG_M_RET_NULL(
1754 _("E678: Invalid character after %s%%[dxouU]"),
1755 reg_magic == MAGIC_ALL);
1756 ret = regnode(EXACTLY);
1757 if (i == 0)
1758 regc(0x0a);
1759 else
1760#ifdef FEAT_MBYTE
1761 regmbc(i);
1762#else
1763 regc(i);
1764#endif
1765 regc(NUL);
1766 *flagp |= HASWIDTH;
1767 break;
1768 }
1769
Bram Moolenaar071d4272004-06-13 20:20:40 +00001770 default:
1771 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1772 {
1773 long_u n = 0;
1774 int cmp;
1775
1776 cmp = c;
1777 if (cmp == '<' || cmp == '>')
1778 c = getchr();
1779 while (VIM_ISDIGIT(c))
1780 {
1781 n = n * 10 + (c - '0');
1782 c = getchr();
1783 }
1784 if (c == 'l' || c == 'c' || c == 'v')
1785 {
1786 if (c == 'l')
1787 ret = regnode(RE_LNUM);
1788 else if (c == 'c')
1789 ret = regnode(RE_COL);
1790 else
1791 ret = regnode(RE_VCOL);
1792 if (ret == JUST_CALC_SIZE)
1793 regsize += 5;
1794 else
1795 {
1796 /* put the number and the optional
1797 * comparator after the opcode */
1798 regcode = re_put_long(regcode, n);
1799 *regcode++ = cmp;
1800 }
1801 break;
1802 }
1803 }
1804
1805 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1806 reg_magic == MAGIC_ALL);
1807 }
1808 }
1809 break;
1810
1811 case Magic('['):
1812collection:
1813 {
1814 char_u *lp;
1815
1816 /*
1817 * If there is no matching ']', we assume the '[' is a normal
1818 * character. This makes 'incsearch' and ":help [" work.
1819 */
1820 lp = skip_anyof(regparse);
1821 if (*lp == ']') /* there is a matching ']' */
1822 {
1823 int startc = -1; /* > 0 when next '-' is a range */
1824 int endc;
1825
1826 /*
1827 * In a character class, different parsing rules apply.
1828 * Not even \ is special anymore, nothing is.
1829 */
1830 if (*regparse == '^') /* Complement of range. */
1831 {
1832 ret = regnode(ANYBUT + extra);
1833 regparse++;
1834 }
1835 else
1836 ret = regnode(ANYOF + extra);
1837
1838 /* At the start ']' and '-' mean the literal character. */
1839 if (*regparse == ']' || *regparse == '-')
1840 regc(*regparse++);
1841
1842 while (*regparse != NUL && *regparse != ']')
1843 {
1844 if (*regparse == '-')
1845 {
1846 ++regparse;
1847 /* The '-' is not used for a range at the end and
1848 * after or before a '\n'. */
1849 if (*regparse == ']' || *regparse == NUL
1850 || startc == -1
1851 || (regparse[0] == '\\' && regparse[1] == 'n'))
1852 {
1853 regc('-');
1854 startc = '-'; /* [--x] is a range */
1855 }
1856 else
1857 {
1858#ifdef FEAT_MBYTE
1859 if (has_mbyte)
1860 endc = mb_ptr2char_adv(&regparse);
1861 else
1862#endif
1863 endc = *regparse++;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001864
1865 /* Handle \o40, \x20 and \u20AC style sequences */
1866 if (endc == '\\' && !cpo_lit)
1867 endc = coll_get_char();
1868
Bram Moolenaar071d4272004-06-13 20:20:40 +00001869 if (startc > endc)
1870 EMSG_RET_NULL(_(e_invrange));
1871#ifdef FEAT_MBYTE
1872 if (has_mbyte && ((*mb_char2len)(startc) > 1
1873 || (*mb_char2len)(endc) > 1))
1874 {
1875 /* Limit to a range of 256 chars */
1876 if (endc > startc + 256)
1877 EMSG_RET_NULL(_(e_invrange));
1878 while (++startc <= endc)
1879 regmbc(startc);
1880 }
1881 else
1882#endif
1883 {
1884#ifdef EBCDIC
1885 int alpha_only = FALSE;
1886
1887 /* for alphabetical range skip the gaps
1888 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1889 if (isalpha(startc) && isalpha(endc))
1890 alpha_only = TRUE;
1891#endif
1892 while (++startc <= endc)
1893#ifdef EBCDIC
1894 if (!alpha_only || isalpha(startc))
1895#endif
1896 regc(startc);
1897 }
1898 startc = -1;
1899 }
1900 }
1901 /*
1902 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1903 * accepts "\t", "\e", etc., but only when the 'l' flag in
1904 * 'cpoptions' is not included.
1905 */
1906 else if (*regparse == '\\'
1907 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1908 || (!cpo_lit
1909 && vim_strchr(REGEXP_ABBR,
1910 regparse[1]) != NULL)))
1911 {
1912 regparse++;
1913 if (*regparse == 'n')
1914 {
1915 /* '\n' in range: also match NL */
1916 if (ret != JUST_CALC_SIZE)
1917 {
1918 if (*ret == ANYBUT)
1919 *ret = ANYBUT + ADD_NL;
1920 else if (*ret == ANYOF)
1921 *ret = ANYOF + ADD_NL;
1922 /* else: must have had a \n already */
1923 }
1924 *flagp |= HASNL;
1925 regparse++;
1926 startc = -1;
1927 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001928 else if (*regparse == 'd'
1929 || *regparse == 'o'
1930 || *regparse == 'x'
1931 || *regparse == 'u'
1932 || *regparse == 'U')
1933 {
1934 startc = coll_get_char();
1935 if (startc == 0)
1936 regc(0x0a);
1937 else
1938#ifdef FEAT_MBYTE
1939 regmbc(startc);
1940#else
1941 regc(startc);
1942#endif
1943 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001944 else
1945 {
1946 startc = backslash_trans(*regparse++);
1947 regc(startc);
1948 }
1949 }
1950 else if (*regparse == '[')
1951 {
1952 int c_class;
1953 int cu;
1954
1955 c_class = skip_class_name(&regparse);
1956 startc = -1;
1957 /* Characters assumed to be 8 bits! */
1958 switch (c_class)
1959 {
1960 case CLASS_NONE:
1961 /* literal '[', allow [[-x] as a range */
1962 startc = *regparse++;
1963 regc(startc);
1964 break;
1965 case CLASS_ALNUM:
1966 for (cu = 1; cu <= 255; cu++)
1967 if (isalnum(cu))
1968 regc(cu);
1969 break;
1970 case CLASS_ALPHA:
1971 for (cu = 1; cu <= 255; cu++)
1972 if (isalpha(cu))
1973 regc(cu);
1974 break;
1975 case CLASS_BLANK:
1976 regc(' ');
1977 regc('\t');
1978 break;
1979 case CLASS_CNTRL:
1980 for (cu = 1; cu <= 255; cu++)
1981 if (iscntrl(cu))
1982 regc(cu);
1983 break;
1984 case CLASS_DIGIT:
1985 for (cu = 1; cu <= 255; cu++)
1986 if (VIM_ISDIGIT(cu))
1987 regc(cu);
1988 break;
1989 case CLASS_GRAPH:
1990 for (cu = 1; cu <= 255; cu++)
1991 if (isgraph(cu))
1992 regc(cu);
1993 break;
1994 case CLASS_LOWER:
1995 for (cu = 1; cu <= 255; cu++)
1996 if (islower(cu))
1997 regc(cu);
1998 break;
1999 case CLASS_PRINT:
2000 for (cu = 1; cu <= 255; cu++)
2001 if (vim_isprintc(cu))
2002 regc(cu);
2003 break;
2004 case CLASS_PUNCT:
2005 for (cu = 1; cu <= 255; cu++)
2006 if (ispunct(cu))
2007 regc(cu);
2008 break;
2009 case CLASS_SPACE:
2010 for (cu = 9; cu <= 13; cu++)
2011 regc(cu);
2012 regc(' ');
2013 break;
2014 case CLASS_UPPER:
2015 for (cu = 1; cu <= 255; cu++)
2016 if (isupper(cu))
2017 regc(cu);
2018 break;
2019 case CLASS_XDIGIT:
2020 for (cu = 1; cu <= 255; cu++)
2021 if (vim_isxdigit(cu))
2022 regc(cu);
2023 break;
2024 case CLASS_TAB:
2025 regc('\t');
2026 break;
2027 case CLASS_RETURN:
2028 regc('\r');
2029 break;
2030 case CLASS_BACKSPACE:
2031 regc('\b');
2032 break;
2033 case CLASS_ESCAPE:
2034 regc('\033');
2035 break;
2036 }
2037 }
2038 else
2039 {
2040#ifdef FEAT_MBYTE
2041 if (has_mbyte)
2042 {
2043 int len;
2044
2045 /* produce a multibyte character, including any
2046 * following composing characters */
2047 startc = mb_ptr2char(regparse);
2048 len = (*mb_ptr2len_check)(regparse);
2049 if (enc_utf8 && utf_char2len(startc) != len)
2050 startc = -1; /* composing chars */
2051 while (--len >= 0)
2052 regc(*regparse++);
2053 }
2054 else
2055#endif
2056 {
2057 startc = *regparse++;
2058 regc(startc);
2059 }
2060 }
2061 }
2062 regc(NUL);
2063 prevchr_len = 1; /* last char was the ']' */
2064 if (*regparse != ']')
2065 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2066 skipchr(); /* let's be friends with the lexer again */
2067 *flagp |= HASWIDTH | SIMPLE;
2068 break;
2069 }
2070 }
2071 /* FALLTHROUGH */
2072
2073 default:
2074 {
2075 int len;
2076
2077#ifdef FEAT_MBYTE
2078 /* A multi-byte character is handled as a separate atom if it's
2079 * before a multi. */
2080 if (has_mbyte && (*mb_char2len)(c) > 1
2081 && re_multi_type(peekchr()) != NOT_MULTI)
2082 {
2083 ret = regnode(MULTIBYTECODE);
2084 regmbc(c);
2085 *flagp |= HASWIDTH | SIMPLE;
2086 break;
2087 }
2088#endif
2089
2090 ret = regnode(EXACTLY);
2091
2092 /*
2093 * Append characters as long as:
2094 * - there is no following multi, we then need the character in
2095 * front of it as a single character operand
2096 * - not running into a Magic character
2097 * - "one_exactly" is not set
2098 * But always emit at least one character. Might be a Multi,
2099 * e.g., a "[" without matching "]".
2100 */
2101 for (len = 0; c != NUL && (len == 0
2102 || (re_multi_type(peekchr()) == NOT_MULTI
2103 && !one_exactly
2104 && !is_Magic(c))); ++len)
2105 {
2106 c = no_Magic(c);
2107#ifdef FEAT_MBYTE
2108 if (has_mbyte)
2109 {
2110 regmbc(c);
2111 if (enc_utf8)
2112 {
2113 int off;
2114 int l;
2115
2116 /* Need to get composing character too, directly
2117 * access regparse for that, because skipchr() skips
2118 * over composing chars. */
2119 ungetchr();
2120 if (*regparse == '\\' && regparse[1] != NUL)
2121 off = 1;
2122 else
2123 off = 0;
2124 for (;;)
2125 {
2126 l = utf_ptr2len_check(regparse + off);
2127 if (!UTF_COMPOSINGLIKE(regparse + off,
2128 regparse + off + l))
2129 break;
2130 off += l;
2131 regmbc(utf_ptr2char(regparse + off));
2132 }
2133 skipchr();
2134 }
2135 }
2136 else
2137#endif
2138 regc(c);
2139 c = getchr();
2140 }
2141 ungetchr();
2142
2143 regc(NUL);
2144 *flagp |= HASWIDTH;
2145 if (len == 1)
2146 *flagp |= SIMPLE;
2147 }
2148 break;
2149 }
2150
2151 return ret;
2152}
2153
2154/*
2155 * emit a node
2156 * Return pointer to generated code.
2157 */
2158 static char_u *
2159regnode(op)
2160 int op;
2161{
2162 char_u *ret;
2163
2164 ret = regcode;
2165 if (ret == JUST_CALC_SIZE)
2166 regsize += 3;
2167 else
2168 {
2169 *regcode++ = op;
2170 *regcode++ = NUL; /* Null "next" pointer. */
2171 *regcode++ = NUL;
2172 }
2173 return ret;
2174}
2175
2176/*
2177 * Emit (if appropriate) a byte of code
2178 */
2179 static void
2180regc(b)
2181 int b;
2182{
2183 if (regcode == JUST_CALC_SIZE)
2184 regsize++;
2185 else
2186 *regcode++ = b;
2187}
2188
2189#ifdef FEAT_MBYTE
2190/*
2191 * Emit (if appropriate) a multi-byte character of code
2192 */
2193 static void
2194regmbc(c)
2195 int c;
2196{
2197 if (regcode == JUST_CALC_SIZE)
2198 regsize += (*mb_char2len)(c);
2199 else
2200 regcode += (*mb_char2bytes)(c, regcode);
2201}
2202#endif
2203
2204/*
2205 * reginsert - insert an operator in front of already-emitted operand
2206 *
2207 * Means relocating the operand.
2208 */
2209 static void
2210reginsert(op, opnd)
2211 int op;
2212 char_u *opnd;
2213{
2214 char_u *src;
2215 char_u *dst;
2216 char_u *place;
2217
2218 if (regcode == JUST_CALC_SIZE)
2219 {
2220 regsize += 3;
2221 return;
2222 }
2223 src = regcode;
2224 regcode += 3;
2225 dst = regcode;
2226 while (src > opnd)
2227 *--dst = *--src;
2228
2229 place = opnd; /* Op node, where operand used to be. */
2230 *place++ = op;
2231 *place++ = NUL;
2232 *place = NUL;
2233}
2234
2235/*
2236 * reginsert_limits - insert an operator in front of already-emitted operand.
2237 * The operator has the given limit values as operands. Also set next pointer.
2238 *
2239 * Means relocating the operand.
2240 */
2241 static void
2242reginsert_limits(op, minval, maxval, opnd)
2243 int op;
2244 long minval;
2245 long maxval;
2246 char_u *opnd;
2247{
2248 char_u *src;
2249 char_u *dst;
2250 char_u *place;
2251
2252 if (regcode == JUST_CALC_SIZE)
2253 {
2254 regsize += 11;
2255 return;
2256 }
2257 src = regcode;
2258 regcode += 11;
2259 dst = regcode;
2260 while (src > opnd)
2261 *--dst = *--src;
2262
2263 place = opnd; /* Op node, where operand used to be. */
2264 *place++ = op;
2265 *place++ = NUL;
2266 *place++ = NUL;
2267 place = re_put_long(place, (long_u)minval);
2268 place = re_put_long(place, (long_u)maxval);
2269 regtail(opnd, place);
2270}
2271
2272/*
2273 * Write a long as four bytes at "p" and return pointer to the next char.
2274 */
2275 static char_u *
2276re_put_long(p, val)
2277 char_u *p;
2278 long_u val;
2279{
2280 *p++ = (char_u) ((val >> 24) & 0377);
2281 *p++ = (char_u) ((val >> 16) & 0377);
2282 *p++ = (char_u) ((val >> 8) & 0377);
2283 *p++ = (char_u) (val & 0377);
2284 return p;
2285}
2286
2287/*
2288 * regtail - set the next-pointer at the end of a node chain
2289 */
2290 static void
2291regtail(p, val)
2292 char_u *p;
2293 char_u *val;
2294{
2295 char_u *scan;
2296 char_u *temp;
2297 int offset;
2298
2299 if (p == JUST_CALC_SIZE)
2300 return;
2301
2302 /* Find last node. */
2303 scan = p;
2304 for (;;)
2305 {
2306 temp = regnext(scan);
2307 if (temp == NULL)
2308 break;
2309 scan = temp;
2310 }
2311
2312 if (OP(scan) == BACK)
2313 offset = (int)(scan - val);
2314 else
2315 offset = (int)(val - scan);
2316 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2317 *(scan + 2) = (char_u) (offset & 0377);
2318}
2319
2320/*
2321 * regoptail - regtail on item after a BRANCH; nop if none
2322 */
2323 static void
2324regoptail(p, val)
2325 char_u *p;
2326 char_u *val;
2327{
2328 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2329 if (p == NULL || p == JUST_CALC_SIZE
2330 || (OP(p) != BRANCH
2331 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2332 return;
2333 regtail(OPERAND(p), val);
2334}
2335
2336/*
2337 * getchr() - get the next character from the pattern. We know about
2338 * magic and such, so therefore we need a lexical analyzer.
2339 */
2340
2341/* static int curchr; */
2342static int prevprevchr;
2343static int prevchr;
2344static int nextchr; /* used for ungetchr() */
2345/*
2346 * Note: prevchr is sometimes -1 when we are not at the start,
2347 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2348 * taken to be magic -- webb
2349 */
2350static int at_start; /* True when on the first character */
2351static int prev_at_start; /* True when on the second character */
2352
2353 static void
2354initchr(str)
2355 char_u *str;
2356{
2357 regparse = str;
2358 prevchr_len = 0;
2359 curchr = prevprevchr = prevchr = nextchr = -1;
2360 at_start = TRUE;
2361 prev_at_start = FALSE;
2362}
2363
2364 static int
2365peekchr()
2366{
2367 if (curchr == -1)
2368 {
2369 switch (curchr = regparse[0])
2370 {
2371 case '.':
2372 case '[':
2373 case '~':
2374 /* magic when 'magic' is on */
2375 if (reg_magic >= MAGIC_ON)
2376 curchr = Magic(curchr);
2377 break;
2378 case '(':
2379 case ')':
2380 case '{':
2381 case '%':
2382 case '+':
2383 case '=':
2384 case '?':
2385 case '@':
2386 case '!':
2387 case '&':
2388 case '|':
2389 case '<':
2390 case '>':
2391 case '#': /* future ext. */
2392 case '"': /* future ext. */
2393 case '\'': /* future ext. */
2394 case ',': /* future ext. */
2395 case '-': /* future ext. */
2396 case ':': /* future ext. */
2397 case ';': /* future ext. */
2398 case '`': /* future ext. */
2399 case '/': /* Can't be used in / command */
2400 /* magic only after "\v" */
2401 if (reg_magic == MAGIC_ALL)
2402 curchr = Magic(curchr);
2403 break;
2404 case '*':
2405 /* * is not magic as the very first character, eg "?*ptr" and when
2406 * after '^', eg "/^*ptr" */
2407 if (reg_magic >= MAGIC_ON && !at_start
2408 && !(prev_at_start && prevchr == Magic('^')))
2409 curchr = Magic('*');
2410 break;
2411 case '^':
2412 /* '^' is only magic as the very first character and if it's after
2413 * "\(", "\|", "\&' or "\n" */
2414 if (reg_magic >= MAGIC_OFF
2415 && (at_start
2416 || reg_magic == MAGIC_ALL
2417 || prevchr == Magic('(')
2418 || prevchr == Magic('|')
2419 || prevchr == Magic('&')
2420 || prevchr == Magic('n')
2421 || (no_Magic(prevchr) == '('
2422 && prevprevchr == Magic('%'))))
2423 {
2424 curchr = Magic('^');
2425 at_start = TRUE;
2426 prev_at_start = FALSE;
2427 }
2428 break;
2429 case '$':
2430 /* '$' is only magic as the very last char and if it's in front of
2431 * either "\|", "\)", "\&", or "\n" */
2432 if (reg_magic >= MAGIC_OFF)
2433 {
2434 char_u *p = regparse + 1;
2435
2436 /* ignore \c \C \m and \M after '$' */
2437 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2438 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2439 p += 2;
2440 if (p[0] == NUL
2441 || (p[0] == '\\'
2442 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2443 || p[1] == 'n'))
2444 || reg_magic == MAGIC_ALL)
2445 curchr = Magic('$');
2446 }
2447 break;
2448 case '\\':
2449 {
2450 int c = regparse[1];
2451
2452 if (c == NUL)
2453 curchr = '\\'; /* trailing '\' */
2454 else if (
2455#ifdef EBCDIC
2456 vim_strchr(META, c)
2457#else
2458 c <= '~' && META_flags[c]
2459#endif
2460 )
2461 {
2462 /*
2463 * META contains everything that may be magic sometimes,
2464 * except ^ and $ ("\^" and "\$" are only magic after
2465 * "\v"). We now fetch the next character and toggle its
2466 * magicness. Therefore, \ is so meta-magic that it is
2467 * not in META.
2468 */
2469 curchr = -1;
2470 prev_at_start = at_start;
2471 at_start = FALSE; /* be able to say "/\*ptr" */
2472 ++regparse;
2473 peekchr();
2474 --regparse;
2475 curchr = toggle_Magic(curchr);
2476 }
2477 else if (vim_strchr(REGEXP_ABBR, c))
2478 {
2479 /*
2480 * Handle abbreviations, like "\t" for TAB -- webb
2481 */
2482 curchr = backslash_trans(c);
2483 }
2484 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2485 curchr = toggle_Magic(c);
2486 else
2487 {
2488 /*
2489 * Next character can never be (made) magic?
2490 * Then backslashing it won't do anything.
2491 */
2492#ifdef FEAT_MBYTE
2493 if (has_mbyte)
2494 curchr = (*mb_ptr2char)(regparse + 1);
2495 else
2496#endif
2497 curchr = c;
2498 }
2499 break;
2500 }
2501
2502#ifdef FEAT_MBYTE
2503 default:
2504 if (has_mbyte)
2505 curchr = (*mb_ptr2char)(regparse);
2506#endif
2507 }
2508 }
2509
2510 return curchr;
2511}
2512
2513/*
2514 * Eat one lexed character. Do this in a way that we can undo it.
2515 */
2516 static void
2517skipchr()
2518{
2519 /* peekchr() eats a backslash, do the same here */
2520 if (*regparse == '\\')
2521 prevchr_len = 1;
2522 else
2523 prevchr_len = 0;
2524 if (regparse[prevchr_len] != NUL)
2525 {
2526#ifdef FEAT_MBYTE
2527 if (has_mbyte)
2528 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2529 else
2530#endif
2531 ++prevchr_len;
2532 }
2533 regparse += prevchr_len;
2534 prev_at_start = at_start;
2535 at_start = FALSE;
2536 prevprevchr = prevchr;
2537 prevchr = curchr;
2538 curchr = nextchr; /* use previously unget char, or -1 */
2539 nextchr = -1;
2540}
2541
2542/*
2543 * Skip a character while keeping the value of prev_at_start for at_start.
2544 * prevchr and prevprevchr are also kept.
2545 */
2546 static void
2547skipchr_keepstart()
2548{
2549 int as = prev_at_start;
2550 int pr = prevchr;
2551 int prpr = prevprevchr;
2552
2553 skipchr();
2554 at_start = as;
2555 prevchr = pr;
2556 prevprevchr = prpr;
2557}
2558
2559 static int
2560getchr()
2561{
2562 int chr = peekchr();
2563
2564 skipchr();
2565 return chr;
2566}
2567
2568/*
2569 * put character back. Works only once!
2570 */
2571 static void
2572ungetchr()
2573{
2574 nextchr = curchr;
2575 curchr = prevchr;
2576 prevchr = prevprevchr;
2577 at_start = prev_at_start;
2578 prev_at_start = FALSE;
2579
2580 /* Backup regparse, so that it's at the same position as before the
2581 * getchr(). */
2582 regparse -= prevchr_len;
2583}
2584
2585/*
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002586 * get and return the value of the hex string immediately after the current
2587 * position. Return -1 for invalid, or 0-255 for valid. Position is updated:
2588 * blahblah\%x20asdf
2589 * before-^ ^-after
2590 * The parameter controls the maximum number of input characters. This will be
2591 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2592 */
2593 static int
2594gethexchrs(maxinputlen)
2595 int maxinputlen;
2596{
2597 int nr = 0;
2598 int c;
2599 int i;
2600
2601 for (i = 0; i < maxinputlen; ++i)
2602 {
2603 c = regparse[0];
2604 if (!vim_isxdigit(c))
2605 break;
2606 nr <<= 4;
2607 nr |= hex2nr(c);
2608 ++regparse;
2609 }
2610
2611 if (i == 0)
2612 return -1;
2613 return nr;
2614}
2615
2616/*
2617 * get and return the value of the decimal string immediately after the
2618 * current position. Return -1 for invalid. Consumes all digits.
2619 */
2620 static int
2621getdecchrs()
2622{
2623 int nr = 0;
2624 int c;
2625 int i;
2626
2627 for (i = 0; ; ++i)
2628 {
2629 c = regparse[0];
2630 if (c < '0' || c > '9')
2631 break;
2632 nr *= 10;
2633 nr += c - '0';
2634 ++regparse;
2635 }
2636
2637 if (i == 0)
2638 return -1;
2639 return nr;
2640}
2641
2642/*
2643 * get and return the value of the octal string immediately after the current
2644 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2645 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2646 * treat 8 or 9 as recognised characters. Position is updated:
2647 * blahblah\%o210asdf
2648 * before-^ ^-after
2649 */
2650 static int
2651getoctchrs()
2652{
2653 int nr = 0;
2654 int c;
2655 int i;
2656
2657 for (i = 0; i < 3 && nr < 040; ++i)
2658 {
2659 c = regparse[0];
2660 if (c < '0' || c > '7')
2661 break;
2662 nr <<= 3;
2663 nr |= hex2nr(c);
2664 ++regparse;
2665 }
2666
2667 if (i == 0)
2668 return -1;
2669 return nr;
2670}
2671
2672/*
2673 * Get a number after a backslash that is inside [].
2674 * When nothing is recognized return a backslash.
2675 */
2676 static int
2677coll_get_char()
2678{
2679 int nr = -1;
2680
2681 switch (*regparse++)
2682 {
2683 case 'd': nr = getdecchrs(); break;
2684 case 'o': nr = getoctchrs(); break;
2685 case 'x': nr = gethexchrs(2); break;
2686 case 'u': nr = gethexchrs(4); break;
2687 case 'U': nr = gethexchrs(8); break;
2688 }
2689 if (nr < 0)
2690 {
2691 /* If getting the number fails be backwards compatible: the character
2692 * is a backslash. */
2693 --regparse;
2694 nr = '\\';
2695 }
2696 return nr;
2697}
2698
2699/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002700 * read_limits - Read two integers to be taken as a minimum and maximum.
2701 * If the first character is '-', then the range is reversed.
2702 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2703 * missing, a very big number is the default.
2704 */
2705 static int
2706read_limits(minval, maxval)
2707 long *minval;
2708 long *maxval;
2709{
2710 int reverse = FALSE;
2711 char_u *first_char;
2712 long tmp;
2713
2714 if (*regparse == '-')
2715 {
2716 /* Starts with '-', so reverse the range later */
2717 regparse++;
2718 reverse = TRUE;
2719 }
2720 first_char = regparse;
2721 *minval = getdigits(&regparse);
2722 if (*regparse == ',') /* There is a comma */
2723 {
2724 if (vim_isdigit(*++regparse))
2725 *maxval = getdigits(&regparse);
2726 else
2727 *maxval = MAX_LIMIT;
2728 }
2729 else if (VIM_ISDIGIT(*first_char))
2730 *maxval = *minval; /* It was \{n} or \{-n} */
2731 else
2732 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2733 if (*regparse == '\\')
2734 regparse++; /* Allow either \{...} or \{...\} */
2735 if (*regparse != '}' || (*maxval == 0 && *minval == 0))
2736 {
2737 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2738 reg_magic == MAGIC_ALL ? "" : "\\");
2739 EMSG_RET_FAIL(IObuff);
2740 }
2741
2742 /*
2743 * Reverse the range if there was a '-', or make sure it is in the right
2744 * order otherwise.
2745 */
2746 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2747 {
2748 tmp = *minval;
2749 *minval = *maxval;
2750 *maxval = tmp;
2751 }
2752 skipchr(); /* let's be friends with the lexer again */
2753 return OK;
2754}
2755
2756/*
2757 * vim_regexec and friends
2758 */
2759
2760/*
2761 * Global work variables for vim_regexec().
2762 */
2763
2764/* The current match-position is remembered with these variables: */
2765static linenr_T reglnum; /* line number, relative to first line */
2766static char_u *regline; /* start of current line */
2767static char_u *reginput; /* current input, points into "regline" */
2768
2769static int need_clear_subexpr; /* subexpressions still need to be
2770 * cleared */
2771#ifdef FEAT_SYN_HL
2772static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2773 * still need to be cleared */
2774#endif
2775
2776static int out_of_stack; /* TRUE when ran out of stack space */
2777
2778/*
2779 * Structure used to save the current input state, when it needs to be
2780 * restored after trying a match. Used by reg_save() and reg_restore().
2781 */
2782typedef struct
2783{
2784 union
2785 {
2786 char_u *ptr; /* reginput pointer, for single-line regexp */
2787 lpos_T pos; /* reginput pos, for multi-line regexp */
2788 } rs_u;
2789} regsave_T;
2790
2791/* struct to save start/end pointer/position in for \(\) */
2792typedef struct
2793{
2794 union
2795 {
2796 char_u *ptr;
2797 lpos_T pos;
2798 } se_u;
2799} save_se_T;
2800
2801static char_u *reg_getline __ARGS((linenr_T lnum));
2802static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2803static long regtry __ARGS((regprog_T *prog, colnr_T col));
2804static void cleanup_subexpr __ARGS((void));
2805#ifdef FEAT_SYN_HL
2806static void cleanup_zsubexpr __ARGS((void));
2807#endif
2808static void reg_nextline __ARGS((void));
2809static void reg_save __ARGS((regsave_T *save));
2810static void reg_restore __ARGS((regsave_T *save));
2811static int reg_save_equal __ARGS((regsave_T *save));
2812static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2813static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2814
2815/* Save the sub-expressions before attempting a match. */
2816#define save_se(savep, posp, pp) \
2817 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2818
2819/* After a failed match restore the sub-expressions. */
2820#define restore_se(savep, posp, pp) { \
2821 if (REG_MULTI) \
2822 *(posp) = (savep)->se_u.pos; \
2823 else \
2824 *(pp) = (savep)->se_u.ptr; }
2825
2826static int re_num_cmp __ARGS((long_u val, char_u *scan));
2827static int regmatch __ARGS((char_u *prog));
2828static int regrepeat __ARGS((char_u *p, long maxcount));
2829
2830#ifdef DEBUG
2831int regnarrate = 0;
2832#endif
2833
2834/*
2835 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
2836 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
2837 * contains '\c' or '\C' the value is overruled.
2838 */
2839static int ireg_ic;
2840
2841#ifdef FEAT_MBYTE
2842/*
2843 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
2844 * in the regexp. Defaults to false, always.
2845 */
2846static int ireg_icombine;
2847#endif
2848
2849/*
2850 * Sometimes need to save a copy of a line. Since alloc()/free() is very
2851 * slow, we keep one allocated piece of memory and only re-allocate it when
2852 * it's too small. It's freed in vim_regexec_both() when finished.
2853 */
2854static char_u *reg_tofree;
2855static unsigned reg_tofreelen;
2856
2857/*
2858 * These variables are set when executing a regexp to speed up the execution.
2859 * Which ones are set depends on whethere a single-line or multi-line match is
2860 * done:
2861 * single-line multi-line
2862 * reg_match &regmatch_T NULL
2863 * reg_mmatch NULL &regmmatch_T
2864 * reg_startp reg_match->startp <invalid>
2865 * reg_endp reg_match->endp <invalid>
2866 * reg_startpos <invalid> reg_mmatch->startpos
2867 * reg_endpos <invalid> reg_mmatch->endpos
2868 * reg_win NULL window in which to search
2869 * reg_buf <invalid> buffer in which to search
2870 * reg_firstlnum <invalid> first line in which to search
2871 * reg_maxline 0 last line nr
2872 * reg_line_lbr FALSE or TRUE FALSE
2873 */
2874static regmatch_T *reg_match;
2875static regmmatch_T *reg_mmatch;
2876static char_u **reg_startp = NULL;
2877static char_u **reg_endp = NULL;
2878static lpos_T *reg_startpos = NULL;
2879static lpos_T *reg_endpos = NULL;
2880static win_T *reg_win;
2881static buf_T *reg_buf;
2882static linenr_T reg_firstlnum;
2883static linenr_T reg_maxline;
2884static int reg_line_lbr; /* "\n" in string is line break */
2885
2886/*
2887 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
2888 */
2889 static char_u *
2890reg_getline(lnum)
2891 linenr_T lnum;
2892{
2893 /* when looking behind for a match/no-match lnum is negative. But we
2894 * can't go before line 1 */
2895 if (reg_firstlnum + lnum < 1)
2896 return NULL;
2897 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
2898}
2899
2900static regsave_T behind_pos;
2901
2902#ifdef FEAT_SYN_HL
2903static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
2904static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
2905static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
2906static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
2907#endif
2908
2909/* TRUE if using multi-line regexp. */
2910#define REG_MULTI (reg_match == NULL)
2911
2912/*
2913 * Match a regexp against a string.
2914 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2915 * Uses curbuf for line count and 'iskeyword'.
2916 *
2917 * Return TRUE if there is a match, FALSE if not.
2918 */
2919 int
2920vim_regexec(rmp, line, col)
2921 regmatch_T *rmp;
2922 char_u *line; /* string to match against */
2923 colnr_T col; /* column to start looking for match */
2924{
2925 reg_match = rmp;
2926 reg_mmatch = NULL;
2927 reg_maxline = 0;
2928 reg_line_lbr = FALSE;
2929 reg_win = NULL;
2930 ireg_ic = rmp->rm_ic;
2931#ifdef FEAT_MBYTE
2932 ireg_icombine = FALSE;
2933#endif
2934 return (vim_regexec_both(line, col) != 0);
2935}
2936
2937#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
2938/*
2939 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2940 */
2941 int
2942vim_regexec_nl(rmp, line, col)
2943 regmatch_T *rmp;
2944 char_u *line; /* string to match against */
2945 colnr_T col; /* column to start looking for match */
2946{
2947 reg_match = rmp;
2948 reg_mmatch = NULL;
2949 reg_maxline = 0;
2950 reg_line_lbr = TRUE;
2951 reg_win = NULL;
2952 ireg_ic = rmp->rm_ic;
2953#ifdef FEAT_MBYTE
2954 ireg_icombine = FALSE;
2955#endif
2956 return (vim_regexec_both(line, col) != 0);
2957}
2958#endif
2959
2960/*
2961 * Match a regexp against multiple lines.
2962 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2963 * Uses curbuf for line count and 'iskeyword'.
2964 *
2965 * Return zero if there is no match. Return number of lines contained in the
2966 * match otherwise.
2967 */
2968 long
2969vim_regexec_multi(rmp, win, buf, lnum, col)
2970 regmmatch_T *rmp;
2971 win_T *win; /* window in which to search or NULL */
2972 buf_T *buf; /* buffer in which to search */
2973 linenr_T lnum; /* nr of line to start looking for match */
2974 colnr_T col; /* column to start looking for match */
2975{
2976 long r;
2977 buf_T *save_curbuf = curbuf;
2978
2979 reg_match = NULL;
2980 reg_mmatch = rmp;
2981 reg_buf = buf;
2982 reg_win = win;
2983 reg_firstlnum = lnum;
2984 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
2985 reg_line_lbr = FALSE;
2986 ireg_ic = rmp->rmm_ic;
2987#ifdef FEAT_MBYTE
2988 ireg_icombine = FALSE;
2989#endif
2990
2991 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
2992 curbuf = buf;
2993 r = vim_regexec_both(NULL, col);
2994 curbuf = save_curbuf;
2995
2996 return r;
2997}
2998
2999/*
3000 * Match a regexp against a string ("line" points to the string) or multiple
3001 * lines ("line" is NULL, use reg_getline()).
3002 */
3003#ifdef HAVE_SETJMP_H
3004 static long
3005vim_regexec_both(line_arg, col_arg)
3006 char_u *line_arg;
3007 colnr_T col_arg; /* column to start looking for match */
3008#else
3009 static long
3010vim_regexec_both(line, col)
3011 char_u *line;
3012 colnr_T col; /* column to start looking for match */
3013#endif
3014{
3015 regprog_T *prog;
3016 char_u *s;
3017 long retval;
3018#ifdef HAVE_SETJMP_H
3019 char_u *line;
3020 colnr_T col;
3021#endif
3022
3023 reg_tofree = NULL;
3024
3025#ifdef HAVE_TRY_EXCEPT
3026 __try
3027 {
3028#endif
3029
3030#ifdef HAVE_SETJMP_H
3031 /*
3032 * Matching with a regexp may cause a very deep recursive call of
3033 * regmatch(). Vim will crash when running out of stack space. Catch
3034 * this here if the system supports it.
3035 */
3036 mch_startjmp();
3037 if (SETJMP(lc_jump_env) != 0)
3038 {
3039 mch_didjmp();
3040# ifdef SIGHASARG
3041 if (lc_signal != SIGINT)
3042# endif
3043 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3044 retval = 0L;
3045 goto theend;
3046 }
3047
3048 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3049 line = line_arg;
3050 col = col_arg;
3051#endif
3052 retval = 0L;
3053
3054 if (REG_MULTI)
3055 {
3056 prog = reg_mmatch->regprog;
3057 line = reg_getline((linenr_T)0);
3058 reg_startpos = reg_mmatch->startpos;
3059 reg_endpos = reg_mmatch->endpos;
3060 }
3061 else
3062 {
3063 prog = reg_match->regprog;
3064 reg_startp = reg_match->startp;
3065 reg_endp = reg_match->endp;
3066 }
3067
3068 /* Be paranoid... */
3069 if (prog == NULL || line == NULL)
3070 {
3071 EMSG(_(e_null));
3072 goto theend;
3073 }
3074
3075 /* Check validity of program. */
3076 if (prog_magic_wrong())
3077 goto theend;
3078
3079 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3080 if (prog->regflags & RF_ICASE)
3081 ireg_ic = TRUE;
3082 else if (prog->regflags & RF_NOICASE)
3083 ireg_ic = FALSE;
3084
3085#ifdef FEAT_MBYTE
3086 /* If pattern contains "\Z" overrule value of ireg_icombine */
3087 if (prog->regflags & RF_ICOMBINE)
3088 ireg_icombine = TRUE;
3089#endif
3090
3091 /* If there is a "must appear" string, look for it. */
3092 if (prog->regmust != NULL)
3093 {
3094 int c;
3095
3096#ifdef FEAT_MBYTE
3097 if (has_mbyte)
3098 c = (*mb_ptr2char)(prog->regmust);
3099 else
3100#endif
3101 c = *prog->regmust;
3102 s = line + col;
3103 while ((s = cstrchr(s, c)) != NULL)
3104 {
3105 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3106 break; /* Found it. */
3107#ifdef FEAT_MBYTE
3108 if (has_mbyte)
3109 s += (*mb_ptr2len_check)(s);
3110 else
3111#endif
3112 ++s;
3113 }
3114 if (s == NULL) /* Not present. */
3115 goto theend;
3116 }
3117
3118 regline = line;
3119 reglnum = 0;
3120 out_of_stack = FALSE;
3121
3122 /* Simplest case: Anchored match need be tried only once. */
3123 if (prog->reganch)
3124 {
3125 int c;
3126
3127#ifdef FEAT_MBYTE
3128 if (has_mbyte)
3129 c = (*mb_ptr2char)(regline + col);
3130 else
3131#endif
3132 c = regline[col];
3133 if (prog->regstart == NUL
3134 || prog->regstart == c
3135 || (ireg_ic && ((
3136#ifdef FEAT_MBYTE
3137 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3138 || (c < 255 && prog->regstart < 255 &&
3139#endif
3140 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3141 retval = regtry(prog, col);
3142 else
3143 retval = 0;
3144 }
3145 else
3146 {
3147 /* Messy cases: unanchored match. */
3148 while (!got_int && !out_of_stack)
3149 {
3150 if (prog->regstart != NUL)
3151 {
3152 /* Skip until the char we know it must start with. */
3153 s = cstrchr(regline + col, prog->regstart);
3154 if (s == NULL)
3155 {
3156 retval = 0;
3157 break;
3158 }
3159 col = (int)(s - regline);
3160 }
3161
3162 retval = regtry(prog, col);
3163 if (retval > 0)
3164 break;
3165
3166 /* if not currently on the first line, get it again */
3167 if (reglnum != 0)
3168 {
3169 regline = reg_getline((linenr_T)0);
3170 reglnum = 0;
3171 }
3172 if (regline[col] == NUL)
3173 break;
3174#ifdef FEAT_MBYTE
3175 if (has_mbyte)
3176 col += (*mb_ptr2len_check)(regline + col);
3177 else
3178#endif
3179 ++col;
3180 }
3181 }
3182
3183 if (out_of_stack)
3184 EMSG(_("E363: pattern caused out-of-stack error"));
3185
3186#ifdef HAVE_TRY_EXCEPT
3187 }
3188 __except(EXCEPTION_EXECUTE_HANDLER)
3189 {
3190 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3191 {
3192 RESETSTKOFLW();
3193 EMSG(_("E363: pattern caused out-of-stack error"));
3194 }
3195 else
3196 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3197 retval = 0L;
3198 }
3199#endif
3200
3201theend:
3202 /* Didn't find a match. */
3203 vim_free(reg_tofree);
3204#ifdef HAVE_SETJMP_H
3205 mch_endjmp();
3206#endif
3207 return retval;
3208}
3209
3210#ifdef FEAT_SYN_HL
3211static reg_extmatch_T *make_extmatch __ARGS((void));
3212
3213/*
3214 * Create a new extmatch and mark it as referenced once.
3215 */
3216 static reg_extmatch_T *
3217make_extmatch()
3218{
3219 reg_extmatch_T *em;
3220
3221 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3222 if (em != NULL)
3223 em->refcnt = 1;
3224 return em;
3225}
3226
3227/*
3228 * Add a reference to an extmatch.
3229 */
3230 reg_extmatch_T *
3231ref_extmatch(em)
3232 reg_extmatch_T *em;
3233{
3234 if (em != NULL)
3235 em->refcnt++;
3236 return em;
3237}
3238
3239/*
3240 * Remove a reference to an extmatch. If there are no references left, free
3241 * the info.
3242 */
3243 void
3244unref_extmatch(em)
3245 reg_extmatch_T *em;
3246{
3247 int i;
3248
3249 if (em != NULL && --em->refcnt <= 0)
3250 {
3251 for (i = 0; i < NSUBEXP; ++i)
3252 vim_free(em->matches[i]);
3253 vim_free(em);
3254 }
3255}
3256#endif
3257
3258/*
3259 * regtry - try match of "prog" with at regline["col"].
3260 * Returns 0 for failure, number of lines contained in the match otherwise.
3261 */
3262 static long
3263regtry(prog, col)
3264 regprog_T *prog;
3265 colnr_T col;
3266{
3267 reginput = regline + col;
3268 need_clear_subexpr = TRUE;
3269#ifdef FEAT_SYN_HL
3270 /* Clear the external match subpointers if necessary. */
3271 if (prog->reghasz == REX_SET)
3272 need_clear_zsubexpr = TRUE;
3273#endif
3274
3275 if (regmatch(prog->program + 1))
3276 {
3277 cleanup_subexpr();
3278 if (REG_MULTI)
3279 {
3280 if (reg_startpos[0].lnum < 0)
3281 {
3282 reg_startpos[0].lnum = 0;
3283 reg_startpos[0].col = col;
3284 }
3285 if (reg_endpos[0].lnum < 0)
3286 {
3287 reg_endpos[0].lnum = reglnum;
3288 reg_endpos[0].col = (int)(reginput - regline);
3289 }
3290 else
3291 /* Use line number of "\ze". */
3292 reglnum = reg_endpos[0].lnum;
3293 }
3294 else
3295 {
3296 if (reg_startp[0] == NULL)
3297 reg_startp[0] = regline + col;
3298 if (reg_endp[0] == NULL)
3299 reg_endp[0] = reginput;
3300 }
3301#ifdef FEAT_SYN_HL
3302 /* Package any found \z(...\) matches for export. Default is none. */
3303 unref_extmatch(re_extmatch_out);
3304 re_extmatch_out = NULL;
3305
3306 if (prog->reghasz == REX_SET)
3307 {
3308 int i;
3309
3310 cleanup_zsubexpr();
3311 re_extmatch_out = make_extmatch();
3312 for (i = 0; i < NSUBEXP; i++)
3313 {
3314 if (REG_MULTI)
3315 {
3316 /* Only accept single line matches. */
3317 if (reg_startzpos[i].lnum >= 0
3318 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3319 re_extmatch_out->matches[i] =
3320 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3321 + reg_startzpos[i].col,
3322 reg_endzpos[i].col - reg_startzpos[i].col);
3323 }
3324 else
3325 {
3326 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3327 re_extmatch_out->matches[i] =
3328 vim_strnsave(reg_startzp[i],
3329 (int)(reg_endzp[i] - reg_startzp[i]));
3330 }
3331 }
3332 }
3333#endif
3334 return 1 + reglnum;
3335 }
3336 return 0;
3337}
3338
3339#ifdef FEAT_MBYTE
3340/* multi-byte: advance reginput with a function */
3341# define ADVANCE_REGINPUT() advance_reginput()
3342
3343static void advance_reginput __ARGS((void));
3344static int reg_prev_class __ARGS((void));
3345
3346 static void
3347advance_reginput()
3348{
3349 if (has_mbyte)
3350 reginput += (*mb_ptr2len_check)(reginput);
3351 else
3352 ++reginput;
3353}
3354
3355/*
3356 * Get class of previous character.
3357 */
3358 static int
3359reg_prev_class()
3360{
3361 if (reginput > regline)
3362 return mb_get_class(reginput - 1
3363 - (*mb_head_off)(regline, reginput - 1));
3364 return -1;
3365}
3366
3367#else
3368/* No multi-byte: It's too simple to make a function for. */
3369# define ADVANCE_REGINPUT() ++reginput
3370#endif
3371
3372/*
3373 * The arguments from BRACE_LIMITS are stored here. They are actually local
3374 * to regmatch(), but they are here to reduce the amount of stack space used
3375 * (it can be called recursively many times).
3376 */
3377static long bl_minval;
3378static long bl_maxval;
3379
3380/*
3381 * regmatch - main matching routine
3382 *
3383 * Conceptually the strategy is simple: Check to see whether the current
3384 * node matches, call self recursively to see whether the rest matches,
3385 * and then act accordingly. In practice we make some effort to avoid
3386 * recursion, in particular by going through "ordinary" nodes (that don't
3387 * need to know whether the rest of the match failed) by a loop instead of
3388 * by recursion.
3389 *
3390 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3391 * the last matched character.
3392 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3393 * undefined state!
3394 */
3395 static int
3396regmatch(scan)
3397 char_u *scan; /* Current node. */
3398{
3399 char_u *next; /* Next node. */
3400 int op;
3401 int c;
3402
3403#ifdef HAVE_GETRLIMIT
3404 /* Check if we are running out of stack space. Could be caused by
3405 * recursively calling ourselves. */
3406 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3407 {
3408 out_of_stack = TRUE;
3409 return FALSE;
3410 }
3411#endif
3412
3413 /* Some patterns my cause a long time to match, even though they are not
3414 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3415 fast_breakcheck();
3416
3417#ifdef DEBUG
3418 if (scan != NULL && regnarrate)
3419 {
3420 mch_errmsg(regprop(scan));
3421 mch_errmsg("(\n");
3422 }
3423#endif
3424 while (scan != NULL)
3425 {
3426 if (got_int || out_of_stack)
3427 return FALSE;
3428#ifdef DEBUG
3429 if (regnarrate)
3430 {
3431 mch_errmsg(regprop(scan));
3432 mch_errmsg("...\n");
3433# ifdef FEAT_SYN_HL
3434 if (re_extmatch_in != NULL)
3435 {
3436 int i;
3437
3438 mch_errmsg(_("External submatches:\n"));
3439 for (i = 0; i < NSUBEXP; i++)
3440 {
3441 mch_errmsg(" \"");
3442 if (re_extmatch_in->matches[i] != NULL)
3443 mch_errmsg(re_extmatch_in->matches[i]);
3444 mch_errmsg("\"\n");
3445 }
3446 }
3447# endif
3448 }
3449#endif
3450 next = regnext(scan);
3451
3452 op = OP(scan);
3453 /* Check for character class with NL added. */
3454 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3455 {
3456 reg_nextline();
3457 }
3458 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3459 {
3460 ADVANCE_REGINPUT();
3461 }
3462 else
3463 {
3464 if (WITH_NL(op))
3465 op -= ADD_NL;
3466#ifdef FEAT_MBYTE
3467 if (has_mbyte)
3468 c = (*mb_ptr2char)(reginput);
3469 else
3470#endif
3471 c = *reginput;
3472 switch (op)
3473 {
3474 case BOL:
3475 if (reginput != regline)
3476 return FALSE;
3477 break;
3478
3479 case EOL:
3480 if (c != NUL)
3481 return FALSE;
3482 break;
3483
3484 case RE_BOF:
3485 /* Passing -1 to the getline() function provided for the search
3486 * should always return NULL if the current line is the first
3487 * line of the file. */
3488 if (reglnum != 0 || reginput != regline
3489 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3490 return FALSE;
3491 break;
3492
3493 case RE_EOF:
3494 if (reglnum != reg_maxline || c != NUL)
3495 return FALSE;
3496 break;
3497
3498 case CURSOR:
3499 /* Check if the buffer is in a window and compare the
3500 * reg_win->w_cursor position to the match position. */
3501 if (reg_win == NULL
3502 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3503 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3504 return FALSE;
3505 break;
3506
3507 case RE_LNUM:
3508 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3509 scan))
3510 return FALSE;
3511 break;
3512
3513 case RE_COL:
3514 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3515 return FALSE;
3516 break;
3517
3518 case RE_VCOL:
3519 if (!re_num_cmp((long_u)win_linetabsize(
3520 reg_win == NULL ? curwin : reg_win,
3521 regline, (colnr_T)(reginput - regline)) + 1, scan))
3522 return FALSE;
3523 break;
3524
3525 case BOW: /* \<word; reginput points to w */
3526 if (c == NUL) /* Can't match at end of line */
3527 return FALSE;
3528#ifdef FEAT_MBYTE
3529 if (has_mbyte)
3530 {
3531 int this_class;
3532
3533 /* Get class of current and previous char (if it exists). */
3534 this_class = mb_get_class(reginput);
3535 if (this_class <= 1)
3536 return FALSE; /* not on a word at all */
3537 if (reg_prev_class() == this_class)
3538 return FALSE; /* previous char is in same word */
3539 }
3540#endif
3541 else
3542 {
3543 if (!vim_iswordc(c)
3544 || (reginput > regline && vim_iswordc(reginput[-1])))
3545 return FALSE;
3546 }
3547 break;
3548
3549 case EOW: /* word\>; reginput points after d */
3550 if (reginput == regline) /* Can't match at start of line */
3551 return FALSE;
3552#ifdef FEAT_MBYTE
3553 if (has_mbyte)
3554 {
3555 int this_class, prev_class;
3556
3557 /* Get class of current and previous char (if it exists). */
3558 this_class = mb_get_class(reginput);
3559 prev_class = reg_prev_class();
3560 if (this_class == prev_class)
3561 return FALSE;
3562 if (prev_class == 0 || prev_class == 1)
3563 return FALSE;
3564 }
3565 else
3566#endif
3567 {
3568 if (!vim_iswordc(reginput[-1]))
3569 return FALSE;
3570 if (reginput[0] != NUL && vim_iswordc(c))
3571 return FALSE;
3572 }
3573 break; /* Matched with EOW */
3574
3575 case ANY:
3576 if (c == NUL)
3577 return FALSE;
3578 ADVANCE_REGINPUT();
3579 break;
3580
3581 case IDENT:
3582 if (!vim_isIDc(c))
3583 return FALSE;
3584 ADVANCE_REGINPUT();
3585 break;
3586
3587 case SIDENT:
3588 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3589 return FALSE;
3590 ADVANCE_REGINPUT();
3591 break;
3592
3593 case KWORD:
3594 if (!vim_iswordp(reginput))
3595 return FALSE;
3596 ADVANCE_REGINPUT();
3597 break;
3598
3599 case SKWORD:
3600 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3601 return FALSE;
3602 ADVANCE_REGINPUT();
3603 break;
3604
3605 case FNAME:
3606 if (!vim_isfilec(c))
3607 return FALSE;
3608 ADVANCE_REGINPUT();
3609 break;
3610
3611 case SFNAME:
3612 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3613 return FALSE;
3614 ADVANCE_REGINPUT();
3615 break;
3616
3617 case PRINT:
3618 if (ptr2cells(reginput) != 1)
3619 return FALSE;
3620 ADVANCE_REGINPUT();
3621 break;
3622
3623 case SPRINT:
3624 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3625 return FALSE;
3626 ADVANCE_REGINPUT();
3627 break;
3628
3629 case WHITE:
3630 if (!vim_iswhite(c))
3631 return FALSE;
3632 ADVANCE_REGINPUT();
3633 break;
3634
3635 case NWHITE:
3636 if (c == NUL || vim_iswhite(c))
3637 return FALSE;
3638 ADVANCE_REGINPUT();
3639 break;
3640
3641 case DIGIT:
3642 if (!ri_digit(c))
3643 return FALSE;
3644 ADVANCE_REGINPUT();
3645 break;
3646
3647 case NDIGIT:
3648 if (c == NUL || ri_digit(c))
3649 return FALSE;
3650 ADVANCE_REGINPUT();
3651 break;
3652
3653 case HEX:
3654 if (!ri_hex(c))
3655 return FALSE;
3656 ADVANCE_REGINPUT();
3657 break;
3658
3659 case NHEX:
3660 if (c == NUL || ri_hex(c))
3661 return FALSE;
3662 ADVANCE_REGINPUT();
3663 break;
3664
3665 case OCTAL:
3666 if (!ri_octal(c))
3667 return FALSE;
3668 ADVANCE_REGINPUT();
3669 break;
3670
3671 case NOCTAL:
3672 if (c == NUL || ri_octal(c))
3673 return FALSE;
3674 ADVANCE_REGINPUT();
3675 break;
3676
3677 case WORD:
3678 if (!ri_word(c))
3679 return FALSE;
3680 ADVANCE_REGINPUT();
3681 break;
3682
3683 case NWORD:
3684 if (c == NUL || ri_word(c))
3685 return FALSE;
3686 ADVANCE_REGINPUT();
3687 break;
3688
3689 case HEAD:
3690 if (!ri_head(c))
3691 return FALSE;
3692 ADVANCE_REGINPUT();
3693 break;
3694
3695 case NHEAD:
3696 if (c == NUL || ri_head(c))
3697 return FALSE;
3698 ADVANCE_REGINPUT();
3699 break;
3700
3701 case ALPHA:
3702 if (!ri_alpha(c))
3703 return FALSE;
3704 ADVANCE_REGINPUT();
3705 break;
3706
3707 case NALPHA:
3708 if (c == NUL || ri_alpha(c))
3709 return FALSE;
3710 ADVANCE_REGINPUT();
3711 break;
3712
3713 case LOWER:
3714 if (!ri_lower(c))
3715 return FALSE;
3716 ADVANCE_REGINPUT();
3717 break;
3718
3719 case NLOWER:
3720 if (c == NUL || ri_lower(c))
3721 return FALSE;
3722 ADVANCE_REGINPUT();
3723 break;
3724
3725 case UPPER:
3726 if (!ri_upper(c))
3727 return FALSE;
3728 ADVANCE_REGINPUT();
3729 break;
3730
3731 case NUPPER:
3732 if (c == NUL || ri_upper(c))
3733 return FALSE;
3734 ADVANCE_REGINPUT();
3735 break;
3736
3737 case EXACTLY:
3738 {
3739 int len;
3740 char_u *opnd;
3741
3742 opnd = OPERAND(scan);
3743 /* Inline the first byte, for speed. */
3744 if (*opnd != *reginput
3745 && (!ireg_ic || (
3746#ifdef FEAT_MBYTE
3747 !enc_utf8 &&
3748#endif
3749 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3750 return FALSE;
3751 if (*opnd == NUL)
3752 {
3753 /* match empty string always works; happens when "~" is
3754 * empty. */
3755 }
3756 else if (opnd[1] == NUL
3757#ifdef FEAT_MBYTE
3758 && !(enc_utf8 && ireg_ic)
3759#endif
3760 )
3761 ++reginput; /* matched a single char */
3762 else
3763 {
3764 len = (int)STRLEN(opnd);
3765 /* Need to match first byte again for multi-byte. */
3766 if (cstrncmp(opnd, reginput, &len) != 0)
3767 return FALSE;
3768#ifdef FEAT_MBYTE
3769 /* Check for following composing character. */
3770 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3771 {
3772 /* raaron: This code makes a composing character get
3773 * ignored, which is the correct behavior (sometimes)
3774 * for voweled Hebrew texts. */
3775 if (!ireg_icombine)
3776 return FALSE;
3777 }
3778 else
3779#endif
3780 reginput += len;
3781 }
3782 }
3783 break;
3784
3785 case ANYOF:
3786 case ANYBUT:
3787 if (c == NUL)
3788 return FALSE;
3789 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3790 return FALSE;
3791 ADVANCE_REGINPUT();
3792 break;
3793
3794#ifdef FEAT_MBYTE
3795 case MULTIBYTECODE:
3796 if (has_mbyte)
3797 {
3798 int i, len;
3799 char_u *opnd;
3800
3801 opnd = OPERAND(scan);
3802 /* Safety check (just in case 'encoding' was changed since
3803 * compiling the program). */
3804 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3805 return FALSE;
3806 for (i = 0; i < len; ++i)
3807 if (opnd[i] != reginput[i])
3808 return FALSE;
3809 reginput += len;
3810 }
3811 else
3812 return FALSE;
3813 break;
3814#endif
3815
3816 case NOTHING:
3817 break;
3818
3819 case BACK:
3820 break;
3821
3822 case MOPEN + 0: /* Match start: \zs */
3823 case MOPEN + 1: /* \( */
3824 case MOPEN + 2:
3825 case MOPEN + 3:
3826 case MOPEN + 4:
3827 case MOPEN + 5:
3828 case MOPEN + 6:
3829 case MOPEN + 7:
3830 case MOPEN + 8:
3831 case MOPEN + 9:
3832 {
3833 int no;
3834 save_se_T save;
3835
3836 no = op - MOPEN;
3837 cleanup_subexpr();
3838 save_se(&save, &reg_startpos[no], &reg_startp[no]);
3839
3840 if (regmatch(next))
3841 return TRUE;
3842
3843 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
3844 return FALSE;
3845 }
3846 /* break; Not Reached */
3847
3848 case NOPEN: /* \%( */
3849 case NCLOSE: /* \) after \%( */
3850 if (regmatch(next))
3851 return TRUE;
3852 return FALSE;
3853 /* break; Not Reached */
3854
3855#ifdef FEAT_SYN_HL
3856 case ZOPEN + 1:
3857 case ZOPEN + 2:
3858 case ZOPEN + 3:
3859 case ZOPEN + 4:
3860 case ZOPEN + 5:
3861 case ZOPEN + 6:
3862 case ZOPEN + 7:
3863 case ZOPEN + 8:
3864 case ZOPEN + 9:
3865 {
3866 int no;
3867 save_se_T save;
3868
3869 no = op - ZOPEN;
3870 cleanup_zsubexpr();
3871 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3872
3873 if (regmatch(next))
3874 return TRUE;
3875
3876 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3877 return FALSE;
3878 }
3879 /* break; Not Reached */
3880#endif
3881
3882 case MCLOSE + 0: /* Match end: \ze */
3883 case MCLOSE + 1: /* \) */
3884 case MCLOSE + 2:
3885 case MCLOSE + 3:
3886 case MCLOSE + 4:
3887 case MCLOSE + 5:
3888 case MCLOSE + 6:
3889 case MCLOSE + 7:
3890 case MCLOSE + 8:
3891 case MCLOSE + 9:
3892 {
3893 int no;
3894 save_se_T save;
3895
3896 no = op - MCLOSE;
3897 cleanup_subexpr();
3898 save_se(&save, &reg_endpos[no], &reg_endp[no]);
3899
3900 if (regmatch(next))
3901 return TRUE;
3902
3903 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
3904 return FALSE;
3905 }
3906 /* break; Not Reached */
3907
3908#ifdef FEAT_SYN_HL
3909 case ZCLOSE + 1: /* \) after \z( */
3910 case ZCLOSE + 2:
3911 case ZCLOSE + 3:
3912 case ZCLOSE + 4:
3913 case ZCLOSE + 5:
3914 case ZCLOSE + 6:
3915 case ZCLOSE + 7:
3916 case ZCLOSE + 8:
3917 case ZCLOSE + 9:
3918 {
3919 int no;
3920 save_se_T save;
3921
3922 no = op - ZCLOSE;
3923 cleanup_zsubexpr();
3924 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3925
3926 if (regmatch(next))
3927 return TRUE;
3928
3929 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3930 return FALSE;
3931 }
3932 /* break; Not Reached */
3933#endif
3934
3935 case BACKREF + 1:
3936 case BACKREF + 2:
3937 case BACKREF + 3:
3938 case BACKREF + 4:
3939 case BACKREF + 5:
3940 case BACKREF + 6:
3941 case BACKREF + 7:
3942 case BACKREF + 8:
3943 case BACKREF + 9:
3944 {
3945 int no;
3946 int len;
3947 linenr_T clnum;
3948 colnr_T ccol;
3949 char_u *p;
3950
3951 no = op - BACKREF;
3952 cleanup_subexpr();
3953 if (!REG_MULTI) /* Single-line regexp */
3954 {
3955 if (reg_endp[no] == NULL)
3956 {
3957 /* Backref was not set: Match an empty string. */
3958 len = 0;
3959 }
3960 else
3961 {
3962 /* Compare current input with back-ref in the same
3963 * line. */
3964 len = (int)(reg_endp[no] - reg_startp[no]);
3965 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
3966 return FALSE;
3967 }
3968 }
3969 else /* Multi-line regexp */
3970 {
3971 if (reg_endpos[no].lnum < 0)
3972 {
3973 /* Backref was not set: Match an empty string. */
3974 len = 0;
3975 }
3976 else
3977 {
3978 if (reg_startpos[no].lnum == reglnum
3979 && reg_endpos[no].lnum == reglnum)
3980 {
3981 /* Compare back-ref within the current line. */
3982 len = reg_endpos[no].col - reg_startpos[no].col;
3983 if (cstrncmp(regline + reg_startpos[no].col,
3984 reginput, &len) != 0)
3985 return FALSE;
3986 }
3987 else
3988 {
3989 /* Messy situation: Need to compare between two
3990 * lines. */
3991 ccol = reg_startpos[no].col;
3992 clnum = reg_startpos[no].lnum;
3993 for (;;)
3994 {
3995 /* Since getting one line may invalidate
3996 * the other, need to make copy. Slow! */
3997 if (regline != reg_tofree)
3998 {
3999 len = (int)STRLEN(regline);
4000 if (reg_tofree == NULL
4001 || len >= (int)reg_tofreelen)
4002 {
4003 len += 50; /* get some extra */
4004 vim_free(reg_tofree);
4005 reg_tofree = alloc(len);
4006 if (reg_tofree == NULL)
4007 return FALSE; /* out of memory! */
4008 reg_tofreelen = len;
4009 }
4010 STRCPY(reg_tofree, regline);
4011 reginput = reg_tofree
4012 + (reginput - regline);
4013 regline = reg_tofree;
4014 }
4015
4016 /* Get the line to compare with. */
4017 p = reg_getline(clnum);
4018 if (clnum == reg_endpos[no].lnum)
4019 len = reg_endpos[no].col - ccol;
4020 else
4021 len = (int)STRLEN(p + ccol);
4022
4023 if (cstrncmp(p + ccol, reginput, &len) != 0)
4024 return FALSE; /* doesn't match */
4025 if (clnum == reg_endpos[no].lnum)
4026 break; /* match and at end! */
4027 if (reglnum == reg_maxline)
4028 return FALSE; /* text too short */
4029
4030 /* Advance to next line. */
4031 reg_nextline();
4032 ++clnum;
4033 ccol = 0;
4034 if (got_int || out_of_stack)
4035 return FALSE;
4036 }
4037
4038 /* found a match! Note that regline may now point
4039 * to a copy of the line, that should not matter. */
4040 }
4041 }
4042 }
4043
4044 /* Matched the backref, skip over it. */
4045 reginput += len;
4046 }
4047 break;
4048
4049#ifdef FEAT_SYN_HL
4050 case ZREF + 1:
4051 case ZREF + 2:
4052 case ZREF + 3:
4053 case ZREF + 4:
4054 case ZREF + 5:
4055 case ZREF + 6:
4056 case ZREF + 7:
4057 case ZREF + 8:
4058 case ZREF + 9:
4059 {
4060 int no;
4061 int len;
4062
4063 cleanup_zsubexpr();
4064 no = op - ZREF;
4065 if (re_extmatch_in != NULL
4066 && re_extmatch_in->matches[no] != NULL)
4067 {
4068 len = (int)STRLEN(re_extmatch_in->matches[no]);
4069 if (cstrncmp(re_extmatch_in->matches[no],
4070 reginput, &len) != 0)
4071 return FALSE;
4072 reginput += len;
4073 }
4074 else
4075 {
4076 /* Backref was not set: Match an empty string. */
4077 }
4078 }
4079 break;
4080#endif
4081
4082 case BRANCH:
4083 {
4084 if (OP(next) != BRANCH) /* No choice. */
4085 next = OPERAND(scan); /* Avoid recursion. */
4086 else
4087 {
4088 regsave_T save;
4089
4090 do
4091 {
4092 reg_save(&save);
4093 if (regmatch(OPERAND(scan)))
4094 return TRUE;
4095 reg_restore(&save);
4096 scan = regnext(scan);
4097 } while (scan != NULL && OP(scan) == BRANCH);
4098 return FALSE;
4099 /* NOTREACHED */
4100 }
4101 }
4102 break;
4103
4104 case BRACE_LIMITS:
4105 {
4106 int no;
4107
4108 if (OP(next) == BRACE_SIMPLE)
4109 {
4110 bl_minval = OPERAND_MIN(scan);
4111 bl_maxval = OPERAND_MAX(scan);
4112 }
4113 else if (OP(next) >= BRACE_COMPLEX
4114 && OP(next) < BRACE_COMPLEX + 10)
4115 {
4116 no = OP(next) - BRACE_COMPLEX;
4117 brace_min[no] = OPERAND_MIN(scan);
4118 brace_max[no] = OPERAND_MAX(scan);
4119 brace_count[no] = 0;
4120 }
4121 else
4122 {
4123 EMSG(_(e_internal)); /* Shouldn't happen */
4124 return FALSE;
4125 }
4126 }
4127 break;
4128
4129 case BRACE_COMPLEX + 0:
4130 case BRACE_COMPLEX + 1:
4131 case BRACE_COMPLEX + 2:
4132 case BRACE_COMPLEX + 3:
4133 case BRACE_COMPLEX + 4:
4134 case BRACE_COMPLEX + 5:
4135 case BRACE_COMPLEX + 6:
4136 case BRACE_COMPLEX + 7:
4137 case BRACE_COMPLEX + 8:
4138 case BRACE_COMPLEX + 9:
4139 {
4140 int no;
4141 regsave_T save;
4142
4143 no = op - BRACE_COMPLEX;
4144 ++brace_count[no];
4145
4146 /* If not matched enough times yet, try one more */
4147 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4148 ? brace_min[no] : brace_max[no]))
4149 {
4150 reg_save(&save);
4151 if (regmatch(OPERAND(scan)))
4152 return TRUE;
4153 reg_restore(&save);
4154 --brace_count[no]; /* failed, decrement match count */
4155 return FALSE;
4156 }
4157
4158 /* If matched enough times, may try matching some more */
4159 if (brace_min[no] <= brace_max[no])
4160 {
4161 /* Range is the normal way around, use longest match */
4162 if (brace_count[no] <= brace_max[no])
4163 {
4164 reg_save(&save);
4165 if (regmatch(OPERAND(scan)))
4166 return TRUE; /* matched some more times */
4167 reg_restore(&save);
4168 --brace_count[no]; /* matched just enough times */
4169 /* continue with the items after \{} */
4170 }
4171 }
4172 else
4173 {
4174 /* Range is backwards, use shortest match first */
4175 if (brace_count[no] <= brace_min[no])
4176 {
4177 reg_save(&save);
4178 if (regmatch(next))
4179 return TRUE;
4180 reg_restore(&save);
4181 next = OPERAND(scan);
4182 /* must try to match one more item */
4183 }
4184 }
4185 }
4186 break;
4187
4188 case BRACE_SIMPLE:
4189 case STAR:
4190 case PLUS:
4191 {
4192 int nextb; /* next byte */
4193 int nextb_ic; /* next byte reverse case */
4194 long count;
4195 regsave_T save;
4196 long minval;
4197 long maxval;
4198
4199 /*
4200 * Lookahead to avoid useless match attempts when we know
4201 * what character comes next.
4202 */
4203 if (OP(next) == EXACTLY)
4204 {
4205 nextb = *OPERAND(next);
4206 if (ireg_ic)
4207 {
4208 if (isupper(nextb))
4209 nextb_ic = TOLOWER_LOC(nextb);
4210 else
4211 nextb_ic = TOUPPER_LOC(nextb);
4212 }
4213 else
4214 nextb_ic = nextb;
4215 }
4216 else
4217 {
4218 nextb = NUL;
4219 nextb_ic = NUL;
4220 }
4221 if (op != BRACE_SIMPLE)
4222 {
4223 minval = (op == STAR) ? 0 : 1;
4224 maxval = MAX_LIMIT;
4225 }
4226 else
4227 {
4228 minval = bl_minval;
4229 maxval = bl_maxval;
4230 }
4231
4232 /*
4233 * When maxval > minval, try matching as much as possible, up
4234 * to maxval. When maxval < minval, try matching at least the
4235 * minimal number (since the range is backwards, that's also
4236 * maxval!).
4237 */
4238 count = regrepeat(OPERAND(scan), maxval);
4239 if (got_int)
4240 return FALSE;
4241 if (minval <= maxval)
4242 {
4243 /* Range is the normal way around, use longest match */
4244 while (count >= minval)
4245 {
4246 /* If it could match, try it. */
4247 if (nextb == NUL || *reginput == nextb
4248 || *reginput == nextb_ic)
4249 {
4250 reg_save(&save);
4251 if (regmatch(next))
4252 return TRUE;
4253 reg_restore(&save);
4254 }
4255 /* Couldn't or didn't match -- back up one char. */
4256 if (--count < minval)
4257 break;
4258 if (reginput == regline)
4259 {
4260 /* backup to last char of previous line */
4261 --reglnum;
4262 regline = reg_getline(reglnum);
4263 /* Just in case regrepeat() didn't count right. */
4264 if (regline == NULL)
4265 return FALSE;
4266 reginput = regline + STRLEN(regline);
4267 fast_breakcheck();
4268 if (got_int || out_of_stack)
4269 return FALSE;
4270 }
4271 else
4272 {
4273 --reginput;
4274#ifdef FEAT_MBYTE
4275 if (has_mbyte)
4276 reginput -= (*mb_head_off)(regline, reginput);
4277#endif
4278 }
4279 }
4280 }
4281 else
4282 {
4283 /* Range is backwards, use shortest match first.
4284 * Careful: maxval and minval are exchanged! */
4285 if (count < maxval)
4286 return FALSE;
4287 for (;;)
4288 {
4289 /* If it could work, try it. */
4290 if (nextb == NUL || *reginput == nextb
4291 || *reginput == nextb_ic)
4292 {
4293 reg_save(&save);
4294 if (regmatch(next))
4295 return TRUE;
4296 reg_restore(&save);
4297 }
4298 /* Couldn't or didn't match: try advancing one char. */
4299 if (count == minval
4300 || regrepeat(OPERAND(scan), 1L) == 0)
4301 break;
4302 ++count;
4303 if (got_int || out_of_stack)
4304 return FALSE;
4305 }
4306 }
4307 return FALSE;
4308 }
4309 /* break; Not Reached */
4310
4311 case NOMATCH:
4312 {
4313 regsave_T save;
4314
4315 /* If the operand matches, we fail. Otherwise backup and
4316 * continue with the next item. */
4317 reg_save(&save);
4318 if (regmatch(OPERAND(scan)))
4319 return FALSE;
4320 reg_restore(&save);
4321 }
4322 break;
4323
4324 case MATCH:
4325 case SUBPAT:
4326 {
4327 regsave_T save;
4328
4329 /* If the operand doesn't match, we fail. Otherwise backup
4330 * and continue with the next item. */
4331 reg_save(&save);
4332 if (!regmatch(OPERAND(scan)))
4333 return FALSE;
4334 if (op == MATCH) /* zero-width */
4335 reg_restore(&save);
4336 }
4337 break;
4338
4339 case BEHIND:
4340 case NOBEHIND:
4341 {
4342 regsave_T save_after, save_start;
4343 regsave_T save_behind_pos;
4344 int needmatch = (op == BEHIND);
4345
4346 /*
4347 * Look back in the input of the operand matches or not. This
4348 * must be done at every position in the input and checking if
4349 * the match ends at the current position.
4350 * First check if the next item matches, that's probably
4351 * faster.
4352 */
4353 reg_save(&save_start);
4354 if (regmatch(next))
4355 {
4356 /* save the position after the found match for next */
4357 reg_save(&save_after);
4358
4359 /* start looking for a match with operand at the current
4360 * postion. Go back one character until we find the
4361 * result, hitting the start of the line or the previous
4362 * line (for multi-line matching).
4363 * Set behind_pos to where the match should end, BHPOS
4364 * will match it. */
4365 save_behind_pos = behind_pos;
4366 behind_pos = save_start;
4367 for (;;)
4368 {
4369 reg_restore(&save_start);
4370 if (regmatch(OPERAND(scan))
4371 && reg_save_equal(&behind_pos))
4372 {
4373 behind_pos = save_behind_pos;
4374 /* found a match that ends where "next" started */
4375 if (needmatch)
4376 {
4377 reg_restore(&save_after);
4378 return TRUE;
4379 }
4380 return FALSE;
4381 }
4382 /*
4383 * No match: Go back one character. May go to
4384 * previous line once.
4385 */
4386 if (REG_MULTI)
4387 {
4388 if (save_start.rs_u.pos.col == 0)
4389 {
4390 if (save_start.rs_u.pos.lnum
4391 < behind_pos.rs_u.pos.lnum
4392 || reg_getline(
4393 --save_start.rs_u.pos.lnum) == NULL)
4394 break;
4395 reg_restore(&save_start);
4396 save_start.rs_u.pos.col =
4397 (colnr_T)STRLEN(regline);
4398 }
4399 else
4400 --save_start.rs_u.pos.col;
4401 }
4402 else
4403 {
4404 if (save_start.rs_u.ptr == regline)
4405 break;
4406 --save_start.rs_u.ptr;
4407 }
4408 }
4409
4410 /* NOBEHIND succeeds when no match was found */
4411 behind_pos = save_behind_pos;
4412 if (!needmatch)
4413 {
4414 reg_restore(&save_after);
4415 return TRUE;
4416 }
4417 }
4418 return FALSE;
4419 }
4420
4421 case BHPOS:
4422 if (REG_MULTI)
4423 {
4424 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4425 || behind_pos.rs_u.pos.lnum != reglnum)
4426 return FALSE;
4427 }
4428 else if (behind_pos.rs_u.ptr != reginput)
4429 return FALSE;
4430 break;
4431
4432 case NEWL:
4433 if ((c != NUL || reglnum == reg_maxline)
4434 && (c != '\n' || !reg_line_lbr))
4435 return FALSE;
4436 if (reg_line_lbr)
4437 ADVANCE_REGINPUT();
4438 else
4439 reg_nextline();
4440 break;
4441
4442 case END:
4443 return TRUE; /* Success! */
4444
4445 default:
4446 EMSG(_(e_re_corr));
4447#ifdef DEBUG
4448 printf("Illegal op code %d\n", op);
4449#endif
4450 return FALSE;
4451 }
4452 }
4453
4454 scan = next;
4455 }
4456
4457 /*
4458 * We get here only if there's trouble -- normally "case END" is the
4459 * terminating point.
4460 */
4461 EMSG(_(e_re_corr));
4462#ifdef DEBUG
4463 printf("Premature EOL\n");
4464#endif
4465 return FALSE;
4466}
4467
4468#ifdef FEAT_MBYTE
4469# define ADVANCE_P(x) if (has_mbyte) x += (*mb_ptr2len_check)(x); else ++x
4470#else
4471# define ADVANCE_P(x) ++x
4472#endif
4473
4474/*
4475 * regrepeat - repeatedly match something simple, return how many.
4476 * Advances reginput (and reglnum) to just after the matched chars.
4477 */
4478 static int
4479regrepeat(p, maxcount)
4480 char_u *p;
4481 long maxcount; /* maximum number of matches allowed */
4482{
4483 long count = 0;
4484 char_u *scan;
4485 char_u *opnd;
4486 int mask;
4487 int testval = 0;
4488
4489 scan = reginput; /* Make local copy of reginput for speed. */
4490 opnd = OPERAND(p);
4491 switch (OP(p))
4492 {
4493 case ANY:
4494 case ANY + ADD_NL:
4495 while (count < maxcount)
4496 {
4497 /* Matching anything means we continue until end-of-line (or
4498 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4499 while (*scan != NUL && count < maxcount)
4500 {
4501 ++count;
4502 ADVANCE_P(scan);
4503 }
4504 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4505 break;
4506 ++count; /* count the line-break */
4507 reg_nextline();
4508 scan = reginput;
4509 if (got_int)
4510 break;
4511 }
4512 break;
4513
4514 case IDENT:
4515 case IDENT + ADD_NL:
4516 testval = TRUE;
4517 /*FALLTHROUGH*/
4518 case SIDENT:
4519 case SIDENT + ADD_NL:
4520 while (count < maxcount)
4521 {
4522 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4523 {
4524 ADVANCE_P(scan);
4525 }
4526 else if (*scan == NUL)
4527 {
4528 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4529 break;
4530 reg_nextline();
4531 scan = reginput;
4532 if (got_int)
4533 break;
4534 }
4535 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4536 ++scan;
4537 else
4538 break;
4539 ++count;
4540 }
4541 break;
4542
4543 case KWORD:
4544 case KWORD + ADD_NL:
4545 testval = TRUE;
4546 /*FALLTHROUGH*/
4547 case SKWORD:
4548 case SKWORD + ADD_NL:
4549 while (count < maxcount)
4550 {
4551 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4552 {
4553 ADVANCE_P(scan);
4554 }
4555 else if (*scan == NUL)
4556 {
4557 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4558 break;
4559 reg_nextline();
4560 scan = reginput;
4561 if (got_int)
4562 break;
4563 }
4564 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4565 ++scan;
4566 else
4567 break;
4568 ++count;
4569 }
4570 break;
4571
4572 case FNAME:
4573 case FNAME + ADD_NL:
4574 testval = TRUE;
4575 /*FALLTHROUGH*/
4576 case SFNAME:
4577 case SFNAME + ADD_NL:
4578 while (count < maxcount)
4579 {
4580 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4581 {
4582 ADVANCE_P(scan);
4583 }
4584 else if (*scan == NUL)
4585 {
4586 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4587 break;
4588 reg_nextline();
4589 scan = reginput;
4590 if (got_int)
4591 break;
4592 }
4593 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4594 ++scan;
4595 else
4596 break;
4597 ++count;
4598 }
4599 break;
4600
4601 case PRINT:
4602 case PRINT + ADD_NL:
4603 testval = TRUE;
4604 /*FALLTHROUGH*/
4605 case SPRINT:
4606 case SPRINT + ADD_NL:
4607 while (count < maxcount)
4608 {
4609 if (*scan == NUL)
4610 {
4611 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4612 break;
4613 reg_nextline();
4614 scan = reginput;
4615 if (got_int)
4616 break;
4617 }
4618 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4619 {
4620 ADVANCE_P(scan);
4621 }
4622 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4623 ++scan;
4624 else
4625 break;
4626 ++count;
4627 }
4628 break;
4629
4630 case WHITE:
4631 case WHITE + ADD_NL:
4632 testval = mask = RI_WHITE;
4633do_class:
4634 while (count < maxcount)
4635 {
4636#ifdef FEAT_MBYTE
4637 int l;
4638#endif
4639 if (*scan == NUL)
4640 {
4641 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4642 break;
4643 reg_nextline();
4644 scan = reginput;
4645 if (got_int)
4646 break;
4647 }
4648#ifdef FEAT_MBYTE
4649 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4650 {
4651 if (testval != 0)
4652 break;
4653 scan += l;
4654 }
4655#endif
4656 else if ((class_tab[*scan] & mask) == testval)
4657 ++scan;
4658 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4659 ++scan;
4660 else
4661 break;
4662 ++count;
4663 }
4664 break;
4665
4666 case NWHITE:
4667 case NWHITE + ADD_NL:
4668 mask = RI_WHITE;
4669 goto do_class;
4670 case DIGIT:
4671 case DIGIT + ADD_NL:
4672 testval = mask = RI_DIGIT;
4673 goto do_class;
4674 case NDIGIT:
4675 case NDIGIT + ADD_NL:
4676 mask = RI_DIGIT;
4677 goto do_class;
4678 case HEX:
4679 case HEX + ADD_NL:
4680 testval = mask = RI_HEX;
4681 goto do_class;
4682 case NHEX:
4683 case NHEX + ADD_NL:
4684 mask = RI_HEX;
4685 goto do_class;
4686 case OCTAL:
4687 case OCTAL + ADD_NL:
4688 testval = mask = RI_OCTAL;
4689 goto do_class;
4690 case NOCTAL:
4691 case NOCTAL + ADD_NL:
4692 mask = RI_OCTAL;
4693 goto do_class;
4694 case WORD:
4695 case WORD + ADD_NL:
4696 testval = mask = RI_WORD;
4697 goto do_class;
4698 case NWORD:
4699 case NWORD + ADD_NL:
4700 mask = RI_WORD;
4701 goto do_class;
4702 case HEAD:
4703 case HEAD + ADD_NL:
4704 testval = mask = RI_HEAD;
4705 goto do_class;
4706 case NHEAD:
4707 case NHEAD + ADD_NL:
4708 mask = RI_HEAD;
4709 goto do_class;
4710 case ALPHA:
4711 case ALPHA + ADD_NL:
4712 testval = mask = RI_ALPHA;
4713 goto do_class;
4714 case NALPHA:
4715 case NALPHA + ADD_NL:
4716 mask = RI_ALPHA;
4717 goto do_class;
4718 case LOWER:
4719 case LOWER + ADD_NL:
4720 testval = mask = RI_LOWER;
4721 goto do_class;
4722 case NLOWER:
4723 case NLOWER + ADD_NL:
4724 mask = RI_LOWER;
4725 goto do_class;
4726 case UPPER:
4727 case UPPER + ADD_NL:
4728 testval = mask = RI_UPPER;
4729 goto do_class;
4730 case NUPPER:
4731 case NUPPER + ADD_NL:
4732 mask = RI_UPPER;
4733 goto do_class;
4734
4735 case EXACTLY:
4736 {
4737 int cu, cl;
4738
4739 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4740 * would have been used for it. */
4741 if (ireg_ic)
4742 {
4743 cu = TOUPPER_LOC(*opnd);
4744 cl = TOLOWER_LOC(*opnd);
4745 while (count < maxcount && (*scan == cu || *scan == cl))
4746 {
4747 count++;
4748 scan++;
4749 }
4750 }
4751 else
4752 {
4753 cu = *opnd;
4754 while (count < maxcount && *scan == cu)
4755 {
4756 count++;
4757 scan++;
4758 }
4759 }
4760 break;
4761 }
4762
4763#ifdef FEAT_MBYTE
4764 case MULTIBYTECODE:
4765 {
4766 int i, len, cf = 0;
4767
4768 /* Safety check (just in case 'encoding' was changed since
4769 * compiling the program). */
4770 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4771 {
4772 if (ireg_ic && enc_utf8)
4773 cf = utf_fold(utf_ptr2char(opnd));
4774 while (count < maxcount)
4775 {
4776 for (i = 0; i < len; ++i)
4777 if (opnd[i] != scan[i])
4778 break;
4779 if (i < len && (!ireg_ic || !enc_utf8
4780 || utf_fold(utf_ptr2char(scan)) != cf))
4781 break;
4782 scan += len;
4783 ++count;
4784 }
4785 }
4786 }
4787 break;
4788#endif
4789
4790 case ANYOF:
4791 case ANYOF + ADD_NL:
4792 testval = TRUE;
4793 /*FALLTHROUGH*/
4794
4795 case ANYBUT:
4796 case ANYBUT + ADD_NL:
4797 while (count < maxcount)
4798 {
4799#ifdef FEAT_MBYTE
4800 int len;
4801#endif
4802 if (*scan == NUL)
4803 {
4804 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4805 break;
4806 reg_nextline();
4807 scan = reginput;
4808 if (got_int)
4809 break;
4810 }
4811 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4812 ++scan;
4813#ifdef FEAT_MBYTE
4814 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4815 {
4816 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4817 break;
4818 scan += len;
4819 }
4820#endif
4821 else
4822 {
4823 if ((cstrchr(opnd, *scan) == NULL) == testval)
4824 break;
4825 ++scan;
4826 }
4827 ++count;
4828 }
4829 break;
4830
4831 case NEWL:
4832 while (count < maxcount
4833 && ((*scan == NUL && reglnum < reg_maxline)
4834 || (*scan == '\n' && reg_line_lbr)))
4835 {
4836 count++;
4837 if (reg_line_lbr)
4838 ADVANCE_REGINPUT();
4839 else
4840 reg_nextline();
4841 scan = reginput;
4842 if (got_int)
4843 break;
4844 }
4845 break;
4846
4847 default: /* Oh dear. Called inappropriately. */
4848 EMSG(_(e_re_corr));
4849#ifdef DEBUG
4850 printf("Called regrepeat with op code %d\n", OP(p));
4851#endif
4852 break;
4853 }
4854
4855 reginput = scan;
4856
4857 return (int)count;
4858}
4859
4860/*
4861 * regnext - dig the "next" pointer out of a node
4862 */
4863 static char_u *
4864regnext(p)
4865 char_u *p;
4866{
4867 int offset;
4868
4869 if (p == JUST_CALC_SIZE)
4870 return NULL;
4871
4872 offset = NEXT(p);
4873 if (offset == 0)
4874 return NULL;
4875
4876 if (OP(p) == BACK)
4877 return p - offset;
4878 else
4879 return p + offset;
4880}
4881
4882/*
4883 * Check the regexp program for its magic number.
4884 * Return TRUE if it's wrong.
4885 */
4886 static int
4887prog_magic_wrong()
4888{
4889 if (UCHARAT(REG_MULTI
4890 ? reg_mmatch->regprog->program
4891 : reg_match->regprog->program) != REGMAGIC)
4892 {
4893 EMSG(_(e_re_corr));
4894 return TRUE;
4895 }
4896 return FALSE;
4897}
4898
4899/*
4900 * Cleanup the subexpressions, if this wasn't done yet.
4901 * This construction is used to clear the subexpressions only when they are
4902 * used (to increase speed).
4903 */
4904 static void
4905cleanup_subexpr()
4906{
4907 if (need_clear_subexpr)
4908 {
4909 if (REG_MULTI)
4910 {
4911 /* Use 0xff to set lnum to -1 */
4912 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4913 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4914 }
4915 else
4916 {
4917 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
4918 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
4919 }
4920 need_clear_subexpr = FALSE;
4921 }
4922}
4923
4924#ifdef FEAT_SYN_HL
4925 static void
4926cleanup_zsubexpr()
4927{
4928 if (need_clear_zsubexpr)
4929 {
4930 if (REG_MULTI)
4931 {
4932 /* Use 0xff to set lnum to -1 */
4933 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4934 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4935 }
4936 else
4937 {
4938 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
4939 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
4940 }
4941 need_clear_zsubexpr = FALSE;
4942 }
4943}
4944#endif
4945
4946/*
4947 * Advance reglnum, regline and reginput to the next line.
4948 */
4949 static void
4950reg_nextline()
4951{
4952 regline = reg_getline(++reglnum);
4953 reginput = regline;
4954 fast_breakcheck();
4955}
4956
4957/*
4958 * Save the input line and position in a regsave_T.
4959 */
4960 static void
4961reg_save(save)
4962 regsave_T *save;
4963{
4964 if (REG_MULTI)
4965 {
4966 save->rs_u.pos.col = (colnr_T)(reginput - regline);
4967 save->rs_u.pos.lnum = reglnum;
4968 }
4969 else
4970 save->rs_u.ptr = reginput;
4971}
4972
4973/*
4974 * Restore the input line and position from a regsave_T.
4975 */
4976 static void
4977reg_restore(save)
4978 regsave_T *save;
4979{
4980 if (REG_MULTI)
4981 {
4982 if (reglnum != save->rs_u.pos.lnum)
4983 {
4984 /* only call reg_getline() when the line number changed to save
4985 * a bit of time */
4986 reglnum = save->rs_u.pos.lnum;
4987 regline = reg_getline(reglnum);
4988 }
4989 reginput = regline + save->rs_u.pos.col;
4990 }
4991 else
4992 reginput = save->rs_u.ptr;
4993}
4994
4995/*
4996 * Return TRUE if current position is equal to saved position.
4997 */
4998 static int
4999reg_save_equal(save)
5000 regsave_T *save;
5001{
5002 if (REG_MULTI)
5003 return reglnum == save->rs_u.pos.lnum
5004 && reginput == regline + save->rs_u.pos.col;
5005 return reginput == save->rs_u.ptr;
5006}
5007
5008/*
5009 * Tentatively set the sub-expression start to the current position (after
5010 * calling regmatch() they will have changed). Need to save the existing
5011 * values for when there is no match.
5012 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
5013 * depending on REG_MULTI.
5014 */
5015 static void
5016save_se_multi(savep, posp)
5017 save_se_T *savep;
5018 lpos_T *posp;
5019{
5020 savep->se_u.pos = *posp;
5021 posp->lnum = reglnum;
5022 posp->col = (colnr_T)(reginput - regline);
5023}
5024
5025 static void
5026save_se_one(savep, pp)
5027 save_se_T *savep;
5028 char_u **pp;
5029{
5030 savep->se_u.ptr = *pp;
5031 *pp = reginput;
5032}
5033
5034/*
5035 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5036 */
5037 static int
5038re_num_cmp(val, scan)
5039 long_u val;
5040 char_u *scan;
5041{
5042 long_u n = OPERAND_MIN(scan);
5043
5044 if (OPERAND_CMP(scan) == '>')
5045 return val > n;
5046 if (OPERAND_CMP(scan) == '<')
5047 return val < n;
5048 return val == n;
5049}
5050
5051
5052#ifdef DEBUG
5053
5054/*
5055 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5056 */
5057 static void
5058regdump(pattern, r)
5059 char_u *pattern;
5060 regprog_T *r;
5061{
5062 char_u *s;
5063 int op = EXACTLY; /* Arbitrary non-END op. */
5064 char_u *next;
5065 char_u *end = NULL;
5066
5067 printf("\r\nregcomp(%s):\r\n", pattern);
5068
5069 s = r->program + 1;
5070 /*
5071 * Loop until we find the END that isn't before a referred next (an END
5072 * can also appear in a NOMATCH operand).
5073 */
5074 while (op != END || s <= end)
5075 {
5076 op = OP(s);
5077 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5078 next = regnext(s);
5079 if (next == NULL) /* Next ptr. */
5080 printf("(0)");
5081 else
5082 printf("(%d)", (int)((s - r->program) + (next - s)));
5083 if (end < next)
5084 end = next;
5085 if (op == BRACE_LIMITS)
5086 {
5087 /* Two short ints */
5088 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5089 s += 8;
5090 }
5091 s += 3;
5092 if (op == ANYOF || op == ANYOF + ADD_NL
5093 || op == ANYBUT || op == ANYBUT + ADD_NL
5094 || op == EXACTLY)
5095 {
5096 /* Literal string, where present. */
5097 while (*s != NUL)
5098 printf("%c", *s++);
5099 s++;
5100 }
5101 printf("\r\n");
5102 }
5103
5104 /* Header fields of interest. */
5105 if (r->regstart != NUL)
5106 printf("start `%s' 0x%x; ", r->regstart < 256
5107 ? (char *)transchar(r->regstart)
5108 : "multibyte", r->regstart);
5109 if (r->reganch)
5110 printf("anchored; ");
5111 if (r->regmust != NULL)
5112 printf("must have \"%s\"", r->regmust);
5113 printf("\r\n");
5114}
5115
5116/*
5117 * regprop - printable representation of opcode
5118 */
5119 static char_u *
5120regprop(op)
5121 char_u *op;
5122{
5123 char_u *p;
5124 static char_u buf[50];
5125
5126 (void) strcpy(buf, ":");
5127
5128 switch (OP(op))
5129 {
5130 case BOL:
5131 p = "BOL";
5132 break;
5133 case EOL:
5134 p = "EOL";
5135 break;
5136 case RE_BOF:
5137 p = "BOF";
5138 break;
5139 case RE_EOF:
5140 p = "EOF";
5141 break;
5142 case CURSOR:
5143 p = "CURSOR";
5144 break;
5145 case RE_LNUM:
5146 p = "RE_LNUM";
5147 break;
5148 case RE_COL:
5149 p = "RE_COL";
5150 break;
5151 case RE_VCOL:
5152 p = "RE_VCOL";
5153 break;
5154 case BOW:
5155 p = "BOW";
5156 break;
5157 case EOW:
5158 p = "EOW";
5159 break;
5160 case ANY:
5161 p = "ANY";
5162 break;
5163 case ANY + ADD_NL:
5164 p = "ANY+NL";
5165 break;
5166 case ANYOF:
5167 p = "ANYOF";
5168 break;
5169 case ANYOF + ADD_NL:
5170 p = "ANYOF+NL";
5171 break;
5172 case ANYBUT:
5173 p = "ANYBUT";
5174 break;
5175 case ANYBUT + ADD_NL:
5176 p = "ANYBUT+NL";
5177 break;
5178 case IDENT:
5179 p = "IDENT";
5180 break;
5181 case IDENT + ADD_NL:
5182 p = "IDENT+NL";
5183 break;
5184 case SIDENT:
5185 p = "SIDENT";
5186 break;
5187 case SIDENT + ADD_NL:
5188 p = "SIDENT+NL";
5189 break;
5190 case KWORD:
5191 p = "KWORD";
5192 break;
5193 case KWORD + ADD_NL:
5194 p = "KWORD+NL";
5195 break;
5196 case SKWORD:
5197 p = "SKWORD";
5198 break;
5199 case SKWORD + ADD_NL:
5200 p = "SKWORD+NL";
5201 break;
5202 case FNAME:
5203 p = "FNAME";
5204 break;
5205 case FNAME + ADD_NL:
5206 p = "FNAME+NL";
5207 break;
5208 case SFNAME:
5209 p = "SFNAME";
5210 break;
5211 case SFNAME + ADD_NL:
5212 p = "SFNAME+NL";
5213 break;
5214 case PRINT:
5215 p = "PRINT";
5216 break;
5217 case PRINT + ADD_NL:
5218 p = "PRINT+NL";
5219 break;
5220 case SPRINT:
5221 p = "SPRINT";
5222 break;
5223 case SPRINT + ADD_NL:
5224 p = "SPRINT+NL";
5225 break;
5226 case WHITE:
5227 p = "WHITE";
5228 break;
5229 case WHITE + ADD_NL:
5230 p = "WHITE+NL";
5231 break;
5232 case NWHITE:
5233 p = "NWHITE";
5234 break;
5235 case NWHITE + ADD_NL:
5236 p = "NWHITE+NL";
5237 break;
5238 case DIGIT:
5239 p = "DIGIT";
5240 break;
5241 case DIGIT + ADD_NL:
5242 p = "DIGIT+NL";
5243 break;
5244 case NDIGIT:
5245 p = "NDIGIT";
5246 break;
5247 case NDIGIT + ADD_NL:
5248 p = "NDIGIT+NL";
5249 break;
5250 case HEX:
5251 p = "HEX";
5252 break;
5253 case HEX + ADD_NL:
5254 p = "HEX+NL";
5255 break;
5256 case NHEX:
5257 p = "NHEX";
5258 break;
5259 case NHEX + ADD_NL:
5260 p = "NHEX+NL";
5261 break;
5262 case OCTAL:
5263 p = "OCTAL";
5264 break;
5265 case OCTAL + ADD_NL:
5266 p = "OCTAL+NL";
5267 break;
5268 case NOCTAL:
5269 p = "NOCTAL";
5270 break;
5271 case NOCTAL + ADD_NL:
5272 p = "NOCTAL+NL";
5273 break;
5274 case WORD:
5275 p = "WORD";
5276 break;
5277 case WORD + ADD_NL:
5278 p = "WORD+NL";
5279 break;
5280 case NWORD:
5281 p = "NWORD";
5282 break;
5283 case NWORD + ADD_NL:
5284 p = "NWORD+NL";
5285 break;
5286 case HEAD:
5287 p = "HEAD";
5288 break;
5289 case HEAD + ADD_NL:
5290 p = "HEAD+NL";
5291 break;
5292 case NHEAD:
5293 p = "NHEAD";
5294 break;
5295 case NHEAD + ADD_NL:
5296 p = "NHEAD+NL";
5297 break;
5298 case ALPHA:
5299 p = "ALPHA";
5300 break;
5301 case ALPHA + ADD_NL:
5302 p = "ALPHA+NL";
5303 break;
5304 case NALPHA:
5305 p = "NALPHA";
5306 break;
5307 case NALPHA + ADD_NL:
5308 p = "NALPHA+NL";
5309 break;
5310 case LOWER:
5311 p = "LOWER";
5312 break;
5313 case LOWER + ADD_NL:
5314 p = "LOWER+NL";
5315 break;
5316 case NLOWER:
5317 p = "NLOWER";
5318 break;
5319 case NLOWER + ADD_NL:
5320 p = "NLOWER+NL";
5321 break;
5322 case UPPER:
5323 p = "UPPER";
5324 break;
5325 case UPPER + ADD_NL:
5326 p = "UPPER+NL";
5327 break;
5328 case NUPPER:
5329 p = "NUPPER";
5330 break;
5331 case NUPPER + ADD_NL:
5332 p = "NUPPER+NL";
5333 break;
5334 case BRANCH:
5335 p = "BRANCH";
5336 break;
5337 case EXACTLY:
5338 p = "EXACTLY";
5339 break;
5340 case NOTHING:
5341 p = "NOTHING";
5342 break;
5343 case BACK:
5344 p = "BACK";
5345 break;
5346 case END:
5347 p = "END";
5348 break;
5349 case MOPEN + 0:
5350 p = "MATCH START";
5351 break;
5352 case MOPEN + 1:
5353 case MOPEN + 2:
5354 case MOPEN + 3:
5355 case MOPEN + 4:
5356 case MOPEN + 5:
5357 case MOPEN + 6:
5358 case MOPEN + 7:
5359 case MOPEN + 8:
5360 case MOPEN + 9:
5361 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5362 p = NULL;
5363 break;
5364 case MCLOSE + 0:
5365 p = "MATCH END";
5366 break;
5367 case MCLOSE + 1:
5368 case MCLOSE + 2:
5369 case MCLOSE + 3:
5370 case MCLOSE + 4:
5371 case MCLOSE + 5:
5372 case MCLOSE + 6:
5373 case MCLOSE + 7:
5374 case MCLOSE + 8:
5375 case MCLOSE + 9:
5376 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5377 p = NULL;
5378 break;
5379 case BACKREF + 1:
5380 case BACKREF + 2:
5381 case BACKREF + 3:
5382 case BACKREF + 4:
5383 case BACKREF + 5:
5384 case BACKREF + 6:
5385 case BACKREF + 7:
5386 case BACKREF + 8:
5387 case BACKREF + 9:
5388 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5389 p = NULL;
5390 break;
5391 case NOPEN:
5392 p = "NOPEN";
5393 break;
5394 case NCLOSE:
5395 p = "NCLOSE";
5396 break;
5397#ifdef FEAT_SYN_HL
5398 case ZOPEN + 1:
5399 case ZOPEN + 2:
5400 case ZOPEN + 3:
5401 case ZOPEN + 4:
5402 case ZOPEN + 5:
5403 case ZOPEN + 6:
5404 case ZOPEN + 7:
5405 case ZOPEN + 8:
5406 case ZOPEN + 9:
5407 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5408 p = NULL;
5409 break;
5410 case ZCLOSE + 1:
5411 case ZCLOSE + 2:
5412 case ZCLOSE + 3:
5413 case ZCLOSE + 4:
5414 case ZCLOSE + 5:
5415 case ZCLOSE + 6:
5416 case ZCLOSE + 7:
5417 case ZCLOSE + 8:
5418 case ZCLOSE + 9:
5419 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5420 p = NULL;
5421 break;
5422 case ZREF + 1:
5423 case ZREF + 2:
5424 case ZREF + 3:
5425 case ZREF + 4:
5426 case ZREF + 5:
5427 case ZREF + 6:
5428 case ZREF + 7:
5429 case ZREF + 8:
5430 case ZREF + 9:
5431 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5432 p = NULL;
5433 break;
5434#endif
5435 case STAR:
5436 p = "STAR";
5437 break;
5438 case PLUS:
5439 p = "PLUS";
5440 break;
5441 case NOMATCH:
5442 p = "NOMATCH";
5443 break;
5444 case MATCH:
5445 p = "MATCH";
5446 break;
5447 case BEHIND:
5448 p = "BEHIND";
5449 break;
5450 case NOBEHIND:
5451 p = "NOBEHIND";
5452 break;
5453 case SUBPAT:
5454 p = "SUBPAT";
5455 break;
5456 case BRACE_LIMITS:
5457 p = "BRACE_LIMITS";
5458 break;
5459 case BRACE_SIMPLE:
5460 p = "BRACE_SIMPLE";
5461 break;
5462 case BRACE_COMPLEX + 0:
5463 case BRACE_COMPLEX + 1:
5464 case BRACE_COMPLEX + 2:
5465 case BRACE_COMPLEX + 3:
5466 case BRACE_COMPLEX + 4:
5467 case BRACE_COMPLEX + 5:
5468 case BRACE_COMPLEX + 6:
5469 case BRACE_COMPLEX + 7:
5470 case BRACE_COMPLEX + 8:
5471 case BRACE_COMPLEX + 9:
5472 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5473 p = NULL;
5474 break;
5475#ifdef FEAT_MBYTE
5476 case MULTIBYTECODE:
5477 p = "MULTIBYTECODE";
5478 break;
5479#endif
5480 case NEWL:
5481 p = "NEWL";
5482 break;
5483 default:
5484 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5485 p = NULL;
5486 break;
5487 }
5488 if (p != NULL)
5489 (void) strcat(buf, p);
5490 return buf;
5491}
5492#endif
5493
5494#ifdef FEAT_MBYTE
5495static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5496
5497typedef struct
5498{
5499 int a, b, c;
5500} decomp_T;
5501
5502
5503/* 0xfb20 - 0xfb4f */
5504decomp_T decomp_table[0xfb4f-0xfb20+1] =
5505{
5506 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5507 {0x5d0,0,0}, /* 0xfb21 alt alef */
5508 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5509 {0x5d4,0,0}, /* 0xfb23 alt he */
5510 {0x5db,0,0}, /* 0xfb24 alt kaf */
5511 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5512 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5513 {0x5e8,0,0}, /* 0xfb27 alt resh */
5514 {0x5ea,0,0}, /* 0xfb28 alt tav */
5515 {'+', 0, 0}, /* 0xfb29 alt plus */
5516 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5517 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5518 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5519 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5520 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5521 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5522 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5523 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5524 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5525 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5526 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5527 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5528 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5529 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5530 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5531 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5532 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5533 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5534 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5535 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5536 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5537 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5538 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5539 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5540 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5541 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5542 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5543 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5544 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5545 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5546 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5547 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5548 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5549 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5550 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5551 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5552 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5553 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5554};
5555
5556 static void
5557mb_decompose(c, c1, c2, c3)
5558 int c, *c1, *c2, *c3;
5559{
5560 decomp_T d;
5561
5562 if (c >= 0x4b20 && c <= 0xfb4f)
5563 {
5564 d = decomp_table[c - 0xfb20];
5565 *c1 = d.a;
5566 *c2 = d.b;
5567 *c3 = d.c;
5568 }
5569 else
5570 {
5571 *c1 = c;
5572 *c2 = *c3 = 0;
5573 }
5574}
5575#endif
5576
5577/*
5578 * Compare two strings, ignore case if ireg_ic set.
5579 * Return 0 if strings match, non-zero otherwise.
5580 * Correct the length "*n" when composing characters are ignored.
5581 */
5582 static int
5583cstrncmp(s1, s2, n)
5584 char_u *s1, *s2;
5585 int *n;
5586{
5587 int result;
5588
5589 if (!ireg_ic)
5590 result = STRNCMP(s1, s2, *n);
5591 else
5592 result = MB_STRNICMP(s1, s2, *n);
5593
5594#ifdef FEAT_MBYTE
5595 /* if it failed and it's utf8 and we want to combineignore: */
5596 if (result != 0 && enc_utf8 && ireg_icombine)
5597 {
5598 char_u *str1, *str2;
5599 int c1, c2, c11, c12;
5600 int ix;
5601 int junk;
5602
5603 /* we have to handle the strcmp ourselves, since it is necessary to
5604 * deal with the composing characters by ignoring them: */
5605 str1 = s1;
5606 str2 = s2;
5607 c1 = c2 = 0;
5608 for (ix = 0; ix < *n; )
5609 {
5610 c1 = mb_ptr2char_adv(&str1);
5611 c2 = mb_ptr2char_adv(&str2);
5612 ix += utf_char2len(c1);
5613
5614 /* decompose the character if necessary, into 'base' characters
5615 * because I don't care about Arabic, I will hard-code the Hebrew
5616 * which I *do* care about! So sue me... */
5617 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5618 {
5619 /* decomposition necessary? */
5620 mb_decompose(c1, &c11, &junk, &junk);
5621 mb_decompose(c2, &c12, &junk, &junk);
5622 c1 = c11;
5623 c2 = c12;
5624 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5625 break;
5626 }
5627 }
5628 result = c2 - c1;
5629 if (result == 0)
5630 *n = (int)(str2 - s2);
5631 }
5632#endif
5633
5634 return result;
5635}
5636
5637/*
5638 * cstrchr: This function is used a lot for simple searches, keep it fast!
5639 */
5640 static char_u *
5641cstrchr(s, c)
5642 char_u *s;
5643 int c;
5644{
5645 char_u *p;
5646 int cc;
5647
5648 if (!ireg_ic
5649#ifdef FEAT_MBYTE
5650 || (!enc_utf8 && mb_char2len(c) > 1)
5651#endif
5652 )
5653 return vim_strchr(s, c);
5654
5655 /* tolower() and toupper() can be slow, comparing twice should be a lot
5656 * faster (esp. when using MS Visual C++!).
5657 * For UTF-8 need to use folded case. */
5658#ifdef FEAT_MBYTE
5659 if (enc_utf8 && c > 0x80)
5660 cc = utf_fold(c);
5661 else
5662#endif
5663 if (isupper(c))
5664 cc = TOLOWER_LOC(c);
5665 else if (islower(c))
5666 cc = TOUPPER_LOC(c);
5667 else
5668 return vim_strchr(s, c);
5669
5670#ifdef FEAT_MBYTE
5671 if (has_mbyte)
5672 {
5673 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5674 {
5675 if (enc_utf8 && c > 0x80)
5676 {
5677 if (utf_fold(utf_ptr2char(p)) == cc)
5678 return p;
5679 }
5680 else if (*p == c || *p == cc)
5681 return p;
5682 }
5683 }
5684 else
5685#endif
5686 /* Faster version for when there are no multi-byte characters. */
5687 for (p = s; *p != NUL; ++p)
5688 if (*p == c || *p == cc)
5689 return p;
5690
5691 return NULL;
5692}
5693
5694/***************************************************************
5695 * regsub stuff *
5696 ***************************************************************/
5697
5698/* This stuff below really confuses cc on an SGI -- webb */
5699#ifdef __sgi
5700# undef __ARGS
5701# define __ARGS(x) ()
5702#endif
5703
5704/*
5705 * We should define ftpr as a pointer to a function returning a pointer to
5706 * a function returning a pointer to a function ...
5707 * This is impossible, so we declare a pointer to a function returning a
5708 * pointer to a function returning void. This should work for all compilers.
5709 */
5710typedef void (*(*fptr) __ARGS((char_u *, int)))();
5711
5712static fptr do_upper __ARGS((char_u *, int));
5713static fptr do_Upper __ARGS((char_u *, int));
5714static fptr do_lower __ARGS((char_u *, int));
5715static fptr do_Lower __ARGS((char_u *, int));
5716
5717static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5718
5719 static fptr
5720do_upper(d, c)
5721 char_u *d;
5722 int c;
5723{
5724 *d = TOUPPER_LOC(c);
5725
5726 return (fptr)NULL;
5727}
5728
5729 static fptr
5730do_Upper(d, c)
5731 char_u *d;
5732 int c;
5733{
5734 *d = TOUPPER_LOC(c);
5735
5736 return (fptr)do_Upper;
5737}
5738
5739 static fptr
5740do_lower(d, c)
5741 char_u *d;
5742 int c;
5743{
5744 *d = TOLOWER_LOC(c);
5745
5746 return (fptr)NULL;
5747}
5748
5749 static fptr
5750do_Lower(d, c)
5751 char_u *d;
5752 int c;
5753{
5754 *d = TOLOWER_LOC(c);
5755
5756 return (fptr)do_Lower;
5757}
5758
5759/*
5760 * regtilde(): Replace tildes in the pattern by the old pattern.
5761 *
5762 * Short explanation of the tilde: It stands for the previous replacement
5763 * pattern. If that previous pattern also contains a ~ we should go back a
5764 * step further... But we insert the previous pattern into the current one
5765 * and remember that.
5766 * This still does not handle the case where "magic" changes. TODO?
5767 *
5768 * The tildes are parsed once before the first call to vim_regsub().
5769 */
5770 char_u *
5771regtilde(source, magic)
5772 char_u *source;
5773 int magic;
5774{
5775 char_u *newsub = source;
5776 char_u *tmpsub;
5777 char_u *p;
5778 int len;
5779 int prevlen;
5780
5781 for (p = newsub; *p; ++p)
5782 {
5783 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5784 {
5785 if (reg_prev_sub != NULL)
5786 {
5787 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5788 prevlen = (int)STRLEN(reg_prev_sub);
5789 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5790 if (tmpsub != NULL)
5791 {
5792 /* copy prefix */
5793 len = (int)(p - newsub); /* not including ~ */
5794 mch_memmove(tmpsub, newsub, (size_t)len);
5795 /* interpretate tilde */
5796 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5797 /* copy postfix */
5798 if (!magic)
5799 ++p; /* back off \ */
5800 STRCPY(tmpsub + len + prevlen, p + 1);
5801
5802 if (newsub != source) /* already allocated newsub */
5803 vim_free(newsub);
5804 newsub = tmpsub;
5805 p = newsub + len + prevlen;
5806 }
5807 }
5808 else if (magic)
5809 STRCPY(p, p + 1); /* remove '~' */
5810 else
5811 STRCPY(p, p + 2); /* remove '\~' */
5812 --p;
5813 }
5814 else
5815 {
5816 if (*p == '\\' && p[1]) /* skip escaped characters */
5817 ++p;
5818#ifdef FEAT_MBYTE
5819 if (has_mbyte)
5820 p += (*mb_ptr2len_check)(p) - 1;
5821#endif
5822 }
5823 }
5824
5825 vim_free(reg_prev_sub);
5826 if (newsub != source) /* newsub was allocated, just keep it */
5827 reg_prev_sub = newsub;
5828 else /* no ~ found, need to save newsub */
5829 reg_prev_sub = vim_strsave(newsub);
5830 return newsub;
5831}
5832
5833#ifdef FEAT_EVAL
5834static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
5835
5836/* These pointers are used instead of reg_match and reg_mmatch for
5837 * reg_submatch(). Needed for when the substitution string is an expression
5838 * that contains a call to substitute() and submatch(). */
5839static regmatch_T *submatch_match;
5840static regmmatch_T *submatch_mmatch;
5841#endif
5842
5843#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
5844/*
5845 * vim_regsub() - perform substitutions after a vim_regexec() or
5846 * vim_regexec_multi() match.
5847 *
5848 * If "copy" is TRUE really copy into "dest".
5849 * If "copy" is FALSE nothing is copied, this is just to find out the length
5850 * of the result.
5851 *
5852 * If "backslash" is TRUE, a backslash will be removed later, need to double
5853 * them to keep them, and insert a backslash before a CR to avoid it being
5854 * replaced with a line break later.
5855 *
5856 * Note: The matched text must not change between the call of
5857 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
5858 * references invalid!
5859 *
5860 * Returns the size of the replacement, including terminating NUL.
5861 */
5862 int
5863vim_regsub(rmp, source, dest, copy, magic, backslash)
5864 regmatch_T *rmp;
5865 char_u *source;
5866 char_u *dest;
5867 int copy;
5868 int magic;
5869 int backslash;
5870{
5871 reg_match = rmp;
5872 reg_mmatch = NULL;
5873 reg_maxline = 0;
5874 return vim_regsub_both(source, dest, copy, magic, backslash);
5875}
5876#endif
5877
5878 int
5879vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
5880 regmmatch_T *rmp;
5881 linenr_T lnum;
5882 char_u *source;
5883 char_u *dest;
5884 int copy;
5885 int magic;
5886 int backslash;
5887{
5888 reg_match = NULL;
5889 reg_mmatch = rmp;
5890 reg_buf = curbuf; /* always works on the current buffer! */
5891 reg_firstlnum = lnum;
5892 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
5893 return vim_regsub_both(source, dest, copy, magic, backslash);
5894}
5895
5896 static int
5897vim_regsub_both(source, dest, copy, magic, backslash)
5898 char_u *source;
5899 char_u *dest;
5900 int copy;
5901 int magic;
5902 int backslash;
5903{
5904 char_u *src;
5905 char_u *dst;
5906 char_u *s;
5907 int c;
5908 int no = -1;
5909 fptr func = (fptr)NULL;
5910 linenr_T clnum = 0; /* init for GCC */
5911 int len = 0; /* init for GCC */
5912#ifdef FEAT_EVAL
5913 static char_u *eval_result = NULL;
5914#endif
5915#ifdef FEAT_MBYTE
5916 int l;
5917#endif
5918
5919
5920 /* Be paranoid... */
5921 if (source == NULL || dest == NULL)
5922 {
5923 EMSG(_(e_null));
5924 return 0;
5925 }
5926 if (prog_magic_wrong())
5927 return 0;
5928 src = source;
5929 dst = dest;
5930
5931 /*
5932 * When the substitute part starts with "\=" evaluate it as an expression.
5933 */
5934 if (source[0] == '\\' && source[1] == '='
5935#ifdef FEAT_EVAL
5936 && !can_f_submatch /* can't do this recursively */
5937#endif
5938 )
5939 {
5940#ifdef FEAT_EVAL
5941 /* To make sure that the length doesn't change between checking the
5942 * length and copying the string, and to speed up things, the
5943 * resulting string is saved from the call with "copy" == FALSE to the
5944 * call with "copy" == TRUE. */
5945 if (copy)
5946 {
5947 if (eval_result != NULL)
5948 {
5949 STRCPY(dest, eval_result);
5950 dst += STRLEN(eval_result);
5951 vim_free(eval_result);
5952 eval_result = NULL;
5953 }
5954 }
5955 else
5956 {
5957 linenr_T save_reg_maxline;
5958 win_T *save_reg_win;
5959 int save_ireg_ic;
5960
5961 vim_free(eval_result);
5962
5963 /* The expression may contain substitute(), which calls us
5964 * recursively. Make sure submatch() gets the text from the first
5965 * level. Don't need to save "reg_buf", because
5966 * vim_regexec_multi() can't be called recursively. */
5967 submatch_match = reg_match;
5968 submatch_mmatch = reg_mmatch;
5969 save_reg_maxline = reg_maxline;
5970 save_reg_win = reg_win;
5971 save_ireg_ic = ireg_ic;
5972 can_f_submatch = TRUE;
5973
5974 eval_result = eval_to_string(source + 2, NULL);
5975 if (eval_result != NULL)
5976 {
5977 for (s = eval_result; *s != NUL; ++s)
5978 {
5979 /* Change NL to CR, so that it becomes a line break.
5980 * Skip over a backslashed character. */
5981 if (*s == NL)
5982 *s = CAR;
5983 else if (*s == '\\' && s[1] != NUL)
5984 ++s;
5985#ifdef FEAT_MBYTE
5986 if (has_mbyte)
5987 s += (*mb_ptr2len_check)(s) - 1;
5988#endif
5989 }
5990
5991 dst += STRLEN(eval_result);
5992 }
5993
5994 reg_match = submatch_match;
5995 reg_mmatch = submatch_mmatch;
5996 reg_maxline = save_reg_maxline;
5997 reg_win = save_reg_win;
5998 ireg_ic = save_ireg_ic;
5999 can_f_submatch = FALSE;
6000 }
6001#endif
6002 }
6003 else
6004 while ((c = *src++) != NUL)
6005 {
6006 if (c == '&' && magic)
6007 no = 0;
6008 else if (c == '\\' && *src != NUL)
6009 {
6010 if (*src == '&' && !magic)
6011 {
6012 ++src;
6013 no = 0;
6014 }
6015 else if ('0' <= *src && *src <= '9')
6016 {
6017 no = *src++ - '0';
6018 }
6019 else if (vim_strchr((char_u *)"uUlLeE", *src))
6020 {
6021 switch (*src++)
6022 {
6023 case 'u': func = (fptr)do_upper;
6024 continue;
6025 case 'U': func = (fptr)do_Upper;
6026 continue;
6027 case 'l': func = (fptr)do_lower;
6028 continue;
6029 case 'L': func = (fptr)do_Lower;
6030 continue;
6031 case 'e':
6032 case 'E': func = (fptr)NULL;
6033 continue;
6034 }
6035 }
6036 }
6037 if (no < 0) /* Ordinary character. */
6038 {
6039 if (c == '\\' && *src != NUL)
6040 {
6041 /* Check for abbreviations -- webb */
6042 switch (*src)
6043 {
6044 case 'r': c = CAR; ++src; break;
6045 case 'n': c = NL; ++src; break;
6046 case 't': c = TAB; ++src; break;
6047 /* Oh no! \e already has meaning in subst pat :-( */
6048 /* case 'e': c = ESC; ++src; break; */
6049 case 'b': c = Ctrl_H; ++src; break;
6050
6051 /* If "backslash" is TRUE the backslash will be removed
6052 * later. Used to insert a literal CR. */
6053 default: if (backslash)
6054 {
6055 if (copy)
6056 *dst = '\\';
6057 ++dst;
6058 }
6059 c = *src++;
6060 }
6061 }
6062
6063 /* Write to buffer, if copy is set. */
6064#ifdef FEAT_MBYTE
6065 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6066 {
6067 /* TODO: should use "func" here. */
6068 if (copy)
6069 mch_memmove(dst, src - 1, l);
6070 dst += l - 1;
6071 src += l - 1;
6072 }
6073 else
6074 {
6075#endif
6076 if (copy)
6077 {
6078 if (func == (fptr)NULL) /* just copy */
6079 *dst = c;
6080 else /* change case */
6081 func = (fptr)(func(dst, c));
6082 /* Turbo C complains without the typecast */
6083 }
6084#ifdef FEAT_MBYTE
6085 }
6086#endif
6087 dst++;
6088 }
6089 else
6090 {
6091 if (REG_MULTI)
6092 {
6093 clnum = reg_mmatch->startpos[no].lnum;
6094 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6095 s = NULL;
6096 else
6097 {
6098 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6099 if (reg_mmatch->endpos[no].lnum == clnum)
6100 len = reg_mmatch->endpos[no].col
6101 - reg_mmatch->startpos[no].col;
6102 else
6103 len = (int)STRLEN(s);
6104 }
6105 }
6106 else
6107 {
6108 s = reg_match->startp[no];
6109 if (reg_match->endp[no] == NULL)
6110 s = NULL;
6111 else
6112 len = (int)(reg_match->endp[no] - s);
6113 }
6114 if (s != NULL)
6115 {
6116 for (;;)
6117 {
6118 if (len == 0)
6119 {
6120 if (REG_MULTI)
6121 {
6122 if (reg_mmatch->endpos[no].lnum == clnum)
6123 break;
6124 if (copy)
6125 *dst = CAR;
6126 ++dst;
6127 s = reg_getline(++clnum);
6128 if (reg_mmatch->endpos[no].lnum == clnum)
6129 len = reg_mmatch->endpos[no].col;
6130 else
6131 len = (int)STRLEN(s);
6132 }
6133 else
6134 break;
6135 }
6136 else if (*s == NUL) /* we hit NUL. */
6137 {
6138 if (copy)
6139 EMSG(_(e_re_damg));
6140 goto exit;
6141 }
6142 else
6143 {
6144 if (backslash && (*s == CAR || *s == '\\'))
6145 {
6146 /*
6147 * Insert a backslash in front of a CR, otherwise
6148 * it will be replaced by a line break.
6149 * Number of backslashes will be halved later,
6150 * double them here.
6151 */
6152 if (copy)
6153 {
6154 dst[0] = '\\';
6155 dst[1] = *s;
6156 }
6157 dst += 2;
6158 }
6159#ifdef FEAT_MBYTE
6160 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6161 {
6162 /* TODO: should use "func" here. */
6163 if (copy)
6164 mch_memmove(dst, s, l);
6165 dst += l;
6166 s += l - 1;
6167 len -= l - 1;
6168 }
6169#endif
6170 else
6171 {
6172 if (copy)
6173 {
6174 if (func == (fptr)NULL) /* just copy */
6175 *dst = *s;
6176 else /* change case */
6177 func = (fptr)(func(dst, *s));
6178 /* Turbo C complains without the typecast */
6179 }
6180 ++dst;
6181 }
6182 ++s;
6183 --len;
6184 }
6185 }
6186 }
6187 no = -1;
6188 }
6189 }
6190 if (copy)
6191 *dst = NUL;
6192
6193exit:
6194 return (int)((dst - dest) + 1);
6195}
6196
6197#ifdef FEAT_EVAL
6198/*
6199 * Used for the submatch() function: get the string from tne n'th submatch in
6200 * allocated memory.
6201 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6202 */
6203 char_u *
6204reg_submatch(no)
6205 int no;
6206{
6207 char_u *retval = NULL;
6208 char_u *s;
6209 int len;
6210 int round;
6211 linenr_T lnum;
6212
6213 if (!can_f_submatch)
6214 return NULL;
6215
6216 if (submatch_match == NULL)
6217 {
6218 /*
6219 * First round: compute the length and allocate memory.
6220 * Second round: copy the text.
6221 */
6222 for (round = 1; round <= 2; ++round)
6223 {
6224 lnum = submatch_mmatch->startpos[no].lnum;
6225 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6226 return NULL;
6227
6228 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6229 if (s == NULL) /* anti-crash check, cannot happen? */
6230 break;
6231 if (submatch_mmatch->endpos[no].lnum == lnum)
6232 {
6233 /* Within one line: take form start to end col. */
6234 len = submatch_mmatch->endpos[no].col
6235 - submatch_mmatch->startpos[no].col;
6236 if (round == 2)
6237 {
6238 STRNCPY(retval, s, len);
6239 retval[len] = NUL;
6240 }
6241 ++len;
6242 }
6243 else
6244 {
6245 /* Multiple lines: take start line from start col, middle
6246 * lines completely and end line up to end col. */
6247 len = (int)STRLEN(s);
6248 if (round == 2)
6249 {
6250 STRCPY(retval, s);
6251 retval[len] = '\n';
6252 }
6253 ++len;
6254 ++lnum;
6255 while (lnum < submatch_mmatch->endpos[no].lnum)
6256 {
6257 s = reg_getline(lnum++);
6258 if (round == 2)
6259 STRCPY(retval + len, s);
6260 len += (int)STRLEN(s);
6261 if (round == 2)
6262 retval[len] = '\n';
6263 ++len;
6264 }
6265 if (round == 2)
6266 STRNCPY(retval + len, reg_getline(lnum),
6267 submatch_mmatch->endpos[no].col);
6268 len += submatch_mmatch->endpos[no].col;
6269 if (round == 2)
6270 retval[len] = NUL;
6271 ++len;
6272 }
6273
6274 if (round == 1)
6275 {
6276 retval = lalloc((long_u)len, TRUE);
6277 if (s == NULL)
6278 return NULL;
6279 }
6280 }
6281 }
6282 else
6283 {
6284 if (submatch_match->endp[no] == NULL)
6285 retval = NULL;
6286 else
6287 {
6288 s = submatch_match->startp[no];
6289 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6290 }
6291 }
6292
6293 return retval;
6294}
6295#endif