blob: a9915a3b17cd03e0a9398ebb15c9939d12f71735 [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
77 * node points to the node after the stuff to be repeated. The operand of some
78 * types of node is a literal string; for others, it is a node leading into a
79 * sub-FSM. In particular, the operand of a BRANCH node is the first node of
80 * the branch. (NB this is *not* a tree structure: the tail of the branch
81 * connects to the thing following the set of BRANCHes.)
82 *
83 * pattern is coded like:
84 *
85 * +-----------------+
86 * | V
87 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
88 * | ^ | ^
89 * +------+ +----------+
90 *
91 *
92 * +------------------+
93 * V |
94 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
95 * | | ^ ^
96 * | +---------------+ |
97 * +---------------------------------------------+
98 *
99 *
100 * +-------------------------+
101 * V |
102 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
103 * | | ^
104 * | +----------------+
105 * +-----------------------------------------------+
106 *
107 *
108 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
109 * | | ^ ^
110 * | +----------------+ |
111 * +--------------------------------+
112 *
113 * +---------+
114 * | V
115 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
116 * | | | | ^ ^
117 * | | | +-----+ |
118 * | | +----------------+ |
119 * | +---------------------------+ |
120 * +------------------------------------------------------+
121 *
122 * They all start with a BRANCH for "\|" alternaties, even when there is only
123 * one alternative.
124 */
125
126/*
127 * The opcodes are:
128 */
129
130/* definition number opnd? meaning */
131#define END 0 /* End of program or NOMATCH operand. */
132#define BOL 1 /* Match "" at beginning of line. */
133#define EOL 2 /* Match "" at end of line. */
134#define BRANCH 3 /* node Match this alternative, or the
135 * next... */
136#define BACK 4 /* Match "", "next" ptr points backward. */
137#define EXACTLY 5 /* str Match this string. */
138#define NOTHING 6 /* Match empty string. */
139#define STAR 7 /* node Match this (simple) thing 0 or more
140 * times. */
141#define PLUS 8 /* node Match this (simple) thing 1 or more
142 * times. */
143#define MATCH 9 /* node match the operand zero-width */
144#define NOMATCH 10 /* node check for no match with operand */
145#define BEHIND 11 /* node look behind for a match with operand */
146#define NOBEHIND 12 /* node look behind for no match with operand */
147#define SUBPAT 13 /* node match the operand here */
148#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
149 * n times (\{m,n\}). */
150#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
151#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
152#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
153 * and BRACE_COMPLEX. */
154#define NEWL 18 /* Match line-break */
155#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
156
157
158/* character classes: 20-48 normal, 50-78 include a line-break */
159#define ADD_NL 30
160#define FIRST_NL ANY + ADD_NL
161#define ANY 20 /* Match any one character. */
162#define ANYOF 21 /* str Match any character in this string. */
163#define ANYBUT 22 /* str Match any character not in this
164 * string. */
165#define IDENT 23 /* Match identifier char */
166#define SIDENT 24 /* Match identifier char but no digit */
167#define KWORD 25 /* Match keyword char */
168#define SKWORD 26 /* Match word char but no digit */
169#define FNAME 27 /* Match file name char */
170#define SFNAME 28 /* Match file name char but no digit */
171#define PRINT 29 /* Match printable char */
172#define SPRINT 30 /* Match printable char but no digit */
173#define WHITE 31 /* Match whitespace char */
174#define NWHITE 32 /* Match non-whitespace char */
175#define DIGIT 33 /* Match digit char */
176#define NDIGIT 34 /* Match non-digit char */
177#define HEX 35 /* Match hex char */
178#define NHEX 36 /* Match non-hex char */
179#define OCTAL 37 /* Match octal char */
180#define NOCTAL 38 /* Match non-octal char */
181#define WORD 39 /* Match word char */
182#define NWORD 40 /* Match non-word char */
183#define HEAD 41 /* Match head char */
184#define NHEAD 42 /* Match non-head char */
185#define ALPHA 43 /* Match alpha char */
186#define NALPHA 44 /* Match non-alpha char */
187#define LOWER 45 /* Match lowercase char */
188#define NLOWER 46 /* Match non-lowercase char */
189#define UPPER 47 /* Match uppercase char */
190#define NUPPER 48 /* Match non-uppercase char */
191#define LAST_NL NUPPER + ADD_NL
192#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
193
194#define MOPEN 80 /* -89 Mark this point in input as start of
195 * \( subexpr. MOPEN + 0 marks start of
196 * match. */
197#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
198 * end of match. */
199#define BACKREF 100 /* -109 node Match same string again \1-\9 */
200
201#ifdef FEAT_SYN_HL
202# define ZOPEN 110 /* -119 Mark this point in input as start of
203 * \z( subexpr. */
204# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
205# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
206#endif
207
208#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
209
210#define NOPEN 150 /* Mark this point in input as start of
211 \%( subexpr. */
212#define NCLOSE 151 /* Analogous to NOPEN. */
213
214#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
215#define RE_BOF 201 /* Match "" at beginning of file. */
216#define RE_EOF 202 /* Match "" at end of file. */
217#define CURSOR 203 /* Match location of cursor. */
218
219#define RE_LNUM 204 /* nr cmp Match line number */
220#define RE_COL 205 /* nr cmp Match column number */
221#define RE_VCOL 206 /* nr cmp Match virtual column number */
222
223/*
224 * Magic characters have a special meaning, they don't match literally.
225 * Magic characters are negative. This separates them from literal characters
226 * (possibly multi-byte). Only ASCII characters can be Magic.
227 */
228#define Magic(x) ((int)(x) - 256)
229#define un_Magic(x) ((x) + 256)
230#define is_Magic(x) ((x) < 0)
231
232static int no_Magic __ARGS((int x));
233static int toggle_Magic __ARGS((int x));
234
235 static int
236no_Magic(x)
237 int x;
238{
239 if (is_Magic(x))
240 return un_Magic(x);
241 return x;
242}
243
244 static int
245toggle_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return Magic(x);
251}
252
253/*
254 * The first byte of the regexp internal "program" is actually this magic
255 * number; the start node begins in the second byte. It's used to catch the
256 * most severe mutilation of the program by the caller.
257 */
258
259#define REGMAGIC 0234
260
261/*
262 * Opcode notes:
263 *
264 * BRANCH The set of branches constituting a single choice are hooked
265 * together with their "next" pointers, since precedence prevents
266 * anything being concatenated to any individual branch. The
267 * "next" pointer of the last BRANCH in a choice points to the
268 * thing following the whole choice. This is also where the
269 * final "next" pointer of each individual branch points; each
270 * branch starts with the operand node of a BRANCH node.
271 *
272 * BACK Normal "next" pointers all implicitly point forward; BACK
273 * exists to make loop structures possible.
274 *
275 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
276 * BRANCH structures using BACK. Simple cases (one character
277 * per match) are implemented with STAR and PLUS for speed
278 * and to minimize recursive plunges.
279 *
280 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
281 * node, and defines the min and max limits to be used for that
282 * node.
283 *
284 * MOPEN,MCLOSE ...are numbered at compile time.
285 * ZOPEN,ZCLOSE ...ditto
286 */
287
288/*
289 * A node is one char of opcode followed by two chars of "next" pointer.
290 * "Next" pointers are stored as two 8-bit bytes, high order first. The
291 * value is a positive offset from the opcode of the node containing it.
292 * An operand, if any, simply follows the node. (Note that much of the
293 * code generation knows about this implicit relationship.)
294 *
295 * Using two bytes for the "next" pointer is vast overkill for most things,
296 * but allows patterns to get big without disasters.
297 */
298#define OP(p) ((int)*(p))
299#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
300#define OPERAND(p) ((p) + 3)
301/* Obtain an operand that was stored as four bytes, MSB first. */
302#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
303 + ((long)(p)[5] << 8) + (long)(p)[6])
304/* Obtain a second operand stored as four bytes. */
305#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
306/* Obtain a second single-byte operand stored after a four bytes operand. */
307#define OPERAND_CMP(p) (p)[7]
308
309/*
310 * Utility definitions.
311 */
312#define UCHARAT(p) ((int)*(char_u *)(p))
313
314/* Used for an error (down from) vim_regcomp(): give the error message, set
315 * rc_did_emsg and return NULL */
316#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
317#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
318#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
319#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
320
321#define MAX_LIMIT (32767L << 16L)
322
323static int re_multi_type __ARGS((int));
324static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
325static char_u *cstrchr __ARGS((char_u *, int));
326
327#ifdef DEBUG
328static void regdump __ARGS((char_u *, regprog_T *));
329static char_u *regprop __ARGS((char_u *));
330#endif
331
332#define NOT_MULTI 0
333#define MULTI_ONE 1
334#define MULTI_MULT 2
335/*
336 * Return NOT_MULTI if c is not a "multi" operator.
337 * Return MULTI_ONE if c is a single "multi" operator.
338 * Return MULTI_MULT if c is a multi "multi" operator.
339 */
340 static int
341re_multi_type(c)
342 int c;
343{
344 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
345 return MULTI_ONE;
346 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
347 return MULTI_MULT;
348 return NOT_MULTI;
349}
350
351/*
352 * Flags to be passed up and down.
353 */
354#define HASWIDTH 0x1 /* Known never to match null string. */
355#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
356#define SPSTART 0x4 /* Starts with * or +. */
357#define HASNL 0x8 /* Contains some \n. */
358#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
359#define WORST 0 /* Worst case. */
360
361/*
362 * When regcode is set to this value, code is not emitted and size is computed
363 * instead.
364 */
365#define JUST_CALC_SIZE ((char_u *) -1)
366
367static char_u *reg_prev_sub;
368
369/*
370 * REGEXP_INRANGE contains all characters which are always special in a []
371 * range after '\'.
372 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
373 * These are:
374 * \n - New line (NL).
375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB).
377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000379 * \d - Character code in decimal, eg \d123
380 * \o - Character code in octal, eg \o80
381 * \x - Character code in hex, eg \x4a
382 * \u - Multibyte character code, eg \u20ac
383 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384 */
385static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000386static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000387
388static int backslash_trans __ARGS((int c));
389static int skip_class_name __ARGS((char_u **pp));
390static char_u *skip_anyof __ARGS((char_u *p));
391static void init_class_tab __ARGS((void));
392
393/*
394 * Translate '\x' to its control character, except "\n", which is Magic.
395 */
396 static int
397backslash_trans(c)
398 int c;
399{
400 switch (c)
401 {
402 case 'r': return CAR;
403 case 't': return TAB;
404 case 'e': return ESC;
405 case 'b': return BS;
406 }
407 return c;
408}
409
410/*
411 * Check for a character class name. "pp" points to the '['.
412 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
413 * recognized. Otherwise "pp" is advanced to after the item.
414 */
415 static int
416skip_class_name(pp)
417 char_u **pp;
418{
419 static const char *(class_names[]) =
420 {
421 "alnum:]",
422#define CLASS_ALNUM 0
423 "alpha:]",
424#define CLASS_ALPHA 1
425 "blank:]",
426#define CLASS_BLANK 2
427 "cntrl:]",
428#define CLASS_CNTRL 3
429 "digit:]",
430#define CLASS_DIGIT 4
431 "graph:]",
432#define CLASS_GRAPH 5
433 "lower:]",
434#define CLASS_LOWER 6
435 "print:]",
436#define CLASS_PRINT 7
437 "punct:]",
438#define CLASS_PUNCT 8
439 "space:]",
440#define CLASS_SPACE 9
441 "upper:]",
442#define CLASS_UPPER 10
443 "xdigit:]",
444#define CLASS_XDIGIT 11
445 "tab:]",
446#define CLASS_TAB 12
447 "return:]",
448#define CLASS_RETURN 13
449 "backspace:]",
450#define CLASS_BACKSPACE 14
451 "escape:]",
452#define CLASS_ESCAPE 15
453 };
454#define CLASS_NONE 99
455 int i;
456
457 if ((*pp)[1] == ':')
458 {
459 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
460 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
461 {
462 *pp += STRLEN(class_names[i]) + 2;
463 return i;
464 }
465 }
466 return CLASS_NONE;
467}
468
469/*
470 * Skip over a "[]" range.
471 * "p" must point to the character after the '['.
472 * The returned pointer is on the matching ']', or the terminating NUL.
473 */
474 static char_u *
475skip_anyof(p)
476 char_u *p;
477{
478 int cpo_lit; /* 'cpoptions' contains 'l' flag */
479#ifdef FEAT_MBYTE
480 int l;
481#endif
482
483 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
484
485 if (*p == '^') /* Complement of range. */
486 ++p;
487 if (*p == ']' || *p == '-')
488 ++p;
489 while (*p != NUL && *p != ']')
490 {
491#ifdef FEAT_MBYTE
492 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
493 p += l;
494 else
495#endif
496 if (*p == '-')
497 {
498 ++p;
499 if (*p != ']' && *p != NUL)
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000500 mb_ptr_adv(p);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000501 }
502 else if (*p == '\\'
503 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
504 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
505 p += 2;
506 else if (*p == '[')
507 {
508 if (skip_class_name(&p) == CLASS_NONE)
509 ++p; /* It was not a class name */
510 }
511 else
512 ++p;
513 }
514
515 return p;
516}
517
518/*
519 * Specific version of character class functions.
520 * Using a table to keep this fast.
521 */
522static short class_tab[256];
523
524#define RI_DIGIT 0x01
525#define RI_HEX 0x02
526#define RI_OCTAL 0x04
527#define RI_WORD 0x08
528#define RI_HEAD 0x10
529#define RI_ALPHA 0x20
530#define RI_LOWER 0x40
531#define RI_UPPER 0x80
532#define RI_WHITE 0x100
533
534 static void
535init_class_tab()
536{
537 int i;
538 static int done = FALSE;
539
540 if (done)
541 return;
542
543 for (i = 0; i < 256; ++i)
544 {
545 if (i >= '0' && i <= '7')
546 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
547 else if (i >= '8' && i <= '9')
548 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
549 else if (i >= 'a' && i <= 'f')
550 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
551#ifdef EBCDIC
552 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
553 || (i >= 's' && i <= 'z'))
554#else
555 else if (i >= 'g' && i <= 'z')
556#endif
557 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
558 else if (i >= 'A' && i <= 'F')
559 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
560#ifdef EBCDIC
561 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
562 || (i >= 'S' && i <= 'Z'))
563#else
564 else if (i >= 'G' && i <= 'Z')
565#endif
566 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
567 else if (i == '_')
568 class_tab[i] = RI_WORD + RI_HEAD;
569 else
570 class_tab[i] = 0;
571 }
572 class_tab[' '] |= RI_WHITE;
573 class_tab['\t'] |= RI_WHITE;
574 done = TRUE;
575}
576
577#ifdef FEAT_MBYTE
578# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
579# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
580# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
581# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
582# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
583# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
584# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
585# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
586# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
587#else
588# define ri_digit(c) (class_tab[c] & RI_DIGIT)
589# define ri_hex(c) (class_tab[c] & RI_HEX)
590# define ri_octal(c) (class_tab[c] & RI_OCTAL)
591# define ri_word(c) (class_tab[c] & RI_WORD)
592# define ri_head(c) (class_tab[c] & RI_HEAD)
593# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
594# define ri_lower(c) (class_tab[c] & RI_LOWER)
595# define ri_upper(c) (class_tab[c] & RI_UPPER)
596# define ri_white(c) (class_tab[c] & RI_WHITE)
597#endif
598
599/* flags for regflags */
600#define RF_ICASE 1 /* ignore case */
601#define RF_NOICASE 2 /* don't ignore case */
602#define RF_HASNL 4 /* can match a NL */
603#define RF_ICOMBINE 8 /* ignore combining characters */
604#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
605
606/*
607 * Global work variables for vim_regcomp().
608 */
609
610static char_u *regparse; /* Input-scan pointer. */
611static int prevchr_len; /* byte length of previous char */
612static int num_complex_braces; /* Complex \{...} count */
613static int regnpar; /* () count. */
614#ifdef FEAT_SYN_HL
615static int regnzpar; /* \z() count. */
616static int re_has_z; /* \z item detected */
617#endif
618static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
619static long regsize; /* Code size. */
620static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
621static unsigned regflags; /* RF_ flags for prog */
622static long brace_min[10]; /* Minimums for complex brace repeats */
623static long brace_max[10]; /* Maximums for complex brace repeats */
624static int brace_count[10]; /* Current counts for complex brace repeats */
625#if defined(FEAT_SYN_HL) || defined(PROTO)
626static int had_eol; /* TRUE when EOL found by vim_regcomp() */
627#endif
628static int one_exactly = FALSE; /* only do one char for EXACTLY */
629
630static int reg_magic; /* magicness of the pattern: */
631#define MAGIC_NONE 1 /* "\V" very unmagic */
632#define MAGIC_OFF 2 /* "\M" or 'magic' off */
633#define MAGIC_ON 3 /* "\m" or 'magic' */
634#define MAGIC_ALL 4 /* "\v" very magic */
635
636static int reg_string; /* matching with a string instead of a buffer
637 line */
638
639/*
640 * META contains all characters that may be magic, except '^' and '$'.
641 */
642
643#ifdef EBCDIC
644static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
645#else
646/* META[] is used often enough to justify turning it into a table. */
647static char_u META_flags[] = {
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650/* % & ( ) * + . */
651 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
652/* 1 2 3 4 5 6 7 8 9 < = > ? */
653 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
654/* @ A C D F H I K L M O */
655 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
656/* P S U V W X Z [ _ */
657 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
658/* a c d f h i k l m n o */
659 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
660/* p s u v w x z { | ~ */
661 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
662};
663#endif
664
665static int curchr;
666
667/* arguments for reg() */
668#define REG_NOPAREN 0 /* toplevel reg() */
669#define REG_PAREN 1 /* \(\) */
670#define REG_ZPAREN 2 /* \z(\) */
671#define REG_NPAREN 3 /* \%(\) */
672
673/*
674 * Forward declarations for vim_regcomp()'s friends.
675 */
676static void initchr __ARGS((char_u *));
677static int getchr __ARGS((void));
678static void skipchr_keepstart __ARGS((void));
679static int peekchr __ARGS((void));
680static void skipchr __ARGS((void));
681static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000682static int gethexchrs __ARGS((int maxinputlen));
683static int getoctchrs __ARGS((void));
684static int getdecchrs __ARGS((void));
685static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000686static void regcomp_start __ARGS((char_u *expr, int flags));
687static char_u *reg __ARGS((int, int *));
688static char_u *regbranch __ARGS((int *flagp));
689static char_u *regconcat __ARGS((int *flagp));
690static char_u *regpiece __ARGS((int *));
691static char_u *regatom __ARGS((int *));
692static char_u *regnode __ARGS((int));
693static int prog_magic_wrong __ARGS((void));
694static char_u *regnext __ARGS((char_u *));
695static void regc __ARGS((int b));
696#ifdef FEAT_MBYTE
697static void regmbc __ARGS((int c));
698#endif
699static void reginsert __ARGS((int, char_u *));
700static void reginsert_limits __ARGS((int, long, long, char_u *));
701static char_u *re_put_long __ARGS((char_u *pr, long_u val));
702static int read_limits __ARGS((long *, long *));
703static void regtail __ARGS((char_u *, char_u *));
704static void regoptail __ARGS((char_u *, char_u *));
705
706/*
707 * Return TRUE if compiled regular expression "prog" can match a line break.
708 */
709 int
710re_multiline(prog)
711 regprog_T *prog;
712{
713 return (prog->regflags & RF_HASNL);
714}
715
716/*
717 * Return TRUE if compiled regular expression "prog" looks before the start
718 * position (pattern contains "\@<=" or "\@<!").
719 */
720 int
721re_lookbehind(prog)
722 regprog_T *prog;
723{
724 return (prog->regflags & RF_LOOKBH);
725}
726
727/*
728 * Skip past regular expression.
Bram Moolenaar748bf032005-02-02 23:04:36 +0000729 * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000730 * Take care of characters with a backslash in front of it.
731 * Skip strings inside [ and ].
732 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
733 * expression and change "\?" to "?". If "*newp" is not NULL the expression
734 * is changed in-place.
735 */
736 char_u *
737skip_regexp(startp, dirc, magic, newp)
738 char_u *startp;
739 int dirc;
740 int magic;
741 char_u **newp;
742{
743 int mymagic;
744 char_u *p = startp;
745
746 if (magic)
747 mymagic = MAGIC_ON;
748 else
749 mymagic = MAGIC_OFF;
750
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000751 for (; p[0] != NUL; mb_ptr_adv(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000752 {
753 if (p[0] == dirc) /* found end of regexp */
754 break;
755 if ((p[0] == '[' && mymagic >= MAGIC_ON)
756 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
757 {
758 p = skip_anyof(p + 1);
759 if (p[0] == NUL)
760 break;
761 }
762 else if (p[0] == '\\' && p[1] != NUL)
763 {
764 if (dirc == '?' && newp != NULL && p[1] == '?')
765 {
766 /* change "\?" to "?", make a copy first. */
767 if (*newp == NULL)
768 {
769 *newp = vim_strsave(startp);
770 if (*newp != NULL)
771 p = *newp + (p - startp);
772 }
773 if (*newp != NULL)
774 mch_memmove(p, p + 1, STRLEN(p));
775 else
776 ++p;
777 }
778 else
779 ++p; /* skip next character */
780 if (*p == 'v')
781 mymagic = MAGIC_ALL;
782 else if (*p == 'V')
783 mymagic = MAGIC_NONE;
784 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000785 }
786 return p;
787}
788
789/*
Bram Moolenaar86b68352004-12-27 21:59:20 +0000790 * vim_regcomp() - compile a regular expression into internal code
791 * Returns the program in allocated space. Returns NULL for an error.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000792 *
793 * We can't allocate space until we know how big the compiled form will be,
794 * but we can't compile it (and thus know how big it is) until we've got a
795 * place to put the code. So we cheat: we compile it twice, once with code
796 * generation turned off and size counting turned on, and once "for real".
797 * This also means that we don't allocate space until we are sure that the
798 * thing really will compile successfully, and we never have to move the
799 * code and thus invalidate pointers into it. (Note that it has to be in
800 * one piece because vim_free() must be able to free it all.)
801 *
802 * Whether upper/lower case is to be ignored is decided when executing the
803 * program, it does not matter here.
804 *
805 * Beware that the optimization-preparation code in here knows about some
806 * of the structure of the compiled regexp.
807 * "re_flags": RE_MAGIC and/or RE_STRING.
808 */
809 regprog_T *
810vim_regcomp(expr, re_flags)
811 char_u *expr;
812 int re_flags;
813{
814 regprog_T *r;
815 char_u *scan;
816 char_u *longest;
817 int len;
818 int flags;
819
820 if (expr == NULL)
821 EMSG_RET_NULL(_(e_null));
822
823 init_class_tab();
824
825 /*
826 * First pass: determine size, legality.
827 */
828 regcomp_start(expr, re_flags);
829 regcode = JUST_CALC_SIZE;
830 regc(REGMAGIC);
831 if (reg(REG_NOPAREN, &flags) == NULL)
832 return NULL;
833
834 /* Small enough for pointer-storage convention? */
835#ifdef SMALL_MALLOC /* 16 bit storage allocation */
836 if (regsize >= 65536L - 256L)
837 EMSG_RET_NULL(_("E339: Pattern too long"));
838#endif
839
840 /* Allocate space. */
841 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
842 if (r == NULL)
843 return NULL;
844
845 /*
846 * Second pass: emit code.
847 */
848 regcomp_start(expr, re_flags);
849 regcode = r->program;
850 regc(REGMAGIC);
851 if (reg(REG_NOPAREN, &flags) == NULL)
852 {
853 vim_free(r);
854 return NULL;
855 }
856
857 /* Dig out information for optimizations. */
858 r->regstart = NUL; /* Worst-case defaults. */
859 r->reganch = 0;
860 r->regmust = NULL;
861 r->regmlen = 0;
862 r->regflags = regflags;
863 if (flags & HASNL)
864 r->regflags |= RF_HASNL;
865 if (flags & HASLOOKBH)
866 r->regflags |= RF_LOOKBH;
867#ifdef FEAT_SYN_HL
868 /* Remember whether this pattern has any \z specials in it. */
869 r->reghasz = re_has_z;
870#endif
871 scan = r->program + 1; /* First BRANCH. */
872 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
873 {
874 scan = OPERAND(scan);
875
876 /* Starting-point info. */
877 if (OP(scan) == BOL || OP(scan) == RE_BOF)
878 {
879 r->reganch++;
880 scan = regnext(scan);
881 }
882
883 if (OP(scan) == EXACTLY)
884 {
885#ifdef FEAT_MBYTE
886 if (has_mbyte)
887 r->regstart = (*mb_ptr2char)(OPERAND(scan));
888 else
889#endif
890 r->regstart = *OPERAND(scan);
891 }
892 else if ((OP(scan) == BOW
893 || OP(scan) == EOW
894 || OP(scan) == NOTHING
895 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
896 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
897 && OP(regnext(scan)) == EXACTLY)
898 {
899#ifdef FEAT_MBYTE
900 if (has_mbyte)
901 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
902 else
903#endif
904 r->regstart = *OPERAND(regnext(scan));
905 }
906
907 /*
908 * If there's something expensive in the r.e., find the longest
909 * literal string that must appear and make it the regmust. Resolve
910 * ties in favor of later strings, since the regstart check works
911 * with the beginning of the r.e. and avoiding duplication
912 * strengthens checking. Not a strong reason, but sufficient in the
913 * absence of others.
914 */
915 /*
916 * When the r.e. starts with BOW, it is faster to look for a regmust
917 * first. Used a lot for "#" and "*" commands. (Added by mool).
918 */
919 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
920 && !(flags & HASNL))
921 {
922 longest = NULL;
923 len = 0;
924 for (; scan != NULL; scan = regnext(scan))
925 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
926 {
927 longest = OPERAND(scan);
928 len = (int)STRLEN(OPERAND(scan));
929 }
930 r->regmust = longest;
931 r->regmlen = len;
932 }
933 }
934#ifdef DEBUG
935 regdump(expr, r);
936#endif
937 return r;
938}
939
940/*
941 * Setup to parse the regexp. Used once to get the length and once to do it.
942 */
943 static void
944regcomp_start(expr, re_flags)
945 char_u *expr;
946 int re_flags; /* see vim_regcomp() */
947{
948 initchr(expr);
949 if (re_flags & RE_MAGIC)
950 reg_magic = MAGIC_ON;
951 else
952 reg_magic = MAGIC_OFF;
953 reg_string = (re_flags & RE_STRING);
954
955 num_complex_braces = 0;
956 regnpar = 1;
957 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
958#ifdef FEAT_SYN_HL
959 regnzpar = 1;
960 re_has_z = 0;
961#endif
962 regsize = 0L;
963 regflags = 0;
964#if defined(FEAT_SYN_HL) || defined(PROTO)
965 had_eol = FALSE;
966#endif
967}
968
969#if defined(FEAT_SYN_HL) || defined(PROTO)
970/*
971 * Check if during the previous call to vim_regcomp the EOL item "$" has been
972 * found. This is messy, but it works fine.
973 */
974 int
975vim_regcomp_had_eol()
976{
977 return had_eol;
978}
979#endif
980
981/*
982 * reg - regular expression, i.e. main body or parenthesized thing
983 *
984 * Caller must absorb opening parenthesis.
985 *
986 * Combining parenthesis handling with the base level of regular expression
987 * is a trifle forced, but the need to tie the tails of the branches to what
988 * follows makes it hard to avoid.
989 */
990 static char_u *
991reg(paren, flagp)
992 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
993 int *flagp;
994{
995 char_u *ret;
996 char_u *br;
997 char_u *ender;
998 int parno = 0;
999 int flags;
1000
1001 *flagp = HASWIDTH; /* Tentatively. */
1002
1003#ifdef FEAT_SYN_HL
1004 if (paren == REG_ZPAREN)
1005 {
1006 /* Make a ZOPEN node. */
1007 if (regnzpar >= NSUBEXP)
1008 EMSG_RET_NULL(_("E50: Too many \\z("));
1009 parno = regnzpar;
1010 regnzpar++;
1011 ret = regnode(ZOPEN + parno);
1012 }
1013 else
1014#endif
1015 if (paren == REG_PAREN)
1016 {
1017 /* Make a MOPEN node. */
1018 if (regnpar >= NSUBEXP)
1019 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1020 parno = regnpar;
1021 ++regnpar;
1022 ret = regnode(MOPEN + parno);
1023 }
1024 else if (paren == REG_NPAREN)
1025 {
1026 /* Make a NOPEN node. */
1027 ret = regnode(NOPEN);
1028 }
1029 else
1030 ret = NULL;
1031
1032 /* Pick up the branches, linking them together. */
1033 br = regbranch(&flags);
1034 if (br == NULL)
1035 return NULL;
1036 if (ret != NULL)
1037 regtail(ret, br); /* [MZ]OPEN -> first. */
1038 else
1039 ret = br;
1040 /* If one of the branches can be zero-width, the whole thing can.
1041 * If one of the branches has * at start or matches a line-break, the
1042 * whole thing can. */
1043 if (!(flags & HASWIDTH))
1044 *flagp &= ~HASWIDTH;
1045 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1046 while (peekchr() == Magic('|'))
1047 {
1048 skipchr();
1049 br = regbranch(&flags);
1050 if (br == NULL)
1051 return NULL;
1052 regtail(ret, br); /* BRANCH -> BRANCH. */
1053 if (!(flags & HASWIDTH))
1054 *flagp &= ~HASWIDTH;
1055 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1056 }
1057
1058 /* Make a closing node, and hook it on the end. */
1059 ender = regnode(
1060#ifdef FEAT_SYN_HL
1061 paren == REG_ZPAREN ? ZCLOSE + parno :
1062#endif
1063 paren == REG_PAREN ? MCLOSE + parno :
1064 paren == REG_NPAREN ? NCLOSE : END);
1065 regtail(ret, ender);
1066
1067 /* Hook the tails of the branches to the closing node. */
1068 for (br = ret; br != NULL; br = regnext(br))
1069 regoptail(br, ender);
1070
1071 /* Check for proper termination. */
1072 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1073 {
1074#ifdef FEAT_SYN_HL
1075 if (paren == REG_ZPAREN)
1076 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1077 else
1078#endif
1079 if (paren == REG_NPAREN)
1080 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1081 else
1082 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1083 }
1084 else if (paren == REG_NOPAREN && peekchr() != NUL)
1085 {
1086 if (curchr == Magic(')'))
1087 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1088 else
1089 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1090 /* NOTREACHED */
1091 }
1092 /*
1093 * Here we set the flag allowing back references to this set of
1094 * parentheses.
1095 */
1096 if (paren == REG_PAREN)
1097 had_endbrace[parno] = TRUE; /* have seen the close paren */
1098 return ret;
1099}
1100
1101/*
1102 * regbranch - one alternative of an | operator
1103 *
1104 * Implements the & operator.
1105 */
1106 static char_u *
1107regbranch(flagp)
1108 int *flagp;
1109{
1110 char_u *ret;
1111 char_u *chain = NULL;
1112 char_u *latest;
1113 int flags;
1114
1115 *flagp = WORST | HASNL; /* Tentatively. */
1116
1117 ret = regnode(BRANCH);
1118 for (;;)
1119 {
1120 latest = regconcat(&flags);
1121 if (latest == NULL)
1122 return NULL;
1123 /* If one of the branches has width, the whole thing has. If one of
1124 * the branches anchors at start-of-line, the whole thing does.
1125 * If one of the branches uses look-behind, the whole thing does. */
1126 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1127 /* If one of the branches doesn't match a line-break, the whole thing
1128 * doesn't. */
1129 *flagp &= ~HASNL | (flags & HASNL);
1130 if (chain != NULL)
1131 regtail(chain, latest);
1132 if (peekchr() != Magic('&'))
1133 break;
1134 skipchr();
1135 regtail(latest, regnode(END)); /* operand ends */
1136 reginsert(MATCH, latest);
1137 chain = latest;
1138 }
1139
1140 return ret;
1141}
1142
1143/*
1144 * regbranch - one alternative of an | or & operator
1145 *
1146 * Implements the concatenation operator.
1147 */
1148 static char_u *
1149regconcat(flagp)
1150 int *flagp;
1151{
1152 char_u *first = NULL;
1153 char_u *chain = NULL;
1154 char_u *latest;
1155 int flags;
1156 int cont = TRUE;
1157
1158 *flagp = WORST; /* Tentatively. */
1159
1160 while (cont)
1161 {
1162 switch (peekchr())
1163 {
1164 case NUL:
1165 case Magic('|'):
1166 case Magic('&'):
1167 case Magic(')'):
1168 cont = FALSE;
1169 break;
1170 case Magic('Z'):
1171#ifdef FEAT_MBYTE
1172 regflags |= RF_ICOMBINE;
1173#endif
1174 skipchr_keepstart();
1175 break;
1176 case Magic('c'):
1177 regflags |= RF_ICASE;
1178 skipchr_keepstart();
1179 break;
1180 case Magic('C'):
1181 regflags |= RF_NOICASE;
1182 skipchr_keepstart();
1183 break;
1184 case Magic('v'):
1185 reg_magic = MAGIC_ALL;
1186 skipchr_keepstart();
1187 curchr = -1;
1188 break;
1189 case Magic('m'):
1190 reg_magic = MAGIC_ON;
1191 skipchr_keepstart();
1192 curchr = -1;
1193 break;
1194 case Magic('M'):
1195 reg_magic = MAGIC_OFF;
1196 skipchr_keepstart();
1197 curchr = -1;
1198 break;
1199 case Magic('V'):
1200 reg_magic = MAGIC_NONE;
1201 skipchr_keepstart();
1202 curchr = -1;
1203 break;
1204 default:
1205 latest = regpiece(&flags);
1206 if (latest == NULL)
1207 return NULL;
1208 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1209 if (chain == NULL) /* First piece. */
1210 *flagp |= flags & SPSTART;
1211 else
1212 regtail(chain, latest);
1213 chain = latest;
1214 if (first == NULL)
1215 first = latest;
1216 break;
1217 }
1218 }
1219 if (first == NULL) /* Loop ran zero times. */
1220 first = regnode(NOTHING);
1221 return first;
1222}
1223
1224/*
1225 * regpiece - something followed by possible [*+=]
1226 *
1227 * Note that the branching code sequences used for = and the general cases
1228 * of * and + are somewhat optimized: they use the same NOTHING node as
1229 * both the endmarker for their branch list and the body of the last branch.
1230 * It might seem that this node could be dispensed with entirely, but the
1231 * endmarker role is not redundant.
1232 */
1233 static char_u *
1234regpiece(flagp)
1235 int *flagp;
1236{
1237 char_u *ret;
1238 int op;
1239 char_u *next;
1240 int flags;
1241 long minval;
1242 long maxval;
1243
1244 ret = regatom(&flags);
1245 if (ret == NULL)
1246 return NULL;
1247
1248 op = peekchr();
1249 if (re_multi_type(op) == NOT_MULTI)
1250 {
1251 *flagp = flags;
1252 return ret;
1253 }
1254 if (!(flags & HASWIDTH) && re_multi_type(op) == MULTI_MULT)
1255 {
1256 if (op == Magic('*'))
1257 EMSG_M_RET_NULL(_("E56: %s* operand could be empty"),
1258 reg_magic >= MAGIC_ON);
1259 if (op == Magic('+'))
1260 EMSG_M_RET_NULL(_("E57: %s+ operand could be empty"),
1261 reg_magic == MAGIC_ALL);
1262 /* "\{}" is checked below, it's allowed when there is an upper limit */
1263 }
1264 /* default flags */
1265 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1266
1267 skipchr();
1268 switch (op)
1269 {
1270 case Magic('*'):
1271 if (flags & SIMPLE)
1272 reginsert(STAR, ret);
1273 else
1274 {
1275 /* Emit x* as (x&|), where & means "self". */
1276 reginsert(BRANCH, ret); /* Either x */
1277 regoptail(ret, regnode(BACK)); /* and loop */
1278 regoptail(ret, ret); /* back */
1279 regtail(ret, regnode(BRANCH)); /* or */
1280 regtail(ret, regnode(NOTHING)); /* null. */
1281 }
1282 break;
1283
1284 case Magic('+'):
1285 if (flags & SIMPLE)
1286 reginsert(PLUS, ret);
1287 else
1288 {
1289 /* Emit x+ as x(&|), where & means "self". */
1290 next = regnode(BRANCH); /* Either */
1291 regtail(ret, next);
1292 regtail(regnode(BACK), ret); /* loop back */
1293 regtail(next, regnode(BRANCH)); /* or */
1294 regtail(ret, regnode(NOTHING)); /* null. */
1295 }
1296 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1297 break;
1298
1299 case Magic('@'):
1300 {
1301 int lop = END;
1302
1303 switch (no_Magic(getchr()))
1304 {
1305 case '=': lop = MATCH; break; /* \@= */
1306 case '!': lop = NOMATCH; break; /* \@! */
1307 case '>': lop = SUBPAT; break; /* \@> */
1308 case '<': switch (no_Magic(getchr()))
1309 {
1310 case '=': lop = BEHIND; break; /* \@<= */
1311 case '!': lop = NOBEHIND; break; /* \@<! */
1312 }
1313 }
1314 if (lop == END)
1315 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1316 reg_magic == MAGIC_ALL);
1317 /* Look behind must match with behind_pos. */
1318 if (lop == BEHIND || lop == NOBEHIND)
1319 {
1320 regtail(ret, regnode(BHPOS));
1321 *flagp |= HASLOOKBH;
1322 }
1323 regtail(ret, regnode(END)); /* operand ends */
1324 reginsert(lop, ret);
1325 break;
1326 }
1327
1328 case Magic('?'):
1329 case Magic('='):
1330 /* Emit x= as (x|) */
1331 reginsert(BRANCH, ret); /* Either x */
1332 regtail(ret, regnode(BRANCH)); /* or */
1333 next = regnode(NOTHING); /* null. */
1334 regtail(ret, next);
1335 regoptail(ret, next);
1336 break;
1337
1338 case Magic('{'):
1339 if (!read_limits(&minval, &maxval))
1340 return NULL;
1341 if (!(flags & HASWIDTH) && (maxval > minval
1342 ? maxval >= MAX_LIMIT : minval >= MAX_LIMIT))
1343 EMSG_M_RET_NULL(_("E58: %s{ operand could be empty"),
1344 reg_magic == MAGIC_ALL);
1345 if (flags & SIMPLE)
1346 {
1347 reginsert(BRACE_SIMPLE, ret);
1348 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1349 }
1350 else
1351 {
1352 if (num_complex_braces >= 10)
1353 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1354 reg_magic == MAGIC_ALL);
1355 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1356 regoptail(ret, regnode(BACK));
1357 regoptail(ret, ret);
1358 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1359 ++num_complex_braces;
1360 }
1361 if (minval > 0 && maxval > 0)
1362 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1363 break;
1364 }
1365 if (re_multi_type(peekchr()) != NOT_MULTI)
1366 {
1367 /* Can't have a multi follow a multi. */
1368 if (peekchr() == Magic('*'))
1369 sprintf((char *)IObuff, _("E61: Nested %s*"),
1370 reg_magic >= MAGIC_ON ? "" : "\\");
1371 else
1372 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1373 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1374 EMSG_RET_NULL(IObuff);
1375 }
1376
1377 return ret;
1378}
1379
1380/*
1381 * regatom - the lowest level
1382 *
1383 * Optimization: gobbles an entire sequence of ordinary characters so that
1384 * it can turn them into a single node, which is smaller to store and
1385 * faster to run. Don't do this when one_exactly is set.
1386 */
1387 static char_u *
1388regatom(flagp)
1389 int *flagp;
1390{
1391 char_u *ret;
1392 int flags;
1393 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1394 int c;
1395 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1396 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1397 FNAME, SFNAME, PRINT, SPRINT,
1398 WHITE, NWHITE, DIGIT, NDIGIT,
1399 HEX, NHEX, OCTAL, NOCTAL,
1400 WORD, NWORD, HEAD, NHEAD,
1401 ALPHA, NALPHA, LOWER, NLOWER,
1402 UPPER, NUPPER
1403 };
1404 char_u *p;
1405 int extra = 0;
1406
1407 *flagp = WORST; /* Tentatively. */
1408 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
1409
1410 c = getchr();
1411 switch (c)
1412 {
1413 case Magic('^'):
1414 ret = regnode(BOL);
1415 break;
1416
1417 case Magic('$'):
1418 ret = regnode(EOL);
1419#if defined(FEAT_SYN_HL) || defined(PROTO)
1420 had_eol = TRUE;
1421#endif
1422 break;
1423
1424 case Magic('<'):
1425 ret = regnode(BOW);
1426 break;
1427
1428 case Magic('>'):
1429 ret = regnode(EOW);
1430 break;
1431
1432 case Magic('_'):
1433 c = no_Magic(getchr());
1434 if (c == '^') /* "\_^" is start-of-line */
1435 {
1436 ret = regnode(BOL);
1437 break;
1438 }
1439 if (c == '$') /* "\_$" is end-of-line */
1440 {
1441 ret = regnode(EOL);
1442#if defined(FEAT_SYN_HL) || defined(PROTO)
1443 had_eol = TRUE;
1444#endif
1445 break;
1446 }
1447
1448 extra = ADD_NL;
1449 *flagp |= HASNL;
1450
1451 /* "\_[" is character range plus newline */
1452 if (c == '[')
1453 goto collection;
1454
1455 /* "\_x" is character class plus newline */
1456 /*FALLTHROUGH*/
1457
1458 /*
1459 * Character classes.
1460 */
1461 case Magic('.'):
1462 case Magic('i'):
1463 case Magic('I'):
1464 case Magic('k'):
1465 case Magic('K'):
1466 case Magic('f'):
1467 case Magic('F'):
1468 case Magic('p'):
1469 case Magic('P'):
1470 case Magic('s'):
1471 case Magic('S'):
1472 case Magic('d'):
1473 case Magic('D'):
1474 case Magic('x'):
1475 case Magic('X'):
1476 case Magic('o'):
1477 case Magic('O'):
1478 case Magic('w'):
1479 case Magic('W'):
1480 case Magic('h'):
1481 case Magic('H'):
1482 case Magic('a'):
1483 case Magic('A'):
1484 case Magic('l'):
1485 case Magic('L'):
1486 case Magic('u'):
1487 case Magic('U'):
1488 p = vim_strchr(classchars, no_Magic(c));
1489 if (p == NULL)
1490 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1491 ret = regnode(classcodes[p - classchars] + extra);
1492 *flagp |= HASWIDTH | SIMPLE;
1493 break;
1494
1495 case Magic('n'):
1496 if (reg_string)
1497 {
1498 /* In a string "\n" matches a newline character. */
1499 ret = regnode(EXACTLY);
1500 regc(NL);
1501 regc(NUL);
1502 *flagp |= HASWIDTH | SIMPLE;
1503 }
1504 else
1505 {
1506 /* In buffer text "\n" matches the end of a line. */
1507 ret = regnode(NEWL);
1508 *flagp |= HASWIDTH | HASNL;
1509 }
1510 break;
1511
1512 case Magic('('):
1513 if (one_exactly)
1514 EMSG_ONE_RET_NULL;
1515 ret = reg(REG_PAREN, &flags);
1516 if (ret == NULL)
1517 return NULL;
1518 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1519 break;
1520
1521 case NUL:
1522 case Magic('|'):
1523 case Magic('&'):
1524 case Magic(')'):
1525 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1526 /* NOTREACHED */
1527
1528 case Magic('='):
1529 case Magic('?'):
1530 case Magic('+'):
1531 case Magic('@'):
1532 case Magic('{'):
1533 case Magic('*'):
1534 c = no_Magic(c);
1535 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1536 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1537 ? "" : "\\", c);
1538 EMSG_RET_NULL(IObuff);
1539 /* NOTREACHED */
1540
1541 case Magic('~'): /* previous substitute pattern */
1542 if (reg_prev_sub)
1543 {
1544 char_u *lp;
1545
1546 ret = regnode(EXACTLY);
1547 lp = reg_prev_sub;
1548 while (*lp != NUL)
1549 regc(*lp++);
1550 regc(NUL);
1551 if (*reg_prev_sub != NUL)
1552 {
1553 *flagp |= HASWIDTH;
1554 if ((lp - reg_prev_sub) == 1)
1555 *flagp |= SIMPLE;
1556 }
1557 }
1558 else
1559 EMSG_RET_NULL(_(e_nopresub));
1560 break;
1561
1562 case Magic('1'):
1563 case Magic('2'):
1564 case Magic('3'):
1565 case Magic('4'):
1566 case Magic('5'):
1567 case Magic('6'):
1568 case Magic('7'):
1569 case Magic('8'):
1570 case Magic('9'):
1571 {
1572 int refnum;
1573
1574 refnum = c - Magic('0');
1575 /*
1576 * Check if the back reference is legal. We must have seen the
1577 * close brace.
1578 * TODO: Should also check that we don't refer to something
1579 * that is repeated (+*=): what instance of the repetition
1580 * should we match?
1581 */
1582 if (!had_endbrace[refnum])
1583 {
1584 /* Trick: check if "@<=" or "@<!" follows, in which case
1585 * the \1 can appear before the referenced match. */
1586 for (p = regparse; *p != NUL; ++p)
1587 if (p[0] == '@' && p[1] == '<'
1588 && (p[2] == '!' || p[2] == '='))
1589 break;
1590 if (*p == NUL)
1591 EMSG_RET_NULL(_("E65: Illegal back reference"));
1592 }
1593 ret = regnode(BACKREF + refnum);
1594 }
1595 break;
1596
1597#ifdef FEAT_SYN_HL
1598 case Magic('z'):
1599 {
1600 c = no_Magic(getchr());
1601 switch (c)
1602 {
1603 case '(': if (reg_do_extmatch != REX_SET)
1604 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1605 if (one_exactly)
1606 EMSG_ONE_RET_NULL;
1607 ret = reg(REG_ZPAREN, &flags);
1608 if (ret == NULL)
1609 return NULL;
1610 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1611 re_has_z = REX_SET;
1612 break;
1613
1614 case '1':
1615 case '2':
1616 case '3':
1617 case '4':
1618 case '5':
1619 case '6':
1620 case '7':
1621 case '8':
1622 case '9': if (reg_do_extmatch != REX_USE)
1623 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1624 ret = regnode(ZREF + c - '0');
1625 re_has_z = REX_USE;
1626 break;
1627
1628 case 's': ret = regnode(MOPEN + 0);
1629 break;
1630
1631 case 'e': ret = regnode(MCLOSE + 0);
1632 break;
1633
1634 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1635 }
1636 }
1637 break;
1638#endif
1639
1640 case Magic('%'):
1641 {
1642 c = no_Magic(getchr());
1643 switch (c)
1644 {
1645 /* () without a back reference */
1646 case '(':
1647 if (one_exactly)
1648 EMSG_ONE_RET_NULL;
1649 ret = reg(REG_NPAREN, &flags);
1650 if (ret == NULL)
1651 return NULL;
1652 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1653 break;
1654
1655 /* Catch \%^ and \%$ regardless of where they appear in the
1656 * pattern -- regardless of whether or not it makes sense. */
1657 case '^':
1658 ret = regnode(RE_BOF);
1659 break;
1660
1661 case '$':
1662 ret = regnode(RE_EOF);
1663 break;
1664
1665 case '#':
1666 ret = regnode(CURSOR);
1667 break;
1668
1669 /* \%[abc]: Emit as a list of branches, all ending at the last
1670 * branch which matches nothing. */
1671 case '[':
1672 if (one_exactly) /* doesn't nest */
1673 EMSG_ONE_RET_NULL;
1674 {
1675 char_u *lastbranch;
1676 char_u *lastnode = NULL;
1677 char_u *br;
1678
1679 ret = NULL;
1680 while ((c = getchr()) != ']')
1681 {
1682 if (c == NUL)
1683 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1684 reg_magic == MAGIC_ALL);
1685 br = regnode(BRANCH);
1686 if (ret == NULL)
1687 ret = br;
1688 else
1689 regtail(lastnode, br);
1690
1691 ungetchr();
1692 one_exactly = TRUE;
1693 lastnode = regatom(flagp);
1694 one_exactly = FALSE;
1695 if (lastnode == NULL)
1696 return NULL;
1697 }
1698 if (ret == NULL)
1699 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1700 reg_magic == MAGIC_ALL);
1701 lastbranch = regnode(BRANCH);
1702 br = regnode(NOTHING);
1703 if (ret != JUST_CALC_SIZE)
1704 {
1705 regtail(lastnode, br);
1706 regtail(lastbranch, br);
1707 /* connect all branches to the NOTHING
1708 * branch at the end */
1709 for (br = ret; br != lastnode; )
1710 {
1711 if (OP(br) == BRANCH)
1712 {
1713 regtail(br, lastbranch);
1714 br = OPERAND(br);
1715 }
1716 else
1717 br = regnext(br);
1718 }
1719 }
1720 *flagp &= ~HASWIDTH;
1721 break;
1722 }
1723
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001724 case 'd': /* %d123 decimal */
1725 case 'o': /* %o123 octal */
1726 case 'x': /* %xab hex 2 */
1727 case 'u': /* %uabcd hex 4 */
1728 case 'U': /* %U1234abcd hex 8 */
1729 {
1730 int i;
1731
1732 switch (c)
1733 {
1734 case 'd': i = getdecchrs(); break;
1735 case 'o': i = getoctchrs(); break;
1736 case 'x': i = gethexchrs(2); break;
1737 case 'u': i = gethexchrs(4); break;
1738 case 'U': i = gethexchrs(8); break;
1739 default: i = -1; break;
1740 }
1741
1742 if (i < 0)
1743 EMSG_M_RET_NULL(
1744 _("E678: Invalid character after %s%%[dxouU]"),
1745 reg_magic == MAGIC_ALL);
1746 ret = regnode(EXACTLY);
1747 if (i == 0)
1748 regc(0x0a);
1749 else
1750#ifdef FEAT_MBYTE
1751 regmbc(i);
1752#else
1753 regc(i);
1754#endif
1755 regc(NUL);
1756 *flagp |= HASWIDTH;
1757 break;
1758 }
1759
Bram Moolenaar071d4272004-06-13 20:20:40 +00001760 default:
1761 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1762 {
1763 long_u n = 0;
1764 int cmp;
1765
1766 cmp = c;
1767 if (cmp == '<' || cmp == '>')
1768 c = getchr();
1769 while (VIM_ISDIGIT(c))
1770 {
1771 n = n * 10 + (c - '0');
1772 c = getchr();
1773 }
1774 if (c == 'l' || c == 'c' || c == 'v')
1775 {
1776 if (c == 'l')
1777 ret = regnode(RE_LNUM);
1778 else if (c == 'c')
1779 ret = regnode(RE_COL);
1780 else
1781 ret = regnode(RE_VCOL);
1782 if (ret == JUST_CALC_SIZE)
1783 regsize += 5;
1784 else
1785 {
1786 /* put the number and the optional
1787 * comparator after the opcode */
1788 regcode = re_put_long(regcode, n);
1789 *regcode++ = cmp;
1790 }
1791 break;
1792 }
1793 }
1794
1795 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1796 reg_magic == MAGIC_ALL);
1797 }
1798 }
1799 break;
1800
1801 case Magic('['):
1802collection:
1803 {
1804 char_u *lp;
1805
1806 /*
1807 * If there is no matching ']', we assume the '[' is a normal
1808 * character. This makes 'incsearch' and ":help [" work.
1809 */
1810 lp = skip_anyof(regparse);
1811 if (*lp == ']') /* there is a matching ']' */
1812 {
1813 int startc = -1; /* > 0 when next '-' is a range */
1814 int endc;
1815
1816 /*
1817 * In a character class, different parsing rules apply.
1818 * Not even \ is special anymore, nothing is.
1819 */
1820 if (*regparse == '^') /* Complement of range. */
1821 {
1822 ret = regnode(ANYBUT + extra);
1823 regparse++;
1824 }
1825 else
1826 ret = regnode(ANYOF + extra);
1827
1828 /* At the start ']' and '-' mean the literal character. */
1829 if (*regparse == ']' || *regparse == '-')
1830 regc(*regparse++);
1831
1832 while (*regparse != NUL && *regparse != ']')
1833 {
1834 if (*regparse == '-')
1835 {
1836 ++regparse;
1837 /* The '-' is not used for a range at the end and
1838 * after or before a '\n'. */
1839 if (*regparse == ']' || *regparse == NUL
1840 || startc == -1
1841 || (regparse[0] == '\\' && regparse[1] == 'n'))
1842 {
1843 regc('-');
1844 startc = '-'; /* [--x] is a range */
1845 }
1846 else
1847 {
1848#ifdef FEAT_MBYTE
1849 if (has_mbyte)
1850 endc = mb_ptr2char_adv(&regparse);
1851 else
1852#endif
1853 endc = *regparse++;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001854
1855 /* Handle \o40, \x20 and \u20AC style sequences */
1856 if (endc == '\\' && !cpo_lit)
1857 endc = coll_get_char();
1858
Bram Moolenaar071d4272004-06-13 20:20:40 +00001859 if (startc > endc)
1860 EMSG_RET_NULL(_(e_invrange));
1861#ifdef FEAT_MBYTE
1862 if (has_mbyte && ((*mb_char2len)(startc) > 1
1863 || (*mb_char2len)(endc) > 1))
1864 {
1865 /* Limit to a range of 256 chars */
1866 if (endc > startc + 256)
1867 EMSG_RET_NULL(_(e_invrange));
1868 while (++startc <= endc)
1869 regmbc(startc);
1870 }
1871 else
1872#endif
1873 {
1874#ifdef EBCDIC
1875 int alpha_only = FALSE;
1876
1877 /* for alphabetical range skip the gaps
1878 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1879 if (isalpha(startc) && isalpha(endc))
1880 alpha_only = TRUE;
1881#endif
1882 while (++startc <= endc)
1883#ifdef EBCDIC
1884 if (!alpha_only || isalpha(startc))
1885#endif
1886 regc(startc);
1887 }
1888 startc = -1;
1889 }
1890 }
1891 /*
1892 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1893 * accepts "\t", "\e", etc., but only when the 'l' flag in
1894 * 'cpoptions' is not included.
1895 */
1896 else if (*regparse == '\\'
1897 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1898 || (!cpo_lit
1899 && vim_strchr(REGEXP_ABBR,
1900 regparse[1]) != NULL)))
1901 {
1902 regparse++;
1903 if (*regparse == 'n')
1904 {
1905 /* '\n' in range: also match NL */
1906 if (ret != JUST_CALC_SIZE)
1907 {
1908 if (*ret == ANYBUT)
1909 *ret = ANYBUT + ADD_NL;
1910 else if (*ret == ANYOF)
1911 *ret = ANYOF + ADD_NL;
1912 /* else: must have had a \n already */
1913 }
1914 *flagp |= HASNL;
1915 regparse++;
1916 startc = -1;
1917 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001918 else if (*regparse == 'd'
1919 || *regparse == 'o'
1920 || *regparse == 'x'
1921 || *regparse == 'u'
1922 || *regparse == 'U')
1923 {
1924 startc = coll_get_char();
1925 if (startc == 0)
1926 regc(0x0a);
1927 else
1928#ifdef FEAT_MBYTE
1929 regmbc(startc);
1930#else
1931 regc(startc);
1932#endif
1933 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001934 else
1935 {
1936 startc = backslash_trans(*regparse++);
1937 regc(startc);
1938 }
1939 }
1940 else if (*regparse == '[')
1941 {
1942 int c_class;
1943 int cu;
1944
1945 c_class = skip_class_name(&regparse);
1946 startc = -1;
1947 /* Characters assumed to be 8 bits! */
1948 switch (c_class)
1949 {
1950 case CLASS_NONE:
1951 /* literal '[', allow [[-x] as a range */
1952 startc = *regparse++;
1953 regc(startc);
1954 break;
1955 case CLASS_ALNUM:
1956 for (cu = 1; cu <= 255; cu++)
1957 if (isalnum(cu))
1958 regc(cu);
1959 break;
1960 case CLASS_ALPHA:
1961 for (cu = 1; cu <= 255; cu++)
1962 if (isalpha(cu))
1963 regc(cu);
1964 break;
1965 case CLASS_BLANK:
1966 regc(' ');
1967 regc('\t');
1968 break;
1969 case CLASS_CNTRL:
1970 for (cu = 1; cu <= 255; cu++)
1971 if (iscntrl(cu))
1972 regc(cu);
1973 break;
1974 case CLASS_DIGIT:
1975 for (cu = 1; cu <= 255; cu++)
1976 if (VIM_ISDIGIT(cu))
1977 regc(cu);
1978 break;
1979 case CLASS_GRAPH:
1980 for (cu = 1; cu <= 255; cu++)
1981 if (isgraph(cu))
1982 regc(cu);
1983 break;
1984 case CLASS_LOWER:
1985 for (cu = 1; cu <= 255; cu++)
1986 if (islower(cu))
1987 regc(cu);
1988 break;
1989 case CLASS_PRINT:
1990 for (cu = 1; cu <= 255; cu++)
1991 if (vim_isprintc(cu))
1992 regc(cu);
1993 break;
1994 case CLASS_PUNCT:
1995 for (cu = 1; cu <= 255; cu++)
1996 if (ispunct(cu))
1997 regc(cu);
1998 break;
1999 case CLASS_SPACE:
2000 for (cu = 9; cu <= 13; cu++)
2001 regc(cu);
2002 regc(' ');
2003 break;
2004 case CLASS_UPPER:
2005 for (cu = 1; cu <= 255; cu++)
2006 if (isupper(cu))
2007 regc(cu);
2008 break;
2009 case CLASS_XDIGIT:
2010 for (cu = 1; cu <= 255; cu++)
2011 if (vim_isxdigit(cu))
2012 regc(cu);
2013 break;
2014 case CLASS_TAB:
2015 regc('\t');
2016 break;
2017 case CLASS_RETURN:
2018 regc('\r');
2019 break;
2020 case CLASS_BACKSPACE:
2021 regc('\b');
2022 break;
2023 case CLASS_ESCAPE:
2024 regc('\033');
2025 break;
2026 }
2027 }
2028 else
2029 {
2030#ifdef FEAT_MBYTE
2031 if (has_mbyte)
2032 {
2033 int len;
2034
2035 /* produce a multibyte character, including any
2036 * following composing characters */
2037 startc = mb_ptr2char(regparse);
2038 len = (*mb_ptr2len_check)(regparse);
2039 if (enc_utf8 && utf_char2len(startc) != len)
2040 startc = -1; /* composing chars */
2041 while (--len >= 0)
2042 regc(*regparse++);
2043 }
2044 else
2045#endif
2046 {
2047 startc = *regparse++;
2048 regc(startc);
2049 }
2050 }
2051 }
2052 regc(NUL);
2053 prevchr_len = 1; /* last char was the ']' */
2054 if (*regparse != ']')
2055 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2056 skipchr(); /* let's be friends with the lexer again */
2057 *flagp |= HASWIDTH | SIMPLE;
2058 break;
2059 }
2060 }
2061 /* FALLTHROUGH */
2062
2063 default:
2064 {
2065 int len;
2066
2067#ifdef FEAT_MBYTE
2068 /* A multi-byte character is handled as a separate atom if it's
2069 * before a multi. */
2070 if (has_mbyte && (*mb_char2len)(c) > 1
2071 && re_multi_type(peekchr()) != NOT_MULTI)
2072 {
2073 ret = regnode(MULTIBYTECODE);
2074 regmbc(c);
2075 *flagp |= HASWIDTH | SIMPLE;
2076 break;
2077 }
2078#endif
2079
2080 ret = regnode(EXACTLY);
2081
2082 /*
2083 * Append characters as long as:
2084 * - there is no following multi, we then need the character in
2085 * front of it as a single character operand
2086 * - not running into a Magic character
2087 * - "one_exactly" is not set
2088 * But always emit at least one character. Might be a Multi,
2089 * e.g., a "[" without matching "]".
2090 */
2091 for (len = 0; c != NUL && (len == 0
2092 || (re_multi_type(peekchr()) == NOT_MULTI
2093 && !one_exactly
2094 && !is_Magic(c))); ++len)
2095 {
2096 c = no_Magic(c);
2097#ifdef FEAT_MBYTE
2098 if (has_mbyte)
2099 {
2100 regmbc(c);
2101 if (enc_utf8)
2102 {
2103 int off;
2104 int l;
2105
2106 /* Need to get composing character too, directly
2107 * access regparse for that, because skipchr() skips
2108 * over composing chars. */
2109 ungetchr();
2110 if (*regparse == '\\' && regparse[1] != NUL)
2111 off = 1;
2112 else
2113 off = 0;
2114 for (;;)
2115 {
2116 l = utf_ptr2len_check(regparse + off);
2117 if (!UTF_COMPOSINGLIKE(regparse + off,
2118 regparse + off + l))
2119 break;
2120 off += l;
2121 regmbc(utf_ptr2char(regparse + off));
2122 }
2123 skipchr();
2124 }
2125 }
2126 else
2127#endif
2128 regc(c);
2129 c = getchr();
2130 }
2131 ungetchr();
2132
2133 regc(NUL);
2134 *flagp |= HASWIDTH;
2135 if (len == 1)
2136 *flagp |= SIMPLE;
2137 }
2138 break;
2139 }
2140
2141 return ret;
2142}
2143
2144/*
2145 * emit a node
2146 * Return pointer to generated code.
2147 */
2148 static char_u *
2149regnode(op)
2150 int op;
2151{
2152 char_u *ret;
2153
2154 ret = regcode;
2155 if (ret == JUST_CALC_SIZE)
2156 regsize += 3;
2157 else
2158 {
2159 *regcode++ = op;
2160 *regcode++ = NUL; /* Null "next" pointer. */
2161 *regcode++ = NUL;
2162 }
2163 return ret;
2164}
2165
2166/*
2167 * Emit (if appropriate) a byte of code
2168 */
2169 static void
2170regc(b)
2171 int b;
2172{
2173 if (regcode == JUST_CALC_SIZE)
2174 regsize++;
2175 else
2176 *regcode++ = b;
2177}
2178
2179#ifdef FEAT_MBYTE
2180/*
2181 * Emit (if appropriate) a multi-byte character of code
2182 */
2183 static void
2184regmbc(c)
2185 int c;
2186{
2187 if (regcode == JUST_CALC_SIZE)
2188 regsize += (*mb_char2len)(c);
2189 else
2190 regcode += (*mb_char2bytes)(c, regcode);
2191}
2192#endif
2193
2194/*
2195 * reginsert - insert an operator in front of already-emitted operand
2196 *
2197 * Means relocating the operand.
2198 */
2199 static void
2200reginsert(op, opnd)
2201 int op;
2202 char_u *opnd;
2203{
2204 char_u *src;
2205 char_u *dst;
2206 char_u *place;
2207
2208 if (regcode == JUST_CALC_SIZE)
2209 {
2210 regsize += 3;
2211 return;
2212 }
2213 src = regcode;
2214 regcode += 3;
2215 dst = regcode;
2216 while (src > opnd)
2217 *--dst = *--src;
2218
2219 place = opnd; /* Op node, where operand used to be. */
2220 *place++ = op;
2221 *place++ = NUL;
2222 *place = NUL;
2223}
2224
2225/*
2226 * reginsert_limits - insert an operator in front of already-emitted operand.
2227 * The operator has the given limit values as operands. Also set next pointer.
2228 *
2229 * Means relocating the operand.
2230 */
2231 static void
2232reginsert_limits(op, minval, maxval, opnd)
2233 int op;
2234 long minval;
2235 long maxval;
2236 char_u *opnd;
2237{
2238 char_u *src;
2239 char_u *dst;
2240 char_u *place;
2241
2242 if (regcode == JUST_CALC_SIZE)
2243 {
2244 regsize += 11;
2245 return;
2246 }
2247 src = regcode;
2248 regcode += 11;
2249 dst = regcode;
2250 while (src > opnd)
2251 *--dst = *--src;
2252
2253 place = opnd; /* Op node, where operand used to be. */
2254 *place++ = op;
2255 *place++ = NUL;
2256 *place++ = NUL;
2257 place = re_put_long(place, (long_u)minval);
2258 place = re_put_long(place, (long_u)maxval);
2259 regtail(opnd, place);
2260}
2261
2262/*
2263 * Write a long as four bytes at "p" and return pointer to the next char.
2264 */
2265 static char_u *
2266re_put_long(p, val)
2267 char_u *p;
2268 long_u val;
2269{
2270 *p++ = (char_u) ((val >> 24) & 0377);
2271 *p++ = (char_u) ((val >> 16) & 0377);
2272 *p++ = (char_u) ((val >> 8) & 0377);
2273 *p++ = (char_u) (val & 0377);
2274 return p;
2275}
2276
2277/*
2278 * regtail - set the next-pointer at the end of a node chain
2279 */
2280 static void
2281regtail(p, val)
2282 char_u *p;
2283 char_u *val;
2284{
2285 char_u *scan;
2286 char_u *temp;
2287 int offset;
2288
2289 if (p == JUST_CALC_SIZE)
2290 return;
2291
2292 /* Find last node. */
2293 scan = p;
2294 for (;;)
2295 {
2296 temp = regnext(scan);
2297 if (temp == NULL)
2298 break;
2299 scan = temp;
2300 }
2301
2302 if (OP(scan) == BACK)
2303 offset = (int)(scan - val);
2304 else
2305 offset = (int)(val - scan);
2306 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2307 *(scan + 2) = (char_u) (offset & 0377);
2308}
2309
2310/*
2311 * regoptail - regtail on item after a BRANCH; nop if none
2312 */
2313 static void
2314regoptail(p, val)
2315 char_u *p;
2316 char_u *val;
2317{
2318 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2319 if (p == NULL || p == JUST_CALC_SIZE
2320 || (OP(p) != BRANCH
2321 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2322 return;
2323 regtail(OPERAND(p), val);
2324}
2325
2326/*
2327 * getchr() - get the next character from the pattern. We know about
2328 * magic and such, so therefore we need a lexical analyzer.
2329 */
2330
2331/* static int curchr; */
2332static int prevprevchr;
2333static int prevchr;
2334static int nextchr; /* used for ungetchr() */
2335/*
2336 * Note: prevchr is sometimes -1 when we are not at the start,
2337 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2338 * taken to be magic -- webb
2339 */
2340static int at_start; /* True when on the first character */
2341static int prev_at_start; /* True when on the second character */
2342
2343 static void
2344initchr(str)
2345 char_u *str;
2346{
2347 regparse = str;
2348 prevchr_len = 0;
2349 curchr = prevprevchr = prevchr = nextchr = -1;
2350 at_start = TRUE;
2351 prev_at_start = FALSE;
2352}
2353
2354 static int
2355peekchr()
2356{
2357 if (curchr == -1)
2358 {
2359 switch (curchr = regparse[0])
2360 {
2361 case '.':
2362 case '[':
2363 case '~':
2364 /* magic when 'magic' is on */
2365 if (reg_magic >= MAGIC_ON)
2366 curchr = Magic(curchr);
2367 break;
2368 case '(':
2369 case ')':
2370 case '{':
2371 case '%':
2372 case '+':
2373 case '=':
2374 case '?':
2375 case '@':
2376 case '!':
2377 case '&':
2378 case '|':
2379 case '<':
2380 case '>':
2381 case '#': /* future ext. */
2382 case '"': /* future ext. */
2383 case '\'': /* future ext. */
2384 case ',': /* future ext. */
2385 case '-': /* future ext. */
2386 case ':': /* future ext. */
2387 case ';': /* future ext. */
2388 case '`': /* future ext. */
2389 case '/': /* Can't be used in / command */
2390 /* magic only after "\v" */
2391 if (reg_magic == MAGIC_ALL)
2392 curchr = Magic(curchr);
2393 break;
2394 case '*':
2395 /* * is not magic as the very first character, eg "?*ptr" and when
2396 * after '^', eg "/^*ptr" */
2397 if (reg_magic >= MAGIC_ON && !at_start
2398 && !(prev_at_start && prevchr == Magic('^')))
2399 curchr = Magic('*');
2400 break;
2401 case '^':
2402 /* '^' is only magic as the very first character and if it's after
2403 * "\(", "\|", "\&' or "\n" */
2404 if (reg_magic >= MAGIC_OFF
2405 && (at_start
2406 || reg_magic == MAGIC_ALL
2407 || prevchr == Magic('(')
2408 || prevchr == Magic('|')
2409 || prevchr == Magic('&')
2410 || prevchr == Magic('n')
2411 || (no_Magic(prevchr) == '('
2412 && prevprevchr == Magic('%'))))
2413 {
2414 curchr = Magic('^');
2415 at_start = TRUE;
2416 prev_at_start = FALSE;
2417 }
2418 break;
2419 case '$':
2420 /* '$' is only magic as the very last char and if it's in front of
2421 * either "\|", "\)", "\&", or "\n" */
2422 if (reg_magic >= MAGIC_OFF)
2423 {
2424 char_u *p = regparse + 1;
2425
2426 /* ignore \c \C \m and \M after '$' */
2427 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2428 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2429 p += 2;
2430 if (p[0] == NUL
2431 || (p[0] == '\\'
2432 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2433 || p[1] == 'n'))
2434 || reg_magic == MAGIC_ALL)
2435 curchr = Magic('$');
2436 }
2437 break;
2438 case '\\':
2439 {
2440 int c = regparse[1];
2441
2442 if (c == NUL)
2443 curchr = '\\'; /* trailing '\' */
2444 else if (
2445#ifdef EBCDIC
2446 vim_strchr(META, c)
2447#else
2448 c <= '~' && META_flags[c]
2449#endif
2450 )
2451 {
2452 /*
2453 * META contains everything that may be magic sometimes,
2454 * except ^ and $ ("\^" and "\$" are only magic after
2455 * "\v"). We now fetch the next character and toggle its
2456 * magicness. Therefore, \ is so meta-magic that it is
2457 * not in META.
2458 */
2459 curchr = -1;
2460 prev_at_start = at_start;
2461 at_start = FALSE; /* be able to say "/\*ptr" */
2462 ++regparse;
2463 peekchr();
2464 --regparse;
2465 curchr = toggle_Magic(curchr);
2466 }
2467 else if (vim_strchr(REGEXP_ABBR, c))
2468 {
2469 /*
2470 * Handle abbreviations, like "\t" for TAB -- webb
2471 */
2472 curchr = backslash_trans(c);
2473 }
2474 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2475 curchr = toggle_Magic(c);
2476 else
2477 {
2478 /*
2479 * Next character can never be (made) magic?
2480 * Then backslashing it won't do anything.
2481 */
2482#ifdef FEAT_MBYTE
2483 if (has_mbyte)
2484 curchr = (*mb_ptr2char)(regparse + 1);
2485 else
2486#endif
2487 curchr = c;
2488 }
2489 break;
2490 }
2491
2492#ifdef FEAT_MBYTE
2493 default:
2494 if (has_mbyte)
2495 curchr = (*mb_ptr2char)(regparse);
2496#endif
2497 }
2498 }
2499
2500 return curchr;
2501}
2502
2503/*
2504 * Eat one lexed character. Do this in a way that we can undo it.
2505 */
2506 static void
2507skipchr()
2508{
2509 /* peekchr() eats a backslash, do the same here */
2510 if (*regparse == '\\')
2511 prevchr_len = 1;
2512 else
2513 prevchr_len = 0;
2514 if (regparse[prevchr_len] != NUL)
2515 {
2516#ifdef FEAT_MBYTE
2517 if (has_mbyte)
2518 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2519 else
2520#endif
2521 ++prevchr_len;
2522 }
2523 regparse += prevchr_len;
2524 prev_at_start = at_start;
2525 at_start = FALSE;
2526 prevprevchr = prevchr;
2527 prevchr = curchr;
2528 curchr = nextchr; /* use previously unget char, or -1 */
2529 nextchr = -1;
2530}
2531
2532/*
2533 * Skip a character while keeping the value of prev_at_start for at_start.
2534 * prevchr and prevprevchr are also kept.
2535 */
2536 static void
2537skipchr_keepstart()
2538{
2539 int as = prev_at_start;
2540 int pr = prevchr;
2541 int prpr = prevprevchr;
2542
2543 skipchr();
2544 at_start = as;
2545 prevchr = pr;
2546 prevprevchr = prpr;
2547}
2548
2549 static int
2550getchr()
2551{
2552 int chr = peekchr();
2553
2554 skipchr();
2555 return chr;
2556}
2557
2558/*
2559 * put character back. Works only once!
2560 */
2561 static void
2562ungetchr()
2563{
2564 nextchr = curchr;
2565 curchr = prevchr;
2566 prevchr = prevprevchr;
2567 at_start = prev_at_start;
2568 prev_at_start = FALSE;
2569
2570 /* Backup regparse, so that it's at the same position as before the
2571 * getchr(). */
2572 regparse -= prevchr_len;
2573}
2574
2575/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +00002576 * Get and return the value of the hex string at the current position.
2577 * Return -1 if there is no valid hex number.
2578 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002579 * blahblah\%x20asdf
2580 * before-^ ^-after
2581 * The parameter controls the maximum number of input characters. This will be
2582 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2583 */
2584 static int
2585gethexchrs(maxinputlen)
2586 int maxinputlen;
2587{
2588 int nr = 0;
2589 int c;
2590 int i;
2591
2592 for (i = 0; i < maxinputlen; ++i)
2593 {
2594 c = regparse[0];
2595 if (!vim_isxdigit(c))
2596 break;
2597 nr <<= 4;
2598 nr |= hex2nr(c);
2599 ++regparse;
2600 }
2601
2602 if (i == 0)
2603 return -1;
2604 return nr;
2605}
2606
2607/*
2608 * get and return the value of the decimal string immediately after the
2609 * current position. Return -1 for invalid. Consumes all digits.
2610 */
2611 static int
2612getdecchrs()
2613{
2614 int nr = 0;
2615 int c;
2616 int i;
2617
2618 for (i = 0; ; ++i)
2619 {
2620 c = regparse[0];
2621 if (c < '0' || c > '9')
2622 break;
2623 nr *= 10;
2624 nr += c - '0';
2625 ++regparse;
2626 }
2627
2628 if (i == 0)
2629 return -1;
2630 return nr;
2631}
2632
2633/*
2634 * get and return the value of the octal string immediately after the current
2635 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2636 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2637 * treat 8 or 9 as recognised characters. Position is updated:
2638 * blahblah\%o210asdf
2639 * before-^ ^-after
2640 */
2641 static int
2642getoctchrs()
2643{
2644 int nr = 0;
2645 int c;
2646 int i;
2647
2648 for (i = 0; i < 3 && nr < 040; ++i)
2649 {
2650 c = regparse[0];
2651 if (c < '0' || c > '7')
2652 break;
2653 nr <<= 3;
2654 nr |= hex2nr(c);
2655 ++regparse;
2656 }
2657
2658 if (i == 0)
2659 return -1;
2660 return nr;
2661}
2662
2663/*
2664 * Get a number after a backslash that is inside [].
2665 * When nothing is recognized return a backslash.
2666 */
2667 static int
2668coll_get_char()
2669{
2670 int nr = -1;
2671
2672 switch (*regparse++)
2673 {
2674 case 'd': nr = getdecchrs(); break;
2675 case 'o': nr = getoctchrs(); break;
2676 case 'x': nr = gethexchrs(2); break;
2677 case 'u': nr = gethexchrs(4); break;
2678 case 'U': nr = gethexchrs(8); break;
2679 }
2680 if (nr < 0)
2681 {
2682 /* If getting the number fails be backwards compatible: the character
2683 * is a backslash. */
2684 --regparse;
2685 nr = '\\';
2686 }
2687 return nr;
2688}
2689
2690/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002691 * read_limits - Read two integers to be taken as a minimum and maximum.
2692 * If the first character is '-', then the range is reversed.
2693 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2694 * missing, a very big number is the default.
2695 */
2696 static int
2697read_limits(minval, maxval)
2698 long *minval;
2699 long *maxval;
2700{
2701 int reverse = FALSE;
2702 char_u *first_char;
2703 long tmp;
2704
2705 if (*regparse == '-')
2706 {
2707 /* Starts with '-', so reverse the range later */
2708 regparse++;
2709 reverse = TRUE;
2710 }
2711 first_char = regparse;
2712 *minval = getdigits(&regparse);
2713 if (*regparse == ',') /* There is a comma */
2714 {
2715 if (vim_isdigit(*++regparse))
2716 *maxval = getdigits(&regparse);
2717 else
2718 *maxval = MAX_LIMIT;
2719 }
2720 else if (VIM_ISDIGIT(*first_char))
2721 *maxval = *minval; /* It was \{n} or \{-n} */
2722 else
2723 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2724 if (*regparse == '\\')
2725 regparse++; /* Allow either \{...} or \{...\} */
2726 if (*regparse != '}' || (*maxval == 0 && *minval == 0))
2727 {
2728 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2729 reg_magic == MAGIC_ALL ? "" : "\\");
2730 EMSG_RET_FAIL(IObuff);
2731 }
2732
2733 /*
2734 * Reverse the range if there was a '-', or make sure it is in the right
2735 * order otherwise.
2736 */
2737 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2738 {
2739 tmp = *minval;
2740 *minval = *maxval;
2741 *maxval = tmp;
2742 }
2743 skipchr(); /* let's be friends with the lexer again */
2744 return OK;
2745}
2746
2747/*
2748 * vim_regexec and friends
2749 */
2750
2751/*
2752 * Global work variables for vim_regexec().
2753 */
2754
2755/* The current match-position is remembered with these variables: */
2756static linenr_T reglnum; /* line number, relative to first line */
2757static char_u *regline; /* start of current line */
2758static char_u *reginput; /* current input, points into "regline" */
2759
2760static int need_clear_subexpr; /* subexpressions still need to be
2761 * cleared */
2762#ifdef FEAT_SYN_HL
2763static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2764 * still need to be cleared */
2765#endif
2766
2767static int out_of_stack; /* TRUE when ran out of stack space */
2768
2769/*
2770 * Structure used to save the current input state, when it needs to be
2771 * restored after trying a match. Used by reg_save() and reg_restore().
2772 */
2773typedef struct
2774{
2775 union
2776 {
2777 char_u *ptr; /* reginput pointer, for single-line regexp */
2778 lpos_T pos; /* reginput pos, for multi-line regexp */
2779 } rs_u;
2780} regsave_T;
2781
2782/* struct to save start/end pointer/position in for \(\) */
2783typedef struct
2784{
2785 union
2786 {
2787 char_u *ptr;
2788 lpos_T pos;
2789 } se_u;
2790} save_se_T;
2791
2792static char_u *reg_getline __ARGS((linenr_T lnum));
2793static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2794static long regtry __ARGS((regprog_T *prog, colnr_T col));
2795static void cleanup_subexpr __ARGS((void));
2796#ifdef FEAT_SYN_HL
2797static void cleanup_zsubexpr __ARGS((void));
2798#endif
2799static void reg_nextline __ARGS((void));
2800static void reg_save __ARGS((regsave_T *save));
2801static void reg_restore __ARGS((regsave_T *save));
2802static int reg_save_equal __ARGS((regsave_T *save));
2803static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2804static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2805
2806/* Save the sub-expressions before attempting a match. */
2807#define save_se(savep, posp, pp) \
2808 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2809
2810/* After a failed match restore the sub-expressions. */
2811#define restore_se(savep, posp, pp) { \
2812 if (REG_MULTI) \
2813 *(posp) = (savep)->se_u.pos; \
2814 else \
2815 *(pp) = (savep)->se_u.ptr; }
2816
2817static int re_num_cmp __ARGS((long_u val, char_u *scan));
2818static int regmatch __ARGS((char_u *prog));
2819static int regrepeat __ARGS((char_u *p, long maxcount));
2820
2821#ifdef DEBUG
2822int regnarrate = 0;
2823#endif
2824
2825/*
2826 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
2827 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
2828 * contains '\c' or '\C' the value is overruled.
2829 */
2830static int ireg_ic;
2831
2832#ifdef FEAT_MBYTE
2833/*
2834 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
2835 * in the regexp. Defaults to false, always.
2836 */
2837static int ireg_icombine;
2838#endif
2839
2840/*
2841 * Sometimes need to save a copy of a line. Since alloc()/free() is very
2842 * slow, we keep one allocated piece of memory and only re-allocate it when
2843 * it's too small. It's freed in vim_regexec_both() when finished.
2844 */
2845static char_u *reg_tofree;
2846static unsigned reg_tofreelen;
2847
2848/*
2849 * These variables are set when executing a regexp to speed up the execution.
2850 * Which ones are set depends on whethere a single-line or multi-line match is
2851 * done:
2852 * single-line multi-line
2853 * reg_match &regmatch_T NULL
2854 * reg_mmatch NULL &regmmatch_T
2855 * reg_startp reg_match->startp <invalid>
2856 * reg_endp reg_match->endp <invalid>
2857 * reg_startpos <invalid> reg_mmatch->startpos
2858 * reg_endpos <invalid> reg_mmatch->endpos
2859 * reg_win NULL window in which to search
2860 * reg_buf <invalid> buffer in which to search
2861 * reg_firstlnum <invalid> first line in which to search
2862 * reg_maxline 0 last line nr
2863 * reg_line_lbr FALSE or TRUE FALSE
2864 */
2865static regmatch_T *reg_match;
2866static regmmatch_T *reg_mmatch;
2867static char_u **reg_startp = NULL;
2868static char_u **reg_endp = NULL;
2869static lpos_T *reg_startpos = NULL;
2870static lpos_T *reg_endpos = NULL;
2871static win_T *reg_win;
2872static buf_T *reg_buf;
2873static linenr_T reg_firstlnum;
2874static linenr_T reg_maxline;
2875static int reg_line_lbr; /* "\n" in string is line break */
2876
2877/*
2878 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
2879 */
2880 static char_u *
2881reg_getline(lnum)
2882 linenr_T lnum;
2883{
2884 /* when looking behind for a match/no-match lnum is negative. But we
2885 * can't go before line 1 */
2886 if (reg_firstlnum + lnum < 1)
2887 return NULL;
2888 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
2889}
2890
2891static regsave_T behind_pos;
2892
2893#ifdef FEAT_SYN_HL
2894static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
2895static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
2896static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
2897static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
2898#endif
2899
2900/* TRUE if using multi-line regexp. */
2901#define REG_MULTI (reg_match == NULL)
2902
2903/*
2904 * Match a regexp against a string.
2905 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2906 * Uses curbuf for line count and 'iskeyword'.
2907 *
2908 * Return TRUE if there is a match, FALSE if not.
2909 */
2910 int
2911vim_regexec(rmp, line, col)
2912 regmatch_T *rmp;
2913 char_u *line; /* string to match against */
2914 colnr_T col; /* column to start looking for match */
2915{
2916 reg_match = rmp;
2917 reg_mmatch = NULL;
2918 reg_maxline = 0;
2919 reg_line_lbr = FALSE;
2920 reg_win = NULL;
2921 ireg_ic = rmp->rm_ic;
2922#ifdef FEAT_MBYTE
2923 ireg_icombine = FALSE;
2924#endif
2925 return (vim_regexec_both(line, col) != 0);
2926}
2927
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00002928#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
2929 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002930/*
2931 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2932 */
2933 int
2934vim_regexec_nl(rmp, line, col)
2935 regmatch_T *rmp;
2936 char_u *line; /* string to match against */
2937 colnr_T col; /* column to start looking for match */
2938{
2939 reg_match = rmp;
2940 reg_mmatch = NULL;
2941 reg_maxline = 0;
2942 reg_line_lbr = TRUE;
2943 reg_win = NULL;
2944 ireg_ic = rmp->rm_ic;
2945#ifdef FEAT_MBYTE
2946 ireg_icombine = FALSE;
2947#endif
2948 return (vim_regexec_both(line, col) != 0);
2949}
2950#endif
2951
2952/*
2953 * Match a regexp against multiple lines.
2954 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2955 * Uses curbuf for line count and 'iskeyword'.
2956 *
2957 * Return zero if there is no match. Return number of lines contained in the
2958 * match otherwise.
2959 */
2960 long
2961vim_regexec_multi(rmp, win, buf, lnum, col)
2962 regmmatch_T *rmp;
2963 win_T *win; /* window in which to search or NULL */
2964 buf_T *buf; /* buffer in which to search */
2965 linenr_T lnum; /* nr of line to start looking for match */
2966 colnr_T col; /* column to start looking for match */
2967{
2968 long r;
2969 buf_T *save_curbuf = curbuf;
2970
2971 reg_match = NULL;
2972 reg_mmatch = rmp;
2973 reg_buf = buf;
2974 reg_win = win;
2975 reg_firstlnum = lnum;
2976 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
2977 reg_line_lbr = FALSE;
2978 ireg_ic = rmp->rmm_ic;
2979#ifdef FEAT_MBYTE
2980 ireg_icombine = FALSE;
2981#endif
2982
2983 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
2984 curbuf = buf;
2985 r = vim_regexec_both(NULL, col);
2986 curbuf = save_curbuf;
2987
2988 return r;
2989}
2990
2991/*
2992 * Match a regexp against a string ("line" points to the string) or multiple
2993 * lines ("line" is NULL, use reg_getline()).
2994 */
2995#ifdef HAVE_SETJMP_H
2996 static long
2997vim_regexec_both(line_arg, col_arg)
2998 char_u *line_arg;
2999 colnr_T col_arg; /* column to start looking for match */
3000#else
3001 static long
3002vim_regexec_both(line, col)
3003 char_u *line;
3004 colnr_T col; /* column to start looking for match */
3005#endif
3006{
3007 regprog_T *prog;
3008 char_u *s;
3009 long retval;
3010#ifdef HAVE_SETJMP_H
3011 char_u *line;
3012 colnr_T col;
Bram Moolenaar748bf032005-02-02 23:04:36 +00003013 int did_mch_startjmp = FALSE;
Bram Moolenaar071d4272004-06-13 20:20:40 +00003014#endif
3015
3016 reg_tofree = NULL;
3017
Bram Moolenaar071d4272004-06-13 20:20:40 +00003018#ifdef HAVE_SETJMP_H
Bram Moolenaar071d4272004-06-13 20:20:40 +00003019 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3020 line = line_arg;
3021 col = col_arg;
3022#endif
3023 retval = 0L;
3024
3025 if (REG_MULTI)
3026 {
3027 prog = reg_mmatch->regprog;
3028 line = reg_getline((linenr_T)0);
3029 reg_startpos = reg_mmatch->startpos;
3030 reg_endpos = reg_mmatch->endpos;
3031 }
3032 else
3033 {
3034 prog = reg_match->regprog;
3035 reg_startp = reg_match->startp;
3036 reg_endp = reg_match->endp;
3037 }
3038
3039 /* Be paranoid... */
3040 if (prog == NULL || line == NULL)
3041 {
3042 EMSG(_(e_null));
3043 goto theend;
3044 }
3045
3046 /* Check validity of program. */
3047 if (prog_magic_wrong())
3048 goto theend;
3049
3050 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3051 if (prog->regflags & RF_ICASE)
3052 ireg_ic = TRUE;
3053 else if (prog->regflags & RF_NOICASE)
3054 ireg_ic = FALSE;
3055
3056#ifdef FEAT_MBYTE
3057 /* If pattern contains "\Z" overrule value of ireg_icombine */
3058 if (prog->regflags & RF_ICOMBINE)
3059 ireg_icombine = TRUE;
3060#endif
3061
3062 /* If there is a "must appear" string, look for it. */
3063 if (prog->regmust != NULL)
3064 {
3065 int c;
3066
3067#ifdef FEAT_MBYTE
3068 if (has_mbyte)
3069 c = (*mb_ptr2char)(prog->regmust);
3070 else
3071#endif
3072 c = *prog->regmust;
3073 s = line + col;
3074 while ((s = cstrchr(s, c)) != NULL)
3075 {
3076 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3077 break; /* Found it. */
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003078 mb_ptr_adv(s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003079 }
3080 if (s == NULL) /* Not present. */
3081 goto theend;
3082 }
3083
Bram Moolenaar748bf032005-02-02 23:04:36 +00003084#ifdef HAVE_TRY_EXCEPT
3085 __try
3086 {
3087#endif
3088
3089#ifdef HAVE_SETJMP_H
3090 /*
3091 * Matching with a regexp may cause a very deep recursive call of
3092 * regmatch(). Vim will crash when running out of stack space. Catch
3093 * this here if the system supports it.
3094 * It's a bit slow, do it after the check for "regmust".
3095 * Don't do it if the caller already set it up.
3096 */
3097 if (!lc_active)
3098 {
3099 did_mch_startjmp = TRUE;
3100 mch_startjmp();
3101 if (SETJMP(lc_jump_env) != 0)
3102 {
3103 mch_didjmp();
3104# ifdef SIGHASARG
3105 if (lc_signal != SIGINT)
3106# endif
3107 EMSG(_(e_complex));
3108 retval = 0L;
3109 goto inner_end;
3110 }
3111 }
3112#endif
3113
Bram Moolenaar071d4272004-06-13 20:20:40 +00003114 regline = line;
3115 reglnum = 0;
3116 out_of_stack = FALSE;
3117
3118 /* Simplest case: Anchored match need be tried only once. */
3119 if (prog->reganch)
3120 {
3121 int c;
3122
3123#ifdef FEAT_MBYTE
3124 if (has_mbyte)
3125 c = (*mb_ptr2char)(regline + col);
3126 else
3127#endif
3128 c = regline[col];
3129 if (prog->regstart == NUL
3130 || prog->regstart == c
3131 || (ireg_ic && ((
3132#ifdef FEAT_MBYTE
3133 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3134 || (c < 255 && prog->regstart < 255 &&
3135#endif
3136 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3137 retval = regtry(prog, col);
3138 else
3139 retval = 0;
3140 }
3141 else
3142 {
3143 /* Messy cases: unanchored match. */
3144 while (!got_int && !out_of_stack)
3145 {
3146 if (prog->regstart != NUL)
3147 {
3148 /* Skip until the char we know it must start with. */
3149 s = cstrchr(regline + col, prog->regstart);
3150 if (s == NULL)
3151 {
3152 retval = 0;
3153 break;
3154 }
3155 col = (int)(s - regline);
3156 }
3157
3158 retval = regtry(prog, col);
3159 if (retval > 0)
3160 break;
3161
3162 /* if not currently on the first line, get it again */
3163 if (reglnum != 0)
3164 {
3165 regline = reg_getline((linenr_T)0);
3166 reglnum = 0;
3167 }
3168 if (regline[col] == NUL)
3169 break;
3170#ifdef FEAT_MBYTE
3171 if (has_mbyte)
3172 col += (*mb_ptr2len_check)(regline + col);
3173 else
3174#endif
3175 ++col;
3176 }
3177 }
3178
3179 if (out_of_stack)
Bram Moolenaar748bf032005-02-02 23:04:36 +00003180 EMSG(_(e_outofstack));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003181
Bram Moolenaar748bf032005-02-02 23:04:36 +00003182#ifdef HAVE_SETJMP_H
3183inner_end:
3184 ;
3185#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00003186#ifdef HAVE_TRY_EXCEPT
3187 }
3188 __except(EXCEPTION_EXECUTE_HANDLER)
3189 {
3190 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3191 {
3192 RESETSTKOFLW();
Bram Moolenaar748bf032005-02-02 23:04:36 +00003193 EMSG(_(e_outofstack));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003194 }
3195 else
Bram Moolenaar748bf032005-02-02 23:04:36 +00003196 EMSG(_(e_complex));
Bram Moolenaar071d4272004-06-13 20:20:40 +00003197 retval = 0L;
3198 }
3199#endif
Bram Moolenaar748bf032005-02-02 23:04:36 +00003200#ifdef HAVE_SETJMP_H
3201 if (did_mch_startjmp)
3202 mch_endjmp();
3203#endif
Bram Moolenaar071d4272004-06-13 20:20:40 +00003204
3205theend:
3206 /* Didn't find a match. */
3207 vim_free(reg_tofree);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003208 return retval;
3209}
3210
3211#ifdef FEAT_SYN_HL
3212static reg_extmatch_T *make_extmatch __ARGS((void));
3213
3214/*
3215 * Create a new extmatch and mark it as referenced once.
3216 */
3217 static reg_extmatch_T *
3218make_extmatch()
3219{
3220 reg_extmatch_T *em;
3221
3222 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3223 if (em != NULL)
3224 em->refcnt = 1;
3225 return em;
3226}
3227
3228/*
3229 * Add a reference to an extmatch.
3230 */
3231 reg_extmatch_T *
3232ref_extmatch(em)
3233 reg_extmatch_T *em;
3234{
3235 if (em != NULL)
3236 em->refcnt++;
3237 return em;
3238}
3239
3240/*
3241 * Remove a reference to an extmatch. If there are no references left, free
3242 * the info.
3243 */
3244 void
3245unref_extmatch(em)
3246 reg_extmatch_T *em;
3247{
3248 int i;
3249
3250 if (em != NULL && --em->refcnt <= 0)
3251 {
3252 for (i = 0; i < NSUBEXP; ++i)
3253 vim_free(em->matches[i]);
3254 vim_free(em);
3255 }
3256}
3257#endif
3258
3259/*
3260 * regtry - try match of "prog" with at regline["col"].
3261 * Returns 0 for failure, number of lines contained in the match otherwise.
3262 */
3263 static long
3264regtry(prog, col)
3265 regprog_T *prog;
3266 colnr_T col;
3267{
3268 reginput = regline + col;
3269 need_clear_subexpr = TRUE;
3270#ifdef FEAT_SYN_HL
3271 /* Clear the external match subpointers if necessary. */
3272 if (prog->reghasz == REX_SET)
3273 need_clear_zsubexpr = TRUE;
3274#endif
3275
3276 if (regmatch(prog->program + 1))
3277 {
3278 cleanup_subexpr();
3279 if (REG_MULTI)
3280 {
3281 if (reg_startpos[0].lnum < 0)
3282 {
3283 reg_startpos[0].lnum = 0;
3284 reg_startpos[0].col = col;
3285 }
3286 if (reg_endpos[0].lnum < 0)
3287 {
3288 reg_endpos[0].lnum = reglnum;
3289 reg_endpos[0].col = (int)(reginput - regline);
3290 }
3291 else
3292 /* Use line number of "\ze". */
3293 reglnum = reg_endpos[0].lnum;
3294 }
3295 else
3296 {
3297 if (reg_startp[0] == NULL)
3298 reg_startp[0] = regline + col;
3299 if (reg_endp[0] == NULL)
3300 reg_endp[0] = reginput;
3301 }
3302#ifdef FEAT_SYN_HL
3303 /* Package any found \z(...\) matches for export. Default is none. */
3304 unref_extmatch(re_extmatch_out);
3305 re_extmatch_out = NULL;
3306
3307 if (prog->reghasz == REX_SET)
3308 {
3309 int i;
3310
3311 cleanup_zsubexpr();
3312 re_extmatch_out = make_extmatch();
3313 for (i = 0; i < NSUBEXP; i++)
3314 {
3315 if (REG_MULTI)
3316 {
3317 /* Only accept single line matches. */
3318 if (reg_startzpos[i].lnum >= 0
3319 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3320 re_extmatch_out->matches[i] =
3321 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3322 + reg_startzpos[i].col,
3323 reg_endzpos[i].col - reg_startzpos[i].col);
3324 }
3325 else
3326 {
3327 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3328 re_extmatch_out->matches[i] =
3329 vim_strnsave(reg_startzp[i],
3330 (int)(reg_endzp[i] - reg_startzp[i]));
3331 }
3332 }
3333 }
3334#endif
3335 return 1 + reglnum;
3336 }
3337 return 0;
3338}
3339
3340#ifdef FEAT_MBYTE
Bram Moolenaar071d4272004-06-13 20:20:40 +00003341static int reg_prev_class __ARGS((void));
3342
Bram Moolenaar071d4272004-06-13 20:20:40 +00003343/*
3344 * Get class of previous character.
3345 */
3346 static int
3347reg_prev_class()
3348{
3349 if (reginput > regline)
3350 return mb_get_class(reginput - 1
3351 - (*mb_head_off)(regline, reginput - 1));
3352 return -1;
3353}
3354
Bram Moolenaar071d4272004-06-13 20:20:40 +00003355#endif
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003356#define ADVANCE_REGINPUT() mb_ptr_adv(reginput)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003357
3358/*
3359 * The arguments from BRACE_LIMITS are stored here. They are actually local
3360 * to regmatch(), but they are here to reduce the amount of stack space used
3361 * (it can be called recursively many times).
3362 */
3363static long bl_minval;
3364static long bl_maxval;
3365
3366/*
3367 * regmatch - main matching routine
3368 *
3369 * Conceptually the strategy is simple: Check to see whether the current
3370 * node matches, call self recursively to see whether the rest matches,
3371 * and then act accordingly. In practice we make some effort to avoid
3372 * recursion, in particular by going through "ordinary" nodes (that don't
3373 * need to know whether the rest of the match failed) by a loop instead of
3374 * by recursion.
3375 *
3376 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3377 * the last matched character.
3378 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3379 * undefined state!
3380 */
3381 static int
3382regmatch(scan)
3383 char_u *scan; /* Current node. */
3384{
3385 char_u *next; /* Next node. */
3386 int op;
3387 int c;
3388
3389#ifdef HAVE_GETRLIMIT
3390 /* Check if we are running out of stack space. Could be caused by
3391 * recursively calling ourselves. */
3392 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3393 {
3394 out_of_stack = TRUE;
3395 return FALSE;
3396 }
3397#endif
3398
3399 /* Some patterns my cause a long time to match, even though they are not
3400 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3401 fast_breakcheck();
3402
3403#ifdef DEBUG
3404 if (scan != NULL && regnarrate)
3405 {
3406 mch_errmsg(regprop(scan));
3407 mch_errmsg("(\n");
3408 }
3409#endif
3410 while (scan != NULL)
3411 {
3412 if (got_int || out_of_stack)
3413 return FALSE;
3414#ifdef DEBUG
3415 if (regnarrate)
3416 {
3417 mch_errmsg(regprop(scan));
3418 mch_errmsg("...\n");
3419# ifdef FEAT_SYN_HL
3420 if (re_extmatch_in != NULL)
3421 {
3422 int i;
3423
3424 mch_errmsg(_("External submatches:\n"));
3425 for (i = 0; i < NSUBEXP; i++)
3426 {
3427 mch_errmsg(" \"");
3428 if (re_extmatch_in->matches[i] != NULL)
3429 mch_errmsg(re_extmatch_in->matches[i]);
3430 mch_errmsg("\"\n");
3431 }
3432 }
3433# endif
3434 }
3435#endif
3436 next = regnext(scan);
3437
3438 op = OP(scan);
3439 /* Check for character class with NL added. */
3440 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3441 {
3442 reg_nextline();
3443 }
3444 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3445 {
3446 ADVANCE_REGINPUT();
3447 }
3448 else
3449 {
3450 if (WITH_NL(op))
3451 op -= ADD_NL;
3452#ifdef FEAT_MBYTE
3453 if (has_mbyte)
3454 c = (*mb_ptr2char)(reginput);
3455 else
3456#endif
3457 c = *reginput;
3458 switch (op)
3459 {
3460 case BOL:
3461 if (reginput != regline)
3462 return FALSE;
3463 break;
3464
3465 case EOL:
3466 if (c != NUL)
3467 return FALSE;
3468 break;
3469
3470 case RE_BOF:
3471 /* Passing -1 to the getline() function provided for the search
3472 * should always return NULL if the current line is the first
3473 * line of the file. */
3474 if (reglnum != 0 || reginput != regline
3475 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3476 return FALSE;
3477 break;
3478
3479 case RE_EOF:
3480 if (reglnum != reg_maxline || c != NUL)
3481 return FALSE;
3482 break;
3483
3484 case CURSOR:
3485 /* Check if the buffer is in a window and compare the
3486 * reg_win->w_cursor position to the match position. */
3487 if (reg_win == NULL
3488 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3489 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3490 return FALSE;
3491 break;
3492
3493 case RE_LNUM:
3494 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3495 scan))
3496 return FALSE;
3497 break;
3498
3499 case RE_COL:
3500 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3501 return FALSE;
3502 break;
3503
3504 case RE_VCOL:
3505 if (!re_num_cmp((long_u)win_linetabsize(
3506 reg_win == NULL ? curwin : reg_win,
3507 regline, (colnr_T)(reginput - regline)) + 1, scan))
3508 return FALSE;
3509 break;
3510
3511 case BOW: /* \<word; reginput points to w */
3512 if (c == NUL) /* Can't match at end of line */
3513 return FALSE;
3514#ifdef FEAT_MBYTE
3515 if (has_mbyte)
3516 {
3517 int this_class;
3518
3519 /* Get class of current and previous char (if it exists). */
3520 this_class = mb_get_class(reginput);
3521 if (this_class <= 1)
3522 return FALSE; /* not on a word at all */
3523 if (reg_prev_class() == this_class)
3524 return FALSE; /* previous char is in same word */
3525 }
3526#endif
3527 else
3528 {
3529 if (!vim_iswordc(c)
3530 || (reginput > regline && vim_iswordc(reginput[-1])))
3531 return FALSE;
3532 }
3533 break;
3534
3535 case EOW: /* word\>; reginput points after d */
3536 if (reginput == regline) /* Can't match at start of line */
3537 return FALSE;
3538#ifdef FEAT_MBYTE
3539 if (has_mbyte)
3540 {
3541 int this_class, prev_class;
3542
3543 /* Get class of current and previous char (if it exists). */
3544 this_class = mb_get_class(reginput);
3545 prev_class = reg_prev_class();
3546 if (this_class == prev_class)
3547 return FALSE;
3548 if (prev_class == 0 || prev_class == 1)
3549 return FALSE;
3550 }
3551 else
3552#endif
3553 {
3554 if (!vim_iswordc(reginput[-1]))
3555 return FALSE;
3556 if (reginput[0] != NUL && vim_iswordc(c))
3557 return FALSE;
3558 }
3559 break; /* Matched with EOW */
3560
3561 case ANY:
3562 if (c == NUL)
3563 return FALSE;
3564 ADVANCE_REGINPUT();
3565 break;
3566
3567 case IDENT:
3568 if (!vim_isIDc(c))
3569 return FALSE;
3570 ADVANCE_REGINPUT();
3571 break;
3572
3573 case SIDENT:
3574 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3575 return FALSE;
3576 ADVANCE_REGINPUT();
3577 break;
3578
3579 case KWORD:
3580 if (!vim_iswordp(reginput))
3581 return FALSE;
3582 ADVANCE_REGINPUT();
3583 break;
3584
3585 case SKWORD:
3586 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3587 return FALSE;
3588 ADVANCE_REGINPUT();
3589 break;
3590
3591 case FNAME:
3592 if (!vim_isfilec(c))
3593 return FALSE;
3594 ADVANCE_REGINPUT();
3595 break;
3596
3597 case SFNAME:
3598 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3599 return FALSE;
3600 ADVANCE_REGINPUT();
3601 break;
3602
3603 case PRINT:
3604 if (ptr2cells(reginput) != 1)
3605 return FALSE;
3606 ADVANCE_REGINPUT();
3607 break;
3608
3609 case SPRINT:
3610 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3611 return FALSE;
3612 ADVANCE_REGINPUT();
3613 break;
3614
3615 case WHITE:
3616 if (!vim_iswhite(c))
3617 return FALSE;
3618 ADVANCE_REGINPUT();
3619 break;
3620
3621 case NWHITE:
3622 if (c == NUL || vim_iswhite(c))
3623 return FALSE;
3624 ADVANCE_REGINPUT();
3625 break;
3626
3627 case DIGIT:
3628 if (!ri_digit(c))
3629 return FALSE;
3630 ADVANCE_REGINPUT();
3631 break;
3632
3633 case NDIGIT:
3634 if (c == NUL || ri_digit(c))
3635 return FALSE;
3636 ADVANCE_REGINPUT();
3637 break;
3638
3639 case HEX:
3640 if (!ri_hex(c))
3641 return FALSE;
3642 ADVANCE_REGINPUT();
3643 break;
3644
3645 case NHEX:
3646 if (c == NUL || ri_hex(c))
3647 return FALSE;
3648 ADVANCE_REGINPUT();
3649 break;
3650
3651 case OCTAL:
3652 if (!ri_octal(c))
3653 return FALSE;
3654 ADVANCE_REGINPUT();
3655 break;
3656
3657 case NOCTAL:
3658 if (c == NUL || ri_octal(c))
3659 return FALSE;
3660 ADVANCE_REGINPUT();
3661 break;
3662
3663 case WORD:
3664 if (!ri_word(c))
3665 return FALSE;
3666 ADVANCE_REGINPUT();
3667 break;
3668
3669 case NWORD:
3670 if (c == NUL || ri_word(c))
3671 return FALSE;
3672 ADVANCE_REGINPUT();
3673 break;
3674
3675 case HEAD:
3676 if (!ri_head(c))
3677 return FALSE;
3678 ADVANCE_REGINPUT();
3679 break;
3680
3681 case NHEAD:
3682 if (c == NUL || ri_head(c))
3683 return FALSE;
3684 ADVANCE_REGINPUT();
3685 break;
3686
3687 case ALPHA:
3688 if (!ri_alpha(c))
3689 return FALSE;
3690 ADVANCE_REGINPUT();
3691 break;
3692
3693 case NALPHA:
3694 if (c == NUL || ri_alpha(c))
3695 return FALSE;
3696 ADVANCE_REGINPUT();
3697 break;
3698
3699 case LOWER:
3700 if (!ri_lower(c))
3701 return FALSE;
3702 ADVANCE_REGINPUT();
3703 break;
3704
3705 case NLOWER:
3706 if (c == NUL || ri_lower(c))
3707 return FALSE;
3708 ADVANCE_REGINPUT();
3709 break;
3710
3711 case UPPER:
3712 if (!ri_upper(c))
3713 return FALSE;
3714 ADVANCE_REGINPUT();
3715 break;
3716
3717 case NUPPER:
3718 if (c == NUL || ri_upper(c))
3719 return FALSE;
3720 ADVANCE_REGINPUT();
3721 break;
3722
3723 case EXACTLY:
3724 {
3725 int len;
3726 char_u *opnd;
3727
3728 opnd = OPERAND(scan);
3729 /* Inline the first byte, for speed. */
3730 if (*opnd != *reginput
3731 && (!ireg_ic || (
3732#ifdef FEAT_MBYTE
3733 !enc_utf8 &&
3734#endif
3735 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3736 return FALSE;
3737 if (*opnd == NUL)
3738 {
3739 /* match empty string always works; happens when "~" is
3740 * empty. */
3741 }
3742 else if (opnd[1] == NUL
3743#ifdef FEAT_MBYTE
3744 && !(enc_utf8 && ireg_ic)
3745#endif
3746 )
3747 ++reginput; /* matched a single char */
3748 else
3749 {
3750 len = (int)STRLEN(opnd);
3751 /* Need to match first byte again for multi-byte. */
3752 if (cstrncmp(opnd, reginput, &len) != 0)
3753 return FALSE;
3754#ifdef FEAT_MBYTE
3755 /* Check for following composing character. */
3756 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3757 {
3758 /* raaron: This code makes a composing character get
3759 * ignored, which is the correct behavior (sometimes)
3760 * for voweled Hebrew texts. */
3761 if (!ireg_icombine)
3762 return FALSE;
3763 }
3764 else
3765#endif
3766 reginput += len;
3767 }
3768 }
3769 break;
3770
3771 case ANYOF:
3772 case ANYBUT:
3773 if (c == NUL)
3774 return FALSE;
3775 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3776 return FALSE;
3777 ADVANCE_REGINPUT();
3778 break;
3779
3780#ifdef FEAT_MBYTE
3781 case MULTIBYTECODE:
3782 if (has_mbyte)
3783 {
3784 int i, len;
3785 char_u *opnd;
3786
3787 opnd = OPERAND(scan);
3788 /* Safety check (just in case 'encoding' was changed since
3789 * compiling the program). */
3790 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3791 return FALSE;
3792 for (i = 0; i < len; ++i)
3793 if (opnd[i] != reginput[i])
3794 return FALSE;
3795 reginput += len;
3796 }
3797 else
3798 return FALSE;
3799 break;
3800#endif
3801
3802 case NOTHING:
3803 break;
3804
3805 case BACK:
3806 break;
3807
3808 case MOPEN + 0: /* Match start: \zs */
3809 case MOPEN + 1: /* \( */
3810 case MOPEN + 2:
3811 case MOPEN + 3:
3812 case MOPEN + 4:
3813 case MOPEN + 5:
3814 case MOPEN + 6:
3815 case MOPEN + 7:
3816 case MOPEN + 8:
3817 case MOPEN + 9:
3818 {
3819 int no;
3820 save_se_T save;
3821
3822 no = op - MOPEN;
3823 cleanup_subexpr();
3824 save_se(&save, &reg_startpos[no], &reg_startp[no]);
3825
3826 if (regmatch(next))
3827 return TRUE;
3828
3829 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
3830 return FALSE;
3831 }
3832 /* break; Not Reached */
3833
3834 case NOPEN: /* \%( */
3835 case NCLOSE: /* \) after \%( */
3836 if (regmatch(next))
3837 return TRUE;
3838 return FALSE;
3839 /* break; Not Reached */
3840
3841#ifdef FEAT_SYN_HL
3842 case ZOPEN + 1:
3843 case ZOPEN + 2:
3844 case ZOPEN + 3:
3845 case ZOPEN + 4:
3846 case ZOPEN + 5:
3847 case ZOPEN + 6:
3848 case ZOPEN + 7:
3849 case ZOPEN + 8:
3850 case ZOPEN + 9:
3851 {
3852 int no;
3853 save_se_T save;
3854
3855 no = op - ZOPEN;
3856 cleanup_zsubexpr();
3857 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3858
3859 if (regmatch(next))
3860 return TRUE;
3861
3862 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3863 return FALSE;
3864 }
3865 /* break; Not Reached */
3866#endif
3867
3868 case MCLOSE + 0: /* Match end: \ze */
3869 case MCLOSE + 1: /* \) */
3870 case MCLOSE + 2:
3871 case MCLOSE + 3:
3872 case MCLOSE + 4:
3873 case MCLOSE + 5:
3874 case MCLOSE + 6:
3875 case MCLOSE + 7:
3876 case MCLOSE + 8:
3877 case MCLOSE + 9:
3878 {
3879 int no;
3880 save_se_T save;
3881
3882 no = op - MCLOSE;
3883 cleanup_subexpr();
3884 save_se(&save, &reg_endpos[no], &reg_endp[no]);
3885
3886 if (regmatch(next))
3887 return TRUE;
3888
3889 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
3890 return FALSE;
3891 }
3892 /* break; Not Reached */
3893
3894#ifdef FEAT_SYN_HL
3895 case ZCLOSE + 1: /* \) after \z( */
3896 case ZCLOSE + 2:
3897 case ZCLOSE + 3:
3898 case ZCLOSE + 4:
3899 case ZCLOSE + 5:
3900 case ZCLOSE + 6:
3901 case ZCLOSE + 7:
3902 case ZCLOSE + 8:
3903 case ZCLOSE + 9:
3904 {
3905 int no;
3906 save_se_T save;
3907
3908 no = op - ZCLOSE;
3909 cleanup_zsubexpr();
3910 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3911
3912 if (regmatch(next))
3913 return TRUE;
3914
3915 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3916 return FALSE;
3917 }
3918 /* break; Not Reached */
3919#endif
3920
3921 case BACKREF + 1:
3922 case BACKREF + 2:
3923 case BACKREF + 3:
3924 case BACKREF + 4:
3925 case BACKREF + 5:
3926 case BACKREF + 6:
3927 case BACKREF + 7:
3928 case BACKREF + 8:
3929 case BACKREF + 9:
3930 {
3931 int no;
3932 int len;
3933 linenr_T clnum;
3934 colnr_T ccol;
3935 char_u *p;
3936
3937 no = op - BACKREF;
3938 cleanup_subexpr();
3939 if (!REG_MULTI) /* Single-line regexp */
3940 {
3941 if (reg_endp[no] == NULL)
3942 {
3943 /* Backref was not set: Match an empty string. */
3944 len = 0;
3945 }
3946 else
3947 {
3948 /* Compare current input with back-ref in the same
3949 * line. */
3950 len = (int)(reg_endp[no] - reg_startp[no]);
3951 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
3952 return FALSE;
3953 }
3954 }
3955 else /* Multi-line regexp */
3956 {
3957 if (reg_endpos[no].lnum < 0)
3958 {
3959 /* Backref was not set: Match an empty string. */
3960 len = 0;
3961 }
3962 else
3963 {
3964 if (reg_startpos[no].lnum == reglnum
3965 && reg_endpos[no].lnum == reglnum)
3966 {
3967 /* Compare back-ref within the current line. */
3968 len = reg_endpos[no].col - reg_startpos[no].col;
3969 if (cstrncmp(regline + reg_startpos[no].col,
3970 reginput, &len) != 0)
3971 return FALSE;
3972 }
3973 else
3974 {
3975 /* Messy situation: Need to compare between two
3976 * lines. */
3977 ccol = reg_startpos[no].col;
3978 clnum = reg_startpos[no].lnum;
3979 for (;;)
3980 {
3981 /* Since getting one line may invalidate
3982 * the other, need to make copy. Slow! */
3983 if (regline != reg_tofree)
3984 {
3985 len = (int)STRLEN(regline);
3986 if (reg_tofree == NULL
3987 || len >= (int)reg_tofreelen)
3988 {
3989 len += 50; /* get some extra */
3990 vim_free(reg_tofree);
3991 reg_tofree = alloc(len);
3992 if (reg_tofree == NULL)
3993 return FALSE; /* out of memory! */
3994 reg_tofreelen = len;
3995 }
3996 STRCPY(reg_tofree, regline);
3997 reginput = reg_tofree
3998 + (reginput - regline);
3999 regline = reg_tofree;
4000 }
4001
4002 /* Get the line to compare with. */
4003 p = reg_getline(clnum);
4004 if (clnum == reg_endpos[no].lnum)
4005 len = reg_endpos[no].col - ccol;
4006 else
4007 len = (int)STRLEN(p + ccol);
4008
4009 if (cstrncmp(p + ccol, reginput, &len) != 0)
4010 return FALSE; /* doesn't match */
4011 if (clnum == reg_endpos[no].lnum)
4012 break; /* match and at end! */
4013 if (reglnum == reg_maxline)
4014 return FALSE; /* text too short */
4015
4016 /* Advance to next line. */
4017 reg_nextline();
4018 ++clnum;
4019 ccol = 0;
4020 if (got_int || out_of_stack)
4021 return FALSE;
4022 }
4023
4024 /* found a match! Note that regline may now point
4025 * to a copy of the line, that should not matter. */
4026 }
4027 }
4028 }
4029
4030 /* Matched the backref, skip over it. */
4031 reginput += len;
4032 }
4033 break;
4034
4035#ifdef FEAT_SYN_HL
4036 case ZREF + 1:
4037 case ZREF + 2:
4038 case ZREF + 3:
4039 case ZREF + 4:
4040 case ZREF + 5:
4041 case ZREF + 6:
4042 case ZREF + 7:
4043 case ZREF + 8:
4044 case ZREF + 9:
4045 {
4046 int no;
4047 int len;
4048
4049 cleanup_zsubexpr();
4050 no = op - ZREF;
4051 if (re_extmatch_in != NULL
4052 && re_extmatch_in->matches[no] != NULL)
4053 {
4054 len = (int)STRLEN(re_extmatch_in->matches[no]);
4055 if (cstrncmp(re_extmatch_in->matches[no],
4056 reginput, &len) != 0)
4057 return FALSE;
4058 reginput += len;
4059 }
4060 else
4061 {
4062 /* Backref was not set: Match an empty string. */
4063 }
4064 }
4065 break;
4066#endif
4067
4068 case BRANCH:
4069 {
4070 if (OP(next) != BRANCH) /* No choice. */
4071 next = OPERAND(scan); /* Avoid recursion. */
4072 else
4073 {
4074 regsave_T save;
4075
4076 do
4077 {
4078 reg_save(&save);
4079 if (regmatch(OPERAND(scan)))
4080 return TRUE;
4081 reg_restore(&save);
4082 scan = regnext(scan);
4083 } while (scan != NULL && OP(scan) == BRANCH);
4084 return FALSE;
4085 /* NOTREACHED */
4086 }
4087 }
4088 break;
4089
4090 case BRACE_LIMITS:
4091 {
4092 int no;
4093
4094 if (OP(next) == BRACE_SIMPLE)
4095 {
4096 bl_minval = OPERAND_MIN(scan);
4097 bl_maxval = OPERAND_MAX(scan);
4098 }
4099 else if (OP(next) >= BRACE_COMPLEX
4100 && OP(next) < BRACE_COMPLEX + 10)
4101 {
4102 no = OP(next) - BRACE_COMPLEX;
4103 brace_min[no] = OPERAND_MIN(scan);
4104 brace_max[no] = OPERAND_MAX(scan);
4105 brace_count[no] = 0;
4106 }
4107 else
4108 {
4109 EMSG(_(e_internal)); /* Shouldn't happen */
4110 return FALSE;
4111 }
4112 }
4113 break;
4114
4115 case BRACE_COMPLEX + 0:
4116 case BRACE_COMPLEX + 1:
4117 case BRACE_COMPLEX + 2:
4118 case BRACE_COMPLEX + 3:
4119 case BRACE_COMPLEX + 4:
4120 case BRACE_COMPLEX + 5:
4121 case BRACE_COMPLEX + 6:
4122 case BRACE_COMPLEX + 7:
4123 case BRACE_COMPLEX + 8:
4124 case BRACE_COMPLEX + 9:
4125 {
4126 int no;
4127 regsave_T save;
4128
4129 no = op - BRACE_COMPLEX;
4130 ++brace_count[no];
4131
4132 /* If not matched enough times yet, try one more */
4133 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4134 ? brace_min[no] : brace_max[no]))
4135 {
4136 reg_save(&save);
4137 if (regmatch(OPERAND(scan)))
4138 return TRUE;
4139 reg_restore(&save);
4140 --brace_count[no]; /* failed, decrement match count */
4141 return FALSE;
4142 }
4143
4144 /* If matched enough times, may try matching some more */
4145 if (brace_min[no] <= brace_max[no])
4146 {
4147 /* Range is the normal way around, use longest match */
4148 if (brace_count[no] <= brace_max[no])
4149 {
4150 reg_save(&save);
4151 if (regmatch(OPERAND(scan)))
4152 return TRUE; /* matched some more times */
4153 reg_restore(&save);
4154 --brace_count[no]; /* matched just enough times */
4155 /* continue with the items after \{} */
4156 }
4157 }
4158 else
4159 {
4160 /* Range is backwards, use shortest match first */
4161 if (brace_count[no] <= brace_min[no])
4162 {
4163 reg_save(&save);
4164 if (regmatch(next))
4165 return TRUE;
4166 reg_restore(&save);
4167 next = OPERAND(scan);
4168 /* must try to match one more item */
4169 }
4170 }
4171 }
4172 break;
4173
4174 case BRACE_SIMPLE:
4175 case STAR:
4176 case PLUS:
4177 {
4178 int nextb; /* next byte */
4179 int nextb_ic; /* next byte reverse case */
4180 long count;
4181 regsave_T save;
4182 long minval;
4183 long maxval;
4184
4185 /*
4186 * Lookahead to avoid useless match attempts when we know
4187 * what character comes next.
4188 */
4189 if (OP(next) == EXACTLY)
4190 {
4191 nextb = *OPERAND(next);
4192 if (ireg_ic)
4193 {
4194 if (isupper(nextb))
4195 nextb_ic = TOLOWER_LOC(nextb);
4196 else
4197 nextb_ic = TOUPPER_LOC(nextb);
4198 }
4199 else
4200 nextb_ic = nextb;
4201 }
4202 else
4203 {
4204 nextb = NUL;
4205 nextb_ic = NUL;
4206 }
4207 if (op != BRACE_SIMPLE)
4208 {
4209 minval = (op == STAR) ? 0 : 1;
4210 maxval = MAX_LIMIT;
4211 }
4212 else
4213 {
4214 minval = bl_minval;
4215 maxval = bl_maxval;
4216 }
4217
4218 /*
4219 * When maxval > minval, try matching as much as possible, up
4220 * to maxval. When maxval < minval, try matching at least the
4221 * minimal number (since the range is backwards, that's also
4222 * maxval!).
4223 */
4224 count = regrepeat(OPERAND(scan), maxval);
4225 if (got_int)
4226 return FALSE;
4227 if (minval <= maxval)
4228 {
4229 /* Range is the normal way around, use longest match */
4230 while (count >= minval)
4231 {
4232 /* If it could match, try it. */
4233 if (nextb == NUL || *reginput == nextb
4234 || *reginput == nextb_ic)
4235 {
4236 reg_save(&save);
4237 if (regmatch(next))
4238 return TRUE;
4239 reg_restore(&save);
4240 }
4241 /* Couldn't or didn't match -- back up one char. */
4242 if (--count < minval)
4243 break;
4244 if (reginput == regline)
4245 {
4246 /* backup to last char of previous line */
4247 --reglnum;
4248 regline = reg_getline(reglnum);
4249 /* Just in case regrepeat() didn't count right. */
4250 if (regline == NULL)
4251 return FALSE;
4252 reginput = regline + STRLEN(regline);
4253 fast_breakcheck();
4254 if (got_int || out_of_stack)
4255 return FALSE;
4256 }
4257 else
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004258 mb_ptr_back(regline, reginput);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004259 }
4260 }
4261 else
4262 {
4263 /* Range is backwards, use shortest match first.
4264 * Careful: maxval and minval are exchanged! */
4265 if (count < maxval)
4266 return FALSE;
4267 for (;;)
4268 {
4269 /* If it could work, try it. */
4270 if (nextb == NUL || *reginput == nextb
4271 || *reginput == nextb_ic)
4272 {
4273 reg_save(&save);
4274 if (regmatch(next))
4275 return TRUE;
4276 reg_restore(&save);
4277 }
4278 /* Couldn't or didn't match: try advancing one char. */
4279 if (count == minval
4280 || regrepeat(OPERAND(scan), 1L) == 0)
4281 break;
4282 ++count;
4283 if (got_int || out_of_stack)
4284 return FALSE;
4285 }
4286 }
4287 return FALSE;
4288 }
4289 /* break; Not Reached */
4290
4291 case NOMATCH:
4292 {
4293 regsave_T save;
4294
4295 /* If the operand matches, we fail. Otherwise backup and
4296 * continue with the next item. */
4297 reg_save(&save);
4298 if (regmatch(OPERAND(scan)))
4299 return FALSE;
4300 reg_restore(&save);
4301 }
4302 break;
4303
4304 case MATCH:
4305 case SUBPAT:
4306 {
4307 regsave_T save;
4308
4309 /* If the operand doesn't match, we fail. Otherwise backup
4310 * and continue with the next item. */
4311 reg_save(&save);
4312 if (!regmatch(OPERAND(scan)))
4313 return FALSE;
4314 if (op == MATCH) /* zero-width */
4315 reg_restore(&save);
4316 }
4317 break;
4318
4319 case BEHIND:
4320 case NOBEHIND:
4321 {
4322 regsave_T save_after, save_start;
4323 regsave_T save_behind_pos;
4324 int needmatch = (op == BEHIND);
4325
4326 /*
4327 * Look back in the input of the operand matches or not. This
4328 * must be done at every position in the input and checking if
4329 * the match ends at the current position.
4330 * First check if the next item matches, that's probably
4331 * faster.
4332 */
4333 reg_save(&save_start);
4334 if (regmatch(next))
4335 {
4336 /* save the position after the found match for next */
4337 reg_save(&save_after);
4338
4339 /* start looking for a match with operand at the current
4340 * postion. Go back one character until we find the
4341 * result, hitting the start of the line or the previous
4342 * line (for multi-line matching).
4343 * Set behind_pos to where the match should end, BHPOS
4344 * will match it. */
4345 save_behind_pos = behind_pos;
4346 behind_pos = save_start;
4347 for (;;)
4348 {
4349 reg_restore(&save_start);
4350 if (regmatch(OPERAND(scan))
4351 && reg_save_equal(&behind_pos))
4352 {
4353 behind_pos = save_behind_pos;
4354 /* found a match that ends where "next" started */
4355 if (needmatch)
4356 {
4357 reg_restore(&save_after);
4358 return TRUE;
4359 }
4360 return FALSE;
4361 }
4362 /*
4363 * No match: Go back one character. May go to
4364 * previous line once.
4365 */
4366 if (REG_MULTI)
4367 {
4368 if (save_start.rs_u.pos.col == 0)
4369 {
4370 if (save_start.rs_u.pos.lnum
4371 < behind_pos.rs_u.pos.lnum
4372 || reg_getline(
4373 --save_start.rs_u.pos.lnum) == NULL)
4374 break;
4375 reg_restore(&save_start);
4376 save_start.rs_u.pos.col =
4377 (colnr_T)STRLEN(regline);
4378 }
4379 else
4380 --save_start.rs_u.pos.col;
4381 }
4382 else
4383 {
4384 if (save_start.rs_u.ptr == regline)
4385 break;
4386 --save_start.rs_u.ptr;
4387 }
4388 }
4389
4390 /* NOBEHIND succeeds when no match was found */
4391 behind_pos = save_behind_pos;
4392 if (!needmatch)
4393 {
4394 reg_restore(&save_after);
4395 return TRUE;
4396 }
4397 }
4398 return FALSE;
4399 }
4400
4401 case BHPOS:
4402 if (REG_MULTI)
4403 {
4404 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4405 || behind_pos.rs_u.pos.lnum != reglnum)
4406 return FALSE;
4407 }
4408 else if (behind_pos.rs_u.ptr != reginput)
4409 return FALSE;
4410 break;
4411
4412 case NEWL:
4413 if ((c != NUL || reglnum == reg_maxline)
4414 && (c != '\n' || !reg_line_lbr))
4415 return FALSE;
4416 if (reg_line_lbr)
4417 ADVANCE_REGINPUT();
4418 else
4419 reg_nextline();
4420 break;
4421
4422 case END:
4423 return TRUE; /* Success! */
4424
4425 default:
4426 EMSG(_(e_re_corr));
4427#ifdef DEBUG
4428 printf("Illegal op code %d\n", op);
4429#endif
4430 return FALSE;
4431 }
4432 }
4433
4434 scan = next;
4435 }
4436
4437 /*
4438 * We get here only if there's trouble -- normally "case END" is the
4439 * terminating point.
4440 */
4441 EMSG(_(e_re_corr));
4442#ifdef DEBUG
4443 printf("Premature EOL\n");
4444#endif
4445 return FALSE;
4446}
4447
Bram Moolenaar071d4272004-06-13 20:20:40 +00004448/*
4449 * regrepeat - repeatedly match something simple, return how many.
4450 * Advances reginput (and reglnum) to just after the matched chars.
4451 */
4452 static int
4453regrepeat(p, maxcount)
4454 char_u *p;
4455 long maxcount; /* maximum number of matches allowed */
4456{
4457 long count = 0;
4458 char_u *scan;
4459 char_u *opnd;
4460 int mask;
4461 int testval = 0;
4462
4463 scan = reginput; /* Make local copy of reginput for speed. */
4464 opnd = OPERAND(p);
4465 switch (OP(p))
4466 {
4467 case ANY:
4468 case ANY + ADD_NL:
4469 while (count < maxcount)
4470 {
4471 /* Matching anything means we continue until end-of-line (or
4472 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4473 while (*scan != NUL && count < maxcount)
4474 {
4475 ++count;
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004476 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004477 }
4478 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4479 break;
4480 ++count; /* count the line-break */
4481 reg_nextline();
4482 scan = reginput;
4483 if (got_int)
4484 break;
4485 }
4486 break;
4487
4488 case IDENT:
4489 case IDENT + ADD_NL:
4490 testval = TRUE;
4491 /*FALLTHROUGH*/
4492 case SIDENT:
4493 case SIDENT + ADD_NL:
4494 while (count < maxcount)
4495 {
4496 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4497 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004498 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004499 }
4500 else if (*scan == NUL)
4501 {
4502 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4503 break;
4504 reg_nextline();
4505 scan = reginput;
4506 if (got_int)
4507 break;
4508 }
4509 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4510 ++scan;
4511 else
4512 break;
4513 ++count;
4514 }
4515 break;
4516
4517 case KWORD:
4518 case KWORD + ADD_NL:
4519 testval = TRUE;
4520 /*FALLTHROUGH*/
4521 case SKWORD:
4522 case SKWORD + ADD_NL:
4523 while (count < maxcount)
4524 {
4525 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4526 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004527 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004528 }
4529 else if (*scan == NUL)
4530 {
4531 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4532 break;
4533 reg_nextline();
4534 scan = reginput;
4535 if (got_int)
4536 break;
4537 }
4538 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4539 ++scan;
4540 else
4541 break;
4542 ++count;
4543 }
4544 break;
4545
4546 case FNAME:
4547 case FNAME + ADD_NL:
4548 testval = TRUE;
4549 /*FALLTHROUGH*/
4550 case SFNAME:
4551 case SFNAME + ADD_NL:
4552 while (count < maxcount)
4553 {
4554 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4555 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004556 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004557 }
4558 else if (*scan == NUL)
4559 {
4560 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4561 break;
4562 reg_nextline();
4563 scan = reginput;
4564 if (got_int)
4565 break;
4566 }
4567 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4568 ++scan;
4569 else
4570 break;
4571 ++count;
4572 }
4573 break;
4574
4575 case PRINT:
4576 case PRINT + ADD_NL:
4577 testval = TRUE;
4578 /*FALLTHROUGH*/
4579 case SPRINT:
4580 case SPRINT + ADD_NL:
4581 while (count < maxcount)
4582 {
4583 if (*scan == NUL)
4584 {
4585 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4586 break;
4587 reg_nextline();
4588 scan = reginput;
4589 if (got_int)
4590 break;
4591 }
4592 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4593 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004594 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004595 }
4596 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4597 ++scan;
4598 else
4599 break;
4600 ++count;
4601 }
4602 break;
4603
4604 case WHITE:
4605 case WHITE + ADD_NL:
4606 testval = mask = RI_WHITE;
4607do_class:
4608 while (count < maxcount)
4609 {
4610#ifdef FEAT_MBYTE
4611 int l;
4612#endif
4613 if (*scan == NUL)
4614 {
4615 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4616 break;
4617 reg_nextline();
4618 scan = reginput;
4619 if (got_int)
4620 break;
4621 }
4622#ifdef FEAT_MBYTE
4623 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4624 {
4625 if (testval != 0)
4626 break;
4627 scan += l;
4628 }
4629#endif
4630 else if ((class_tab[*scan] & mask) == testval)
4631 ++scan;
4632 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4633 ++scan;
4634 else
4635 break;
4636 ++count;
4637 }
4638 break;
4639
4640 case NWHITE:
4641 case NWHITE + ADD_NL:
4642 mask = RI_WHITE;
4643 goto do_class;
4644 case DIGIT:
4645 case DIGIT + ADD_NL:
4646 testval = mask = RI_DIGIT;
4647 goto do_class;
4648 case NDIGIT:
4649 case NDIGIT + ADD_NL:
4650 mask = RI_DIGIT;
4651 goto do_class;
4652 case HEX:
4653 case HEX + ADD_NL:
4654 testval = mask = RI_HEX;
4655 goto do_class;
4656 case NHEX:
4657 case NHEX + ADD_NL:
4658 mask = RI_HEX;
4659 goto do_class;
4660 case OCTAL:
4661 case OCTAL + ADD_NL:
4662 testval = mask = RI_OCTAL;
4663 goto do_class;
4664 case NOCTAL:
4665 case NOCTAL + ADD_NL:
4666 mask = RI_OCTAL;
4667 goto do_class;
4668 case WORD:
4669 case WORD + ADD_NL:
4670 testval = mask = RI_WORD;
4671 goto do_class;
4672 case NWORD:
4673 case NWORD + ADD_NL:
4674 mask = RI_WORD;
4675 goto do_class;
4676 case HEAD:
4677 case HEAD + ADD_NL:
4678 testval = mask = RI_HEAD;
4679 goto do_class;
4680 case NHEAD:
4681 case NHEAD + ADD_NL:
4682 mask = RI_HEAD;
4683 goto do_class;
4684 case ALPHA:
4685 case ALPHA + ADD_NL:
4686 testval = mask = RI_ALPHA;
4687 goto do_class;
4688 case NALPHA:
4689 case NALPHA + ADD_NL:
4690 mask = RI_ALPHA;
4691 goto do_class;
4692 case LOWER:
4693 case LOWER + ADD_NL:
4694 testval = mask = RI_LOWER;
4695 goto do_class;
4696 case NLOWER:
4697 case NLOWER + ADD_NL:
4698 mask = RI_LOWER;
4699 goto do_class;
4700 case UPPER:
4701 case UPPER + ADD_NL:
4702 testval = mask = RI_UPPER;
4703 goto do_class;
4704 case NUPPER:
4705 case NUPPER + ADD_NL:
4706 mask = RI_UPPER;
4707 goto do_class;
4708
4709 case EXACTLY:
4710 {
4711 int cu, cl;
4712
4713 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4714 * would have been used for it. */
4715 if (ireg_ic)
4716 {
4717 cu = TOUPPER_LOC(*opnd);
4718 cl = TOLOWER_LOC(*opnd);
4719 while (count < maxcount && (*scan == cu || *scan == cl))
4720 {
4721 count++;
4722 scan++;
4723 }
4724 }
4725 else
4726 {
4727 cu = *opnd;
4728 while (count < maxcount && *scan == cu)
4729 {
4730 count++;
4731 scan++;
4732 }
4733 }
4734 break;
4735 }
4736
4737#ifdef FEAT_MBYTE
4738 case MULTIBYTECODE:
4739 {
4740 int i, len, cf = 0;
4741
4742 /* Safety check (just in case 'encoding' was changed since
4743 * compiling the program). */
4744 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4745 {
4746 if (ireg_ic && enc_utf8)
4747 cf = utf_fold(utf_ptr2char(opnd));
4748 while (count < maxcount)
4749 {
4750 for (i = 0; i < len; ++i)
4751 if (opnd[i] != scan[i])
4752 break;
4753 if (i < len && (!ireg_ic || !enc_utf8
4754 || utf_fold(utf_ptr2char(scan)) != cf))
4755 break;
4756 scan += len;
4757 ++count;
4758 }
4759 }
4760 }
4761 break;
4762#endif
4763
4764 case ANYOF:
4765 case ANYOF + ADD_NL:
4766 testval = TRUE;
4767 /*FALLTHROUGH*/
4768
4769 case ANYBUT:
4770 case ANYBUT + ADD_NL:
4771 while (count < maxcount)
4772 {
4773#ifdef FEAT_MBYTE
4774 int len;
4775#endif
4776 if (*scan == NUL)
4777 {
4778 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4779 break;
4780 reg_nextline();
4781 scan = reginput;
4782 if (got_int)
4783 break;
4784 }
4785 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4786 ++scan;
4787#ifdef FEAT_MBYTE
4788 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4789 {
4790 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4791 break;
4792 scan += len;
4793 }
4794#endif
4795 else
4796 {
4797 if ((cstrchr(opnd, *scan) == NULL) == testval)
4798 break;
4799 ++scan;
4800 }
4801 ++count;
4802 }
4803 break;
4804
4805 case NEWL:
4806 while (count < maxcount
4807 && ((*scan == NUL && reglnum < reg_maxline)
4808 || (*scan == '\n' && reg_line_lbr)))
4809 {
4810 count++;
4811 if (reg_line_lbr)
4812 ADVANCE_REGINPUT();
4813 else
4814 reg_nextline();
4815 scan = reginput;
4816 if (got_int)
4817 break;
4818 }
4819 break;
4820
4821 default: /* Oh dear. Called inappropriately. */
4822 EMSG(_(e_re_corr));
4823#ifdef DEBUG
4824 printf("Called regrepeat with op code %d\n", OP(p));
4825#endif
4826 break;
4827 }
4828
4829 reginput = scan;
4830
4831 return (int)count;
4832}
4833
4834/*
4835 * regnext - dig the "next" pointer out of a node
4836 */
4837 static char_u *
4838regnext(p)
4839 char_u *p;
4840{
4841 int offset;
4842
4843 if (p == JUST_CALC_SIZE)
4844 return NULL;
4845
4846 offset = NEXT(p);
4847 if (offset == 0)
4848 return NULL;
4849
4850 if (OP(p) == BACK)
4851 return p - offset;
4852 else
4853 return p + offset;
4854}
4855
4856/*
4857 * Check the regexp program for its magic number.
4858 * Return TRUE if it's wrong.
4859 */
4860 static int
4861prog_magic_wrong()
4862{
4863 if (UCHARAT(REG_MULTI
4864 ? reg_mmatch->regprog->program
4865 : reg_match->regprog->program) != REGMAGIC)
4866 {
4867 EMSG(_(e_re_corr));
4868 return TRUE;
4869 }
4870 return FALSE;
4871}
4872
4873/*
4874 * Cleanup the subexpressions, if this wasn't done yet.
4875 * This construction is used to clear the subexpressions only when they are
4876 * used (to increase speed).
4877 */
4878 static void
4879cleanup_subexpr()
4880{
4881 if (need_clear_subexpr)
4882 {
4883 if (REG_MULTI)
4884 {
4885 /* Use 0xff to set lnum to -1 */
4886 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4887 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4888 }
4889 else
4890 {
4891 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
4892 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
4893 }
4894 need_clear_subexpr = FALSE;
4895 }
4896}
4897
4898#ifdef FEAT_SYN_HL
4899 static void
4900cleanup_zsubexpr()
4901{
4902 if (need_clear_zsubexpr)
4903 {
4904 if (REG_MULTI)
4905 {
4906 /* Use 0xff to set lnum to -1 */
4907 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4908 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4909 }
4910 else
4911 {
4912 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
4913 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
4914 }
4915 need_clear_zsubexpr = FALSE;
4916 }
4917}
4918#endif
4919
4920/*
4921 * Advance reglnum, regline and reginput to the next line.
4922 */
4923 static void
4924reg_nextline()
4925{
4926 regline = reg_getline(++reglnum);
4927 reginput = regline;
4928 fast_breakcheck();
4929}
4930
4931/*
4932 * Save the input line and position in a regsave_T.
4933 */
4934 static void
4935reg_save(save)
4936 regsave_T *save;
4937{
4938 if (REG_MULTI)
4939 {
4940 save->rs_u.pos.col = (colnr_T)(reginput - regline);
4941 save->rs_u.pos.lnum = reglnum;
4942 }
4943 else
4944 save->rs_u.ptr = reginput;
4945}
4946
4947/*
4948 * Restore the input line and position from a regsave_T.
4949 */
4950 static void
4951reg_restore(save)
4952 regsave_T *save;
4953{
4954 if (REG_MULTI)
4955 {
4956 if (reglnum != save->rs_u.pos.lnum)
4957 {
4958 /* only call reg_getline() when the line number changed to save
4959 * a bit of time */
4960 reglnum = save->rs_u.pos.lnum;
4961 regline = reg_getline(reglnum);
4962 }
4963 reginput = regline + save->rs_u.pos.col;
4964 }
4965 else
4966 reginput = save->rs_u.ptr;
4967}
4968
4969/*
4970 * Return TRUE if current position is equal to saved position.
4971 */
4972 static int
4973reg_save_equal(save)
4974 regsave_T *save;
4975{
4976 if (REG_MULTI)
4977 return reglnum == save->rs_u.pos.lnum
4978 && reginput == regline + save->rs_u.pos.col;
4979 return reginput == save->rs_u.ptr;
4980}
4981
4982/*
4983 * Tentatively set the sub-expression start to the current position (after
4984 * calling regmatch() they will have changed). Need to save the existing
4985 * values for when there is no match.
4986 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
4987 * depending on REG_MULTI.
4988 */
4989 static void
4990save_se_multi(savep, posp)
4991 save_se_T *savep;
4992 lpos_T *posp;
4993{
4994 savep->se_u.pos = *posp;
4995 posp->lnum = reglnum;
4996 posp->col = (colnr_T)(reginput - regline);
4997}
4998
4999 static void
5000save_se_one(savep, pp)
5001 save_se_T *savep;
5002 char_u **pp;
5003{
5004 savep->se_u.ptr = *pp;
5005 *pp = reginput;
5006}
5007
5008/*
5009 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5010 */
5011 static int
5012re_num_cmp(val, scan)
5013 long_u val;
5014 char_u *scan;
5015{
5016 long_u n = OPERAND_MIN(scan);
5017
5018 if (OPERAND_CMP(scan) == '>')
5019 return val > n;
5020 if (OPERAND_CMP(scan) == '<')
5021 return val < n;
5022 return val == n;
5023}
5024
5025
5026#ifdef DEBUG
5027
5028/*
5029 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5030 */
5031 static void
5032regdump(pattern, r)
5033 char_u *pattern;
5034 regprog_T *r;
5035{
5036 char_u *s;
5037 int op = EXACTLY; /* Arbitrary non-END op. */
5038 char_u *next;
5039 char_u *end = NULL;
5040
5041 printf("\r\nregcomp(%s):\r\n", pattern);
5042
5043 s = r->program + 1;
5044 /*
5045 * Loop until we find the END that isn't before a referred next (an END
5046 * can also appear in a NOMATCH operand).
5047 */
5048 while (op != END || s <= end)
5049 {
5050 op = OP(s);
5051 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5052 next = regnext(s);
5053 if (next == NULL) /* Next ptr. */
5054 printf("(0)");
5055 else
5056 printf("(%d)", (int)((s - r->program) + (next - s)));
5057 if (end < next)
5058 end = next;
5059 if (op == BRACE_LIMITS)
5060 {
5061 /* Two short ints */
5062 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5063 s += 8;
5064 }
5065 s += 3;
5066 if (op == ANYOF || op == ANYOF + ADD_NL
5067 || op == ANYBUT || op == ANYBUT + ADD_NL
5068 || op == EXACTLY)
5069 {
5070 /* Literal string, where present. */
5071 while (*s != NUL)
5072 printf("%c", *s++);
5073 s++;
5074 }
5075 printf("\r\n");
5076 }
5077
5078 /* Header fields of interest. */
5079 if (r->regstart != NUL)
5080 printf("start `%s' 0x%x; ", r->regstart < 256
5081 ? (char *)transchar(r->regstart)
5082 : "multibyte", r->regstart);
5083 if (r->reganch)
5084 printf("anchored; ");
5085 if (r->regmust != NULL)
5086 printf("must have \"%s\"", r->regmust);
5087 printf("\r\n");
5088}
5089
5090/*
5091 * regprop - printable representation of opcode
5092 */
5093 static char_u *
5094regprop(op)
5095 char_u *op;
5096{
5097 char_u *p;
5098 static char_u buf[50];
5099
5100 (void) strcpy(buf, ":");
5101
5102 switch (OP(op))
5103 {
5104 case BOL:
5105 p = "BOL";
5106 break;
5107 case EOL:
5108 p = "EOL";
5109 break;
5110 case RE_BOF:
5111 p = "BOF";
5112 break;
5113 case RE_EOF:
5114 p = "EOF";
5115 break;
5116 case CURSOR:
5117 p = "CURSOR";
5118 break;
5119 case RE_LNUM:
5120 p = "RE_LNUM";
5121 break;
5122 case RE_COL:
5123 p = "RE_COL";
5124 break;
5125 case RE_VCOL:
5126 p = "RE_VCOL";
5127 break;
5128 case BOW:
5129 p = "BOW";
5130 break;
5131 case EOW:
5132 p = "EOW";
5133 break;
5134 case ANY:
5135 p = "ANY";
5136 break;
5137 case ANY + ADD_NL:
5138 p = "ANY+NL";
5139 break;
5140 case ANYOF:
5141 p = "ANYOF";
5142 break;
5143 case ANYOF + ADD_NL:
5144 p = "ANYOF+NL";
5145 break;
5146 case ANYBUT:
5147 p = "ANYBUT";
5148 break;
5149 case ANYBUT + ADD_NL:
5150 p = "ANYBUT+NL";
5151 break;
5152 case IDENT:
5153 p = "IDENT";
5154 break;
5155 case IDENT + ADD_NL:
5156 p = "IDENT+NL";
5157 break;
5158 case SIDENT:
5159 p = "SIDENT";
5160 break;
5161 case SIDENT + ADD_NL:
5162 p = "SIDENT+NL";
5163 break;
5164 case KWORD:
5165 p = "KWORD";
5166 break;
5167 case KWORD + ADD_NL:
5168 p = "KWORD+NL";
5169 break;
5170 case SKWORD:
5171 p = "SKWORD";
5172 break;
5173 case SKWORD + ADD_NL:
5174 p = "SKWORD+NL";
5175 break;
5176 case FNAME:
5177 p = "FNAME";
5178 break;
5179 case FNAME + ADD_NL:
5180 p = "FNAME+NL";
5181 break;
5182 case SFNAME:
5183 p = "SFNAME";
5184 break;
5185 case SFNAME + ADD_NL:
5186 p = "SFNAME+NL";
5187 break;
5188 case PRINT:
5189 p = "PRINT";
5190 break;
5191 case PRINT + ADD_NL:
5192 p = "PRINT+NL";
5193 break;
5194 case SPRINT:
5195 p = "SPRINT";
5196 break;
5197 case SPRINT + ADD_NL:
5198 p = "SPRINT+NL";
5199 break;
5200 case WHITE:
5201 p = "WHITE";
5202 break;
5203 case WHITE + ADD_NL:
5204 p = "WHITE+NL";
5205 break;
5206 case NWHITE:
5207 p = "NWHITE";
5208 break;
5209 case NWHITE + ADD_NL:
5210 p = "NWHITE+NL";
5211 break;
5212 case DIGIT:
5213 p = "DIGIT";
5214 break;
5215 case DIGIT + ADD_NL:
5216 p = "DIGIT+NL";
5217 break;
5218 case NDIGIT:
5219 p = "NDIGIT";
5220 break;
5221 case NDIGIT + ADD_NL:
5222 p = "NDIGIT+NL";
5223 break;
5224 case HEX:
5225 p = "HEX";
5226 break;
5227 case HEX + ADD_NL:
5228 p = "HEX+NL";
5229 break;
5230 case NHEX:
5231 p = "NHEX";
5232 break;
5233 case NHEX + ADD_NL:
5234 p = "NHEX+NL";
5235 break;
5236 case OCTAL:
5237 p = "OCTAL";
5238 break;
5239 case OCTAL + ADD_NL:
5240 p = "OCTAL+NL";
5241 break;
5242 case NOCTAL:
5243 p = "NOCTAL";
5244 break;
5245 case NOCTAL + ADD_NL:
5246 p = "NOCTAL+NL";
5247 break;
5248 case WORD:
5249 p = "WORD";
5250 break;
5251 case WORD + ADD_NL:
5252 p = "WORD+NL";
5253 break;
5254 case NWORD:
5255 p = "NWORD";
5256 break;
5257 case NWORD + ADD_NL:
5258 p = "NWORD+NL";
5259 break;
5260 case HEAD:
5261 p = "HEAD";
5262 break;
5263 case HEAD + ADD_NL:
5264 p = "HEAD+NL";
5265 break;
5266 case NHEAD:
5267 p = "NHEAD";
5268 break;
5269 case NHEAD + ADD_NL:
5270 p = "NHEAD+NL";
5271 break;
5272 case ALPHA:
5273 p = "ALPHA";
5274 break;
5275 case ALPHA + ADD_NL:
5276 p = "ALPHA+NL";
5277 break;
5278 case NALPHA:
5279 p = "NALPHA";
5280 break;
5281 case NALPHA + ADD_NL:
5282 p = "NALPHA+NL";
5283 break;
5284 case LOWER:
5285 p = "LOWER";
5286 break;
5287 case LOWER + ADD_NL:
5288 p = "LOWER+NL";
5289 break;
5290 case NLOWER:
5291 p = "NLOWER";
5292 break;
5293 case NLOWER + ADD_NL:
5294 p = "NLOWER+NL";
5295 break;
5296 case UPPER:
5297 p = "UPPER";
5298 break;
5299 case UPPER + ADD_NL:
5300 p = "UPPER+NL";
5301 break;
5302 case NUPPER:
5303 p = "NUPPER";
5304 break;
5305 case NUPPER + ADD_NL:
5306 p = "NUPPER+NL";
5307 break;
5308 case BRANCH:
5309 p = "BRANCH";
5310 break;
5311 case EXACTLY:
5312 p = "EXACTLY";
5313 break;
5314 case NOTHING:
5315 p = "NOTHING";
5316 break;
5317 case BACK:
5318 p = "BACK";
5319 break;
5320 case END:
5321 p = "END";
5322 break;
5323 case MOPEN + 0:
5324 p = "MATCH START";
5325 break;
5326 case MOPEN + 1:
5327 case MOPEN + 2:
5328 case MOPEN + 3:
5329 case MOPEN + 4:
5330 case MOPEN + 5:
5331 case MOPEN + 6:
5332 case MOPEN + 7:
5333 case MOPEN + 8:
5334 case MOPEN + 9:
5335 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5336 p = NULL;
5337 break;
5338 case MCLOSE + 0:
5339 p = "MATCH END";
5340 break;
5341 case MCLOSE + 1:
5342 case MCLOSE + 2:
5343 case MCLOSE + 3:
5344 case MCLOSE + 4:
5345 case MCLOSE + 5:
5346 case MCLOSE + 6:
5347 case MCLOSE + 7:
5348 case MCLOSE + 8:
5349 case MCLOSE + 9:
5350 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5351 p = NULL;
5352 break;
5353 case BACKREF + 1:
5354 case BACKREF + 2:
5355 case BACKREF + 3:
5356 case BACKREF + 4:
5357 case BACKREF + 5:
5358 case BACKREF + 6:
5359 case BACKREF + 7:
5360 case BACKREF + 8:
5361 case BACKREF + 9:
5362 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5363 p = NULL;
5364 break;
5365 case NOPEN:
5366 p = "NOPEN";
5367 break;
5368 case NCLOSE:
5369 p = "NCLOSE";
5370 break;
5371#ifdef FEAT_SYN_HL
5372 case ZOPEN + 1:
5373 case ZOPEN + 2:
5374 case ZOPEN + 3:
5375 case ZOPEN + 4:
5376 case ZOPEN + 5:
5377 case ZOPEN + 6:
5378 case ZOPEN + 7:
5379 case ZOPEN + 8:
5380 case ZOPEN + 9:
5381 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5382 p = NULL;
5383 break;
5384 case ZCLOSE + 1:
5385 case ZCLOSE + 2:
5386 case ZCLOSE + 3:
5387 case ZCLOSE + 4:
5388 case ZCLOSE + 5:
5389 case ZCLOSE + 6:
5390 case ZCLOSE + 7:
5391 case ZCLOSE + 8:
5392 case ZCLOSE + 9:
5393 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5394 p = NULL;
5395 break;
5396 case ZREF + 1:
5397 case ZREF + 2:
5398 case ZREF + 3:
5399 case ZREF + 4:
5400 case ZREF + 5:
5401 case ZREF + 6:
5402 case ZREF + 7:
5403 case ZREF + 8:
5404 case ZREF + 9:
5405 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5406 p = NULL;
5407 break;
5408#endif
5409 case STAR:
5410 p = "STAR";
5411 break;
5412 case PLUS:
5413 p = "PLUS";
5414 break;
5415 case NOMATCH:
5416 p = "NOMATCH";
5417 break;
5418 case MATCH:
5419 p = "MATCH";
5420 break;
5421 case BEHIND:
5422 p = "BEHIND";
5423 break;
5424 case NOBEHIND:
5425 p = "NOBEHIND";
5426 break;
5427 case SUBPAT:
5428 p = "SUBPAT";
5429 break;
5430 case BRACE_LIMITS:
5431 p = "BRACE_LIMITS";
5432 break;
5433 case BRACE_SIMPLE:
5434 p = "BRACE_SIMPLE";
5435 break;
5436 case BRACE_COMPLEX + 0:
5437 case BRACE_COMPLEX + 1:
5438 case BRACE_COMPLEX + 2:
5439 case BRACE_COMPLEX + 3:
5440 case BRACE_COMPLEX + 4:
5441 case BRACE_COMPLEX + 5:
5442 case BRACE_COMPLEX + 6:
5443 case BRACE_COMPLEX + 7:
5444 case BRACE_COMPLEX + 8:
5445 case BRACE_COMPLEX + 9:
5446 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5447 p = NULL;
5448 break;
5449#ifdef FEAT_MBYTE
5450 case MULTIBYTECODE:
5451 p = "MULTIBYTECODE";
5452 break;
5453#endif
5454 case NEWL:
5455 p = "NEWL";
5456 break;
5457 default:
5458 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5459 p = NULL;
5460 break;
5461 }
5462 if (p != NULL)
5463 (void) strcat(buf, p);
5464 return buf;
5465}
5466#endif
5467
5468#ifdef FEAT_MBYTE
5469static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5470
5471typedef struct
5472{
5473 int a, b, c;
5474} decomp_T;
5475
5476
5477/* 0xfb20 - 0xfb4f */
5478decomp_T decomp_table[0xfb4f-0xfb20+1] =
5479{
5480 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5481 {0x5d0,0,0}, /* 0xfb21 alt alef */
5482 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5483 {0x5d4,0,0}, /* 0xfb23 alt he */
5484 {0x5db,0,0}, /* 0xfb24 alt kaf */
5485 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5486 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5487 {0x5e8,0,0}, /* 0xfb27 alt resh */
5488 {0x5ea,0,0}, /* 0xfb28 alt tav */
5489 {'+', 0, 0}, /* 0xfb29 alt plus */
5490 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5491 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5492 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5493 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5494 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5495 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5496 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5497 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5498 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5499 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5500 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5501 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5502 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5503 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5504 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5505 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5506 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5507 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5508 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5509 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5510 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5511 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5512 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5513 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5514 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5515 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5516 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5517 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5518 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5519 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5520 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5521 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5522 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5523 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5524 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5525 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5526 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5527 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5528};
5529
5530 static void
5531mb_decompose(c, c1, c2, c3)
5532 int c, *c1, *c2, *c3;
5533{
5534 decomp_T d;
5535
5536 if (c >= 0x4b20 && c <= 0xfb4f)
5537 {
5538 d = decomp_table[c - 0xfb20];
5539 *c1 = d.a;
5540 *c2 = d.b;
5541 *c3 = d.c;
5542 }
5543 else
5544 {
5545 *c1 = c;
5546 *c2 = *c3 = 0;
5547 }
5548}
5549#endif
5550
5551/*
5552 * Compare two strings, ignore case if ireg_ic set.
5553 * Return 0 if strings match, non-zero otherwise.
5554 * Correct the length "*n" when composing characters are ignored.
5555 */
5556 static int
5557cstrncmp(s1, s2, n)
5558 char_u *s1, *s2;
5559 int *n;
5560{
5561 int result;
5562
5563 if (!ireg_ic)
5564 result = STRNCMP(s1, s2, *n);
5565 else
5566 result = MB_STRNICMP(s1, s2, *n);
5567
5568#ifdef FEAT_MBYTE
5569 /* if it failed and it's utf8 and we want to combineignore: */
5570 if (result != 0 && enc_utf8 && ireg_icombine)
5571 {
5572 char_u *str1, *str2;
5573 int c1, c2, c11, c12;
5574 int ix;
5575 int junk;
5576
5577 /* we have to handle the strcmp ourselves, since it is necessary to
5578 * deal with the composing characters by ignoring them: */
5579 str1 = s1;
5580 str2 = s2;
5581 c1 = c2 = 0;
5582 for (ix = 0; ix < *n; )
5583 {
5584 c1 = mb_ptr2char_adv(&str1);
5585 c2 = mb_ptr2char_adv(&str2);
5586 ix += utf_char2len(c1);
5587
5588 /* decompose the character if necessary, into 'base' characters
5589 * because I don't care about Arabic, I will hard-code the Hebrew
5590 * which I *do* care about! So sue me... */
5591 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5592 {
5593 /* decomposition necessary? */
5594 mb_decompose(c1, &c11, &junk, &junk);
5595 mb_decompose(c2, &c12, &junk, &junk);
5596 c1 = c11;
5597 c2 = c12;
5598 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5599 break;
5600 }
5601 }
5602 result = c2 - c1;
5603 if (result == 0)
5604 *n = (int)(str2 - s2);
5605 }
5606#endif
5607
5608 return result;
5609}
5610
5611/*
5612 * cstrchr: This function is used a lot for simple searches, keep it fast!
5613 */
5614 static char_u *
5615cstrchr(s, c)
5616 char_u *s;
5617 int c;
5618{
5619 char_u *p;
5620 int cc;
5621
5622 if (!ireg_ic
5623#ifdef FEAT_MBYTE
5624 || (!enc_utf8 && mb_char2len(c) > 1)
5625#endif
5626 )
5627 return vim_strchr(s, c);
5628
5629 /* tolower() and toupper() can be slow, comparing twice should be a lot
5630 * faster (esp. when using MS Visual C++!).
5631 * For UTF-8 need to use folded case. */
5632#ifdef FEAT_MBYTE
5633 if (enc_utf8 && c > 0x80)
5634 cc = utf_fold(c);
5635 else
5636#endif
5637 if (isupper(c))
5638 cc = TOLOWER_LOC(c);
5639 else if (islower(c))
5640 cc = TOUPPER_LOC(c);
5641 else
5642 return vim_strchr(s, c);
5643
5644#ifdef FEAT_MBYTE
5645 if (has_mbyte)
5646 {
5647 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5648 {
5649 if (enc_utf8 && c > 0x80)
5650 {
5651 if (utf_fold(utf_ptr2char(p)) == cc)
5652 return p;
5653 }
5654 else if (*p == c || *p == cc)
5655 return p;
5656 }
5657 }
5658 else
5659#endif
5660 /* Faster version for when there are no multi-byte characters. */
5661 for (p = s; *p != NUL; ++p)
5662 if (*p == c || *p == cc)
5663 return p;
5664
5665 return NULL;
5666}
5667
5668/***************************************************************
5669 * regsub stuff *
5670 ***************************************************************/
5671
5672/* This stuff below really confuses cc on an SGI -- webb */
5673#ifdef __sgi
5674# undef __ARGS
5675# define __ARGS(x) ()
5676#endif
5677
5678/*
5679 * We should define ftpr as a pointer to a function returning a pointer to
5680 * a function returning a pointer to a function ...
5681 * This is impossible, so we declare a pointer to a function returning a
5682 * pointer to a function returning void. This should work for all compilers.
5683 */
5684typedef void (*(*fptr) __ARGS((char_u *, int)))();
5685
5686static fptr do_upper __ARGS((char_u *, int));
5687static fptr do_Upper __ARGS((char_u *, int));
5688static fptr do_lower __ARGS((char_u *, int));
5689static fptr do_Lower __ARGS((char_u *, int));
5690
5691static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5692
5693 static fptr
5694do_upper(d, c)
5695 char_u *d;
5696 int c;
5697{
5698 *d = TOUPPER_LOC(c);
5699
5700 return (fptr)NULL;
5701}
5702
5703 static fptr
5704do_Upper(d, c)
5705 char_u *d;
5706 int c;
5707{
5708 *d = TOUPPER_LOC(c);
5709
5710 return (fptr)do_Upper;
5711}
5712
5713 static fptr
5714do_lower(d, c)
5715 char_u *d;
5716 int c;
5717{
5718 *d = TOLOWER_LOC(c);
5719
5720 return (fptr)NULL;
5721}
5722
5723 static fptr
5724do_Lower(d, c)
5725 char_u *d;
5726 int c;
5727{
5728 *d = TOLOWER_LOC(c);
5729
5730 return (fptr)do_Lower;
5731}
5732
5733/*
5734 * regtilde(): Replace tildes in the pattern by the old pattern.
5735 *
5736 * Short explanation of the tilde: It stands for the previous replacement
5737 * pattern. If that previous pattern also contains a ~ we should go back a
5738 * step further... But we insert the previous pattern into the current one
5739 * and remember that.
5740 * This still does not handle the case where "magic" changes. TODO?
5741 *
5742 * The tildes are parsed once before the first call to vim_regsub().
5743 */
5744 char_u *
5745regtilde(source, magic)
5746 char_u *source;
5747 int magic;
5748{
5749 char_u *newsub = source;
5750 char_u *tmpsub;
5751 char_u *p;
5752 int len;
5753 int prevlen;
5754
5755 for (p = newsub; *p; ++p)
5756 {
5757 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5758 {
5759 if (reg_prev_sub != NULL)
5760 {
5761 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5762 prevlen = (int)STRLEN(reg_prev_sub);
5763 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5764 if (tmpsub != NULL)
5765 {
5766 /* copy prefix */
5767 len = (int)(p - newsub); /* not including ~ */
5768 mch_memmove(tmpsub, newsub, (size_t)len);
5769 /* interpretate tilde */
5770 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5771 /* copy postfix */
5772 if (!magic)
5773 ++p; /* back off \ */
5774 STRCPY(tmpsub + len + prevlen, p + 1);
5775
5776 if (newsub != source) /* already allocated newsub */
5777 vim_free(newsub);
5778 newsub = tmpsub;
5779 p = newsub + len + prevlen;
5780 }
5781 }
5782 else if (magic)
5783 STRCPY(p, p + 1); /* remove '~' */
5784 else
5785 STRCPY(p, p + 2); /* remove '\~' */
5786 --p;
5787 }
5788 else
5789 {
5790 if (*p == '\\' && p[1]) /* skip escaped characters */
5791 ++p;
5792#ifdef FEAT_MBYTE
5793 if (has_mbyte)
5794 p += (*mb_ptr2len_check)(p) - 1;
5795#endif
5796 }
5797 }
5798
5799 vim_free(reg_prev_sub);
5800 if (newsub != source) /* newsub was allocated, just keep it */
5801 reg_prev_sub = newsub;
5802 else /* no ~ found, need to save newsub */
5803 reg_prev_sub = vim_strsave(newsub);
5804 return newsub;
5805}
5806
5807#ifdef FEAT_EVAL
5808static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
5809
5810/* These pointers are used instead of reg_match and reg_mmatch for
5811 * reg_submatch(). Needed for when the substitution string is an expression
5812 * that contains a call to substitute() and submatch(). */
5813static regmatch_T *submatch_match;
5814static regmmatch_T *submatch_mmatch;
5815#endif
5816
5817#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
5818/*
5819 * vim_regsub() - perform substitutions after a vim_regexec() or
5820 * vim_regexec_multi() match.
5821 *
5822 * If "copy" is TRUE really copy into "dest".
5823 * If "copy" is FALSE nothing is copied, this is just to find out the length
5824 * of the result.
5825 *
5826 * If "backslash" is TRUE, a backslash will be removed later, need to double
5827 * them to keep them, and insert a backslash before a CR to avoid it being
5828 * replaced with a line break later.
5829 *
5830 * Note: The matched text must not change between the call of
5831 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
5832 * references invalid!
5833 *
5834 * Returns the size of the replacement, including terminating NUL.
5835 */
5836 int
5837vim_regsub(rmp, source, dest, copy, magic, backslash)
5838 regmatch_T *rmp;
5839 char_u *source;
5840 char_u *dest;
5841 int copy;
5842 int magic;
5843 int backslash;
5844{
5845 reg_match = rmp;
5846 reg_mmatch = NULL;
5847 reg_maxline = 0;
5848 return vim_regsub_both(source, dest, copy, magic, backslash);
5849}
5850#endif
5851
5852 int
5853vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
5854 regmmatch_T *rmp;
5855 linenr_T lnum;
5856 char_u *source;
5857 char_u *dest;
5858 int copy;
5859 int magic;
5860 int backslash;
5861{
5862 reg_match = NULL;
5863 reg_mmatch = rmp;
5864 reg_buf = curbuf; /* always works on the current buffer! */
5865 reg_firstlnum = lnum;
5866 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
5867 return vim_regsub_both(source, dest, copy, magic, backslash);
5868}
5869
5870 static int
5871vim_regsub_both(source, dest, copy, magic, backslash)
5872 char_u *source;
5873 char_u *dest;
5874 int copy;
5875 int magic;
5876 int backslash;
5877{
5878 char_u *src;
5879 char_u *dst;
5880 char_u *s;
5881 int c;
5882 int no = -1;
5883 fptr func = (fptr)NULL;
5884 linenr_T clnum = 0; /* init for GCC */
5885 int len = 0; /* init for GCC */
5886#ifdef FEAT_EVAL
5887 static char_u *eval_result = NULL;
5888#endif
5889#ifdef FEAT_MBYTE
5890 int l;
5891#endif
5892
5893
5894 /* Be paranoid... */
5895 if (source == NULL || dest == NULL)
5896 {
5897 EMSG(_(e_null));
5898 return 0;
5899 }
5900 if (prog_magic_wrong())
5901 return 0;
5902 src = source;
5903 dst = dest;
5904
5905 /*
5906 * When the substitute part starts with "\=" evaluate it as an expression.
5907 */
5908 if (source[0] == '\\' && source[1] == '='
5909#ifdef FEAT_EVAL
5910 && !can_f_submatch /* can't do this recursively */
5911#endif
5912 )
5913 {
5914#ifdef FEAT_EVAL
5915 /* To make sure that the length doesn't change between checking the
5916 * length and copying the string, and to speed up things, the
5917 * resulting string is saved from the call with "copy" == FALSE to the
5918 * call with "copy" == TRUE. */
5919 if (copy)
5920 {
5921 if (eval_result != NULL)
5922 {
5923 STRCPY(dest, eval_result);
5924 dst += STRLEN(eval_result);
5925 vim_free(eval_result);
5926 eval_result = NULL;
5927 }
5928 }
5929 else
5930 {
5931 linenr_T save_reg_maxline;
5932 win_T *save_reg_win;
5933 int save_ireg_ic;
5934
5935 vim_free(eval_result);
5936
5937 /* The expression may contain substitute(), which calls us
5938 * recursively. Make sure submatch() gets the text from the first
5939 * level. Don't need to save "reg_buf", because
5940 * vim_regexec_multi() can't be called recursively. */
5941 submatch_match = reg_match;
5942 submatch_mmatch = reg_mmatch;
5943 save_reg_maxline = reg_maxline;
5944 save_reg_win = reg_win;
5945 save_ireg_ic = ireg_ic;
5946 can_f_submatch = TRUE;
5947
5948 eval_result = eval_to_string(source + 2, NULL);
5949 if (eval_result != NULL)
5950 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00005951 for (s = eval_result; *s != NUL; mb_ptr_adv(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00005952 {
5953 /* Change NL to CR, so that it becomes a line break.
5954 * Skip over a backslashed character. */
5955 if (*s == NL)
5956 *s = CAR;
5957 else if (*s == '\\' && s[1] != NUL)
5958 ++s;
Bram Moolenaar071d4272004-06-13 20:20:40 +00005959 }
5960
5961 dst += STRLEN(eval_result);
5962 }
5963
5964 reg_match = submatch_match;
5965 reg_mmatch = submatch_mmatch;
5966 reg_maxline = save_reg_maxline;
5967 reg_win = save_reg_win;
5968 ireg_ic = save_ireg_ic;
5969 can_f_submatch = FALSE;
5970 }
5971#endif
5972 }
5973 else
5974 while ((c = *src++) != NUL)
5975 {
5976 if (c == '&' && magic)
5977 no = 0;
5978 else if (c == '\\' && *src != NUL)
5979 {
5980 if (*src == '&' && !magic)
5981 {
5982 ++src;
5983 no = 0;
5984 }
5985 else if ('0' <= *src && *src <= '9')
5986 {
5987 no = *src++ - '0';
5988 }
5989 else if (vim_strchr((char_u *)"uUlLeE", *src))
5990 {
5991 switch (*src++)
5992 {
5993 case 'u': func = (fptr)do_upper;
5994 continue;
5995 case 'U': func = (fptr)do_Upper;
5996 continue;
5997 case 'l': func = (fptr)do_lower;
5998 continue;
5999 case 'L': func = (fptr)do_Lower;
6000 continue;
6001 case 'e':
6002 case 'E': func = (fptr)NULL;
6003 continue;
6004 }
6005 }
6006 }
6007 if (no < 0) /* Ordinary character. */
6008 {
6009 if (c == '\\' && *src != NUL)
6010 {
6011 /* Check for abbreviations -- webb */
6012 switch (*src)
6013 {
6014 case 'r': c = CAR; ++src; break;
6015 case 'n': c = NL; ++src; break;
6016 case 't': c = TAB; ++src; break;
6017 /* Oh no! \e already has meaning in subst pat :-( */
6018 /* case 'e': c = ESC; ++src; break; */
6019 case 'b': c = Ctrl_H; ++src; break;
6020
6021 /* If "backslash" is TRUE the backslash will be removed
6022 * later. Used to insert a literal CR. */
6023 default: if (backslash)
6024 {
6025 if (copy)
6026 *dst = '\\';
6027 ++dst;
6028 }
6029 c = *src++;
6030 }
6031 }
6032
6033 /* Write to buffer, if copy is set. */
6034#ifdef FEAT_MBYTE
6035 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6036 {
6037 /* TODO: should use "func" here. */
6038 if (copy)
6039 mch_memmove(dst, src - 1, l);
6040 dst += l - 1;
6041 src += l - 1;
6042 }
6043 else
6044 {
6045#endif
6046 if (copy)
6047 {
6048 if (func == (fptr)NULL) /* just copy */
6049 *dst = c;
6050 else /* change case */
6051 func = (fptr)(func(dst, c));
6052 /* Turbo C complains without the typecast */
6053 }
6054#ifdef FEAT_MBYTE
6055 }
6056#endif
6057 dst++;
6058 }
6059 else
6060 {
6061 if (REG_MULTI)
6062 {
6063 clnum = reg_mmatch->startpos[no].lnum;
6064 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6065 s = NULL;
6066 else
6067 {
6068 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6069 if (reg_mmatch->endpos[no].lnum == clnum)
6070 len = reg_mmatch->endpos[no].col
6071 - reg_mmatch->startpos[no].col;
6072 else
6073 len = (int)STRLEN(s);
6074 }
6075 }
6076 else
6077 {
6078 s = reg_match->startp[no];
6079 if (reg_match->endp[no] == NULL)
6080 s = NULL;
6081 else
6082 len = (int)(reg_match->endp[no] - s);
6083 }
6084 if (s != NULL)
6085 {
6086 for (;;)
6087 {
6088 if (len == 0)
6089 {
6090 if (REG_MULTI)
6091 {
6092 if (reg_mmatch->endpos[no].lnum == clnum)
6093 break;
6094 if (copy)
6095 *dst = CAR;
6096 ++dst;
6097 s = reg_getline(++clnum);
6098 if (reg_mmatch->endpos[no].lnum == clnum)
6099 len = reg_mmatch->endpos[no].col;
6100 else
6101 len = (int)STRLEN(s);
6102 }
6103 else
6104 break;
6105 }
6106 else if (*s == NUL) /* we hit NUL. */
6107 {
6108 if (copy)
6109 EMSG(_(e_re_damg));
6110 goto exit;
6111 }
6112 else
6113 {
6114 if (backslash && (*s == CAR || *s == '\\'))
6115 {
6116 /*
6117 * Insert a backslash in front of a CR, otherwise
6118 * it will be replaced by a line break.
6119 * Number of backslashes will be halved later,
6120 * double them here.
6121 */
6122 if (copy)
6123 {
6124 dst[0] = '\\';
6125 dst[1] = *s;
6126 }
6127 dst += 2;
6128 }
6129#ifdef FEAT_MBYTE
6130 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6131 {
6132 /* TODO: should use "func" here. */
6133 if (copy)
6134 mch_memmove(dst, s, l);
6135 dst += l;
6136 s += l - 1;
6137 len -= l - 1;
6138 }
6139#endif
6140 else
6141 {
6142 if (copy)
6143 {
6144 if (func == (fptr)NULL) /* just copy */
6145 *dst = *s;
6146 else /* change case */
6147 func = (fptr)(func(dst, *s));
6148 /* Turbo C complains without the typecast */
6149 }
6150 ++dst;
6151 }
6152 ++s;
6153 --len;
6154 }
6155 }
6156 }
6157 no = -1;
6158 }
6159 }
6160 if (copy)
6161 *dst = NUL;
6162
6163exit:
6164 return (int)((dst - dest) + 1);
6165}
6166
6167#ifdef FEAT_EVAL
6168/*
6169 * Used for the submatch() function: get the string from tne n'th submatch in
6170 * allocated memory.
6171 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6172 */
6173 char_u *
6174reg_submatch(no)
6175 int no;
6176{
6177 char_u *retval = NULL;
6178 char_u *s;
6179 int len;
6180 int round;
6181 linenr_T lnum;
6182
6183 if (!can_f_submatch)
6184 return NULL;
6185
6186 if (submatch_match == NULL)
6187 {
6188 /*
6189 * First round: compute the length and allocate memory.
6190 * Second round: copy the text.
6191 */
6192 for (round = 1; round <= 2; ++round)
6193 {
6194 lnum = submatch_mmatch->startpos[no].lnum;
6195 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6196 return NULL;
6197
6198 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6199 if (s == NULL) /* anti-crash check, cannot happen? */
6200 break;
6201 if (submatch_mmatch->endpos[no].lnum == lnum)
6202 {
6203 /* Within one line: take form start to end col. */
6204 len = submatch_mmatch->endpos[no].col
6205 - submatch_mmatch->startpos[no].col;
6206 if (round == 2)
6207 {
6208 STRNCPY(retval, s, len);
6209 retval[len] = NUL;
6210 }
6211 ++len;
6212 }
6213 else
6214 {
6215 /* Multiple lines: take start line from start col, middle
6216 * lines completely and end line up to end col. */
6217 len = (int)STRLEN(s);
6218 if (round == 2)
6219 {
6220 STRCPY(retval, s);
6221 retval[len] = '\n';
6222 }
6223 ++len;
6224 ++lnum;
6225 while (lnum < submatch_mmatch->endpos[no].lnum)
6226 {
6227 s = reg_getline(lnum++);
6228 if (round == 2)
6229 STRCPY(retval + len, s);
6230 len += (int)STRLEN(s);
6231 if (round == 2)
6232 retval[len] = '\n';
6233 ++len;
6234 }
6235 if (round == 2)
6236 STRNCPY(retval + len, reg_getline(lnum),
6237 submatch_mmatch->endpos[no].col);
6238 len += submatch_mmatch->endpos[no].col;
6239 if (round == 2)
6240 retval[len] = NUL;
6241 ++len;
6242 }
6243
6244 if (round == 1)
6245 {
6246 retval = lalloc((long_u)len, TRUE);
6247 if (s == NULL)
6248 return NULL;
6249 }
6250 }
6251 }
6252 else
6253 {
6254 if (submatch_match->endp[no] == NULL)
6255 retval = NULL;
6256 else
6257 {
6258 s = submatch_match->startp[no];
6259 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6260 }
6261 }
6262
6263 return retval;
6264}
6265#endif