blob: 4aa5b6aba6bbd4d727c076951f85cdf3853ccd43 [file] [log] [blame]
Bram Moolenaar071d4272004-06-13 20:20:40 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
4 *
5 * NOTICE:
6 *
7 * This is NOT the original regular expression code as written by Henry
8 * Spencer. This code has been modified specifically for use with the VIM
9 * editor, and should not be used separately from Vim. If you want a good
10 * regular expression library, get the original code. The copyright notice
11 * that follows is from the original.
12 *
13 * END NOTICE
14 *
15 * Copyright (c) 1986 by University of Toronto.
16 * Written by Henry Spencer. Not derived from licensed software.
17 *
18 * Permission is granted to anyone to use this software for any
19 * purpose on any computer system, and to redistribute it freely,
20 * subject to the following restrictions:
21 *
22 * 1. The author is not responsible for the consequences of use of
23 * this software, no matter how awful, even if they arise
24 * from defects in it.
25 *
26 * 2. The origin of this software must not be misrepresented, either
27 * by explicit claim or by omission.
28 *
29 * 3. Altered versions must be plainly marked as such, and must not
30 * be misrepresented as being the original software.
31 *
32 * Beware that some of this code is subtly aware of the way operator
33 * precedence is structured in regular expressions. Serious changes in
34 * regular-expression syntax might require a total rethink.
35 *
Bram Moolenaarc0197e22004-09-13 20:26:32 +000036 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
37 * Webb, Ciaran McCreesh and Bram Moolenaar.
Bram Moolenaar071d4272004-06-13 20:20:40 +000038 * Named character class support added by Walter Briscoe (1998 Jul 01)
39 */
40
41#include "vim.h"
42
43#undef DEBUG
44
45/*
46 * The "internal use only" fields in regexp.h are present to pass info from
47 * compile to execute that permits the execute phase to run lots faster on
48 * simple cases. They are:
49 *
50 * regstart char that must begin a match; NUL if none obvious; Can be a
51 * multi-byte character.
52 * reganch is the match anchored (at beginning-of-line only)?
53 * regmust string (pointer into program) that match must include, or NULL
54 * regmlen length of regmust string
55 * regflags RF_ values or'ed together
56 *
57 * Regstart and reganch permit very fast decisions on suitable starting points
58 * for a match, cutting down the work a lot. Regmust permits fast rejection
59 * of lines that cannot possibly match. The regmust tests are costly enough
60 * that vim_regcomp() supplies a regmust only if the r.e. contains something
61 * potentially expensive (at present, the only such thing detected is * or +
62 * at the start of the r.e., which can involve a lot of backup). Regmlen is
63 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
64 * computing it anyway.
65 */
66
67/*
68 * Structure for regexp "program". This is essentially a linear encoding
69 * of a nondeterministic finite-state machine (aka syntax charts or
70 * "railroad normal form" in parsing technology). Each node is an opcode
71 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
72 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
73 * pointer with a BRANCH on both ends of it is connecting two alternatives.
74 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
75 * (as opposed to a collection of them) is never concatenated with anything
76 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
77 * node points to the node after the stuff to be repeated. The operand of some
78 * types of node is a literal string; for others, it is a node leading into a
79 * sub-FSM. In particular, the operand of a BRANCH node is the first node of
80 * the branch. (NB this is *not* a tree structure: the tail of the branch
81 * connects to the thing following the set of BRANCHes.)
82 *
83 * pattern is coded like:
84 *
85 * +-----------------+
86 * | V
87 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
88 * | ^ | ^
89 * +------+ +----------+
90 *
91 *
92 * +------------------+
93 * V |
94 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
95 * | | ^ ^
96 * | +---------------+ |
97 * +---------------------------------------------+
98 *
99 *
100 * +-------------------------+
101 * V |
102 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
103 * | | ^
104 * | +----------------+
105 * +-----------------------------------------------+
106 *
107 *
108 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
109 * | | ^ ^
110 * | +----------------+ |
111 * +--------------------------------+
112 *
113 * +---------+
114 * | V
115 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
116 * | | | | ^ ^
117 * | | | +-----+ |
118 * | | +----------------+ |
119 * | +---------------------------+ |
120 * +------------------------------------------------------+
121 *
122 * They all start with a BRANCH for "\|" alternaties, even when there is only
123 * one alternative.
124 */
125
126/*
127 * The opcodes are:
128 */
129
130/* definition number opnd? meaning */
131#define END 0 /* End of program or NOMATCH operand. */
132#define BOL 1 /* Match "" at beginning of line. */
133#define EOL 2 /* Match "" at end of line. */
134#define BRANCH 3 /* node Match this alternative, or the
135 * next... */
136#define BACK 4 /* Match "", "next" ptr points backward. */
137#define EXACTLY 5 /* str Match this string. */
138#define NOTHING 6 /* Match empty string. */
139#define STAR 7 /* node Match this (simple) thing 0 or more
140 * times. */
141#define PLUS 8 /* node Match this (simple) thing 1 or more
142 * times. */
143#define MATCH 9 /* node match the operand zero-width */
144#define NOMATCH 10 /* node check for no match with operand */
145#define BEHIND 11 /* node look behind for a match with operand */
146#define NOBEHIND 12 /* node look behind for no match with operand */
147#define SUBPAT 13 /* node match the operand here */
148#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
149 * n times (\{m,n\}). */
150#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
151#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
152#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
153 * and BRACE_COMPLEX. */
154#define NEWL 18 /* Match line-break */
155#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
156
157
158/* character classes: 20-48 normal, 50-78 include a line-break */
159#define ADD_NL 30
160#define FIRST_NL ANY + ADD_NL
161#define ANY 20 /* Match any one character. */
162#define ANYOF 21 /* str Match any character in this string. */
163#define ANYBUT 22 /* str Match any character not in this
164 * string. */
165#define IDENT 23 /* Match identifier char */
166#define SIDENT 24 /* Match identifier char but no digit */
167#define KWORD 25 /* Match keyword char */
168#define SKWORD 26 /* Match word char but no digit */
169#define FNAME 27 /* Match file name char */
170#define SFNAME 28 /* Match file name char but no digit */
171#define PRINT 29 /* Match printable char */
172#define SPRINT 30 /* Match printable char but no digit */
173#define WHITE 31 /* Match whitespace char */
174#define NWHITE 32 /* Match non-whitespace char */
175#define DIGIT 33 /* Match digit char */
176#define NDIGIT 34 /* Match non-digit char */
177#define HEX 35 /* Match hex char */
178#define NHEX 36 /* Match non-hex char */
179#define OCTAL 37 /* Match octal char */
180#define NOCTAL 38 /* Match non-octal char */
181#define WORD 39 /* Match word char */
182#define NWORD 40 /* Match non-word char */
183#define HEAD 41 /* Match head char */
184#define NHEAD 42 /* Match non-head char */
185#define ALPHA 43 /* Match alpha char */
186#define NALPHA 44 /* Match non-alpha char */
187#define LOWER 45 /* Match lowercase char */
188#define NLOWER 46 /* Match non-lowercase char */
189#define UPPER 47 /* Match uppercase char */
190#define NUPPER 48 /* Match non-uppercase char */
191#define LAST_NL NUPPER + ADD_NL
192#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
193
194#define MOPEN 80 /* -89 Mark this point in input as start of
195 * \( subexpr. MOPEN + 0 marks start of
196 * match. */
197#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
198 * end of match. */
199#define BACKREF 100 /* -109 node Match same string again \1-\9 */
200
201#ifdef FEAT_SYN_HL
202# define ZOPEN 110 /* -119 Mark this point in input as start of
203 * \z( subexpr. */
204# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
205# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
206#endif
207
208#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
209
210#define NOPEN 150 /* Mark this point in input as start of
211 \%( subexpr. */
212#define NCLOSE 151 /* Analogous to NOPEN. */
213
214#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
215#define RE_BOF 201 /* Match "" at beginning of file. */
216#define RE_EOF 202 /* Match "" at end of file. */
217#define CURSOR 203 /* Match location of cursor. */
218
219#define RE_LNUM 204 /* nr cmp Match line number */
220#define RE_COL 205 /* nr cmp Match column number */
221#define RE_VCOL 206 /* nr cmp Match virtual column number */
222
223/*
224 * Magic characters have a special meaning, they don't match literally.
225 * Magic characters are negative. This separates them from literal characters
226 * (possibly multi-byte). Only ASCII characters can be Magic.
227 */
228#define Magic(x) ((int)(x) - 256)
229#define un_Magic(x) ((x) + 256)
230#define is_Magic(x) ((x) < 0)
231
232static int no_Magic __ARGS((int x));
233static int toggle_Magic __ARGS((int x));
234
235 static int
236no_Magic(x)
237 int x;
238{
239 if (is_Magic(x))
240 return un_Magic(x);
241 return x;
242}
243
244 static int
245toggle_Magic(x)
246 int x;
247{
248 if (is_Magic(x))
249 return un_Magic(x);
250 return Magic(x);
251}
252
253/*
254 * The first byte of the regexp internal "program" is actually this magic
255 * number; the start node begins in the second byte. It's used to catch the
256 * most severe mutilation of the program by the caller.
257 */
258
259#define REGMAGIC 0234
260
261/*
262 * Opcode notes:
263 *
264 * BRANCH The set of branches constituting a single choice are hooked
265 * together with their "next" pointers, since precedence prevents
266 * anything being concatenated to any individual branch. The
267 * "next" pointer of the last BRANCH in a choice points to the
268 * thing following the whole choice. This is also where the
269 * final "next" pointer of each individual branch points; each
270 * branch starts with the operand node of a BRANCH node.
271 *
272 * BACK Normal "next" pointers all implicitly point forward; BACK
273 * exists to make loop structures possible.
274 *
275 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
276 * BRANCH structures using BACK. Simple cases (one character
277 * per match) are implemented with STAR and PLUS for speed
278 * and to minimize recursive plunges.
279 *
280 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
281 * node, and defines the min and max limits to be used for that
282 * node.
283 *
284 * MOPEN,MCLOSE ...are numbered at compile time.
285 * ZOPEN,ZCLOSE ...ditto
286 */
287
288/*
289 * A node is one char of opcode followed by two chars of "next" pointer.
290 * "Next" pointers are stored as two 8-bit bytes, high order first. The
291 * value is a positive offset from the opcode of the node containing it.
292 * An operand, if any, simply follows the node. (Note that much of the
293 * code generation knows about this implicit relationship.)
294 *
295 * Using two bytes for the "next" pointer is vast overkill for most things,
296 * but allows patterns to get big without disasters.
297 */
298#define OP(p) ((int)*(p))
299#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
300#define OPERAND(p) ((p) + 3)
301/* Obtain an operand that was stored as four bytes, MSB first. */
302#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
303 + ((long)(p)[5] << 8) + (long)(p)[6])
304/* Obtain a second operand stored as four bytes. */
305#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
306/* Obtain a second single-byte operand stored after a four bytes operand. */
307#define OPERAND_CMP(p) (p)[7]
308
309/*
310 * Utility definitions.
311 */
312#define UCHARAT(p) ((int)*(char_u *)(p))
313
314/* Used for an error (down from) vim_regcomp(): give the error message, set
315 * rc_did_emsg and return NULL */
316#define EMSG_RET_NULL(m) { EMSG(m); rc_did_emsg = TRUE; return NULL; }
317#define EMSG_M_RET_NULL(m, c) { EMSG2(m, c ? "" : "\\"); rc_did_emsg = TRUE; return NULL; }
318#define EMSG_RET_FAIL(m) { EMSG(m); rc_did_emsg = TRUE; return FAIL; }
319#define EMSG_ONE_RET_NULL EMSG_M_RET_NULL(_("E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
320
321#define MAX_LIMIT (32767L << 16L)
322
323static int re_multi_type __ARGS((int));
324static int cstrncmp __ARGS((char_u *s1, char_u *s2, int *n));
325static char_u *cstrchr __ARGS((char_u *, int));
326
327#ifdef DEBUG
328static void regdump __ARGS((char_u *, regprog_T *));
329static char_u *regprop __ARGS((char_u *));
330#endif
331
332#define NOT_MULTI 0
333#define MULTI_ONE 1
334#define MULTI_MULT 2
335/*
336 * Return NOT_MULTI if c is not a "multi" operator.
337 * Return MULTI_ONE if c is a single "multi" operator.
338 * Return MULTI_MULT if c is a multi "multi" operator.
339 */
340 static int
341re_multi_type(c)
342 int c;
343{
344 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
345 return MULTI_ONE;
346 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
347 return MULTI_MULT;
348 return NOT_MULTI;
349}
350
351/*
352 * Flags to be passed up and down.
353 */
354#define HASWIDTH 0x1 /* Known never to match null string. */
355#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
356#define SPSTART 0x4 /* Starts with * or +. */
357#define HASNL 0x8 /* Contains some \n. */
358#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
359#define WORST 0 /* Worst case. */
360
361/*
362 * When regcode is set to this value, code is not emitted and size is computed
363 * instead.
364 */
365#define JUST_CALC_SIZE ((char_u *) -1)
366
367static char_u *reg_prev_sub;
368
369/*
370 * REGEXP_INRANGE contains all characters which are always special in a []
371 * range after '\'.
372 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
373 * These are:
374 * \n - New line (NL).
375 * \r - Carriage Return (CR).
376 * \t - Tab (TAB).
377 * \e - Escape (ESC).
378 * \b - Backspace (Ctrl_H).
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000379 * \d - Character code in decimal, eg \d123
380 * \o - Character code in octal, eg \o80
381 * \x - Character code in hex, eg \x4a
382 * \u - Multibyte character code, eg \u20ac
383 * \U - Long multibyte character code, eg \U12345678
Bram Moolenaar071d4272004-06-13 20:20:40 +0000384 */
385static char_u REGEXP_INRANGE[] = "]^-n\\";
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000386static char_u REGEXP_ABBR[] = "nrtebdoxuU";
Bram Moolenaar071d4272004-06-13 20:20:40 +0000387
388static int backslash_trans __ARGS((int c));
389static int skip_class_name __ARGS((char_u **pp));
390static char_u *skip_anyof __ARGS((char_u *p));
391static void init_class_tab __ARGS((void));
392
393/*
394 * Translate '\x' to its control character, except "\n", which is Magic.
395 */
396 static int
397backslash_trans(c)
398 int c;
399{
400 switch (c)
401 {
402 case 'r': return CAR;
403 case 't': return TAB;
404 case 'e': return ESC;
405 case 'b': return BS;
406 }
407 return c;
408}
409
410/*
411 * Check for a character class name. "pp" points to the '['.
412 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
413 * recognized. Otherwise "pp" is advanced to after the item.
414 */
415 static int
416skip_class_name(pp)
417 char_u **pp;
418{
419 static const char *(class_names[]) =
420 {
421 "alnum:]",
422#define CLASS_ALNUM 0
423 "alpha:]",
424#define CLASS_ALPHA 1
425 "blank:]",
426#define CLASS_BLANK 2
427 "cntrl:]",
428#define CLASS_CNTRL 3
429 "digit:]",
430#define CLASS_DIGIT 4
431 "graph:]",
432#define CLASS_GRAPH 5
433 "lower:]",
434#define CLASS_LOWER 6
435 "print:]",
436#define CLASS_PRINT 7
437 "punct:]",
438#define CLASS_PUNCT 8
439 "space:]",
440#define CLASS_SPACE 9
441 "upper:]",
442#define CLASS_UPPER 10
443 "xdigit:]",
444#define CLASS_XDIGIT 11
445 "tab:]",
446#define CLASS_TAB 12
447 "return:]",
448#define CLASS_RETURN 13
449 "backspace:]",
450#define CLASS_BACKSPACE 14
451 "escape:]",
452#define CLASS_ESCAPE 15
453 };
454#define CLASS_NONE 99
455 int i;
456
457 if ((*pp)[1] == ':')
458 {
459 for (i = 0; i < sizeof(class_names) / sizeof(*class_names); ++i)
460 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0)
461 {
462 *pp += STRLEN(class_names[i]) + 2;
463 return i;
464 }
465 }
466 return CLASS_NONE;
467}
468
469/*
470 * Skip over a "[]" range.
471 * "p" must point to the character after the '['.
472 * The returned pointer is on the matching ']', or the terminating NUL.
473 */
474 static char_u *
475skip_anyof(p)
476 char_u *p;
477{
478 int cpo_lit; /* 'cpoptions' contains 'l' flag */
479#ifdef FEAT_MBYTE
480 int l;
481#endif
482
483 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
484
485 if (*p == '^') /* Complement of range. */
486 ++p;
487 if (*p == ']' || *p == '-')
488 ++p;
489 while (*p != NUL && *p != ']')
490 {
491#ifdef FEAT_MBYTE
492 if (has_mbyte && (l = (*mb_ptr2len_check)(p)) > 1)
493 p += l;
494 else
495#endif
496 if (*p == '-')
497 {
498 ++p;
499 if (*p != ']' && *p != NUL)
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000500 mb_ptr_adv(p);
Bram Moolenaar071d4272004-06-13 20:20:40 +0000501 }
502 else if (*p == '\\'
503 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
504 || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
505 p += 2;
506 else if (*p == '[')
507 {
508 if (skip_class_name(&p) == CLASS_NONE)
509 ++p; /* It was not a class name */
510 }
511 else
512 ++p;
513 }
514
515 return p;
516}
517
518/*
519 * Specific version of character class functions.
520 * Using a table to keep this fast.
521 */
522static short class_tab[256];
523
524#define RI_DIGIT 0x01
525#define RI_HEX 0x02
526#define RI_OCTAL 0x04
527#define RI_WORD 0x08
528#define RI_HEAD 0x10
529#define RI_ALPHA 0x20
530#define RI_LOWER 0x40
531#define RI_UPPER 0x80
532#define RI_WHITE 0x100
533
534 static void
535init_class_tab()
536{
537 int i;
538 static int done = FALSE;
539
540 if (done)
541 return;
542
543 for (i = 0; i < 256; ++i)
544 {
545 if (i >= '0' && i <= '7')
546 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
547 else if (i >= '8' && i <= '9')
548 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
549 else if (i >= 'a' && i <= 'f')
550 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
551#ifdef EBCDIC
552 else if ((i >= 'g' && i <= 'i') || (i >= 'j' && i <= 'r')
553 || (i >= 's' && i <= 'z'))
554#else
555 else if (i >= 'g' && i <= 'z')
556#endif
557 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
558 else if (i >= 'A' && i <= 'F')
559 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
560#ifdef EBCDIC
561 else if ((i >= 'G' && i <= 'I') || ( i >= 'J' && i <= 'R')
562 || (i >= 'S' && i <= 'Z'))
563#else
564 else if (i >= 'G' && i <= 'Z')
565#endif
566 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
567 else if (i == '_')
568 class_tab[i] = RI_WORD + RI_HEAD;
569 else
570 class_tab[i] = 0;
571 }
572 class_tab[' '] |= RI_WHITE;
573 class_tab['\t'] |= RI_WHITE;
574 done = TRUE;
575}
576
577#ifdef FEAT_MBYTE
578# define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
579# define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
580# define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
581# define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
582# define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
583# define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
584# define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
585# define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
586# define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
587#else
588# define ri_digit(c) (class_tab[c] & RI_DIGIT)
589# define ri_hex(c) (class_tab[c] & RI_HEX)
590# define ri_octal(c) (class_tab[c] & RI_OCTAL)
591# define ri_word(c) (class_tab[c] & RI_WORD)
592# define ri_head(c) (class_tab[c] & RI_HEAD)
593# define ri_alpha(c) (class_tab[c] & RI_ALPHA)
594# define ri_lower(c) (class_tab[c] & RI_LOWER)
595# define ri_upper(c) (class_tab[c] & RI_UPPER)
596# define ri_white(c) (class_tab[c] & RI_WHITE)
597#endif
598
599/* flags for regflags */
600#define RF_ICASE 1 /* ignore case */
601#define RF_NOICASE 2 /* don't ignore case */
602#define RF_HASNL 4 /* can match a NL */
603#define RF_ICOMBINE 8 /* ignore combining characters */
604#define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
605
606/*
607 * Global work variables for vim_regcomp().
608 */
609
610static char_u *regparse; /* Input-scan pointer. */
611static int prevchr_len; /* byte length of previous char */
612static int num_complex_braces; /* Complex \{...} count */
613static int regnpar; /* () count. */
614#ifdef FEAT_SYN_HL
615static int regnzpar; /* \z() count. */
616static int re_has_z; /* \z item detected */
617#endif
618static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
619static long regsize; /* Code size. */
620static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
621static unsigned regflags; /* RF_ flags for prog */
622static long brace_min[10]; /* Minimums for complex brace repeats */
623static long brace_max[10]; /* Maximums for complex brace repeats */
624static int brace_count[10]; /* Current counts for complex brace repeats */
625#if defined(FEAT_SYN_HL) || defined(PROTO)
626static int had_eol; /* TRUE when EOL found by vim_regcomp() */
627#endif
628static int one_exactly = FALSE; /* only do one char for EXACTLY */
629
630static int reg_magic; /* magicness of the pattern: */
631#define MAGIC_NONE 1 /* "\V" very unmagic */
632#define MAGIC_OFF 2 /* "\M" or 'magic' off */
633#define MAGIC_ON 3 /* "\m" or 'magic' */
634#define MAGIC_ALL 4 /* "\v" very magic */
635
636static int reg_string; /* matching with a string instead of a buffer
637 line */
638
639/*
640 * META contains all characters that may be magic, except '^' and '$'.
641 */
642
643#ifdef EBCDIC
644static char_u META[] = "%&()*+.123456789<=>?@ACDFHIKLMOPSUVWX[_acdfhiklmnopsuvwxz{|~";
645#else
646/* META[] is used often enough to justify turning it into a table. */
647static char_u META_flags[] = {
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650/* % & ( ) * + . */
651 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
652/* 1 2 3 4 5 6 7 8 9 < = > ? */
653 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
654/* @ A C D F H I K L M O */
655 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
656/* P S U V W X Z [ _ */
657 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
658/* a c d f h i k l m n o */
659 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
660/* p s u v w x z { | ~ */
661 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
662};
663#endif
664
665static int curchr;
666
667/* arguments for reg() */
668#define REG_NOPAREN 0 /* toplevel reg() */
669#define REG_PAREN 1 /* \(\) */
670#define REG_ZPAREN 2 /* \z(\) */
671#define REG_NPAREN 3 /* \%(\) */
672
673/*
674 * Forward declarations for vim_regcomp()'s friends.
675 */
676static void initchr __ARGS((char_u *));
677static int getchr __ARGS((void));
678static void skipchr_keepstart __ARGS((void));
679static int peekchr __ARGS((void));
680static void skipchr __ARGS((void));
681static void ungetchr __ARGS((void));
Bram Moolenaarc0197e22004-09-13 20:26:32 +0000682static int gethexchrs __ARGS((int maxinputlen));
683static int getoctchrs __ARGS((void));
684static int getdecchrs __ARGS((void));
685static int coll_get_char __ARGS((void));
Bram Moolenaar071d4272004-06-13 20:20:40 +0000686static void regcomp_start __ARGS((char_u *expr, int flags));
687static char_u *reg __ARGS((int, int *));
688static char_u *regbranch __ARGS((int *flagp));
689static char_u *regconcat __ARGS((int *flagp));
690static char_u *regpiece __ARGS((int *));
691static char_u *regatom __ARGS((int *));
692static char_u *regnode __ARGS((int));
693static int prog_magic_wrong __ARGS((void));
694static char_u *regnext __ARGS((char_u *));
695static void regc __ARGS((int b));
696#ifdef FEAT_MBYTE
697static void regmbc __ARGS((int c));
698#endif
699static void reginsert __ARGS((int, char_u *));
700static void reginsert_limits __ARGS((int, long, long, char_u *));
701static char_u *re_put_long __ARGS((char_u *pr, long_u val));
702static int read_limits __ARGS((long *, long *));
703static void regtail __ARGS((char_u *, char_u *));
704static void regoptail __ARGS((char_u *, char_u *));
705
706/*
707 * Return TRUE if compiled regular expression "prog" can match a line break.
708 */
709 int
710re_multiline(prog)
711 regprog_T *prog;
712{
713 return (prog->regflags & RF_HASNL);
714}
715
716/*
717 * Return TRUE if compiled regular expression "prog" looks before the start
718 * position (pattern contains "\@<=" or "\@<!").
719 */
720 int
721re_lookbehind(prog)
722 regprog_T *prog;
723{
724 return (prog->regflags & RF_LOOKBH);
725}
726
727/*
728 * Skip past regular expression.
Bram Moolenaar86b68352004-12-27 21:59:20 +0000729 * Stop at end of 'p' or where 'dirc' is found ('/', '?', etc).
Bram Moolenaar071d4272004-06-13 20:20:40 +0000730 * Take care of characters with a backslash in front of it.
731 * Skip strings inside [ and ].
732 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
733 * expression and change "\?" to "?". If "*newp" is not NULL the expression
734 * is changed in-place.
735 */
736 char_u *
737skip_regexp(startp, dirc, magic, newp)
738 char_u *startp;
739 int dirc;
740 int magic;
741 char_u **newp;
742{
743 int mymagic;
744 char_u *p = startp;
745
746 if (magic)
747 mymagic = MAGIC_ON;
748 else
749 mymagic = MAGIC_OFF;
750
Bram Moolenaar1cd871b2004-12-19 22:46:22 +0000751 for (; p[0] != NUL; mb_ptr_adv(p))
Bram Moolenaar071d4272004-06-13 20:20:40 +0000752 {
753 if (p[0] == dirc) /* found end of regexp */
754 break;
755 if ((p[0] == '[' && mymagic >= MAGIC_ON)
756 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF))
757 {
758 p = skip_anyof(p + 1);
759 if (p[0] == NUL)
760 break;
761 }
762 else if (p[0] == '\\' && p[1] != NUL)
763 {
764 if (dirc == '?' && newp != NULL && p[1] == '?')
765 {
766 /* change "\?" to "?", make a copy first. */
767 if (*newp == NULL)
768 {
769 *newp = vim_strsave(startp);
770 if (*newp != NULL)
771 p = *newp + (p - startp);
772 }
773 if (*newp != NULL)
774 mch_memmove(p, p + 1, STRLEN(p));
775 else
776 ++p;
777 }
778 else
779 ++p; /* skip next character */
780 if (*p == 'v')
781 mymagic = MAGIC_ALL;
782 else if (*p == 'V')
783 mymagic = MAGIC_NONE;
784 }
Bram Moolenaar071d4272004-06-13 20:20:40 +0000785 }
786 return p;
787}
788
789/*
Bram Moolenaar86b68352004-12-27 21:59:20 +0000790 * vim_regcomp() - compile a regular expression into internal code
791 * Returns the program in allocated space. Returns NULL for an error.
Bram Moolenaar071d4272004-06-13 20:20:40 +0000792 *
793 * We can't allocate space until we know how big the compiled form will be,
794 * but we can't compile it (and thus know how big it is) until we've got a
795 * place to put the code. So we cheat: we compile it twice, once with code
796 * generation turned off and size counting turned on, and once "for real".
797 * This also means that we don't allocate space until we are sure that the
798 * thing really will compile successfully, and we never have to move the
799 * code and thus invalidate pointers into it. (Note that it has to be in
800 * one piece because vim_free() must be able to free it all.)
801 *
802 * Whether upper/lower case is to be ignored is decided when executing the
803 * program, it does not matter here.
804 *
805 * Beware that the optimization-preparation code in here knows about some
806 * of the structure of the compiled regexp.
807 * "re_flags": RE_MAGIC and/or RE_STRING.
808 */
809 regprog_T *
810vim_regcomp(expr, re_flags)
811 char_u *expr;
812 int re_flags;
813{
814 regprog_T *r;
815 char_u *scan;
816 char_u *longest;
817 int len;
818 int flags;
819
820 if (expr == NULL)
821 EMSG_RET_NULL(_(e_null));
822
823 init_class_tab();
824
825 /*
826 * First pass: determine size, legality.
827 */
828 regcomp_start(expr, re_flags);
829 regcode = JUST_CALC_SIZE;
830 regc(REGMAGIC);
831 if (reg(REG_NOPAREN, &flags) == NULL)
832 return NULL;
833
834 /* Small enough for pointer-storage convention? */
835#ifdef SMALL_MALLOC /* 16 bit storage allocation */
836 if (regsize >= 65536L - 256L)
837 EMSG_RET_NULL(_("E339: Pattern too long"));
838#endif
839
840 /* Allocate space. */
841 r = (regprog_T *)lalloc(sizeof(regprog_T) + regsize, TRUE);
842 if (r == NULL)
843 return NULL;
844
845 /*
846 * Second pass: emit code.
847 */
848 regcomp_start(expr, re_flags);
849 regcode = r->program;
850 regc(REGMAGIC);
851 if (reg(REG_NOPAREN, &flags) == NULL)
852 {
853 vim_free(r);
854 return NULL;
855 }
856
857 /* Dig out information for optimizations. */
858 r->regstart = NUL; /* Worst-case defaults. */
859 r->reganch = 0;
860 r->regmust = NULL;
861 r->regmlen = 0;
862 r->regflags = regflags;
863 if (flags & HASNL)
864 r->regflags |= RF_HASNL;
865 if (flags & HASLOOKBH)
866 r->regflags |= RF_LOOKBH;
867#ifdef FEAT_SYN_HL
868 /* Remember whether this pattern has any \z specials in it. */
869 r->reghasz = re_has_z;
870#endif
871 scan = r->program + 1; /* First BRANCH. */
872 if (OP(regnext(scan)) == END) /* Only one top-level choice. */
873 {
874 scan = OPERAND(scan);
875
876 /* Starting-point info. */
877 if (OP(scan) == BOL || OP(scan) == RE_BOF)
878 {
879 r->reganch++;
880 scan = regnext(scan);
881 }
882
883 if (OP(scan) == EXACTLY)
884 {
885#ifdef FEAT_MBYTE
886 if (has_mbyte)
887 r->regstart = (*mb_ptr2char)(OPERAND(scan));
888 else
889#endif
890 r->regstart = *OPERAND(scan);
891 }
892 else if ((OP(scan) == BOW
893 || OP(scan) == EOW
894 || OP(scan) == NOTHING
895 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
896 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE)
897 && OP(regnext(scan)) == EXACTLY)
898 {
899#ifdef FEAT_MBYTE
900 if (has_mbyte)
901 r->regstart = (*mb_ptr2char)(OPERAND(regnext(scan)));
902 else
903#endif
904 r->regstart = *OPERAND(regnext(scan));
905 }
906
907 /*
908 * If there's something expensive in the r.e., find the longest
909 * literal string that must appear and make it the regmust. Resolve
910 * ties in favor of later strings, since the regstart check works
911 * with the beginning of the r.e. and avoiding duplication
912 * strengthens checking. Not a strong reason, but sufficient in the
913 * absence of others.
914 */
915 /*
916 * When the r.e. starts with BOW, it is faster to look for a regmust
917 * first. Used a lot for "#" and "*" commands. (Added by mool).
918 */
919 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
920 && !(flags & HASNL))
921 {
922 longest = NULL;
923 len = 0;
924 for (; scan != NULL; scan = regnext(scan))
925 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
926 {
927 longest = OPERAND(scan);
928 len = (int)STRLEN(OPERAND(scan));
929 }
930 r->regmust = longest;
931 r->regmlen = len;
932 }
933 }
934#ifdef DEBUG
935 regdump(expr, r);
936#endif
937 return r;
938}
939
940/*
941 * Setup to parse the regexp. Used once to get the length and once to do it.
942 */
943 static void
944regcomp_start(expr, re_flags)
945 char_u *expr;
946 int re_flags; /* see vim_regcomp() */
947{
948 initchr(expr);
949 if (re_flags & RE_MAGIC)
950 reg_magic = MAGIC_ON;
951 else
952 reg_magic = MAGIC_OFF;
953 reg_string = (re_flags & RE_STRING);
954
955 num_complex_braces = 0;
956 regnpar = 1;
957 vim_memset(had_endbrace, 0, sizeof(had_endbrace));
958#ifdef FEAT_SYN_HL
959 regnzpar = 1;
960 re_has_z = 0;
961#endif
962 regsize = 0L;
963 regflags = 0;
964#if defined(FEAT_SYN_HL) || defined(PROTO)
965 had_eol = FALSE;
966#endif
967}
968
969#if defined(FEAT_SYN_HL) || defined(PROTO)
970/*
971 * Check if during the previous call to vim_regcomp the EOL item "$" has been
972 * found. This is messy, but it works fine.
973 */
974 int
975vim_regcomp_had_eol()
976{
977 return had_eol;
978}
979#endif
980
981/*
982 * reg - regular expression, i.e. main body or parenthesized thing
983 *
984 * Caller must absorb opening parenthesis.
985 *
986 * Combining parenthesis handling with the base level of regular expression
987 * is a trifle forced, but the need to tie the tails of the branches to what
988 * follows makes it hard to avoid.
989 */
990 static char_u *
991reg(paren, flagp)
992 int paren; /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
993 int *flagp;
994{
995 char_u *ret;
996 char_u *br;
997 char_u *ender;
998 int parno = 0;
999 int flags;
1000
1001 *flagp = HASWIDTH; /* Tentatively. */
1002
1003#ifdef FEAT_SYN_HL
1004 if (paren == REG_ZPAREN)
1005 {
1006 /* Make a ZOPEN node. */
1007 if (regnzpar >= NSUBEXP)
1008 EMSG_RET_NULL(_("E50: Too many \\z("));
1009 parno = regnzpar;
1010 regnzpar++;
1011 ret = regnode(ZOPEN + parno);
1012 }
1013 else
1014#endif
1015 if (paren == REG_PAREN)
1016 {
1017 /* Make a MOPEN node. */
1018 if (regnpar >= NSUBEXP)
1019 EMSG_M_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1020 parno = regnpar;
1021 ++regnpar;
1022 ret = regnode(MOPEN + parno);
1023 }
1024 else if (paren == REG_NPAREN)
1025 {
1026 /* Make a NOPEN node. */
1027 ret = regnode(NOPEN);
1028 }
1029 else
1030 ret = NULL;
1031
1032 /* Pick up the branches, linking them together. */
1033 br = regbranch(&flags);
1034 if (br == NULL)
1035 return NULL;
1036 if (ret != NULL)
1037 regtail(ret, br); /* [MZ]OPEN -> first. */
1038 else
1039 ret = br;
1040 /* If one of the branches can be zero-width, the whole thing can.
1041 * If one of the branches has * at start or matches a line-break, the
1042 * whole thing can. */
1043 if (!(flags & HASWIDTH))
1044 *flagp &= ~HASWIDTH;
1045 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1046 while (peekchr() == Magic('|'))
1047 {
1048 skipchr();
1049 br = regbranch(&flags);
1050 if (br == NULL)
1051 return NULL;
1052 regtail(ret, br); /* BRANCH -> BRANCH. */
1053 if (!(flags & HASWIDTH))
1054 *flagp &= ~HASWIDTH;
1055 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1056 }
1057
1058 /* Make a closing node, and hook it on the end. */
1059 ender = regnode(
1060#ifdef FEAT_SYN_HL
1061 paren == REG_ZPAREN ? ZCLOSE + parno :
1062#endif
1063 paren == REG_PAREN ? MCLOSE + parno :
1064 paren == REG_NPAREN ? NCLOSE : END);
1065 regtail(ret, ender);
1066
1067 /* Hook the tails of the branches to the closing node. */
1068 for (br = ret; br != NULL; br = regnext(br))
1069 regoptail(br, ender);
1070
1071 /* Check for proper termination. */
1072 if (paren != REG_NOPAREN && getchr() != Magic(')'))
1073 {
1074#ifdef FEAT_SYN_HL
1075 if (paren == REG_ZPAREN)
1076 EMSG_RET_NULL(_("E52: Unmatched \\z("))
1077 else
1078#endif
1079 if (paren == REG_NPAREN)
1080 EMSG_M_RET_NULL(_("E53: Unmatched %s%%("), reg_magic == MAGIC_ALL)
1081 else
1082 EMSG_M_RET_NULL(_("E54: Unmatched %s("), reg_magic == MAGIC_ALL)
1083 }
1084 else if (paren == REG_NOPAREN && peekchr() != NUL)
1085 {
1086 if (curchr == Magic(')'))
1087 EMSG_M_RET_NULL(_("E55: Unmatched %s)"), reg_magic == MAGIC_ALL)
1088 else
1089 EMSG_RET_NULL(_(e_trailing)) /* "Can't happen". */
1090 /* NOTREACHED */
1091 }
1092 /*
1093 * Here we set the flag allowing back references to this set of
1094 * parentheses.
1095 */
1096 if (paren == REG_PAREN)
1097 had_endbrace[parno] = TRUE; /* have seen the close paren */
1098 return ret;
1099}
1100
1101/*
1102 * regbranch - one alternative of an | operator
1103 *
1104 * Implements the & operator.
1105 */
1106 static char_u *
1107regbranch(flagp)
1108 int *flagp;
1109{
1110 char_u *ret;
1111 char_u *chain = NULL;
1112 char_u *latest;
1113 int flags;
1114
1115 *flagp = WORST | HASNL; /* Tentatively. */
1116
1117 ret = regnode(BRANCH);
1118 for (;;)
1119 {
1120 latest = regconcat(&flags);
1121 if (latest == NULL)
1122 return NULL;
1123 /* If one of the branches has width, the whole thing has. If one of
1124 * the branches anchors at start-of-line, the whole thing does.
1125 * If one of the branches uses look-behind, the whole thing does. */
1126 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1127 /* If one of the branches doesn't match a line-break, the whole thing
1128 * doesn't. */
1129 *flagp &= ~HASNL | (flags & HASNL);
1130 if (chain != NULL)
1131 regtail(chain, latest);
1132 if (peekchr() != Magic('&'))
1133 break;
1134 skipchr();
1135 regtail(latest, regnode(END)); /* operand ends */
1136 reginsert(MATCH, latest);
1137 chain = latest;
1138 }
1139
1140 return ret;
1141}
1142
1143/*
1144 * regbranch - one alternative of an | or & operator
1145 *
1146 * Implements the concatenation operator.
1147 */
1148 static char_u *
1149regconcat(flagp)
1150 int *flagp;
1151{
1152 char_u *first = NULL;
1153 char_u *chain = NULL;
1154 char_u *latest;
1155 int flags;
1156 int cont = TRUE;
1157
1158 *flagp = WORST; /* Tentatively. */
1159
1160 while (cont)
1161 {
1162 switch (peekchr())
1163 {
1164 case NUL:
1165 case Magic('|'):
1166 case Magic('&'):
1167 case Magic(')'):
1168 cont = FALSE;
1169 break;
1170 case Magic('Z'):
1171#ifdef FEAT_MBYTE
1172 regflags |= RF_ICOMBINE;
1173#endif
1174 skipchr_keepstart();
1175 break;
1176 case Magic('c'):
1177 regflags |= RF_ICASE;
1178 skipchr_keepstart();
1179 break;
1180 case Magic('C'):
1181 regflags |= RF_NOICASE;
1182 skipchr_keepstart();
1183 break;
1184 case Magic('v'):
1185 reg_magic = MAGIC_ALL;
1186 skipchr_keepstart();
1187 curchr = -1;
1188 break;
1189 case Magic('m'):
1190 reg_magic = MAGIC_ON;
1191 skipchr_keepstart();
1192 curchr = -1;
1193 break;
1194 case Magic('M'):
1195 reg_magic = MAGIC_OFF;
1196 skipchr_keepstart();
1197 curchr = -1;
1198 break;
1199 case Magic('V'):
1200 reg_magic = MAGIC_NONE;
1201 skipchr_keepstart();
1202 curchr = -1;
1203 break;
1204 default:
1205 latest = regpiece(&flags);
1206 if (latest == NULL)
1207 return NULL;
1208 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1209 if (chain == NULL) /* First piece. */
1210 *flagp |= flags & SPSTART;
1211 else
1212 regtail(chain, latest);
1213 chain = latest;
1214 if (first == NULL)
1215 first = latest;
1216 break;
1217 }
1218 }
1219 if (first == NULL) /* Loop ran zero times. */
1220 first = regnode(NOTHING);
1221 return first;
1222}
1223
1224/*
1225 * regpiece - something followed by possible [*+=]
1226 *
1227 * Note that the branching code sequences used for = and the general cases
1228 * of * and + are somewhat optimized: they use the same NOTHING node as
1229 * both the endmarker for their branch list and the body of the last branch.
1230 * It might seem that this node could be dispensed with entirely, but the
1231 * endmarker role is not redundant.
1232 */
1233 static char_u *
1234regpiece(flagp)
1235 int *flagp;
1236{
1237 char_u *ret;
1238 int op;
1239 char_u *next;
1240 int flags;
1241 long minval;
1242 long maxval;
1243
1244 ret = regatom(&flags);
1245 if (ret == NULL)
1246 return NULL;
1247
1248 op = peekchr();
1249 if (re_multi_type(op) == NOT_MULTI)
1250 {
1251 *flagp = flags;
1252 return ret;
1253 }
1254 if (!(flags & HASWIDTH) && re_multi_type(op) == MULTI_MULT)
1255 {
1256 if (op == Magic('*'))
1257 EMSG_M_RET_NULL(_("E56: %s* operand could be empty"),
1258 reg_magic >= MAGIC_ON);
1259 if (op == Magic('+'))
1260 EMSG_M_RET_NULL(_("E57: %s+ operand could be empty"),
1261 reg_magic == MAGIC_ALL);
1262 /* "\{}" is checked below, it's allowed when there is an upper limit */
1263 }
1264 /* default flags */
1265 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1266
1267 skipchr();
1268 switch (op)
1269 {
1270 case Magic('*'):
1271 if (flags & SIMPLE)
1272 reginsert(STAR, ret);
1273 else
1274 {
1275 /* Emit x* as (x&|), where & means "self". */
1276 reginsert(BRANCH, ret); /* Either x */
1277 regoptail(ret, regnode(BACK)); /* and loop */
1278 regoptail(ret, ret); /* back */
1279 regtail(ret, regnode(BRANCH)); /* or */
1280 regtail(ret, regnode(NOTHING)); /* null. */
1281 }
1282 break;
1283
1284 case Magic('+'):
1285 if (flags & SIMPLE)
1286 reginsert(PLUS, ret);
1287 else
1288 {
1289 /* Emit x+ as x(&|), where & means "self". */
1290 next = regnode(BRANCH); /* Either */
1291 regtail(ret, next);
1292 regtail(regnode(BACK), ret); /* loop back */
1293 regtail(next, regnode(BRANCH)); /* or */
1294 regtail(ret, regnode(NOTHING)); /* null. */
1295 }
1296 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1297 break;
1298
1299 case Magic('@'):
1300 {
1301 int lop = END;
1302
1303 switch (no_Magic(getchr()))
1304 {
1305 case '=': lop = MATCH; break; /* \@= */
1306 case '!': lop = NOMATCH; break; /* \@! */
1307 case '>': lop = SUBPAT; break; /* \@> */
1308 case '<': switch (no_Magic(getchr()))
1309 {
1310 case '=': lop = BEHIND; break; /* \@<= */
1311 case '!': lop = NOBEHIND; break; /* \@<! */
1312 }
1313 }
1314 if (lop == END)
1315 EMSG_M_RET_NULL(_("E59: invalid character after %s@"),
1316 reg_magic == MAGIC_ALL);
1317 /* Look behind must match with behind_pos. */
1318 if (lop == BEHIND || lop == NOBEHIND)
1319 {
1320 regtail(ret, regnode(BHPOS));
1321 *flagp |= HASLOOKBH;
1322 }
1323 regtail(ret, regnode(END)); /* operand ends */
1324 reginsert(lop, ret);
1325 break;
1326 }
1327
1328 case Magic('?'):
1329 case Magic('='):
1330 /* Emit x= as (x|) */
1331 reginsert(BRANCH, ret); /* Either x */
1332 regtail(ret, regnode(BRANCH)); /* or */
1333 next = regnode(NOTHING); /* null. */
1334 regtail(ret, next);
1335 regoptail(ret, next);
1336 break;
1337
1338 case Magic('{'):
1339 if (!read_limits(&minval, &maxval))
1340 return NULL;
1341 if (!(flags & HASWIDTH) && (maxval > minval
1342 ? maxval >= MAX_LIMIT : minval >= MAX_LIMIT))
1343 EMSG_M_RET_NULL(_("E58: %s{ operand could be empty"),
1344 reg_magic == MAGIC_ALL);
1345 if (flags & SIMPLE)
1346 {
1347 reginsert(BRACE_SIMPLE, ret);
1348 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1349 }
1350 else
1351 {
1352 if (num_complex_braces >= 10)
1353 EMSG_M_RET_NULL(_("E60: Too many complex %s{...}s"),
1354 reg_magic == MAGIC_ALL);
1355 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1356 regoptail(ret, regnode(BACK));
1357 regoptail(ret, ret);
1358 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1359 ++num_complex_braces;
1360 }
1361 if (minval > 0 && maxval > 0)
1362 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1363 break;
1364 }
1365 if (re_multi_type(peekchr()) != NOT_MULTI)
1366 {
1367 /* Can't have a multi follow a multi. */
1368 if (peekchr() == Magic('*'))
1369 sprintf((char *)IObuff, _("E61: Nested %s*"),
1370 reg_magic >= MAGIC_ON ? "" : "\\");
1371 else
1372 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1373 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1374 EMSG_RET_NULL(IObuff);
1375 }
1376
1377 return ret;
1378}
1379
1380/*
1381 * regatom - the lowest level
1382 *
1383 * Optimization: gobbles an entire sequence of ordinary characters so that
1384 * it can turn them into a single node, which is smaller to store and
1385 * faster to run. Don't do this when one_exactly is set.
1386 */
1387 static char_u *
1388regatom(flagp)
1389 int *flagp;
1390{
1391 char_u *ret;
1392 int flags;
1393 int cpo_lit; /* 'cpoptions' contains 'l' flag */
1394 int c;
1395 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1396 static int classcodes[] = {ANY, IDENT, SIDENT, KWORD, SKWORD,
1397 FNAME, SFNAME, PRINT, SPRINT,
1398 WHITE, NWHITE, DIGIT, NDIGIT,
1399 HEX, NHEX, OCTAL, NOCTAL,
1400 WORD, NWORD, HEAD, NHEAD,
1401 ALPHA, NALPHA, LOWER, NLOWER,
1402 UPPER, NUPPER
1403 };
1404 char_u *p;
1405 int extra = 0;
1406
1407 *flagp = WORST; /* Tentatively. */
1408 cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
1409
1410 c = getchr();
1411 switch (c)
1412 {
1413 case Magic('^'):
1414 ret = regnode(BOL);
1415 break;
1416
1417 case Magic('$'):
1418 ret = regnode(EOL);
1419#if defined(FEAT_SYN_HL) || defined(PROTO)
1420 had_eol = TRUE;
1421#endif
1422 break;
1423
1424 case Magic('<'):
1425 ret = regnode(BOW);
1426 break;
1427
1428 case Magic('>'):
1429 ret = regnode(EOW);
1430 break;
1431
1432 case Magic('_'):
1433 c = no_Magic(getchr());
1434 if (c == '^') /* "\_^" is start-of-line */
1435 {
1436 ret = regnode(BOL);
1437 break;
1438 }
1439 if (c == '$') /* "\_$" is end-of-line */
1440 {
1441 ret = regnode(EOL);
1442#if defined(FEAT_SYN_HL) || defined(PROTO)
1443 had_eol = TRUE;
1444#endif
1445 break;
1446 }
1447
1448 extra = ADD_NL;
1449 *flagp |= HASNL;
1450
1451 /* "\_[" is character range plus newline */
1452 if (c == '[')
1453 goto collection;
1454
1455 /* "\_x" is character class plus newline */
1456 /*FALLTHROUGH*/
1457
1458 /*
1459 * Character classes.
1460 */
1461 case Magic('.'):
1462 case Magic('i'):
1463 case Magic('I'):
1464 case Magic('k'):
1465 case Magic('K'):
1466 case Magic('f'):
1467 case Magic('F'):
1468 case Magic('p'):
1469 case Magic('P'):
1470 case Magic('s'):
1471 case Magic('S'):
1472 case Magic('d'):
1473 case Magic('D'):
1474 case Magic('x'):
1475 case Magic('X'):
1476 case Magic('o'):
1477 case Magic('O'):
1478 case Magic('w'):
1479 case Magic('W'):
1480 case Magic('h'):
1481 case Magic('H'):
1482 case Magic('a'):
1483 case Magic('A'):
1484 case Magic('l'):
1485 case Magic('L'):
1486 case Magic('u'):
1487 case Magic('U'):
1488 p = vim_strchr(classchars, no_Magic(c));
1489 if (p == NULL)
1490 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1491 ret = regnode(classcodes[p - classchars] + extra);
1492 *flagp |= HASWIDTH | SIMPLE;
1493 break;
1494
1495 case Magic('n'):
1496 if (reg_string)
1497 {
1498 /* In a string "\n" matches a newline character. */
1499 ret = regnode(EXACTLY);
1500 regc(NL);
1501 regc(NUL);
1502 *flagp |= HASWIDTH | SIMPLE;
1503 }
1504 else
1505 {
1506 /* In buffer text "\n" matches the end of a line. */
1507 ret = regnode(NEWL);
1508 *flagp |= HASWIDTH | HASNL;
1509 }
1510 break;
1511
1512 case Magic('('):
1513 if (one_exactly)
1514 EMSG_ONE_RET_NULL;
1515 ret = reg(REG_PAREN, &flags);
1516 if (ret == NULL)
1517 return NULL;
1518 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1519 break;
1520
1521 case NUL:
1522 case Magic('|'):
1523 case Magic('&'):
1524 case Magic(')'):
1525 EMSG_RET_NULL(_(e_internal)); /* Supposed to be caught earlier. */
1526 /* NOTREACHED */
1527
1528 case Magic('='):
1529 case Magic('?'):
1530 case Magic('+'):
1531 case Magic('@'):
1532 case Magic('{'):
1533 case Magic('*'):
1534 c = no_Magic(c);
1535 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1536 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1537 ? "" : "\\", c);
1538 EMSG_RET_NULL(IObuff);
1539 /* NOTREACHED */
1540
1541 case Magic('~'): /* previous substitute pattern */
1542 if (reg_prev_sub)
1543 {
1544 char_u *lp;
1545
1546 ret = regnode(EXACTLY);
1547 lp = reg_prev_sub;
1548 while (*lp != NUL)
1549 regc(*lp++);
1550 regc(NUL);
1551 if (*reg_prev_sub != NUL)
1552 {
1553 *flagp |= HASWIDTH;
1554 if ((lp - reg_prev_sub) == 1)
1555 *flagp |= SIMPLE;
1556 }
1557 }
1558 else
1559 EMSG_RET_NULL(_(e_nopresub));
1560 break;
1561
1562 case Magic('1'):
1563 case Magic('2'):
1564 case Magic('3'):
1565 case Magic('4'):
1566 case Magic('5'):
1567 case Magic('6'):
1568 case Magic('7'):
1569 case Magic('8'):
1570 case Magic('9'):
1571 {
1572 int refnum;
1573
1574 refnum = c - Magic('0');
1575 /*
1576 * Check if the back reference is legal. We must have seen the
1577 * close brace.
1578 * TODO: Should also check that we don't refer to something
1579 * that is repeated (+*=): what instance of the repetition
1580 * should we match?
1581 */
1582 if (!had_endbrace[refnum])
1583 {
1584 /* Trick: check if "@<=" or "@<!" follows, in which case
1585 * the \1 can appear before the referenced match. */
1586 for (p = regparse; *p != NUL; ++p)
1587 if (p[0] == '@' && p[1] == '<'
1588 && (p[2] == '!' || p[2] == '='))
1589 break;
1590 if (*p == NUL)
1591 EMSG_RET_NULL(_("E65: Illegal back reference"));
1592 }
1593 ret = regnode(BACKREF + refnum);
1594 }
1595 break;
1596
1597#ifdef FEAT_SYN_HL
1598 case Magic('z'):
1599 {
1600 c = no_Magic(getchr());
1601 switch (c)
1602 {
1603 case '(': if (reg_do_extmatch != REX_SET)
1604 EMSG_RET_NULL(_("E66: \\z( not allowed here"));
1605 if (one_exactly)
1606 EMSG_ONE_RET_NULL;
1607 ret = reg(REG_ZPAREN, &flags);
1608 if (ret == NULL)
1609 return NULL;
1610 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1611 re_has_z = REX_SET;
1612 break;
1613
1614 case '1':
1615 case '2':
1616 case '3':
1617 case '4':
1618 case '5':
1619 case '6':
1620 case '7':
1621 case '8':
1622 case '9': if (reg_do_extmatch != REX_USE)
1623 EMSG_RET_NULL(_("E67: \\z1 et al. not allowed here"));
1624 ret = regnode(ZREF + c - '0');
1625 re_has_z = REX_USE;
1626 break;
1627
1628 case 's': ret = regnode(MOPEN + 0);
1629 break;
1630
1631 case 'e': ret = regnode(MCLOSE + 0);
1632 break;
1633
1634 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
1635 }
1636 }
1637 break;
1638#endif
1639
1640 case Magic('%'):
1641 {
1642 c = no_Magic(getchr());
1643 switch (c)
1644 {
1645 /* () without a back reference */
1646 case '(':
1647 if (one_exactly)
1648 EMSG_ONE_RET_NULL;
1649 ret = reg(REG_NPAREN, &flags);
1650 if (ret == NULL)
1651 return NULL;
1652 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1653 break;
1654
1655 /* Catch \%^ and \%$ regardless of where they appear in the
1656 * pattern -- regardless of whether or not it makes sense. */
1657 case '^':
1658 ret = regnode(RE_BOF);
1659 break;
1660
1661 case '$':
1662 ret = regnode(RE_EOF);
1663 break;
1664
1665 case '#':
1666 ret = regnode(CURSOR);
1667 break;
1668
1669 /* \%[abc]: Emit as a list of branches, all ending at the last
1670 * branch which matches nothing. */
1671 case '[':
1672 if (one_exactly) /* doesn't nest */
1673 EMSG_ONE_RET_NULL;
1674 {
1675 char_u *lastbranch;
1676 char_u *lastnode = NULL;
1677 char_u *br;
1678
1679 ret = NULL;
1680 while ((c = getchr()) != ']')
1681 {
1682 if (c == NUL)
1683 EMSG_M_RET_NULL(_("E69: Missing ] after %s%%["),
1684 reg_magic == MAGIC_ALL);
1685 br = regnode(BRANCH);
1686 if (ret == NULL)
1687 ret = br;
1688 else
1689 regtail(lastnode, br);
1690
1691 ungetchr();
1692 one_exactly = TRUE;
1693 lastnode = regatom(flagp);
1694 one_exactly = FALSE;
1695 if (lastnode == NULL)
1696 return NULL;
1697 }
1698 if (ret == NULL)
1699 EMSG_M_RET_NULL(_("E70: Empty %s%%[]"),
1700 reg_magic == MAGIC_ALL);
1701 lastbranch = regnode(BRANCH);
1702 br = regnode(NOTHING);
1703 if (ret != JUST_CALC_SIZE)
1704 {
1705 regtail(lastnode, br);
1706 regtail(lastbranch, br);
1707 /* connect all branches to the NOTHING
1708 * branch at the end */
1709 for (br = ret; br != lastnode; )
1710 {
1711 if (OP(br) == BRANCH)
1712 {
1713 regtail(br, lastbranch);
1714 br = OPERAND(br);
1715 }
1716 else
1717 br = regnext(br);
1718 }
1719 }
1720 *flagp &= ~HASWIDTH;
1721 break;
1722 }
1723
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001724 case 'd': /* %d123 decimal */
1725 case 'o': /* %o123 octal */
1726 case 'x': /* %xab hex 2 */
1727 case 'u': /* %uabcd hex 4 */
1728 case 'U': /* %U1234abcd hex 8 */
1729 {
1730 int i;
1731
1732 switch (c)
1733 {
1734 case 'd': i = getdecchrs(); break;
1735 case 'o': i = getoctchrs(); break;
1736 case 'x': i = gethexchrs(2); break;
1737 case 'u': i = gethexchrs(4); break;
1738 case 'U': i = gethexchrs(8); break;
1739 default: i = -1; break;
1740 }
1741
1742 if (i < 0)
1743 EMSG_M_RET_NULL(
1744 _("E678: Invalid character after %s%%[dxouU]"),
1745 reg_magic == MAGIC_ALL);
1746 ret = regnode(EXACTLY);
1747 if (i == 0)
1748 regc(0x0a);
1749 else
1750#ifdef FEAT_MBYTE
1751 regmbc(i);
1752#else
1753 regc(i);
1754#endif
1755 regc(NUL);
1756 *flagp |= HASWIDTH;
1757 break;
1758 }
1759
Bram Moolenaar071d4272004-06-13 20:20:40 +00001760 default:
1761 if (VIM_ISDIGIT(c) || c == '<' || c == '>')
1762 {
1763 long_u n = 0;
1764 int cmp;
1765
1766 cmp = c;
1767 if (cmp == '<' || cmp == '>')
1768 c = getchr();
1769 while (VIM_ISDIGIT(c))
1770 {
1771 n = n * 10 + (c - '0');
1772 c = getchr();
1773 }
1774 if (c == 'l' || c == 'c' || c == 'v')
1775 {
1776 if (c == 'l')
1777 ret = regnode(RE_LNUM);
1778 else if (c == 'c')
1779 ret = regnode(RE_COL);
1780 else
1781 ret = regnode(RE_VCOL);
1782 if (ret == JUST_CALC_SIZE)
1783 regsize += 5;
1784 else
1785 {
1786 /* put the number and the optional
1787 * comparator after the opcode */
1788 regcode = re_put_long(regcode, n);
1789 *regcode++ = cmp;
1790 }
1791 break;
1792 }
1793 }
1794
1795 EMSG_M_RET_NULL(_("E71: Invalid character after %s%%"),
1796 reg_magic == MAGIC_ALL);
1797 }
1798 }
1799 break;
1800
1801 case Magic('['):
1802collection:
1803 {
1804 char_u *lp;
1805
1806 /*
1807 * If there is no matching ']', we assume the '[' is a normal
1808 * character. This makes 'incsearch' and ":help [" work.
1809 */
1810 lp = skip_anyof(regparse);
1811 if (*lp == ']') /* there is a matching ']' */
1812 {
1813 int startc = -1; /* > 0 when next '-' is a range */
1814 int endc;
1815
1816 /*
1817 * In a character class, different parsing rules apply.
1818 * Not even \ is special anymore, nothing is.
1819 */
1820 if (*regparse == '^') /* Complement of range. */
1821 {
1822 ret = regnode(ANYBUT + extra);
1823 regparse++;
1824 }
1825 else
1826 ret = regnode(ANYOF + extra);
1827
1828 /* At the start ']' and '-' mean the literal character. */
1829 if (*regparse == ']' || *regparse == '-')
1830 regc(*regparse++);
1831
1832 while (*regparse != NUL && *regparse != ']')
1833 {
1834 if (*regparse == '-')
1835 {
1836 ++regparse;
1837 /* The '-' is not used for a range at the end and
1838 * after or before a '\n'. */
1839 if (*regparse == ']' || *regparse == NUL
1840 || startc == -1
1841 || (regparse[0] == '\\' && regparse[1] == 'n'))
1842 {
1843 regc('-');
1844 startc = '-'; /* [--x] is a range */
1845 }
1846 else
1847 {
1848#ifdef FEAT_MBYTE
1849 if (has_mbyte)
1850 endc = mb_ptr2char_adv(&regparse);
1851 else
1852#endif
1853 endc = *regparse++;
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001854
1855 /* Handle \o40, \x20 and \u20AC style sequences */
1856 if (endc == '\\' && !cpo_lit)
1857 endc = coll_get_char();
1858
Bram Moolenaar071d4272004-06-13 20:20:40 +00001859 if (startc > endc)
1860 EMSG_RET_NULL(_(e_invrange));
1861#ifdef FEAT_MBYTE
1862 if (has_mbyte && ((*mb_char2len)(startc) > 1
1863 || (*mb_char2len)(endc) > 1))
1864 {
1865 /* Limit to a range of 256 chars */
1866 if (endc > startc + 256)
1867 EMSG_RET_NULL(_(e_invrange));
1868 while (++startc <= endc)
1869 regmbc(startc);
1870 }
1871 else
1872#endif
1873 {
1874#ifdef EBCDIC
1875 int alpha_only = FALSE;
1876
1877 /* for alphabetical range skip the gaps
1878 * 'i'-'j', 'r'-'s', 'I'-'J' and 'R'-'S'. */
1879 if (isalpha(startc) && isalpha(endc))
1880 alpha_only = TRUE;
1881#endif
1882 while (++startc <= endc)
1883#ifdef EBCDIC
1884 if (!alpha_only || isalpha(startc))
1885#endif
1886 regc(startc);
1887 }
1888 startc = -1;
1889 }
1890 }
1891 /*
1892 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1893 * accepts "\t", "\e", etc., but only when the 'l' flag in
1894 * 'cpoptions' is not included.
1895 */
1896 else if (*regparse == '\\'
1897 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1898 || (!cpo_lit
1899 && vim_strchr(REGEXP_ABBR,
1900 regparse[1]) != NULL)))
1901 {
1902 regparse++;
1903 if (*regparse == 'n')
1904 {
1905 /* '\n' in range: also match NL */
1906 if (ret != JUST_CALC_SIZE)
1907 {
1908 if (*ret == ANYBUT)
1909 *ret = ANYBUT + ADD_NL;
1910 else if (*ret == ANYOF)
1911 *ret = ANYOF + ADD_NL;
1912 /* else: must have had a \n already */
1913 }
1914 *flagp |= HASNL;
1915 regparse++;
1916 startc = -1;
1917 }
Bram Moolenaarc0197e22004-09-13 20:26:32 +00001918 else if (*regparse == 'd'
1919 || *regparse == 'o'
1920 || *regparse == 'x'
1921 || *regparse == 'u'
1922 || *regparse == 'U')
1923 {
1924 startc = coll_get_char();
1925 if (startc == 0)
1926 regc(0x0a);
1927 else
1928#ifdef FEAT_MBYTE
1929 regmbc(startc);
1930#else
1931 regc(startc);
1932#endif
1933 }
Bram Moolenaar071d4272004-06-13 20:20:40 +00001934 else
1935 {
1936 startc = backslash_trans(*regparse++);
1937 regc(startc);
1938 }
1939 }
1940 else if (*regparse == '[')
1941 {
1942 int c_class;
1943 int cu;
1944
1945 c_class = skip_class_name(&regparse);
1946 startc = -1;
1947 /* Characters assumed to be 8 bits! */
1948 switch (c_class)
1949 {
1950 case CLASS_NONE:
1951 /* literal '[', allow [[-x] as a range */
1952 startc = *regparse++;
1953 regc(startc);
1954 break;
1955 case CLASS_ALNUM:
1956 for (cu = 1; cu <= 255; cu++)
1957 if (isalnum(cu))
1958 regc(cu);
1959 break;
1960 case CLASS_ALPHA:
1961 for (cu = 1; cu <= 255; cu++)
1962 if (isalpha(cu))
1963 regc(cu);
1964 break;
1965 case CLASS_BLANK:
1966 regc(' ');
1967 regc('\t');
1968 break;
1969 case CLASS_CNTRL:
1970 for (cu = 1; cu <= 255; cu++)
1971 if (iscntrl(cu))
1972 regc(cu);
1973 break;
1974 case CLASS_DIGIT:
1975 for (cu = 1; cu <= 255; cu++)
1976 if (VIM_ISDIGIT(cu))
1977 regc(cu);
1978 break;
1979 case CLASS_GRAPH:
1980 for (cu = 1; cu <= 255; cu++)
1981 if (isgraph(cu))
1982 regc(cu);
1983 break;
1984 case CLASS_LOWER:
1985 for (cu = 1; cu <= 255; cu++)
1986 if (islower(cu))
1987 regc(cu);
1988 break;
1989 case CLASS_PRINT:
1990 for (cu = 1; cu <= 255; cu++)
1991 if (vim_isprintc(cu))
1992 regc(cu);
1993 break;
1994 case CLASS_PUNCT:
1995 for (cu = 1; cu <= 255; cu++)
1996 if (ispunct(cu))
1997 regc(cu);
1998 break;
1999 case CLASS_SPACE:
2000 for (cu = 9; cu <= 13; cu++)
2001 regc(cu);
2002 regc(' ');
2003 break;
2004 case CLASS_UPPER:
2005 for (cu = 1; cu <= 255; cu++)
2006 if (isupper(cu))
2007 regc(cu);
2008 break;
2009 case CLASS_XDIGIT:
2010 for (cu = 1; cu <= 255; cu++)
2011 if (vim_isxdigit(cu))
2012 regc(cu);
2013 break;
2014 case CLASS_TAB:
2015 regc('\t');
2016 break;
2017 case CLASS_RETURN:
2018 regc('\r');
2019 break;
2020 case CLASS_BACKSPACE:
2021 regc('\b');
2022 break;
2023 case CLASS_ESCAPE:
2024 regc('\033');
2025 break;
2026 }
2027 }
2028 else
2029 {
2030#ifdef FEAT_MBYTE
2031 if (has_mbyte)
2032 {
2033 int len;
2034
2035 /* produce a multibyte character, including any
2036 * following composing characters */
2037 startc = mb_ptr2char(regparse);
2038 len = (*mb_ptr2len_check)(regparse);
2039 if (enc_utf8 && utf_char2len(startc) != len)
2040 startc = -1; /* composing chars */
2041 while (--len >= 0)
2042 regc(*regparse++);
2043 }
2044 else
2045#endif
2046 {
2047 startc = *regparse++;
2048 regc(startc);
2049 }
2050 }
2051 }
2052 regc(NUL);
2053 prevchr_len = 1; /* last char was the ']' */
2054 if (*regparse != ']')
2055 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2056 skipchr(); /* let's be friends with the lexer again */
2057 *flagp |= HASWIDTH | SIMPLE;
2058 break;
2059 }
2060 }
2061 /* FALLTHROUGH */
2062
2063 default:
2064 {
2065 int len;
2066
2067#ifdef FEAT_MBYTE
2068 /* A multi-byte character is handled as a separate atom if it's
2069 * before a multi. */
2070 if (has_mbyte && (*mb_char2len)(c) > 1
2071 && re_multi_type(peekchr()) != NOT_MULTI)
2072 {
2073 ret = regnode(MULTIBYTECODE);
2074 regmbc(c);
2075 *flagp |= HASWIDTH | SIMPLE;
2076 break;
2077 }
2078#endif
2079
2080 ret = regnode(EXACTLY);
2081
2082 /*
2083 * Append characters as long as:
2084 * - there is no following multi, we then need the character in
2085 * front of it as a single character operand
2086 * - not running into a Magic character
2087 * - "one_exactly" is not set
2088 * But always emit at least one character. Might be a Multi,
2089 * e.g., a "[" without matching "]".
2090 */
2091 for (len = 0; c != NUL && (len == 0
2092 || (re_multi_type(peekchr()) == NOT_MULTI
2093 && !one_exactly
2094 && !is_Magic(c))); ++len)
2095 {
2096 c = no_Magic(c);
2097#ifdef FEAT_MBYTE
2098 if (has_mbyte)
2099 {
2100 regmbc(c);
2101 if (enc_utf8)
2102 {
2103 int off;
2104 int l;
2105
2106 /* Need to get composing character too, directly
2107 * access regparse for that, because skipchr() skips
2108 * over composing chars. */
2109 ungetchr();
2110 if (*regparse == '\\' && regparse[1] != NUL)
2111 off = 1;
2112 else
2113 off = 0;
2114 for (;;)
2115 {
2116 l = utf_ptr2len_check(regparse + off);
2117 if (!UTF_COMPOSINGLIKE(regparse + off,
2118 regparse + off + l))
2119 break;
2120 off += l;
2121 regmbc(utf_ptr2char(regparse + off));
2122 }
2123 skipchr();
2124 }
2125 }
2126 else
2127#endif
2128 regc(c);
2129 c = getchr();
2130 }
2131 ungetchr();
2132
2133 regc(NUL);
2134 *flagp |= HASWIDTH;
2135 if (len == 1)
2136 *flagp |= SIMPLE;
2137 }
2138 break;
2139 }
2140
2141 return ret;
2142}
2143
2144/*
2145 * emit a node
2146 * Return pointer to generated code.
2147 */
2148 static char_u *
2149regnode(op)
2150 int op;
2151{
2152 char_u *ret;
2153
2154 ret = regcode;
2155 if (ret == JUST_CALC_SIZE)
2156 regsize += 3;
2157 else
2158 {
2159 *regcode++ = op;
2160 *regcode++ = NUL; /* Null "next" pointer. */
2161 *regcode++ = NUL;
2162 }
2163 return ret;
2164}
2165
2166/*
2167 * Emit (if appropriate) a byte of code
2168 */
2169 static void
2170regc(b)
2171 int b;
2172{
2173 if (regcode == JUST_CALC_SIZE)
2174 regsize++;
2175 else
2176 *regcode++ = b;
2177}
2178
2179#ifdef FEAT_MBYTE
2180/*
2181 * Emit (if appropriate) a multi-byte character of code
2182 */
2183 static void
2184regmbc(c)
2185 int c;
2186{
2187 if (regcode == JUST_CALC_SIZE)
2188 regsize += (*mb_char2len)(c);
2189 else
2190 regcode += (*mb_char2bytes)(c, regcode);
2191}
2192#endif
2193
2194/*
2195 * reginsert - insert an operator in front of already-emitted operand
2196 *
2197 * Means relocating the operand.
2198 */
2199 static void
2200reginsert(op, opnd)
2201 int op;
2202 char_u *opnd;
2203{
2204 char_u *src;
2205 char_u *dst;
2206 char_u *place;
2207
2208 if (regcode == JUST_CALC_SIZE)
2209 {
2210 regsize += 3;
2211 return;
2212 }
2213 src = regcode;
2214 regcode += 3;
2215 dst = regcode;
2216 while (src > opnd)
2217 *--dst = *--src;
2218
2219 place = opnd; /* Op node, where operand used to be. */
2220 *place++ = op;
2221 *place++ = NUL;
2222 *place = NUL;
2223}
2224
2225/*
2226 * reginsert_limits - insert an operator in front of already-emitted operand.
2227 * The operator has the given limit values as operands. Also set next pointer.
2228 *
2229 * Means relocating the operand.
2230 */
2231 static void
2232reginsert_limits(op, minval, maxval, opnd)
2233 int op;
2234 long minval;
2235 long maxval;
2236 char_u *opnd;
2237{
2238 char_u *src;
2239 char_u *dst;
2240 char_u *place;
2241
2242 if (regcode == JUST_CALC_SIZE)
2243 {
2244 regsize += 11;
2245 return;
2246 }
2247 src = regcode;
2248 regcode += 11;
2249 dst = regcode;
2250 while (src > opnd)
2251 *--dst = *--src;
2252
2253 place = opnd; /* Op node, where operand used to be. */
2254 *place++ = op;
2255 *place++ = NUL;
2256 *place++ = NUL;
2257 place = re_put_long(place, (long_u)minval);
2258 place = re_put_long(place, (long_u)maxval);
2259 regtail(opnd, place);
2260}
2261
2262/*
2263 * Write a long as four bytes at "p" and return pointer to the next char.
2264 */
2265 static char_u *
2266re_put_long(p, val)
2267 char_u *p;
2268 long_u val;
2269{
2270 *p++ = (char_u) ((val >> 24) & 0377);
2271 *p++ = (char_u) ((val >> 16) & 0377);
2272 *p++ = (char_u) ((val >> 8) & 0377);
2273 *p++ = (char_u) (val & 0377);
2274 return p;
2275}
2276
2277/*
2278 * regtail - set the next-pointer at the end of a node chain
2279 */
2280 static void
2281regtail(p, val)
2282 char_u *p;
2283 char_u *val;
2284{
2285 char_u *scan;
2286 char_u *temp;
2287 int offset;
2288
2289 if (p == JUST_CALC_SIZE)
2290 return;
2291
2292 /* Find last node. */
2293 scan = p;
2294 for (;;)
2295 {
2296 temp = regnext(scan);
2297 if (temp == NULL)
2298 break;
2299 scan = temp;
2300 }
2301
2302 if (OP(scan) == BACK)
2303 offset = (int)(scan - val);
2304 else
2305 offset = (int)(val - scan);
2306 *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377);
2307 *(scan + 2) = (char_u) (offset & 0377);
2308}
2309
2310/*
2311 * regoptail - regtail on item after a BRANCH; nop if none
2312 */
2313 static void
2314regoptail(p, val)
2315 char_u *p;
2316 char_u *val;
2317{
2318 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2319 if (p == NULL || p == JUST_CALC_SIZE
2320 || (OP(p) != BRANCH
2321 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2322 return;
2323 regtail(OPERAND(p), val);
2324}
2325
2326/*
2327 * getchr() - get the next character from the pattern. We know about
2328 * magic and such, so therefore we need a lexical analyzer.
2329 */
2330
2331/* static int curchr; */
2332static int prevprevchr;
2333static int prevchr;
2334static int nextchr; /* used for ungetchr() */
2335/*
2336 * Note: prevchr is sometimes -1 when we are not at the start,
2337 * eg in /[ ^I]^ the pattern was never found even if it existed, because ^ was
2338 * taken to be magic -- webb
2339 */
2340static int at_start; /* True when on the first character */
2341static int prev_at_start; /* True when on the second character */
2342
2343 static void
2344initchr(str)
2345 char_u *str;
2346{
2347 regparse = str;
2348 prevchr_len = 0;
2349 curchr = prevprevchr = prevchr = nextchr = -1;
2350 at_start = TRUE;
2351 prev_at_start = FALSE;
2352}
2353
2354 static int
2355peekchr()
2356{
2357 if (curchr == -1)
2358 {
2359 switch (curchr = regparse[0])
2360 {
2361 case '.':
2362 case '[':
2363 case '~':
2364 /* magic when 'magic' is on */
2365 if (reg_magic >= MAGIC_ON)
2366 curchr = Magic(curchr);
2367 break;
2368 case '(':
2369 case ')':
2370 case '{':
2371 case '%':
2372 case '+':
2373 case '=':
2374 case '?':
2375 case '@':
2376 case '!':
2377 case '&':
2378 case '|':
2379 case '<':
2380 case '>':
2381 case '#': /* future ext. */
2382 case '"': /* future ext. */
2383 case '\'': /* future ext. */
2384 case ',': /* future ext. */
2385 case '-': /* future ext. */
2386 case ':': /* future ext. */
2387 case ';': /* future ext. */
2388 case '`': /* future ext. */
2389 case '/': /* Can't be used in / command */
2390 /* magic only after "\v" */
2391 if (reg_magic == MAGIC_ALL)
2392 curchr = Magic(curchr);
2393 break;
2394 case '*':
2395 /* * is not magic as the very first character, eg "?*ptr" and when
2396 * after '^', eg "/^*ptr" */
2397 if (reg_magic >= MAGIC_ON && !at_start
2398 && !(prev_at_start && prevchr == Magic('^')))
2399 curchr = Magic('*');
2400 break;
2401 case '^':
2402 /* '^' is only magic as the very first character and if it's after
2403 * "\(", "\|", "\&' or "\n" */
2404 if (reg_magic >= MAGIC_OFF
2405 && (at_start
2406 || reg_magic == MAGIC_ALL
2407 || prevchr == Magic('(')
2408 || prevchr == Magic('|')
2409 || prevchr == Magic('&')
2410 || prevchr == Magic('n')
2411 || (no_Magic(prevchr) == '('
2412 && prevprevchr == Magic('%'))))
2413 {
2414 curchr = Magic('^');
2415 at_start = TRUE;
2416 prev_at_start = FALSE;
2417 }
2418 break;
2419 case '$':
2420 /* '$' is only magic as the very last char and if it's in front of
2421 * either "\|", "\)", "\&", or "\n" */
2422 if (reg_magic >= MAGIC_OFF)
2423 {
2424 char_u *p = regparse + 1;
2425
2426 /* ignore \c \C \m and \M after '$' */
2427 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2428 || p[1] == 'm' || p[1] == 'M' || p[1] == 'Z'))
2429 p += 2;
2430 if (p[0] == NUL
2431 || (p[0] == '\\'
2432 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2433 || p[1] == 'n'))
2434 || reg_magic == MAGIC_ALL)
2435 curchr = Magic('$');
2436 }
2437 break;
2438 case '\\':
2439 {
2440 int c = regparse[1];
2441
2442 if (c == NUL)
2443 curchr = '\\'; /* trailing '\' */
2444 else if (
2445#ifdef EBCDIC
2446 vim_strchr(META, c)
2447#else
2448 c <= '~' && META_flags[c]
2449#endif
2450 )
2451 {
2452 /*
2453 * META contains everything that may be magic sometimes,
2454 * except ^ and $ ("\^" and "\$" are only magic after
2455 * "\v"). We now fetch the next character and toggle its
2456 * magicness. Therefore, \ is so meta-magic that it is
2457 * not in META.
2458 */
2459 curchr = -1;
2460 prev_at_start = at_start;
2461 at_start = FALSE; /* be able to say "/\*ptr" */
2462 ++regparse;
2463 peekchr();
2464 --regparse;
2465 curchr = toggle_Magic(curchr);
2466 }
2467 else if (vim_strchr(REGEXP_ABBR, c))
2468 {
2469 /*
2470 * Handle abbreviations, like "\t" for TAB -- webb
2471 */
2472 curchr = backslash_trans(c);
2473 }
2474 else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2475 curchr = toggle_Magic(c);
2476 else
2477 {
2478 /*
2479 * Next character can never be (made) magic?
2480 * Then backslashing it won't do anything.
2481 */
2482#ifdef FEAT_MBYTE
2483 if (has_mbyte)
2484 curchr = (*mb_ptr2char)(regparse + 1);
2485 else
2486#endif
2487 curchr = c;
2488 }
2489 break;
2490 }
2491
2492#ifdef FEAT_MBYTE
2493 default:
2494 if (has_mbyte)
2495 curchr = (*mb_ptr2char)(regparse);
2496#endif
2497 }
2498 }
2499
2500 return curchr;
2501}
2502
2503/*
2504 * Eat one lexed character. Do this in a way that we can undo it.
2505 */
2506 static void
2507skipchr()
2508{
2509 /* peekchr() eats a backslash, do the same here */
2510 if (*regparse == '\\')
2511 prevchr_len = 1;
2512 else
2513 prevchr_len = 0;
2514 if (regparse[prevchr_len] != NUL)
2515 {
2516#ifdef FEAT_MBYTE
2517 if (has_mbyte)
2518 prevchr_len += (*mb_ptr2len_check)(regparse + prevchr_len);
2519 else
2520#endif
2521 ++prevchr_len;
2522 }
2523 regparse += prevchr_len;
2524 prev_at_start = at_start;
2525 at_start = FALSE;
2526 prevprevchr = prevchr;
2527 prevchr = curchr;
2528 curchr = nextchr; /* use previously unget char, or -1 */
2529 nextchr = -1;
2530}
2531
2532/*
2533 * Skip a character while keeping the value of prev_at_start for at_start.
2534 * prevchr and prevprevchr are also kept.
2535 */
2536 static void
2537skipchr_keepstart()
2538{
2539 int as = prev_at_start;
2540 int pr = prevchr;
2541 int prpr = prevprevchr;
2542
2543 skipchr();
2544 at_start = as;
2545 prevchr = pr;
2546 prevprevchr = prpr;
2547}
2548
2549 static int
2550getchr()
2551{
2552 int chr = peekchr();
2553
2554 skipchr();
2555 return chr;
2556}
2557
2558/*
2559 * put character back. Works only once!
2560 */
2561 static void
2562ungetchr()
2563{
2564 nextchr = curchr;
2565 curchr = prevchr;
2566 prevchr = prevprevchr;
2567 at_start = prev_at_start;
2568 prev_at_start = FALSE;
2569
2570 /* Backup regparse, so that it's at the same position as before the
2571 * getchr(). */
2572 regparse -= prevchr_len;
2573}
2574
2575/*
Bram Moolenaar7b0294c2004-10-11 10:16:09 +00002576 * Get and return the value of the hex string at the current position.
2577 * Return -1 if there is no valid hex number.
2578 * The position is updated:
Bram Moolenaarc0197e22004-09-13 20:26:32 +00002579 * blahblah\%x20asdf
2580 * before-^ ^-after
2581 * The parameter controls the maximum number of input characters. This will be
2582 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
2583 */
2584 static int
2585gethexchrs(maxinputlen)
2586 int maxinputlen;
2587{
2588 int nr = 0;
2589 int c;
2590 int i;
2591
2592 for (i = 0; i < maxinputlen; ++i)
2593 {
2594 c = regparse[0];
2595 if (!vim_isxdigit(c))
2596 break;
2597 nr <<= 4;
2598 nr |= hex2nr(c);
2599 ++regparse;
2600 }
2601
2602 if (i == 0)
2603 return -1;
2604 return nr;
2605}
2606
2607/*
2608 * get and return the value of the decimal string immediately after the
2609 * current position. Return -1 for invalid. Consumes all digits.
2610 */
2611 static int
2612getdecchrs()
2613{
2614 int nr = 0;
2615 int c;
2616 int i;
2617
2618 for (i = 0; ; ++i)
2619 {
2620 c = regparse[0];
2621 if (c < '0' || c > '9')
2622 break;
2623 nr *= 10;
2624 nr += c - '0';
2625 ++regparse;
2626 }
2627
2628 if (i == 0)
2629 return -1;
2630 return nr;
2631}
2632
2633/*
2634 * get and return the value of the octal string immediately after the current
2635 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
2636 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
2637 * treat 8 or 9 as recognised characters. Position is updated:
2638 * blahblah\%o210asdf
2639 * before-^ ^-after
2640 */
2641 static int
2642getoctchrs()
2643{
2644 int nr = 0;
2645 int c;
2646 int i;
2647
2648 for (i = 0; i < 3 && nr < 040; ++i)
2649 {
2650 c = regparse[0];
2651 if (c < '0' || c > '7')
2652 break;
2653 nr <<= 3;
2654 nr |= hex2nr(c);
2655 ++regparse;
2656 }
2657
2658 if (i == 0)
2659 return -1;
2660 return nr;
2661}
2662
2663/*
2664 * Get a number after a backslash that is inside [].
2665 * When nothing is recognized return a backslash.
2666 */
2667 static int
2668coll_get_char()
2669{
2670 int nr = -1;
2671
2672 switch (*regparse++)
2673 {
2674 case 'd': nr = getdecchrs(); break;
2675 case 'o': nr = getoctchrs(); break;
2676 case 'x': nr = gethexchrs(2); break;
2677 case 'u': nr = gethexchrs(4); break;
2678 case 'U': nr = gethexchrs(8); break;
2679 }
2680 if (nr < 0)
2681 {
2682 /* If getting the number fails be backwards compatible: the character
2683 * is a backslash. */
2684 --regparse;
2685 nr = '\\';
2686 }
2687 return nr;
2688}
2689
2690/*
Bram Moolenaar071d4272004-06-13 20:20:40 +00002691 * read_limits - Read two integers to be taken as a minimum and maximum.
2692 * If the first character is '-', then the range is reversed.
2693 * Should end with 'end'. If minval is missing, zero is default, if maxval is
2694 * missing, a very big number is the default.
2695 */
2696 static int
2697read_limits(minval, maxval)
2698 long *minval;
2699 long *maxval;
2700{
2701 int reverse = FALSE;
2702 char_u *first_char;
2703 long tmp;
2704
2705 if (*regparse == '-')
2706 {
2707 /* Starts with '-', so reverse the range later */
2708 regparse++;
2709 reverse = TRUE;
2710 }
2711 first_char = regparse;
2712 *minval = getdigits(&regparse);
2713 if (*regparse == ',') /* There is a comma */
2714 {
2715 if (vim_isdigit(*++regparse))
2716 *maxval = getdigits(&regparse);
2717 else
2718 *maxval = MAX_LIMIT;
2719 }
2720 else if (VIM_ISDIGIT(*first_char))
2721 *maxval = *minval; /* It was \{n} or \{-n} */
2722 else
2723 *maxval = MAX_LIMIT; /* It was \{} or \{-} */
2724 if (*regparse == '\\')
2725 regparse++; /* Allow either \{...} or \{...\} */
2726 if (*regparse != '}' || (*maxval == 0 && *minval == 0))
2727 {
2728 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
2729 reg_magic == MAGIC_ALL ? "" : "\\");
2730 EMSG_RET_FAIL(IObuff);
2731 }
2732
2733 /*
2734 * Reverse the range if there was a '-', or make sure it is in the right
2735 * order otherwise.
2736 */
2737 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval))
2738 {
2739 tmp = *minval;
2740 *minval = *maxval;
2741 *maxval = tmp;
2742 }
2743 skipchr(); /* let's be friends with the lexer again */
2744 return OK;
2745}
2746
2747/*
2748 * vim_regexec and friends
2749 */
2750
2751/*
2752 * Global work variables for vim_regexec().
2753 */
2754
2755/* The current match-position is remembered with these variables: */
2756static linenr_T reglnum; /* line number, relative to first line */
2757static char_u *regline; /* start of current line */
2758static char_u *reginput; /* current input, points into "regline" */
2759
2760static int need_clear_subexpr; /* subexpressions still need to be
2761 * cleared */
2762#ifdef FEAT_SYN_HL
2763static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions
2764 * still need to be cleared */
2765#endif
2766
2767static int out_of_stack; /* TRUE when ran out of stack space */
2768
2769/*
2770 * Structure used to save the current input state, when it needs to be
2771 * restored after trying a match. Used by reg_save() and reg_restore().
2772 */
2773typedef struct
2774{
2775 union
2776 {
2777 char_u *ptr; /* reginput pointer, for single-line regexp */
2778 lpos_T pos; /* reginput pos, for multi-line regexp */
2779 } rs_u;
2780} regsave_T;
2781
2782/* struct to save start/end pointer/position in for \(\) */
2783typedef struct
2784{
2785 union
2786 {
2787 char_u *ptr;
2788 lpos_T pos;
2789 } se_u;
2790} save_se_T;
2791
2792static char_u *reg_getline __ARGS((linenr_T lnum));
2793static long vim_regexec_both __ARGS((char_u *line, colnr_T col));
2794static long regtry __ARGS((regprog_T *prog, colnr_T col));
2795static void cleanup_subexpr __ARGS((void));
2796#ifdef FEAT_SYN_HL
2797static void cleanup_zsubexpr __ARGS((void));
2798#endif
2799static void reg_nextline __ARGS((void));
2800static void reg_save __ARGS((regsave_T *save));
2801static void reg_restore __ARGS((regsave_T *save));
2802static int reg_save_equal __ARGS((regsave_T *save));
2803static void save_se_multi __ARGS((save_se_T *savep, lpos_T *posp));
2804static void save_se_one __ARGS((save_se_T *savep, char_u **pp));
2805
2806/* Save the sub-expressions before attempting a match. */
2807#define save_se(savep, posp, pp) \
2808 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
2809
2810/* After a failed match restore the sub-expressions. */
2811#define restore_se(savep, posp, pp) { \
2812 if (REG_MULTI) \
2813 *(posp) = (savep)->se_u.pos; \
2814 else \
2815 *(pp) = (savep)->se_u.ptr; }
2816
2817static int re_num_cmp __ARGS((long_u val, char_u *scan));
2818static int regmatch __ARGS((char_u *prog));
2819static int regrepeat __ARGS((char_u *p, long maxcount));
2820
2821#ifdef DEBUG
2822int regnarrate = 0;
2823#endif
2824
2825/*
2826 * Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
2827 * Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
2828 * contains '\c' or '\C' the value is overruled.
2829 */
2830static int ireg_ic;
2831
2832#ifdef FEAT_MBYTE
2833/*
2834 * Similar to ireg_ic, but only for 'combining' characters. Set with \Z flag
2835 * in the regexp. Defaults to false, always.
2836 */
2837static int ireg_icombine;
2838#endif
2839
2840/*
2841 * Sometimes need to save a copy of a line. Since alloc()/free() is very
2842 * slow, we keep one allocated piece of memory and only re-allocate it when
2843 * it's too small. It's freed in vim_regexec_both() when finished.
2844 */
2845static char_u *reg_tofree;
2846static unsigned reg_tofreelen;
2847
2848/*
2849 * These variables are set when executing a regexp to speed up the execution.
2850 * Which ones are set depends on whethere a single-line or multi-line match is
2851 * done:
2852 * single-line multi-line
2853 * reg_match &regmatch_T NULL
2854 * reg_mmatch NULL &regmmatch_T
2855 * reg_startp reg_match->startp <invalid>
2856 * reg_endp reg_match->endp <invalid>
2857 * reg_startpos <invalid> reg_mmatch->startpos
2858 * reg_endpos <invalid> reg_mmatch->endpos
2859 * reg_win NULL window in which to search
2860 * reg_buf <invalid> buffer in which to search
2861 * reg_firstlnum <invalid> first line in which to search
2862 * reg_maxline 0 last line nr
2863 * reg_line_lbr FALSE or TRUE FALSE
2864 */
2865static regmatch_T *reg_match;
2866static regmmatch_T *reg_mmatch;
2867static char_u **reg_startp = NULL;
2868static char_u **reg_endp = NULL;
2869static lpos_T *reg_startpos = NULL;
2870static lpos_T *reg_endpos = NULL;
2871static win_T *reg_win;
2872static buf_T *reg_buf;
2873static linenr_T reg_firstlnum;
2874static linenr_T reg_maxline;
2875static int reg_line_lbr; /* "\n" in string is line break */
2876
2877/*
2878 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
2879 */
2880 static char_u *
2881reg_getline(lnum)
2882 linenr_T lnum;
2883{
2884 /* when looking behind for a match/no-match lnum is negative. But we
2885 * can't go before line 1 */
2886 if (reg_firstlnum + lnum < 1)
2887 return NULL;
2888 return ml_get_buf(reg_buf, reg_firstlnum + lnum, FALSE);
2889}
2890
2891static regsave_T behind_pos;
2892
2893#ifdef FEAT_SYN_HL
2894static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
2895static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
2896static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
2897static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
2898#endif
2899
2900/* TRUE if using multi-line regexp. */
2901#define REG_MULTI (reg_match == NULL)
2902
2903/*
2904 * Match a regexp against a string.
2905 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2906 * Uses curbuf for line count and 'iskeyword'.
2907 *
2908 * Return TRUE if there is a match, FALSE if not.
2909 */
2910 int
2911vim_regexec(rmp, line, col)
2912 regmatch_T *rmp;
2913 char_u *line; /* string to match against */
2914 colnr_T col; /* column to start looking for match */
2915{
2916 reg_match = rmp;
2917 reg_mmatch = NULL;
2918 reg_maxline = 0;
2919 reg_line_lbr = FALSE;
2920 reg_win = NULL;
2921 ireg_ic = rmp->rm_ic;
2922#ifdef FEAT_MBYTE
2923 ireg_icombine = FALSE;
2924#endif
2925 return (vim_regexec_both(line, col) != 0);
2926}
2927
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00002928#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) \
2929 || defined(FIND_REPLACE_DIALOG) || defined(PROTO)
Bram Moolenaar071d4272004-06-13 20:20:40 +00002930/*
2931 * Like vim_regexec(), but consider a "\n" in "line" to be a line break.
2932 */
2933 int
2934vim_regexec_nl(rmp, line, col)
2935 regmatch_T *rmp;
2936 char_u *line; /* string to match against */
2937 colnr_T col; /* column to start looking for match */
2938{
2939 reg_match = rmp;
2940 reg_mmatch = NULL;
2941 reg_maxline = 0;
2942 reg_line_lbr = TRUE;
2943 reg_win = NULL;
2944 ireg_ic = rmp->rm_ic;
2945#ifdef FEAT_MBYTE
2946 ireg_icombine = FALSE;
2947#endif
2948 return (vim_regexec_both(line, col) != 0);
2949}
2950#endif
2951
2952/*
2953 * Match a regexp against multiple lines.
2954 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
2955 * Uses curbuf for line count and 'iskeyword'.
2956 *
2957 * Return zero if there is no match. Return number of lines contained in the
2958 * match otherwise.
2959 */
2960 long
2961vim_regexec_multi(rmp, win, buf, lnum, col)
2962 regmmatch_T *rmp;
2963 win_T *win; /* window in which to search or NULL */
2964 buf_T *buf; /* buffer in which to search */
2965 linenr_T lnum; /* nr of line to start looking for match */
2966 colnr_T col; /* column to start looking for match */
2967{
2968 long r;
2969 buf_T *save_curbuf = curbuf;
2970
2971 reg_match = NULL;
2972 reg_mmatch = rmp;
2973 reg_buf = buf;
2974 reg_win = win;
2975 reg_firstlnum = lnum;
2976 reg_maxline = reg_buf->b_ml.ml_line_count - lnum;
2977 reg_line_lbr = FALSE;
2978 ireg_ic = rmp->rmm_ic;
2979#ifdef FEAT_MBYTE
2980 ireg_icombine = FALSE;
2981#endif
2982
2983 /* Need to switch to buffer "buf" to make vim_iswordc() work. */
2984 curbuf = buf;
2985 r = vim_regexec_both(NULL, col);
2986 curbuf = save_curbuf;
2987
2988 return r;
2989}
2990
2991/*
2992 * Match a regexp against a string ("line" points to the string) or multiple
2993 * lines ("line" is NULL, use reg_getline()).
2994 */
2995#ifdef HAVE_SETJMP_H
2996 static long
2997vim_regexec_both(line_arg, col_arg)
2998 char_u *line_arg;
2999 colnr_T col_arg; /* column to start looking for match */
3000#else
3001 static long
3002vim_regexec_both(line, col)
3003 char_u *line;
3004 colnr_T col; /* column to start looking for match */
3005#endif
3006{
3007 regprog_T *prog;
3008 char_u *s;
3009 long retval;
3010#ifdef HAVE_SETJMP_H
3011 char_u *line;
3012 colnr_T col;
3013#endif
3014
3015 reg_tofree = NULL;
3016
3017#ifdef HAVE_TRY_EXCEPT
3018 __try
3019 {
3020#endif
3021
3022#ifdef HAVE_SETJMP_H
3023 /*
3024 * Matching with a regexp may cause a very deep recursive call of
3025 * regmatch(). Vim will crash when running out of stack space. Catch
3026 * this here if the system supports it.
3027 */
3028 mch_startjmp();
3029 if (SETJMP(lc_jump_env) != 0)
3030 {
3031 mch_didjmp();
3032# ifdef SIGHASARG
3033 if (lc_signal != SIGINT)
3034# endif
3035 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3036 retval = 0L;
3037 goto theend;
3038 }
3039
3040 /* Trick to avoid "might be clobbered by `longjmp'" warning from gcc. */
3041 line = line_arg;
3042 col = col_arg;
3043#endif
3044 retval = 0L;
3045
3046 if (REG_MULTI)
3047 {
3048 prog = reg_mmatch->regprog;
3049 line = reg_getline((linenr_T)0);
3050 reg_startpos = reg_mmatch->startpos;
3051 reg_endpos = reg_mmatch->endpos;
3052 }
3053 else
3054 {
3055 prog = reg_match->regprog;
3056 reg_startp = reg_match->startp;
3057 reg_endp = reg_match->endp;
3058 }
3059
3060 /* Be paranoid... */
3061 if (prog == NULL || line == NULL)
3062 {
3063 EMSG(_(e_null));
3064 goto theend;
3065 }
3066
3067 /* Check validity of program. */
3068 if (prog_magic_wrong())
3069 goto theend;
3070
3071 /* If pattern contains "\c" or "\C": overrule value of ireg_ic */
3072 if (prog->regflags & RF_ICASE)
3073 ireg_ic = TRUE;
3074 else if (prog->regflags & RF_NOICASE)
3075 ireg_ic = FALSE;
3076
3077#ifdef FEAT_MBYTE
3078 /* If pattern contains "\Z" overrule value of ireg_icombine */
3079 if (prog->regflags & RF_ICOMBINE)
3080 ireg_icombine = TRUE;
3081#endif
3082
3083 /* If there is a "must appear" string, look for it. */
3084 if (prog->regmust != NULL)
3085 {
3086 int c;
3087
3088#ifdef FEAT_MBYTE
3089 if (has_mbyte)
3090 c = (*mb_ptr2char)(prog->regmust);
3091 else
3092#endif
3093 c = *prog->regmust;
3094 s = line + col;
3095 while ((s = cstrchr(s, c)) != NULL)
3096 {
3097 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0)
3098 break; /* Found it. */
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003099 mb_ptr_adv(s);
Bram Moolenaar071d4272004-06-13 20:20:40 +00003100 }
3101 if (s == NULL) /* Not present. */
3102 goto theend;
3103 }
3104
3105 regline = line;
3106 reglnum = 0;
3107 out_of_stack = FALSE;
3108
3109 /* Simplest case: Anchored match need be tried only once. */
3110 if (prog->reganch)
3111 {
3112 int c;
3113
3114#ifdef FEAT_MBYTE
3115 if (has_mbyte)
3116 c = (*mb_ptr2char)(regline + col);
3117 else
3118#endif
3119 c = regline[col];
3120 if (prog->regstart == NUL
3121 || prog->regstart == c
3122 || (ireg_ic && ((
3123#ifdef FEAT_MBYTE
3124 (enc_utf8 && utf_fold(prog->regstart) == utf_fold(c)))
3125 || (c < 255 && prog->regstart < 255 &&
3126#endif
3127 TOLOWER_LOC(prog->regstart) == TOLOWER_LOC(c)))))
3128 retval = regtry(prog, col);
3129 else
3130 retval = 0;
3131 }
3132 else
3133 {
3134 /* Messy cases: unanchored match. */
3135 while (!got_int && !out_of_stack)
3136 {
3137 if (prog->regstart != NUL)
3138 {
3139 /* Skip until the char we know it must start with. */
3140 s = cstrchr(regline + col, prog->regstart);
3141 if (s == NULL)
3142 {
3143 retval = 0;
3144 break;
3145 }
3146 col = (int)(s - regline);
3147 }
3148
3149 retval = regtry(prog, col);
3150 if (retval > 0)
3151 break;
3152
3153 /* if not currently on the first line, get it again */
3154 if (reglnum != 0)
3155 {
3156 regline = reg_getline((linenr_T)0);
3157 reglnum = 0;
3158 }
3159 if (regline[col] == NUL)
3160 break;
3161#ifdef FEAT_MBYTE
3162 if (has_mbyte)
3163 col += (*mb_ptr2len_check)(regline + col);
3164 else
3165#endif
3166 ++col;
3167 }
3168 }
3169
3170 if (out_of_stack)
3171 EMSG(_("E363: pattern caused out-of-stack error"));
3172
3173#ifdef HAVE_TRY_EXCEPT
3174 }
3175 __except(EXCEPTION_EXECUTE_HANDLER)
3176 {
3177 if (GetExceptionCode() == EXCEPTION_STACK_OVERFLOW)
3178 {
3179 RESETSTKOFLW();
3180 EMSG(_("E363: pattern caused out-of-stack error"));
3181 }
3182 else
3183 EMSG(_("E361: Crash intercepted; regexp too complex?"));
3184 retval = 0L;
3185 }
3186#endif
3187
3188theend:
3189 /* Didn't find a match. */
3190 vim_free(reg_tofree);
3191#ifdef HAVE_SETJMP_H
3192 mch_endjmp();
3193#endif
3194 return retval;
3195}
3196
3197#ifdef FEAT_SYN_HL
3198static reg_extmatch_T *make_extmatch __ARGS((void));
3199
3200/*
3201 * Create a new extmatch and mark it as referenced once.
3202 */
3203 static reg_extmatch_T *
3204make_extmatch()
3205{
3206 reg_extmatch_T *em;
3207
3208 em = (reg_extmatch_T *)alloc_clear((unsigned)sizeof(reg_extmatch_T));
3209 if (em != NULL)
3210 em->refcnt = 1;
3211 return em;
3212}
3213
3214/*
3215 * Add a reference to an extmatch.
3216 */
3217 reg_extmatch_T *
3218ref_extmatch(em)
3219 reg_extmatch_T *em;
3220{
3221 if (em != NULL)
3222 em->refcnt++;
3223 return em;
3224}
3225
3226/*
3227 * Remove a reference to an extmatch. If there are no references left, free
3228 * the info.
3229 */
3230 void
3231unref_extmatch(em)
3232 reg_extmatch_T *em;
3233{
3234 int i;
3235
3236 if (em != NULL && --em->refcnt <= 0)
3237 {
3238 for (i = 0; i < NSUBEXP; ++i)
3239 vim_free(em->matches[i]);
3240 vim_free(em);
3241 }
3242}
3243#endif
3244
3245/*
3246 * regtry - try match of "prog" with at regline["col"].
3247 * Returns 0 for failure, number of lines contained in the match otherwise.
3248 */
3249 static long
3250regtry(prog, col)
3251 regprog_T *prog;
3252 colnr_T col;
3253{
3254 reginput = regline + col;
3255 need_clear_subexpr = TRUE;
3256#ifdef FEAT_SYN_HL
3257 /* Clear the external match subpointers if necessary. */
3258 if (prog->reghasz == REX_SET)
3259 need_clear_zsubexpr = TRUE;
3260#endif
3261
3262 if (regmatch(prog->program + 1))
3263 {
3264 cleanup_subexpr();
3265 if (REG_MULTI)
3266 {
3267 if (reg_startpos[0].lnum < 0)
3268 {
3269 reg_startpos[0].lnum = 0;
3270 reg_startpos[0].col = col;
3271 }
3272 if (reg_endpos[0].lnum < 0)
3273 {
3274 reg_endpos[0].lnum = reglnum;
3275 reg_endpos[0].col = (int)(reginput - regline);
3276 }
3277 else
3278 /* Use line number of "\ze". */
3279 reglnum = reg_endpos[0].lnum;
3280 }
3281 else
3282 {
3283 if (reg_startp[0] == NULL)
3284 reg_startp[0] = regline + col;
3285 if (reg_endp[0] == NULL)
3286 reg_endp[0] = reginput;
3287 }
3288#ifdef FEAT_SYN_HL
3289 /* Package any found \z(...\) matches for export. Default is none. */
3290 unref_extmatch(re_extmatch_out);
3291 re_extmatch_out = NULL;
3292
3293 if (prog->reghasz == REX_SET)
3294 {
3295 int i;
3296
3297 cleanup_zsubexpr();
3298 re_extmatch_out = make_extmatch();
3299 for (i = 0; i < NSUBEXP; i++)
3300 {
3301 if (REG_MULTI)
3302 {
3303 /* Only accept single line matches. */
3304 if (reg_startzpos[i].lnum >= 0
3305 && reg_endzpos[i].lnum == reg_startzpos[i].lnum)
3306 re_extmatch_out->matches[i] =
3307 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3308 + reg_startzpos[i].col,
3309 reg_endzpos[i].col - reg_startzpos[i].col);
3310 }
3311 else
3312 {
3313 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3314 re_extmatch_out->matches[i] =
3315 vim_strnsave(reg_startzp[i],
3316 (int)(reg_endzp[i] - reg_startzp[i]));
3317 }
3318 }
3319 }
3320#endif
3321 return 1 + reglnum;
3322 }
3323 return 0;
3324}
3325
3326#ifdef FEAT_MBYTE
Bram Moolenaar071d4272004-06-13 20:20:40 +00003327static int reg_prev_class __ARGS((void));
3328
Bram Moolenaar071d4272004-06-13 20:20:40 +00003329/*
3330 * Get class of previous character.
3331 */
3332 static int
3333reg_prev_class()
3334{
3335 if (reginput > regline)
3336 return mb_get_class(reginput - 1
3337 - (*mb_head_off)(regline, reginput - 1));
3338 return -1;
3339}
3340
Bram Moolenaar071d4272004-06-13 20:20:40 +00003341#endif
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00003342#define ADVANCE_REGINPUT() mb_ptr_adv(reginput)
Bram Moolenaar071d4272004-06-13 20:20:40 +00003343
3344/*
3345 * The arguments from BRACE_LIMITS are stored here. They are actually local
3346 * to regmatch(), but they are here to reduce the amount of stack space used
3347 * (it can be called recursively many times).
3348 */
3349static long bl_minval;
3350static long bl_maxval;
3351
3352/*
3353 * regmatch - main matching routine
3354 *
3355 * Conceptually the strategy is simple: Check to see whether the current
3356 * node matches, call self recursively to see whether the rest matches,
3357 * and then act accordingly. In practice we make some effort to avoid
3358 * recursion, in particular by going through "ordinary" nodes (that don't
3359 * need to know whether the rest of the match failed) by a loop instead of
3360 * by recursion.
3361 *
3362 * Returns TRUE when there is a match. Leaves reginput and reglnum just after
3363 * the last matched character.
3364 * Returns FALSE when there is no match. Leaves reginput and reglnum in an
3365 * undefined state!
3366 */
3367 static int
3368regmatch(scan)
3369 char_u *scan; /* Current node. */
3370{
3371 char_u *next; /* Next node. */
3372 int op;
3373 int c;
3374
3375#ifdef HAVE_GETRLIMIT
3376 /* Check if we are running out of stack space. Could be caused by
3377 * recursively calling ourselves. */
3378 if (out_of_stack || mch_stackcheck((char *)&op) == FAIL)
3379 {
3380 out_of_stack = TRUE;
3381 return FALSE;
3382 }
3383#endif
3384
3385 /* Some patterns my cause a long time to match, even though they are not
3386 * illegal. E.g., "\([a-z]\+\)\+Q". Allow breaking them with CTRL-C. */
3387 fast_breakcheck();
3388
3389#ifdef DEBUG
3390 if (scan != NULL && regnarrate)
3391 {
3392 mch_errmsg(regprop(scan));
3393 mch_errmsg("(\n");
3394 }
3395#endif
3396 while (scan != NULL)
3397 {
3398 if (got_int || out_of_stack)
3399 return FALSE;
3400#ifdef DEBUG
3401 if (regnarrate)
3402 {
3403 mch_errmsg(regprop(scan));
3404 mch_errmsg("...\n");
3405# ifdef FEAT_SYN_HL
3406 if (re_extmatch_in != NULL)
3407 {
3408 int i;
3409
3410 mch_errmsg(_("External submatches:\n"));
3411 for (i = 0; i < NSUBEXP; i++)
3412 {
3413 mch_errmsg(" \"");
3414 if (re_extmatch_in->matches[i] != NULL)
3415 mch_errmsg(re_extmatch_in->matches[i]);
3416 mch_errmsg("\"\n");
3417 }
3418 }
3419# endif
3420 }
3421#endif
3422 next = regnext(scan);
3423
3424 op = OP(scan);
3425 /* Check for character class with NL added. */
3426 if (WITH_NL(op) && *reginput == NUL && reglnum < reg_maxline)
3427 {
3428 reg_nextline();
3429 }
3430 else if (reg_line_lbr && WITH_NL(op) && *reginput == '\n')
3431 {
3432 ADVANCE_REGINPUT();
3433 }
3434 else
3435 {
3436 if (WITH_NL(op))
3437 op -= ADD_NL;
3438#ifdef FEAT_MBYTE
3439 if (has_mbyte)
3440 c = (*mb_ptr2char)(reginput);
3441 else
3442#endif
3443 c = *reginput;
3444 switch (op)
3445 {
3446 case BOL:
3447 if (reginput != regline)
3448 return FALSE;
3449 break;
3450
3451 case EOL:
3452 if (c != NUL)
3453 return FALSE;
3454 break;
3455
3456 case RE_BOF:
3457 /* Passing -1 to the getline() function provided for the search
3458 * should always return NULL if the current line is the first
3459 * line of the file. */
3460 if (reglnum != 0 || reginput != regline
3461 || (REG_MULTI && reg_getline((linenr_T)-1) != NULL))
3462 return FALSE;
3463 break;
3464
3465 case RE_EOF:
3466 if (reglnum != reg_maxline || c != NUL)
3467 return FALSE;
3468 break;
3469
3470 case CURSOR:
3471 /* Check if the buffer is in a window and compare the
3472 * reg_win->w_cursor position to the match position. */
3473 if (reg_win == NULL
3474 || (reglnum + reg_firstlnum != reg_win->w_cursor.lnum)
3475 || ((colnr_T)(reginput - regline) != reg_win->w_cursor.col))
3476 return FALSE;
3477 break;
3478
3479 case RE_LNUM:
3480 if (!REG_MULTI || !re_num_cmp((long_u)(reglnum + reg_firstlnum),
3481 scan))
3482 return FALSE;
3483 break;
3484
3485 case RE_COL:
3486 if (!re_num_cmp((long_u)(reginput - regline) + 1, scan))
3487 return FALSE;
3488 break;
3489
3490 case RE_VCOL:
3491 if (!re_num_cmp((long_u)win_linetabsize(
3492 reg_win == NULL ? curwin : reg_win,
3493 regline, (colnr_T)(reginput - regline)) + 1, scan))
3494 return FALSE;
3495 break;
3496
3497 case BOW: /* \<word; reginput points to w */
3498 if (c == NUL) /* Can't match at end of line */
3499 return FALSE;
3500#ifdef FEAT_MBYTE
3501 if (has_mbyte)
3502 {
3503 int this_class;
3504
3505 /* Get class of current and previous char (if it exists). */
3506 this_class = mb_get_class(reginput);
3507 if (this_class <= 1)
3508 return FALSE; /* not on a word at all */
3509 if (reg_prev_class() == this_class)
3510 return FALSE; /* previous char is in same word */
3511 }
3512#endif
3513 else
3514 {
3515 if (!vim_iswordc(c)
3516 || (reginput > regline && vim_iswordc(reginput[-1])))
3517 return FALSE;
3518 }
3519 break;
3520
3521 case EOW: /* word\>; reginput points after d */
3522 if (reginput == regline) /* Can't match at start of line */
3523 return FALSE;
3524#ifdef FEAT_MBYTE
3525 if (has_mbyte)
3526 {
3527 int this_class, prev_class;
3528
3529 /* Get class of current and previous char (if it exists). */
3530 this_class = mb_get_class(reginput);
3531 prev_class = reg_prev_class();
3532 if (this_class == prev_class)
3533 return FALSE;
3534 if (prev_class == 0 || prev_class == 1)
3535 return FALSE;
3536 }
3537 else
3538#endif
3539 {
3540 if (!vim_iswordc(reginput[-1]))
3541 return FALSE;
3542 if (reginput[0] != NUL && vim_iswordc(c))
3543 return FALSE;
3544 }
3545 break; /* Matched with EOW */
3546
3547 case ANY:
3548 if (c == NUL)
3549 return FALSE;
3550 ADVANCE_REGINPUT();
3551 break;
3552
3553 case IDENT:
3554 if (!vim_isIDc(c))
3555 return FALSE;
3556 ADVANCE_REGINPUT();
3557 break;
3558
3559 case SIDENT:
3560 if (VIM_ISDIGIT(*reginput) || !vim_isIDc(c))
3561 return FALSE;
3562 ADVANCE_REGINPUT();
3563 break;
3564
3565 case KWORD:
3566 if (!vim_iswordp(reginput))
3567 return FALSE;
3568 ADVANCE_REGINPUT();
3569 break;
3570
3571 case SKWORD:
3572 if (VIM_ISDIGIT(*reginput) || !vim_iswordp(reginput))
3573 return FALSE;
3574 ADVANCE_REGINPUT();
3575 break;
3576
3577 case FNAME:
3578 if (!vim_isfilec(c))
3579 return FALSE;
3580 ADVANCE_REGINPUT();
3581 break;
3582
3583 case SFNAME:
3584 if (VIM_ISDIGIT(*reginput) || !vim_isfilec(c))
3585 return FALSE;
3586 ADVANCE_REGINPUT();
3587 break;
3588
3589 case PRINT:
3590 if (ptr2cells(reginput) != 1)
3591 return FALSE;
3592 ADVANCE_REGINPUT();
3593 break;
3594
3595 case SPRINT:
3596 if (VIM_ISDIGIT(*reginput) || ptr2cells(reginput) != 1)
3597 return FALSE;
3598 ADVANCE_REGINPUT();
3599 break;
3600
3601 case WHITE:
3602 if (!vim_iswhite(c))
3603 return FALSE;
3604 ADVANCE_REGINPUT();
3605 break;
3606
3607 case NWHITE:
3608 if (c == NUL || vim_iswhite(c))
3609 return FALSE;
3610 ADVANCE_REGINPUT();
3611 break;
3612
3613 case DIGIT:
3614 if (!ri_digit(c))
3615 return FALSE;
3616 ADVANCE_REGINPUT();
3617 break;
3618
3619 case NDIGIT:
3620 if (c == NUL || ri_digit(c))
3621 return FALSE;
3622 ADVANCE_REGINPUT();
3623 break;
3624
3625 case HEX:
3626 if (!ri_hex(c))
3627 return FALSE;
3628 ADVANCE_REGINPUT();
3629 break;
3630
3631 case NHEX:
3632 if (c == NUL || ri_hex(c))
3633 return FALSE;
3634 ADVANCE_REGINPUT();
3635 break;
3636
3637 case OCTAL:
3638 if (!ri_octal(c))
3639 return FALSE;
3640 ADVANCE_REGINPUT();
3641 break;
3642
3643 case NOCTAL:
3644 if (c == NUL || ri_octal(c))
3645 return FALSE;
3646 ADVANCE_REGINPUT();
3647 break;
3648
3649 case WORD:
3650 if (!ri_word(c))
3651 return FALSE;
3652 ADVANCE_REGINPUT();
3653 break;
3654
3655 case NWORD:
3656 if (c == NUL || ri_word(c))
3657 return FALSE;
3658 ADVANCE_REGINPUT();
3659 break;
3660
3661 case HEAD:
3662 if (!ri_head(c))
3663 return FALSE;
3664 ADVANCE_REGINPUT();
3665 break;
3666
3667 case NHEAD:
3668 if (c == NUL || ri_head(c))
3669 return FALSE;
3670 ADVANCE_REGINPUT();
3671 break;
3672
3673 case ALPHA:
3674 if (!ri_alpha(c))
3675 return FALSE;
3676 ADVANCE_REGINPUT();
3677 break;
3678
3679 case NALPHA:
3680 if (c == NUL || ri_alpha(c))
3681 return FALSE;
3682 ADVANCE_REGINPUT();
3683 break;
3684
3685 case LOWER:
3686 if (!ri_lower(c))
3687 return FALSE;
3688 ADVANCE_REGINPUT();
3689 break;
3690
3691 case NLOWER:
3692 if (c == NUL || ri_lower(c))
3693 return FALSE;
3694 ADVANCE_REGINPUT();
3695 break;
3696
3697 case UPPER:
3698 if (!ri_upper(c))
3699 return FALSE;
3700 ADVANCE_REGINPUT();
3701 break;
3702
3703 case NUPPER:
3704 if (c == NUL || ri_upper(c))
3705 return FALSE;
3706 ADVANCE_REGINPUT();
3707 break;
3708
3709 case EXACTLY:
3710 {
3711 int len;
3712 char_u *opnd;
3713
3714 opnd = OPERAND(scan);
3715 /* Inline the first byte, for speed. */
3716 if (*opnd != *reginput
3717 && (!ireg_ic || (
3718#ifdef FEAT_MBYTE
3719 !enc_utf8 &&
3720#endif
3721 TOLOWER_LOC(*opnd) != TOLOWER_LOC(*reginput))))
3722 return FALSE;
3723 if (*opnd == NUL)
3724 {
3725 /* match empty string always works; happens when "~" is
3726 * empty. */
3727 }
3728 else if (opnd[1] == NUL
3729#ifdef FEAT_MBYTE
3730 && !(enc_utf8 && ireg_ic)
3731#endif
3732 )
3733 ++reginput; /* matched a single char */
3734 else
3735 {
3736 len = (int)STRLEN(opnd);
3737 /* Need to match first byte again for multi-byte. */
3738 if (cstrncmp(opnd, reginput, &len) != 0)
3739 return FALSE;
3740#ifdef FEAT_MBYTE
3741 /* Check for following composing character. */
3742 if (enc_utf8 && UTF_COMPOSINGLIKE(reginput, reginput + len))
3743 {
3744 /* raaron: This code makes a composing character get
3745 * ignored, which is the correct behavior (sometimes)
3746 * for voweled Hebrew texts. */
3747 if (!ireg_icombine)
3748 return FALSE;
3749 }
3750 else
3751#endif
3752 reginput += len;
3753 }
3754 }
3755 break;
3756
3757 case ANYOF:
3758 case ANYBUT:
3759 if (c == NUL)
3760 return FALSE;
3761 if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
3762 return FALSE;
3763 ADVANCE_REGINPUT();
3764 break;
3765
3766#ifdef FEAT_MBYTE
3767 case MULTIBYTECODE:
3768 if (has_mbyte)
3769 {
3770 int i, len;
3771 char_u *opnd;
3772
3773 opnd = OPERAND(scan);
3774 /* Safety check (just in case 'encoding' was changed since
3775 * compiling the program). */
3776 if ((len = (*mb_ptr2len_check)(opnd)) < 2)
3777 return FALSE;
3778 for (i = 0; i < len; ++i)
3779 if (opnd[i] != reginput[i])
3780 return FALSE;
3781 reginput += len;
3782 }
3783 else
3784 return FALSE;
3785 break;
3786#endif
3787
3788 case NOTHING:
3789 break;
3790
3791 case BACK:
3792 break;
3793
3794 case MOPEN + 0: /* Match start: \zs */
3795 case MOPEN + 1: /* \( */
3796 case MOPEN + 2:
3797 case MOPEN + 3:
3798 case MOPEN + 4:
3799 case MOPEN + 5:
3800 case MOPEN + 6:
3801 case MOPEN + 7:
3802 case MOPEN + 8:
3803 case MOPEN + 9:
3804 {
3805 int no;
3806 save_se_T save;
3807
3808 no = op - MOPEN;
3809 cleanup_subexpr();
3810 save_se(&save, &reg_startpos[no], &reg_startp[no]);
3811
3812 if (regmatch(next))
3813 return TRUE;
3814
3815 restore_se(&save, &reg_startpos[no], &reg_startp[no]);
3816 return FALSE;
3817 }
3818 /* break; Not Reached */
3819
3820 case NOPEN: /* \%( */
3821 case NCLOSE: /* \) after \%( */
3822 if (regmatch(next))
3823 return TRUE;
3824 return FALSE;
3825 /* break; Not Reached */
3826
3827#ifdef FEAT_SYN_HL
3828 case ZOPEN + 1:
3829 case ZOPEN + 2:
3830 case ZOPEN + 3:
3831 case ZOPEN + 4:
3832 case ZOPEN + 5:
3833 case ZOPEN + 6:
3834 case ZOPEN + 7:
3835 case ZOPEN + 8:
3836 case ZOPEN + 9:
3837 {
3838 int no;
3839 save_se_T save;
3840
3841 no = op - ZOPEN;
3842 cleanup_zsubexpr();
3843 save_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3844
3845 if (regmatch(next))
3846 return TRUE;
3847
3848 restore_se(&save, &reg_startzpos[no], &reg_startzp[no]);
3849 return FALSE;
3850 }
3851 /* break; Not Reached */
3852#endif
3853
3854 case MCLOSE + 0: /* Match end: \ze */
3855 case MCLOSE + 1: /* \) */
3856 case MCLOSE + 2:
3857 case MCLOSE + 3:
3858 case MCLOSE + 4:
3859 case MCLOSE + 5:
3860 case MCLOSE + 6:
3861 case MCLOSE + 7:
3862 case MCLOSE + 8:
3863 case MCLOSE + 9:
3864 {
3865 int no;
3866 save_se_T save;
3867
3868 no = op - MCLOSE;
3869 cleanup_subexpr();
3870 save_se(&save, &reg_endpos[no], &reg_endp[no]);
3871
3872 if (regmatch(next))
3873 return TRUE;
3874
3875 restore_se(&save, &reg_endpos[no], &reg_endp[no]);
3876 return FALSE;
3877 }
3878 /* break; Not Reached */
3879
3880#ifdef FEAT_SYN_HL
3881 case ZCLOSE + 1: /* \) after \z( */
3882 case ZCLOSE + 2:
3883 case ZCLOSE + 3:
3884 case ZCLOSE + 4:
3885 case ZCLOSE + 5:
3886 case ZCLOSE + 6:
3887 case ZCLOSE + 7:
3888 case ZCLOSE + 8:
3889 case ZCLOSE + 9:
3890 {
3891 int no;
3892 save_se_T save;
3893
3894 no = op - ZCLOSE;
3895 cleanup_zsubexpr();
3896 save_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3897
3898 if (regmatch(next))
3899 return TRUE;
3900
3901 restore_se(&save, &reg_endzpos[no], &reg_endzp[no]);
3902 return FALSE;
3903 }
3904 /* break; Not Reached */
3905#endif
3906
3907 case BACKREF + 1:
3908 case BACKREF + 2:
3909 case BACKREF + 3:
3910 case BACKREF + 4:
3911 case BACKREF + 5:
3912 case BACKREF + 6:
3913 case BACKREF + 7:
3914 case BACKREF + 8:
3915 case BACKREF + 9:
3916 {
3917 int no;
3918 int len;
3919 linenr_T clnum;
3920 colnr_T ccol;
3921 char_u *p;
3922
3923 no = op - BACKREF;
3924 cleanup_subexpr();
3925 if (!REG_MULTI) /* Single-line regexp */
3926 {
3927 if (reg_endp[no] == NULL)
3928 {
3929 /* Backref was not set: Match an empty string. */
3930 len = 0;
3931 }
3932 else
3933 {
3934 /* Compare current input with back-ref in the same
3935 * line. */
3936 len = (int)(reg_endp[no] - reg_startp[no]);
3937 if (cstrncmp(reg_startp[no], reginput, &len) != 0)
3938 return FALSE;
3939 }
3940 }
3941 else /* Multi-line regexp */
3942 {
3943 if (reg_endpos[no].lnum < 0)
3944 {
3945 /* Backref was not set: Match an empty string. */
3946 len = 0;
3947 }
3948 else
3949 {
3950 if (reg_startpos[no].lnum == reglnum
3951 && reg_endpos[no].lnum == reglnum)
3952 {
3953 /* Compare back-ref within the current line. */
3954 len = reg_endpos[no].col - reg_startpos[no].col;
3955 if (cstrncmp(regline + reg_startpos[no].col,
3956 reginput, &len) != 0)
3957 return FALSE;
3958 }
3959 else
3960 {
3961 /* Messy situation: Need to compare between two
3962 * lines. */
3963 ccol = reg_startpos[no].col;
3964 clnum = reg_startpos[no].lnum;
3965 for (;;)
3966 {
3967 /* Since getting one line may invalidate
3968 * the other, need to make copy. Slow! */
3969 if (regline != reg_tofree)
3970 {
3971 len = (int)STRLEN(regline);
3972 if (reg_tofree == NULL
3973 || len >= (int)reg_tofreelen)
3974 {
3975 len += 50; /* get some extra */
3976 vim_free(reg_tofree);
3977 reg_tofree = alloc(len);
3978 if (reg_tofree == NULL)
3979 return FALSE; /* out of memory! */
3980 reg_tofreelen = len;
3981 }
3982 STRCPY(reg_tofree, regline);
3983 reginput = reg_tofree
3984 + (reginput - regline);
3985 regline = reg_tofree;
3986 }
3987
3988 /* Get the line to compare with. */
3989 p = reg_getline(clnum);
3990 if (clnum == reg_endpos[no].lnum)
3991 len = reg_endpos[no].col - ccol;
3992 else
3993 len = (int)STRLEN(p + ccol);
3994
3995 if (cstrncmp(p + ccol, reginput, &len) != 0)
3996 return FALSE; /* doesn't match */
3997 if (clnum == reg_endpos[no].lnum)
3998 break; /* match and at end! */
3999 if (reglnum == reg_maxline)
4000 return FALSE; /* text too short */
4001
4002 /* Advance to next line. */
4003 reg_nextline();
4004 ++clnum;
4005 ccol = 0;
4006 if (got_int || out_of_stack)
4007 return FALSE;
4008 }
4009
4010 /* found a match! Note that regline may now point
4011 * to a copy of the line, that should not matter. */
4012 }
4013 }
4014 }
4015
4016 /* Matched the backref, skip over it. */
4017 reginput += len;
4018 }
4019 break;
4020
4021#ifdef FEAT_SYN_HL
4022 case ZREF + 1:
4023 case ZREF + 2:
4024 case ZREF + 3:
4025 case ZREF + 4:
4026 case ZREF + 5:
4027 case ZREF + 6:
4028 case ZREF + 7:
4029 case ZREF + 8:
4030 case ZREF + 9:
4031 {
4032 int no;
4033 int len;
4034
4035 cleanup_zsubexpr();
4036 no = op - ZREF;
4037 if (re_extmatch_in != NULL
4038 && re_extmatch_in->matches[no] != NULL)
4039 {
4040 len = (int)STRLEN(re_extmatch_in->matches[no]);
4041 if (cstrncmp(re_extmatch_in->matches[no],
4042 reginput, &len) != 0)
4043 return FALSE;
4044 reginput += len;
4045 }
4046 else
4047 {
4048 /* Backref was not set: Match an empty string. */
4049 }
4050 }
4051 break;
4052#endif
4053
4054 case BRANCH:
4055 {
4056 if (OP(next) != BRANCH) /* No choice. */
4057 next = OPERAND(scan); /* Avoid recursion. */
4058 else
4059 {
4060 regsave_T save;
4061
4062 do
4063 {
4064 reg_save(&save);
4065 if (regmatch(OPERAND(scan)))
4066 return TRUE;
4067 reg_restore(&save);
4068 scan = regnext(scan);
4069 } while (scan != NULL && OP(scan) == BRANCH);
4070 return FALSE;
4071 /* NOTREACHED */
4072 }
4073 }
4074 break;
4075
4076 case BRACE_LIMITS:
4077 {
4078 int no;
4079
4080 if (OP(next) == BRACE_SIMPLE)
4081 {
4082 bl_minval = OPERAND_MIN(scan);
4083 bl_maxval = OPERAND_MAX(scan);
4084 }
4085 else if (OP(next) >= BRACE_COMPLEX
4086 && OP(next) < BRACE_COMPLEX + 10)
4087 {
4088 no = OP(next) - BRACE_COMPLEX;
4089 brace_min[no] = OPERAND_MIN(scan);
4090 brace_max[no] = OPERAND_MAX(scan);
4091 brace_count[no] = 0;
4092 }
4093 else
4094 {
4095 EMSG(_(e_internal)); /* Shouldn't happen */
4096 return FALSE;
4097 }
4098 }
4099 break;
4100
4101 case BRACE_COMPLEX + 0:
4102 case BRACE_COMPLEX + 1:
4103 case BRACE_COMPLEX + 2:
4104 case BRACE_COMPLEX + 3:
4105 case BRACE_COMPLEX + 4:
4106 case BRACE_COMPLEX + 5:
4107 case BRACE_COMPLEX + 6:
4108 case BRACE_COMPLEX + 7:
4109 case BRACE_COMPLEX + 8:
4110 case BRACE_COMPLEX + 9:
4111 {
4112 int no;
4113 regsave_T save;
4114
4115 no = op - BRACE_COMPLEX;
4116 ++brace_count[no];
4117
4118 /* If not matched enough times yet, try one more */
4119 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4120 ? brace_min[no] : brace_max[no]))
4121 {
4122 reg_save(&save);
4123 if (regmatch(OPERAND(scan)))
4124 return TRUE;
4125 reg_restore(&save);
4126 --brace_count[no]; /* failed, decrement match count */
4127 return FALSE;
4128 }
4129
4130 /* If matched enough times, may try matching some more */
4131 if (brace_min[no] <= brace_max[no])
4132 {
4133 /* Range is the normal way around, use longest match */
4134 if (brace_count[no] <= brace_max[no])
4135 {
4136 reg_save(&save);
4137 if (regmatch(OPERAND(scan)))
4138 return TRUE; /* matched some more times */
4139 reg_restore(&save);
4140 --brace_count[no]; /* matched just enough times */
4141 /* continue with the items after \{} */
4142 }
4143 }
4144 else
4145 {
4146 /* Range is backwards, use shortest match first */
4147 if (brace_count[no] <= brace_min[no])
4148 {
4149 reg_save(&save);
4150 if (regmatch(next))
4151 return TRUE;
4152 reg_restore(&save);
4153 next = OPERAND(scan);
4154 /* must try to match one more item */
4155 }
4156 }
4157 }
4158 break;
4159
4160 case BRACE_SIMPLE:
4161 case STAR:
4162 case PLUS:
4163 {
4164 int nextb; /* next byte */
4165 int nextb_ic; /* next byte reverse case */
4166 long count;
4167 regsave_T save;
4168 long minval;
4169 long maxval;
4170
4171 /*
4172 * Lookahead to avoid useless match attempts when we know
4173 * what character comes next.
4174 */
4175 if (OP(next) == EXACTLY)
4176 {
4177 nextb = *OPERAND(next);
4178 if (ireg_ic)
4179 {
4180 if (isupper(nextb))
4181 nextb_ic = TOLOWER_LOC(nextb);
4182 else
4183 nextb_ic = TOUPPER_LOC(nextb);
4184 }
4185 else
4186 nextb_ic = nextb;
4187 }
4188 else
4189 {
4190 nextb = NUL;
4191 nextb_ic = NUL;
4192 }
4193 if (op != BRACE_SIMPLE)
4194 {
4195 minval = (op == STAR) ? 0 : 1;
4196 maxval = MAX_LIMIT;
4197 }
4198 else
4199 {
4200 minval = bl_minval;
4201 maxval = bl_maxval;
4202 }
4203
4204 /*
4205 * When maxval > minval, try matching as much as possible, up
4206 * to maxval. When maxval < minval, try matching at least the
4207 * minimal number (since the range is backwards, that's also
4208 * maxval!).
4209 */
4210 count = regrepeat(OPERAND(scan), maxval);
4211 if (got_int)
4212 return FALSE;
4213 if (minval <= maxval)
4214 {
4215 /* Range is the normal way around, use longest match */
4216 while (count >= minval)
4217 {
4218 /* If it could match, try it. */
4219 if (nextb == NUL || *reginput == nextb
4220 || *reginput == nextb_ic)
4221 {
4222 reg_save(&save);
4223 if (regmatch(next))
4224 return TRUE;
4225 reg_restore(&save);
4226 }
4227 /* Couldn't or didn't match -- back up one char. */
4228 if (--count < minval)
4229 break;
4230 if (reginput == regline)
4231 {
4232 /* backup to last char of previous line */
4233 --reglnum;
4234 regline = reg_getline(reglnum);
4235 /* Just in case regrepeat() didn't count right. */
4236 if (regline == NULL)
4237 return FALSE;
4238 reginput = regline + STRLEN(regline);
4239 fast_breakcheck();
4240 if (got_int || out_of_stack)
4241 return FALSE;
4242 }
4243 else
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004244 mb_ptr_back(regline, reginput);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004245 }
4246 }
4247 else
4248 {
4249 /* Range is backwards, use shortest match first.
4250 * Careful: maxval and minval are exchanged! */
4251 if (count < maxval)
4252 return FALSE;
4253 for (;;)
4254 {
4255 /* If it could work, try it. */
4256 if (nextb == NUL || *reginput == nextb
4257 || *reginput == nextb_ic)
4258 {
4259 reg_save(&save);
4260 if (regmatch(next))
4261 return TRUE;
4262 reg_restore(&save);
4263 }
4264 /* Couldn't or didn't match: try advancing one char. */
4265 if (count == minval
4266 || regrepeat(OPERAND(scan), 1L) == 0)
4267 break;
4268 ++count;
4269 if (got_int || out_of_stack)
4270 return FALSE;
4271 }
4272 }
4273 return FALSE;
4274 }
4275 /* break; Not Reached */
4276
4277 case NOMATCH:
4278 {
4279 regsave_T save;
4280
4281 /* If the operand matches, we fail. Otherwise backup and
4282 * continue with the next item. */
4283 reg_save(&save);
4284 if (regmatch(OPERAND(scan)))
4285 return FALSE;
4286 reg_restore(&save);
4287 }
4288 break;
4289
4290 case MATCH:
4291 case SUBPAT:
4292 {
4293 regsave_T save;
4294
4295 /* If the operand doesn't match, we fail. Otherwise backup
4296 * and continue with the next item. */
4297 reg_save(&save);
4298 if (!regmatch(OPERAND(scan)))
4299 return FALSE;
4300 if (op == MATCH) /* zero-width */
4301 reg_restore(&save);
4302 }
4303 break;
4304
4305 case BEHIND:
4306 case NOBEHIND:
4307 {
4308 regsave_T save_after, save_start;
4309 regsave_T save_behind_pos;
4310 int needmatch = (op == BEHIND);
4311
4312 /*
4313 * Look back in the input of the operand matches or not. This
4314 * must be done at every position in the input and checking if
4315 * the match ends at the current position.
4316 * First check if the next item matches, that's probably
4317 * faster.
4318 */
4319 reg_save(&save_start);
4320 if (regmatch(next))
4321 {
4322 /* save the position after the found match for next */
4323 reg_save(&save_after);
4324
4325 /* start looking for a match with operand at the current
4326 * postion. Go back one character until we find the
4327 * result, hitting the start of the line or the previous
4328 * line (for multi-line matching).
4329 * Set behind_pos to where the match should end, BHPOS
4330 * will match it. */
4331 save_behind_pos = behind_pos;
4332 behind_pos = save_start;
4333 for (;;)
4334 {
4335 reg_restore(&save_start);
4336 if (regmatch(OPERAND(scan))
4337 && reg_save_equal(&behind_pos))
4338 {
4339 behind_pos = save_behind_pos;
4340 /* found a match that ends where "next" started */
4341 if (needmatch)
4342 {
4343 reg_restore(&save_after);
4344 return TRUE;
4345 }
4346 return FALSE;
4347 }
4348 /*
4349 * No match: Go back one character. May go to
4350 * previous line once.
4351 */
4352 if (REG_MULTI)
4353 {
4354 if (save_start.rs_u.pos.col == 0)
4355 {
4356 if (save_start.rs_u.pos.lnum
4357 < behind_pos.rs_u.pos.lnum
4358 || reg_getline(
4359 --save_start.rs_u.pos.lnum) == NULL)
4360 break;
4361 reg_restore(&save_start);
4362 save_start.rs_u.pos.col =
4363 (colnr_T)STRLEN(regline);
4364 }
4365 else
4366 --save_start.rs_u.pos.col;
4367 }
4368 else
4369 {
4370 if (save_start.rs_u.ptr == regline)
4371 break;
4372 --save_start.rs_u.ptr;
4373 }
4374 }
4375
4376 /* NOBEHIND succeeds when no match was found */
4377 behind_pos = save_behind_pos;
4378 if (!needmatch)
4379 {
4380 reg_restore(&save_after);
4381 return TRUE;
4382 }
4383 }
4384 return FALSE;
4385 }
4386
4387 case BHPOS:
4388 if (REG_MULTI)
4389 {
4390 if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline)
4391 || behind_pos.rs_u.pos.lnum != reglnum)
4392 return FALSE;
4393 }
4394 else if (behind_pos.rs_u.ptr != reginput)
4395 return FALSE;
4396 break;
4397
4398 case NEWL:
4399 if ((c != NUL || reglnum == reg_maxline)
4400 && (c != '\n' || !reg_line_lbr))
4401 return FALSE;
4402 if (reg_line_lbr)
4403 ADVANCE_REGINPUT();
4404 else
4405 reg_nextline();
4406 break;
4407
4408 case END:
4409 return TRUE; /* Success! */
4410
4411 default:
4412 EMSG(_(e_re_corr));
4413#ifdef DEBUG
4414 printf("Illegal op code %d\n", op);
4415#endif
4416 return FALSE;
4417 }
4418 }
4419
4420 scan = next;
4421 }
4422
4423 /*
4424 * We get here only if there's trouble -- normally "case END" is the
4425 * terminating point.
4426 */
4427 EMSG(_(e_re_corr));
4428#ifdef DEBUG
4429 printf("Premature EOL\n");
4430#endif
4431 return FALSE;
4432}
4433
Bram Moolenaar071d4272004-06-13 20:20:40 +00004434/*
4435 * regrepeat - repeatedly match something simple, return how many.
4436 * Advances reginput (and reglnum) to just after the matched chars.
4437 */
4438 static int
4439regrepeat(p, maxcount)
4440 char_u *p;
4441 long maxcount; /* maximum number of matches allowed */
4442{
4443 long count = 0;
4444 char_u *scan;
4445 char_u *opnd;
4446 int mask;
4447 int testval = 0;
4448
4449 scan = reginput; /* Make local copy of reginput for speed. */
4450 opnd = OPERAND(p);
4451 switch (OP(p))
4452 {
4453 case ANY:
4454 case ANY + ADD_NL:
4455 while (count < maxcount)
4456 {
4457 /* Matching anything means we continue until end-of-line (or
4458 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
4459 while (*scan != NUL && count < maxcount)
4460 {
4461 ++count;
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004462 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004463 }
4464 if (!WITH_NL(OP(p)) || reglnum == reg_maxline || count == maxcount)
4465 break;
4466 ++count; /* count the line-break */
4467 reg_nextline();
4468 scan = reginput;
4469 if (got_int)
4470 break;
4471 }
4472 break;
4473
4474 case IDENT:
4475 case IDENT + ADD_NL:
4476 testval = TRUE;
4477 /*FALLTHROUGH*/
4478 case SIDENT:
4479 case SIDENT + ADD_NL:
4480 while (count < maxcount)
4481 {
4482 if (vim_isIDc(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4483 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004484 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004485 }
4486 else if (*scan == NUL)
4487 {
4488 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4489 break;
4490 reg_nextline();
4491 scan = reginput;
4492 if (got_int)
4493 break;
4494 }
4495 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4496 ++scan;
4497 else
4498 break;
4499 ++count;
4500 }
4501 break;
4502
4503 case KWORD:
4504 case KWORD + ADD_NL:
4505 testval = TRUE;
4506 /*FALLTHROUGH*/
4507 case SKWORD:
4508 case SKWORD + ADD_NL:
4509 while (count < maxcount)
4510 {
4511 if (vim_iswordp(scan) && (testval || !VIM_ISDIGIT(*scan)))
4512 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004513 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004514 }
4515 else if (*scan == NUL)
4516 {
4517 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4518 break;
4519 reg_nextline();
4520 scan = reginput;
4521 if (got_int)
4522 break;
4523 }
4524 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4525 ++scan;
4526 else
4527 break;
4528 ++count;
4529 }
4530 break;
4531
4532 case FNAME:
4533 case FNAME + ADD_NL:
4534 testval = TRUE;
4535 /*FALLTHROUGH*/
4536 case SFNAME:
4537 case SFNAME + ADD_NL:
4538 while (count < maxcount)
4539 {
4540 if (vim_isfilec(*scan) && (testval || !VIM_ISDIGIT(*scan)))
4541 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004542 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004543 }
4544 else if (*scan == NUL)
4545 {
4546 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4547 break;
4548 reg_nextline();
4549 scan = reginput;
4550 if (got_int)
4551 break;
4552 }
4553 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4554 ++scan;
4555 else
4556 break;
4557 ++count;
4558 }
4559 break;
4560
4561 case PRINT:
4562 case PRINT + ADD_NL:
4563 testval = TRUE;
4564 /*FALLTHROUGH*/
4565 case SPRINT:
4566 case SPRINT + ADD_NL:
4567 while (count < maxcount)
4568 {
4569 if (*scan == NUL)
4570 {
4571 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4572 break;
4573 reg_nextline();
4574 scan = reginput;
4575 if (got_int)
4576 break;
4577 }
4578 else if (ptr2cells(scan) == 1 && (testval || !VIM_ISDIGIT(*scan)))
4579 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00004580 mb_ptr_adv(scan);
Bram Moolenaar071d4272004-06-13 20:20:40 +00004581 }
4582 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4583 ++scan;
4584 else
4585 break;
4586 ++count;
4587 }
4588 break;
4589
4590 case WHITE:
4591 case WHITE + ADD_NL:
4592 testval = mask = RI_WHITE;
4593do_class:
4594 while (count < maxcount)
4595 {
4596#ifdef FEAT_MBYTE
4597 int l;
4598#endif
4599 if (*scan == NUL)
4600 {
4601 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4602 break;
4603 reg_nextline();
4604 scan = reginput;
4605 if (got_int)
4606 break;
4607 }
4608#ifdef FEAT_MBYTE
4609 else if (has_mbyte && (l = (*mb_ptr2len_check)(scan)) > 1)
4610 {
4611 if (testval != 0)
4612 break;
4613 scan += l;
4614 }
4615#endif
4616 else if ((class_tab[*scan] & mask) == testval)
4617 ++scan;
4618 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4619 ++scan;
4620 else
4621 break;
4622 ++count;
4623 }
4624 break;
4625
4626 case NWHITE:
4627 case NWHITE + ADD_NL:
4628 mask = RI_WHITE;
4629 goto do_class;
4630 case DIGIT:
4631 case DIGIT + ADD_NL:
4632 testval = mask = RI_DIGIT;
4633 goto do_class;
4634 case NDIGIT:
4635 case NDIGIT + ADD_NL:
4636 mask = RI_DIGIT;
4637 goto do_class;
4638 case HEX:
4639 case HEX + ADD_NL:
4640 testval = mask = RI_HEX;
4641 goto do_class;
4642 case NHEX:
4643 case NHEX + ADD_NL:
4644 mask = RI_HEX;
4645 goto do_class;
4646 case OCTAL:
4647 case OCTAL + ADD_NL:
4648 testval = mask = RI_OCTAL;
4649 goto do_class;
4650 case NOCTAL:
4651 case NOCTAL + ADD_NL:
4652 mask = RI_OCTAL;
4653 goto do_class;
4654 case WORD:
4655 case WORD + ADD_NL:
4656 testval = mask = RI_WORD;
4657 goto do_class;
4658 case NWORD:
4659 case NWORD + ADD_NL:
4660 mask = RI_WORD;
4661 goto do_class;
4662 case HEAD:
4663 case HEAD + ADD_NL:
4664 testval = mask = RI_HEAD;
4665 goto do_class;
4666 case NHEAD:
4667 case NHEAD + ADD_NL:
4668 mask = RI_HEAD;
4669 goto do_class;
4670 case ALPHA:
4671 case ALPHA + ADD_NL:
4672 testval = mask = RI_ALPHA;
4673 goto do_class;
4674 case NALPHA:
4675 case NALPHA + ADD_NL:
4676 mask = RI_ALPHA;
4677 goto do_class;
4678 case LOWER:
4679 case LOWER + ADD_NL:
4680 testval = mask = RI_LOWER;
4681 goto do_class;
4682 case NLOWER:
4683 case NLOWER + ADD_NL:
4684 mask = RI_LOWER;
4685 goto do_class;
4686 case UPPER:
4687 case UPPER + ADD_NL:
4688 testval = mask = RI_UPPER;
4689 goto do_class;
4690 case NUPPER:
4691 case NUPPER + ADD_NL:
4692 mask = RI_UPPER;
4693 goto do_class;
4694
4695 case EXACTLY:
4696 {
4697 int cu, cl;
4698
4699 /* This doesn't do a multi-byte character, because a MULTIBYTECODE
4700 * would have been used for it. */
4701 if (ireg_ic)
4702 {
4703 cu = TOUPPER_LOC(*opnd);
4704 cl = TOLOWER_LOC(*opnd);
4705 while (count < maxcount && (*scan == cu || *scan == cl))
4706 {
4707 count++;
4708 scan++;
4709 }
4710 }
4711 else
4712 {
4713 cu = *opnd;
4714 while (count < maxcount && *scan == cu)
4715 {
4716 count++;
4717 scan++;
4718 }
4719 }
4720 break;
4721 }
4722
4723#ifdef FEAT_MBYTE
4724 case MULTIBYTECODE:
4725 {
4726 int i, len, cf = 0;
4727
4728 /* Safety check (just in case 'encoding' was changed since
4729 * compiling the program). */
4730 if ((len = (*mb_ptr2len_check)(opnd)) > 1)
4731 {
4732 if (ireg_ic && enc_utf8)
4733 cf = utf_fold(utf_ptr2char(opnd));
4734 while (count < maxcount)
4735 {
4736 for (i = 0; i < len; ++i)
4737 if (opnd[i] != scan[i])
4738 break;
4739 if (i < len && (!ireg_ic || !enc_utf8
4740 || utf_fold(utf_ptr2char(scan)) != cf))
4741 break;
4742 scan += len;
4743 ++count;
4744 }
4745 }
4746 }
4747 break;
4748#endif
4749
4750 case ANYOF:
4751 case ANYOF + ADD_NL:
4752 testval = TRUE;
4753 /*FALLTHROUGH*/
4754
4755 case ANYBUT:
4756 case ANYBUT + ADD_NL:
4757 while (count < maxcount)
4758 {
4759#ifdef FEAT_MBYTE
4760 int len;
4761#endif
4762 if (*scan == NUL)
4763 {
4764 if (!WITH_NL(OP(p)) || reglnum == reg_maxline)
4765 break;
4766 reg_nextline();
4767 scan = reginput;
4768 if (got_int)
4769 break;
4770 }
4771 else if (reg_line_lbr && *scan == '\n' && WITH_NL(OP(p)))
4772 ++scan;
4773#ifdef FEAT_MBYTE
4774 else if (has_mbyte && (len = (*mb_ptr2len_check)(scan)) > 1)
4775 {
4776 if ((cstrchr(opnd, (*mb_ptr2char)(scan)) == NULL) == testval)
4777 break;
4778 scan += len;
4779 }
4780#endif
4781 else
4782 {
4783 if ((cstrchr(opnd, *scan) == NULL) == testval)
4784 break;
4785 ++scan;
4786 }
4787 ++count;
4788 }
4789 break;
4790
4791 case NEWL:
4792 while (count < maxcount
4793 && ((*scan == NUL && reglnum < reg_maxline)
4794 || (*scan == '\n' && reg_line_lbr)))
4795 {
4796 count++;
4797 if (reg_line_lbr)
4798 ADVANCE_REGINPUT();
4799 else
4800 reg_nextline();
4801 scan = reginput;
4802 if (got_int)
4803 break;
4804 }
4805 break;
4806
4807 default: /* Oh dear. Called inappropriately. */
4808 EMSG(_(e_re_corr));
4809#ifdef DEBUG
4810 printf("Called regrepeat with op code %d\n", OP(p));
4811#endif
4812 break;
4813 }
4814
4815 reginput = scan;
4816
4817 return (int)count;
4818}
4819
4820/*
4821 * regnext - dig the "next" pointer out of a node
4822 */
4823 static char_u *
4824regnext(p)
4825 char_u *p;
4826{
4827 int offset;
4828
4829 if (p == JUST_CALC_SIZE)
4830 return NULL;
4831
4832 offset = NEXT(p);
4833 if (offset == 0)
4834 return NULL;
4835
4836 if (OP(p) == BACK)
4837 return p - offset;
4838 else
4839 return p + offset;
4840}
4841
4842/*
4843 * Check the regexp program for its magic number.
4844 * Return TRUE if it's wrong.
4845 */
4846 static int
4847prog_magic_wrong()
4848{
4849 if (UCHARAT(REG_MULTI
4850 ? reg_mmatch->regprog->program
4851 : reg_match->regprog->program) != REGMAGIC)
4852 {
4853 EMSG(_(e_re_corr));
4854 return TRUE;
4855 }
4856 return FALSE;
4857}
4858
4859/*
4860 * Cleanup the subexpressions, if this wasn't done yet.
4861 * This construction is used to clear the subexpressions only when they are
4862 * used (to increase speed).
4863 */
4864 static void
4865cleanup_subexpr()
4866{
4867 if (need_clear_subexpr)
4868 {
4869 if (REG_MULTI)
4870 {
4871 /* Use 0xff to set lnum to -1 */
4872 vim_memset(reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4873 vim_memset(reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4874 }
4875 else
4876 {
4877 vim_memset(reg_startp, 0, sizeof(char_u *) * NSUBEXP);
4878 vim_memset(reg_endp, 0, sizeof(char_u *) * NSUBEXP);
4879 }
4880 need_clear_subexpr = FALSE;
4881 }
4882}
4883
4884#ifdef FEAT_SYN_HL
4885 static void
4886cleanup_zsubexpr()
4887{
4888 if (need_clear_zsubexpr)
4889 {
4890 if (REG_MULTI)
4891 {
4892 /* Use 0xff to set lnum to -1 */
4893 vim_memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4894 vim_memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
4895 }
4896 else
4897 {
4898 vim_memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
4899 vim_memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
4900 }
4901 need_clear_zsubexpr = FALSE;
4902 }
4903}
4904#endif
4905
4906/*
4907 * Advance reglnum, regline and reginput to the next line.
4908 */
4909 static void
4910reg_nextline()
4911{
4912 regline = reg_getline(++reglnum);
4913 reginput = regline;
4914 fast_breakcheck();
4915}
4916
4917/*
4918 * Save the input line and position in a regsave_T.
4919 */
4920 static void
4921reg_save(save)
4922 regsave_T *save;
4923{
4924 if (REG_MULTI)
4925 {
4926 save->rs_u.pos.col = (colnr_T)(reginput - regline);
4927 save->rs_u.pos.lnum = reglnum;
4928 }
4929 else
4930 save->rs_u.ptr = reginput;
4931}
4932
4933/*
4934 * Restore the input line and position from a regsave_T.
4935 */
4936 static void
4937reg_restore(save)
4938 regsave_T *save;
4939{
4940 if (REG_MULTI)
4941 {
4942 if (reglnum != save->rs_u.pos.lnum)
4943 {
4944 /* only call reg_getline() when the line number changed to save
4945 * a bit of time */
4946 reglnum = save->rs_u.pos.lnum;
4947 regline = reg_getline(reglnum);
4948 }
4949 reginput = regline + save->rs_u.pos.col;
4950 }
4951 else
4952 reginput = save->rs_u.ptr;
4953}
4954
4955/*
4956 * Return TRUE if current position is equal to saved position.
4957 */
4958 static int
4959reg_save_equal(save)
4960 regsave_T *save;
4961{
4962 if (REG_MULTI)
4963 return reglnum == save->rs_u.pos.lnum
4964 && reginput == regline + save->rs_u.pos.col;
4965 return reginput == save->rs_u.ptr;
4966}
4967
4968/*
4969 * Tentatively set the sub-expression start to the current position (after
4970 * calling regmatch() they will have changed). Need to save the existing
4971 * values for when there is no match.
4972 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
4973 * depending on REG_MULTI.
4974 */
4975 static void
4976save_se_multi(savep, posp)
4977 save_se_T *savep;
4978 lpos_T *posp;
4979{
4980 savep->se_u.pos = *posp;
4981 posp->lnum = reglnum;
4982 posp->col = (colnr_T)(reginput - regline);
4983}
4984
4985 static void
4986save_se_one(savep, pp)
4987 save_se_T *savep;
4988 char_u **pp;
4989{
4990 savep->se_u.ptr = *pp;
4991 *pp = reginput;
4992}
4993
4994/*
4995 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
4996 */
4997 static int
4998re_num_cmp(val, scan)
4999 long_u val;
5000 char_u *scan;
5001{
5002 long_u n = OPERAND_MIN(scan);
5003
5004 if (OPERAND_CMP(scan) == '>')
5005 return val > n;
5006 if (OPERAND_CMP(scan) == '<')
5007 return val < n;
5008 return val == n;
5009}
5010
5011
5012#ifdef DEBUG
5013
5014/*
5015 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5016 */
5017 static void
5018regdump(pattern, r)
5019 char_u *pattern;
5020 regprog_T *r;
5021{
5022 char_u *s;
5023 int op = EXACTLY; /* Arbitrary non-END op. */
5024 char_u *next;
5025 char_u *end = NULL;
5026
5027 printf("\r\nregcomp(%s):\r\n", pattern);
5028
5029 s = r->program + 1;
5030 /*
5031 * Loop until we find the END that isn't before a referred next (an END
5032 * can also appear in a NOMATCH operand).
5033 */
5034 while (op != END || s <= end)
5035 {
5036 op = OP(s);
5037 printf("%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5038 next = regnext(s);
5039 if (next == NULL) /* Next ptr. */
5040 printf("(0)");
5041 else
5042 printf("(%d)", (int)((s - r->program) + (next - s)));
5043 if (end < next)
5044 end = next;
5045 if (op == BRACE_LIMITS)
5046 {
5047 /* Two short ints */
5048 printf(" minval %ld, maxval %ld", OPERAND_MIN(s), OPERAND_MAX(s));
5049 s += 8;
5050 }
5051 s += 3;
5052 if (op == ANYOF || op == ANYOF + ADD_NL
5053 || op == ANYBUT || op == ANYBUT + ADD_NL
5054 || op == EXACTLY)
5055 {
5056 /* Literal string, where present. */
5057 while (*s != NUL)
5058 printf("%c", *s++);
5059 s++;
5060 }
5061 printf("\r\n");
5062 }
5063
5064 /* Header fields of interest. */
5065 if (r->regstart != NUL)
5066 printf("start `%s' 0x%x; ", r->regstart < 256
5067 ? (char *)transchar(r->regstart)
5068 : "multibyte", r->regstart);
5069 if (r->reganch)
5070 printf("anchored; ");
5071 if (r->regmust != NULL)
5072 printf("must have \"%s\"", r->regmust);
5073 printf("\r\n");
5074}
5075
5076/*
5077 * regprop - printable representation of opcode
5078 */
5079 static char_u *
5080regprop(op)
5081 char_u *op;
5082{
5083 char_u *p;
5084 static char_u buf[50];
5085
5086 (void) strcpy(buf, ":");
5087
5088 switch (OP(op))
5089 {
5090 case BOL:
5091 p = "BOL";
5092 break;
5093 case EOL:
5094 p = "EOL";
5095 break;
5096 case RE_BOF:
5097 p = "BOF";
5098 break;
5099 case RE_EOF:
5100 p = "EOF";
5101 break;
5102 case CURSOR:
5103 p = "CURSOR";
5104 break;
5105 case RE_LNUM:
5106 p = "RE_LNUM";
5107 break;
5108 case RE_COL:
5109 p = "RE_COL";
5110 break;
5111 case RE_VCOL:
5112 p = "RE_VCOL";
5113 break;
5114 case BOW:
5115 p = "BOW";
5116 break;
5117 case EOW:
5118 p = "EOW";
5119 break;
5120 case ANY:
5121 p = "ANY";
5122 break;
5123 case ANY + ADD_NL:
5124 p = "ANY+NL";
5125 break;
5126 case ANYOF:
5127 p = "ANYOF";
5128 break;
5129 case ANYOF + ADD_NL:
5130 p = "ANYOF+NL";
5131 break;
5132 case ANYBUT:
5133 p = "ANYBUT";
5134 break;
5135 case ANYBUT + ADD_NL:
5136 p = "ANYBUT+NL";
5137 break;
5138 case IDENT:
5139 p = "IDENT";
5140 break;
5141 case IDENT + ADD_NL:
5142 p = "IDENT+NL";
5143 break;
5144 case SIDENT:
5145 p = "SIDENT";
5146 break;
5147 case SIDENT + ADD_NL:
5148 p = "SIDENT+NL";
5149 break;
5150 case KWORD:
5151 p = "KWORD";
5152 break;
5153 case KWORD + ADD_NL:
5154 p = "KWORD+NL";
5155 break;
5156 case SKWORD:
5157 p = "SKWORD";
5158 break;
5159 case SKWORD + ADD_NL:
5160 p = "SKWORD+NL";
5161 break;
5162 case FNAME:
5163 p = "FNAME";
5164 break;
5165 case FNAME + ADD_NL:
5166 p = "FNAME+NL";
5167 break;
5168 case SFNAME:
5169 p = "SFNAME";
5170 break;
5171 case SFNAME + ADD_NL:
5172 p = "SFNAME+NL";
5173 break;
5174 case PRINT:
5175 p = "PRINT";
5176 break;
5177 case PRINT + ADD_NL:
5178 p = "PRINT+NL";
5179 break;
5180 case SPRINT:
5181 p = "SPRINT";
5182 break;
5183 case SPRINT + ADD_NL:
5184 p = "SPRINT+NL";
5185 break;
5186 case WHITE:
5187 p = "WHITE";
5188 break;
5189 case WHITE + ADD_NL:
5190 p = "WHITE+NL";
5191 break;
5192 case NWHITE:
5193 p = "NWHITE";
5194 break;
5195 case NWHITE + ADD_NL:
5196 p = "NWHITE+NL";
5197 break;
5198 case DIGIT:
5199 p = "DIGIT";
5200 break;
5201 case DIGIT + ADD_NL:
5202 p = "DIGIT+NL";
5203 break;
5204 case NDIGIT:
5205 p = "NDIGIT";
5206 break;
5207 case NDIGIT + ADD_NL:
5208 p = "NDIGIT+NL";
5209 break;
5210 case HEX:
5211 p = "HEX";
5212 break;
5213 case HEX + ADD_NL:
5214 p = "HEX+NL";
5215 break;
5216 case NHEX:
5217 p = "NHEX";
5218 break;
5219 case NHEX + ADD_NL:
5220 p = "NHEX+NL";
5221 break;
5222 case OCTAL:
5223 p = "OCTAL";
5224 break;
5225 case OCTAL + ADD_NL:
5226 p = "OCTAL+NL";
5227 break;
5228 case NOCTAL:
5229 p = "NOCTAL";
5230 break;
5231 case NOCTAL + ADD_NL:
5232 p = "NOCTAL+NL";
5233 break;
5234 case WORD:
5235 p = "WORD";
5236 break;
5237 case WORD + ADD_NL:
5238 p = "WORD+NL";
5239 break;
5240 case NWORD:
5241 p = "NWORD";
5242 break;
5243 case NWORD + ADD_NL:
5244 p = "NWORD+NL";
5245 break;
5246 case HEAD:
5247 p = "HEAD";
5248 break;
5249 case HEAD + ADD_NL:
5250 p = "HEAD+NL";
5251 break;
5252 case NHEAD:
5253 p = "NHEAD";
5254 break;
5255 case NHEAD + ADD_NL:
5256 p = "NHEAD+NL";
5257 break;
5258 case ALPHA:
5259 p = "ALPHA";
5260 break;
5261 case ALPHA + ADD_NL:
5262 p = "ALPHA+NL";
5263 break;
5264 case NALPHA:
5265 p = "NALPHA";
5266 break;
5267 case NALPHA + ADD_NL:
5268 p = "NALPHA+NL";
5269 break;
5270 case LOWER:
5271 p = "LOWER";
5272 break;
5273 case LOWER + ADD_NL:
5274 p = "LOWER+NL";
5275 break;
5276 case NLOWER:
5277 p = "NLOWER";
5278 break;
5279 case NLOWER + ADD_NL:
5280 p = "NLOWER+NL";
5281 break;
5282 case UPPER:
5283 p = "UPPER";
5284 break;
5285 case UPPER + ADD_NL:
5286 p = "UPPER+NL";
5287 break;
5288 case NUPPER:
5289 p = "NUPPER";
5290 break;
5291 case NUPPER + ADD_NL:
5292 p = "NUPPER+NL";
5293 break;
5294 case BRANCH:
5295 p = "BRANCH";
5296 break;
5297 case EXACTLY:
5298 p = "EXACTLY";
5299 break;
5300 case NOTHING:
5301 p = "NOTHING";
5302 break;
5303 case BACK:
5304 p = "BACK";
5305 break;
5306 case END:
5307 p = "END";
5308 break;
5309 case MOPEN + 0:
5310 p = "MATCH START";
5311 break;
5312 case MOPEN + 1:
5313 case MOPEN + 2:
5314 case MOPEN + 3:
5315 case MOPEN + 4:
5316 case MOPEN + 5:
5317 case MOPEN + 6:
5318 case MOPEN + 7:
5319 case MOPEN + 8:
5320 case MOPEN + 9:
5321 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
5322 p = NULL;
5323 break;
5324 case MCLOSE + 0:
5325 p = "MATCH END";
5326 break;
5327 case MCLOSE + 1:
5328 case MCLOSE + 2:
5329 case MCLOSE + 3:
5330 case MCLOSE + 4:
5331 case MCLOSE + 5:
5332 case MCLOSE + 6:
5333 case MCLOSE + 7:
5334 case MCLOSE + 8:
5335 case MCLOSE + 9:
5336 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
5337 p = NULL;
5338 break;
5339 case BACKREF + 1:
5340 case BACKREF + 2:
5341 case BACKREF + 3:
5342 case BACKREF + 4:
5343 case BACKREF + 5:
5344 case BACKREF + 6:
5345 case BACKREF + 7:
5346 case BACKREF + 8:
5347 case BACKREF + 9:
5348 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
5349 p = NULL;
5350 break;
5351 case NOPEN:
5352 p = "NOPEN";
5353 break;
5354 case NCLOSE:
5355 p = "NCLOSE";
5356 break;
5357#ifdef FEAT_SYN_HL
5358 case ZOPEN + 1:
5359 case ZOPEN + 2:
5360 case ZOPEN + 3:
5361 case ZOPEN + 4:
5362 case ZOPEN + 5:
5363 case ZOPEN + 6:
5364 case ZOPEN + 7:
5365 case ZOPEN + 8:
5366 case ZOPEN + 9:
5367 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
5368 p = NULL;
5369 break;
5370 case ZCLOSE + 1:
5371 case ZCLOSE + 2:
5372 case ZCLOSE + 3:
5373 case ZCLOSE + 4:
5374 case ZCLOSE + 5:
5375 case ZCLOSE + 6:
5376 case ZCLOSE + 7:
5377 case ZCLOSE + 8:
5378 case ZCLOSE + 9:
5379 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
5380 p = NULL;
5381 break;
5382 case ZREF + 1:
5383 case ZREF + 2:
5384 case ZREF + 3:
5385 case ZREF + 4:
5386 case ZREF + 5:
5387 case ZREF + 6:
5388 case ZREF + 7:
5389 case ZREF + 8:
5390 case ZREF + 9:
5391 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
5392 p = NULL;
5393 break;
5394#endif
5395 case STAR:
5396 p = "STAR";
5397 break;
5398 case PLUS:
5399 p = "PLUS";
5400 break;
5401 case NOMATCH:
5402 p = "NOMATCH";
5403 break;
5404 case MATCH:
5405 p = "MATCH";
5406 break;
5407 case BEHIND:
5408 p = "BEHIND";
5409 break;
5410 case NOBEHIND:
5411 p = "NOBEHIND";
5412 break;
5413 case SUBPAT:
5414 p = "SUBPAT";
5415 break;
5416 case BRACE_LIMITS:
5417 p = "BRACE_LIMITS";
5418 break;
5419 case BRACE_SIMPLE:
5420 p = "BRACE_SIMPLE";
5421 break;
5422 case BRACE_COMPLEX + 0:
5423 case BRACE_COMPLEX + 1:
5424 case BRACE_COMPLEX + 2:
5425 case BRACE_COMPLEX + 3:
5426 case BRACE_COMPLEX + 4:
5427 case BRACE_COMPLEX + 5:
5428 case BRACE_COMPLEX + 6:
5429 case BRACE_COMPLEX + 7:
5430 case BRACE_COMPLEX + 8:
5431 case BRACE_COMPLEX + 9:
5432 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
5433 p = NULL;
5434 break;
5435#ifdef FEAT_MBYTE
5436 case MULTIBYTECODE:
5437 p = "MULTIBYTECODE";
5438 break;
5439#endif
5440 case NEWL:
5441 p = "NEWL";
5442 break;
5443 default:
5444 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
5445 p = NULL;
5446 break;
5447 }
5448 if (p != NULL)
5449 (void) strcat(buf, p);
5450 return buf;
5451}
5452#endif
5453
5454#ifdef FEAT_MBYTE
5455static void mb_decompose __ARGS((int c, int *c1, int *c2, int *c3));
5456
5457typedef struct
5458{
5459 int a, b, c;
5460} decomp_T;
5461
5462
5463/* 0xfb20 - 0xfb4f */
5464decomp_T decomp_table[0xfb4f-0xfb20+1] =
5465{
5466 {0x5e2,0,0}, /* 0xfb20 alt ayin */
5467 {0x5d0,0,0}, /* 0xfb21 alt alef */
5468 {0x5d3,0,0}, /* 0xfb22 alt dalet */
5469 {0x5d4,0,0}, /* 0xfb23 alt he */
5470 {0x5db,0,0}, /* 0xfb24 alt kaf */
5471 {0x5dc,0,0}, /* 0xfb25 alt lamed */
5472 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
5473 {0x5e8,0,0}, /* 0xfb27 alt resh */
5474 {0x5ea,0,0}, /* 0xfb28 alt tav */
5475 {'+', 0, 0}, /* 0xfb29 alt plus */
5476 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
5477 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
5478 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
5479 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
5480 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
5481 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
5482 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
5483 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
5484 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
5485 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
5486 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
5487 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
5488 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
5489 {0xfb37, 0, 0}, /* 0xfb37 -- UNUSED */
5490 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
5491 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
5492 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
5493 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
5494 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
5495 {0xfb3d, 0, 0}, /* 0xfb3d -- UNUSED */
5496 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
5497 {0xfb3f, 0, 0}, /* 0xfb3f -- UNUSED */
5498 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
5499 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
5500 {0xfb42, 0, 0}, /* 0xfb42 -- UNUSED */
5501 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
5502 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
5503 {0xfb45, 0, 0}, /* 0xfb45 -- UNUSED */
5504 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
5505 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
5506 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
5507 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
5508 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
5509 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
5510 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
5511 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
5512 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
5513 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
5514};
5515
5516 static void
5517mb_decompose(c, c1, c2, c3)
5518 int c, *c1, *c2, *c3;
5519{
5520 decomp_T d;
5521
5522 if (c >= 0x4b20 && c <= 0xfb4f)
5523 {
5524 d = decomp_table[c - 0xfb20];
5525 *c1 = d.a;
5526 *c2 = d.b;
5527 *c3 = d.c;
5528 }
5529 else
5530 {
5531 *c1 = c;
5532 *c2 = *c3 = 0;
5533 }
5534}
5535#endif
5536
5537/*
5538 * Compare two strings, ignore case if ireg_ic set.
5539 * Return 0 if strings match, non-zero otherwise.
5540 * Correct the length "*n" when composing characters are ignored.
5541 */
5542 static int
5543cstrncmp(s1, s2, n)
5544 char_u *s1, *s2;
5545 int *n;
5546{
5547 int result;
5548
5549 if (!ireg_ic)
5550 result = STRNCMP(s1, s2, *n);
5551 else
5552 result = MB_STRNICMP(s1, s2, *n);
5553
5554#ifdef FEAT_MBYTE
5555 /* if it failed and it's utf8 and we want to combineignore: */
5556 if (result != 0 && enc_utf8 && ireg_icombine)
5557 {
5558 char_u *str1, *str2;
5559 int c1, c2, c11, c12;
5560 int ix;
5561 int junk;
5562
5563 /* we have to handle the strcmp ourselves, since it is necessary to
5564 * deal with the composing characters by ignoring them: */
5565 str1 = s1;
5566 str2 = s2;
5567 c1 = c2 = 0;
5568 for (ix = 0; ix < *n; )
5569 {
5570 c1 = mb_ptr2char_adv(&str1);
5571 c2 = mb_ptr2char_adv(&str2);
5572 ix += utf_char2len(c1);
5573
5574 /* decompose the character if necessary, into 'base' characters
5575 * because I don't care about Arabic, I will hard-code the Hebrew
5576 * which I *do* care about! So sue me... */
5577 if (c1 != c2 && (!ireg_ic || utf_fold(c1) != utf_fold(c2)))
5578 {
5579 /* decomposition necessary? */
5580 mb_decompose(c1, &c11, &junk, &junk);
5581 mb_decompose(c2, &c12, &junk, &junk);
5582 c1 = c11;
5583 c2 = c12;
5584 if (c11 != c12 && (!ireg_ic || utf_fold(c11) != utf_fold(c12)))
5585 break;
5586 }
5587 }
5588 result = c2 - c1;
5589 if (result == 0)
5590 *n = (int)(str2 - s2);
5591 }
5592#endif
5593
5594 return result;
5595}
5596
5597/*
5598 * cstrchr: This function is used a lot for simple searches, keep it fast!
5599 */
5600 static char_u *
5601cstrchr(s, c)
5602 char_u *s;
5603 int c;
5604{
5605 char_u *p;
5606 int cc;
5607
5608 if (!ireg_ic
5609#ifdef FEAT_MBYTE
5610 || (!enc_utf8 && mb_char2len(c) > 1)
5611#endif
5612 )
5613 return vim_strchr(s, c);
5614
5615 /* tolower() and toupper() can be slow, comparing twice should be a lot
5616 * faster (esp. when using MS Visual C++!).
5617 * For UTF-8 need to use folded case. */
5618#ifdef FEAT_MBYTE
5619 if (enc_utf8 && c > 0x80)
5620 cc = utf_fold(c);
5621 else
5622#endif
5623 if (isupper(c))
5624 cc = TOLOWER_LOC(c);
5625 else if (islower(c))
5626 cc = TOUPPER_LOC(c);
5627 else
5628 return vim_strchr(s, c);
5629
5630#ifdef FEAT_MBYTE
5631 if (has_mbyte)
5632 {
5633 for (p = s; *p != NUL; p += (*mb_ptr2len_check)(p))
5634 {
5635 if (enc_utf8 && c > 0x80)
5636 {
5637 if (utf_fold(utf_ptr2char(p)) == cc)
5638 return p;
5639 }
5640 else if (*p == c || *p == cc)
5641 return p;
5642 }
5643 }
5644 else
5645#endif
5646 /* Faster version for when there are no multi-byte characters. */
5647 for (p = s; *p != NUL; ++p)
5648 if (*p == c || *p == cc)
5649 return p;
5650
5651 return NULL;
5652}
5653
5654/***************************************************************
5655 * regsub stuff *
5656 ***************************************************************/
5657
5658/* This stuff below really confuses cc on an SGI -- webb */
5659#ifdef __sgi
5660# undef __ARGS
5661# define __ARGS(x) ()
5662#endif
5663
5664/*
5665 * We should define ftpr as a pointer to a function returning a pointer to
5666 * a function returning a pointer to a function ...
5667 * This is impossible, so we declare a pointer to a function returning a
5668 * pointer to a function returning void. This should work for all compilers.
5669 */
5670typedef void (*(*fptr) __ARGS((char_u *, int)))();
5671
5672static fptr do_upper __ARGS((char_u *, int));
5673static fptr do_Upper __ARGS((char_u *, int));
5674static fptr do_lower __ARGS((char_u *, int));
5675static fptr do_Lower __ARGS((char_u *, int));
5676
5677static int vim_regsub_both __ARGS((char_u *source, char_u *dest, int copy, int magic, int backslash));
5678
5679 static fptr
5680do_upper(d, c)
5681 char_u *d;
5682 int c;
5683{
5684 *d = TOUPPER_LOC(c);
5685
5686 return (fptr)NULL;
5687}
5688
5689 static fptr
5690do_Upper(d, c)
5691 char_u *d;
5692 int c;
5693{
5694 *d = TOUPPER_LOC(c);
5695
5696 return (fptr)do_Upper;
5697}
5698
5699 static fptr
5700do_lower(d, c)
5701 char_u *d;
5702 int c;
5703{
5704 *d = TOLOWER_LOC(c);
5705
5706 return (fptr)NULL;
5707}
5708
5709 static fptr
5710do_Lower(d, c)
5711 char_u *d;
5712 int c;
5713{
5714 *d = TOLOWER_LOC(c);
5715
5716 return (fptr)do_Lower;
5717}
5718
5719/*
5720 * regtilde(): Replace tildes in the pattern by the old pattern.
5721 *
5722 * Short explanation of the tilde: It stands for the previous replacement
5723 * pattern. If that previous pattern also contains a ~ we should go back a
5724 * step further... But we insert the previous pattern into the current one
5725 * and remember that.
5726 * This still does not handle the case where "magic" changes. TODO?
5727 *
5728 * The tildes are parsed once before the first call to vim_regsub().
5729 */
5730 char_u *
5731regtilde(source, magic)
5732 char_u *source;
5733 int magic;
5734{
5735 char_u *newsub = source;
5736 char_u *tmpsub;
5737 char_u *p;
5738 int len;
5739 int prevlen;
5740
5741 for (p = newsub; *p; ++p)
5742 {
5743 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic))
5744 {
5745 if (reg_prev_sub != NULL)
5746 {
5747 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
5748 prevlen = (int)STRLEN(reg_prev_sub);
5749 tmpsub = alloc((unsigned)(STRLEN(newsub) + prevlen));
5750 if (tmpsub != NULL)
5751 {
5752 /* copy prefix */
5753 len = (int)(p - newsub); /* not including ~ */
5754 mch_memmove(tmpsub, newsub, (size_t)len);
5755 /* interpretate tilde */
5756 mch_memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
5757 /* copy postfix */
5758 if (!magic)
5759 ++p; /* back off \ */
5760 STRCPY(tmpsub + len + prevlen, p + 1);
5761
5762 if (newsub != source) /* already allocated newsub */
5763 vim_free(newsub);
5764 newsub = tmpsub;
5765 p = newsub + len + prevlen;
5766 }
5767 }
5768 else if (magic)
5769 STRCPY(p, p + 1); /* remove '~' */
5770 else
5771 STRCPY(p, p + 2); /* remove '\~' */
5772 --p;
5773 }
5774 else
5775 {
5776 if (*p == '\\' && p[1]) /* skip escaped characters */
5777 ++p;
5778#ifdef FEAT_MBYTE
5779 if (has_mbyte)
5780 p += (*mb_ptr2len_check)(p) - 1;
5781#endif
5782 }
5783 }
5784
5785 vim_free(reg_prev_sub);
5786 if (newsub != source) /* newsub was allocated, just keep it */
5787 reg_prev_sub = newsub;
5788 else /* no ~ found, need to save newsub */
5789 reg_prev_sub = vim_strsave(newsub);
5790 return newsub;
5791}
5792
5793#ifdef FEAT_EVAL
5794static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */
5795
5796/* These pointers are used instead of reg_match and reg_mmatch for
5797 * reg_submatch(). Needed for when the substitution string is an expression
5798 * that contains a call to substitute() and submatch(). */
5799static regmatch_T *submatch_match;
5800static regmmatch_T *submatch_mmatch;
5801#endif
5802
5803#if defined(FEAT_MODIFY_FNAME) || defined(FEAT_EVAL) || defined(PROTO)
5804/*
5805 * vim_regsub() - perform substitutions after a vim_regexec() or
5806 * vim_regexec_multi() match.
5807 *
5808 * If "copy" is TRUE really copy into "dest".
5809 * If "copy" is FALSE nothing is copied, this is just to find out the length
5810 * of the result.
5811 *
5812 * If "backslash" is TRUE, a backslash will be removed later, need to double
5813 * them to keep them, and insert a backslash before a CR to avoid it being
5814 * replaced with a line break later.
5815 *
5816 * Note: The matched text must not change between the call of
5817 * vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
5818 * references invalid!
5819 *
5820 * Returns the size of the replacement, including terminating NUL.
5821 */
5822 int
5823vim_regsub(rmp, source, dest, copy, magic, backslash)
5824 regmatch_T *rmp;
5825 char_u *source;
5826 char_u *dest;
5827 int copy;
5828 int magic;
5829 int backslash;
5830{
5831 reg_match = rmp;
5832 reg_mmatch = NULL;
5833 reg_maxline = 0;
5834 return vim_regsub_both(source, dest, copy, magic, backslash);
5835}
5836#endif
5837
5838 int
5839vim_regsub_multi(rmp, lnum, source, dest, copy, magic, backslash)
5840 regmmatch_T *rmp;
5841 linenr_T lnum;
5842 char_u *source;
5843 char_u *dest;
5844 int copy;
5845 int magic;
5846 int backslash;
5847{
5848 reg_match = NULL;
5849 reg_mmatch = rmp;
5850 reg_buf = curbuf; /* always works on the current buffer! */
5851 reg_firstlnum = lnum;
5852 reg_maxline = curbuf->b_ml.ml_line_count - lnum;
5853 return vim_regsub_both(source, dest, copy, magic, backslash);
5854}
5855
5856 static int
5857vim_regsub_both(source, dest, copy, magic, backslash)
5858 char_u *source;
5859 char_u *dest;
5860 int copy;
5861 int magic;
5862 int backslash;
5863{
5864 char_u *src;
5865 char_u *dst;
5866 char_u *s;
5867 int c;
5868 int no = -1;
5869 fptr func = (fptr)NULL;
5870 linenr_T clnum = 0; /* init for GCC */
5871 int len = 0; /* init for GCC */
5872#ifdef FEAT_EVAL
5873 static char_u *eval_result = NULL;
5874#endif
5875#ifdef FEAT_MBYTE
5876 int l;
5877#endif
5878
5879
5880 /* Be paranoid... */
5881 if (source == NULL || dest == NULL)
5882 {
5883 EMSG(_(e_null));
5884 return 0;
5885 }
5886 if (prog_magic_wrong())
5887 return 0;
5888 src = source;
5889 dst = dest;
5890
5891 /*
5892 * When the substitute part starts with "\=" evaluate it as an expression.
5893 */
5894 if (source[0] == '\\' && source[1] == '='
5895#ifdef FEAT_EVAL
5896 && !can_f_submatch /* can't do this recursively */
5897#endif
5898 )
5899 {
5900#ifdef FEAT_EVAL
5901 /* To make sure that the length doesn't change between checking the
5902 * length and copying the string, and to speed up things, the
5903 * resulting string is saved from the call with "copy" == FALSE to the
5904 * call with "copy" == TRUE. */
5905 if (copy)
5906 {
5907 if (eval_result != NULL)
5908 {
5909 STRCPY(dest, eval_result);
5910 dst += STRLEN(eval_result);
5911 vim_free(eval_result);
5912 eval_result = NULL;
5913 }
5914 }
5915 else
5916 {
5917 linenr_T save_reg_maxline;
5918 win_T *save_reg_win;
5919 int save_ireg_ic;
5920
5921 vim_free(eval_result);
5922
5923 /* The expression may contain substitute(), which calls us
5924 * recursively. Make sure submatch() gets the text from the first
5925 * level. Don't need to save "reg_buf", because
5926 * vim_regexec_multi() can't be called recursively. */
5927 submatch_match = reg_match;
5928 submatch_mmatch = reg_mmatch;
5929 save_reg_maxline = reg_maxline;
5930 save_reg_win = reg_win;
5931 save_ireg_ic = ireg_ic;
5932 can_f_submatch = TRUE;
5933
5934 eval_result = eval_to_string(source + 2, NULL);
5935 if (eval_result != NULL)
5936 {
Bram Moolenaar1cd871b2004-12-19 22:46:22 +00005937 for (s = eval_result; *s != NUL; mb_ptr_adv(s))
Bram Moolenaar071d4272004-06-13 20:20:40 +00005938 {
5939 /* Change NL to CR, so that it becomes a line break.
5940 * Skip over a backslashed character. */
5941 if (*s == NL)
5942 *s = CAR;
5943 else if (*s == '\\' && s[1] != NUL)
5944 ++s;
Bram Moolenaar071d4272004-06-13 20:20:40 +00005945 }
5946
5947 dst += STRLEN(eval_result);
5948 }
5949
5950 reg_match = submatch_match;
5951 reg_mmatch = submatch_mmatch;
5952 reg_maxline = save_reg_maxline;
5953 reg_win = save_reg_win;
5954 ireg_ic = save_ireg_ic;
5955 can_f_submatch = FALSE;
5956 }
5957#endif
5958 }
5959 else
5960 while ((c = *src++) != NUL)
5961 {
5962 if (c == '&' && magic)
5963 no = 0;
5964 else if (c == '\\' && *src != NUL)
5965 {
5966 if (*src == '&' && !magic)
5967 {
5968 ++src;
5969 no = 0;
5970 }
5971 else if ('0' <= *src && *src <= '9')
5972 {
5973 no = *src++ - '0';
5974 }
5975 else if (vim_strchr((char_u *)"uUlLeE", *src))
5976 {
5977 switch (*src++)
5978 {
5979 case 'u': func = (fptr)do_upper;
5980 continue;
5981 case 'U': func = (fptr)do_Upper;
5982 continue;
5983 case 'l': func = (fptr)do_lower;
5984 continue;
5985 case 'L': func = (fptr)do_Lower;
5986 continue;
5987 case 'e':
5988 case 'E': func = (fptr)NULL;
5989 continue;
5990 }
5991 }
5992 }
5993 if (no < 0) /* Ordinary character. */
5994 {
5995 if (c == '\\' && *src != NUL)
5996 {
5997 /* Check for abbreviations -- webb */
5998 switch (*src)
5999 {
6000 case 'r': c = CAR; ++src; break;
6001 case 'n': c = NL; ++src; break;
6002 case 't': c = TAB; ++src; break;
6003 /* Oh no! \e already has meaning in subst pat :-( */
6004 /* case 'e': c = ESC; ++src; break; */
6005 case 'b': c = Ctrl_H; ++src; break;
6006
6007 /* If "backslash" is TRUE the backslash will be removed
6008 * later. Used to insert a literal CR. */
6009 default: if (backslash)
6010 {
6011 if (copy)
6012 *dst = '\\';
6013 ++dst;
6014 }
6015 c = *src++;
6016 }
6017 }
6018
6019 /* Write to buffer, if copy is set. */
6020#ifdef FEAT_MBYTE
6021 if (has_mbyte && (l = (*mb_ptr2len_check)(src - 1)) > 1)
6022 {
6023 /* TODO: should use "func" here. */
6024 if (copy)
6025 mch_memmove(dst, src - 1, l);
6026 dst += l - 1;
6027 src += l - 1;
6028 }
6029 else
6030 {
6031#endif
6032 if (copy)
6033 {
6034 if (func == (fptr)NULL) /* just copy */
6035 *dst = c;
6036 else /* change case */
6037 func = (fptr)(func(dst, c));
6038 /* Turbo C complains without the typecast */
6039 }
6040#ifdef FEAT_MBYTE
6041 }
6042#endif
6043 dst++;
6044 }
6045 else
6046 {
6047 if (REG_MULTI)
6048 {
6049 clnum = reg_mmatch->startpos[no].lnum;
6050 if (clnum < 0 || reg_mmatch->endpos[no].lnum < 0)
6051 s = NULL;
6052 else
6053 {
6054 s = reg_getline(clnum) + reg_mmatch->startpos[no].col;
6055 if (reg_mmatch->endpos[no].lnum == clnum)
6056 len = reg_mmatch->endpos[no].col
6057 - reg_mmatch->startpos[no].col;
6058 else
6059 len = (int)STRLEN(s);
6060 }
6061 }
6062 else
6063 {
6064 s = reg_match->startp[no];
6065 if (reg_match->endp[no] == NULL)
6066 s = NULL;
6067 else
6068 len = (int)(reg_match->endp[no] - s);
6069 }
6070 if (s != NULL)
6071 {
6072 for (;;)
6073 {
6074 if (len == 0)
6075 {
6076 if (REG_MULTI)
6077 {
6078 if (reg_mmatch->endpos[no].lnum == clnum)
6079 break;
6080 if (copy)
6081 *dst = CAR;
6082 ++dst;
6083 s = reg_getline(++clnum);
6084 if (reg_mmatch->endpos[no].lnum == clnum)
6085 len = reg_mmatch->endpos[no].col;
6086 else
6087 len = (int)STRLEN(s);
6088 }
6089 else
6090 break;
6091 }
6092 else if (*s == NUL) /* we hit NUL. */
6093 {
6094 if (copy)
6095 EMSG(_(e_re_damg));
6096 goto exit;
6097 }
6098 else
6099 {
6100 if (backslash && (*s == CAR || *s == '\\'))
6101 {
6102 /*
6103 * Insert a backslash in front of a CR, otherwise
6104 * it will be replaced by a line break.
6105 * Number of backslashes will be halved later,
6106 * double them here.
6107 */
6108 if (copy)
6109 {
6110 dst[0] = '\\';
6111 dst[1] = *s;
6112 }
6113 dst += 2;
6114 }
6115#ifdef FEAT_MBYTE
6116 else if (has_mbyte && (l = (*mb_ptr2len_check)(s)) > 1)
6117 {
6118 /* TODO: should use "func" here. */
6119 if (copy)
6120 mch_memmove(dst, s, l);
6121 dst += l;
6122 s += l - 1;
6123 len -= l - 1;
6124 }
6125#endif
6126 else
6127 {
6128 if (copy)
6129 {
6130 if (func == (fptr)NULL) /* just copy */
6131 *dst = *s;
6132 else /* change case */
6133 func = (fptr)(func(dst, *s));
6134 /* Turbo C complains without the typecast */
6135 }
6136 ++dst;
6137 }
6138 ++s;
6139 --len;
6140 }
6141 }
6142 }
6143 no = -1;
6144 }
6145 }
6146 if (copy)
6147 *dst = NUL;
6148
6149exit:
6150 return (int)((dst - dest) + 1);
6151}
6152
6153#ifdef FEAT_EVAL
6154/*
6155 * Used for the submatch() function: get the string from tne n'th submatch in
6156 * allocated memory.
6157 * Returns NULL when not in a ":s" command and for a non-existing submatch.
6158 */
6159 char_u *
6160reg_submatch(no)
6161 int no;
6162{
6163 char_u *retval = NULL;
6164 char_u *s;
6165 int len;
6166 int round;
6167 linenr_T lnum;
6168
6169 if (!can_f_submatch)
6170 return NULL;
6171
6172 if (submatch_match == NULL)
6173 {
6174 /*
6175 * First round: compute the length and allocate memory.
6176 * Second round: copy the text.
6177 */
6178 for (round = 1; round <= 2; ++round)
6179 {
6180 lnum = submatch_mmatch->startpos[no].lnum;
6181 if (lnum < 0 || submatch_mmatch->endpos[no].lnum < 0)
6182 return NULL;
6183
6184 s = reg_getline(lnum) + submatch_mmatch->startpos[no].col;
6185 if (s == NULL) /* anti-crash check, cannot happen? */
6186 break;
6187 if (submatch_mmatch->endpos[no].lnum == lnum)
6188 {
6189 /* Within one line: take form start to end col. */
6190 len = submatch_mmatch->endpos[no].col
6191 - submatch_mmatch->startpos[no].col;
6192 if (round == 2)
6193 {
6194 STRNCPY(retval, s, len);
6195 retval[len] = NUL;
6196 }
6197 ++len;
6198 }
6199 else
6200 {
6201 /* Multiple lines: take start line from start col, middle
6202 * lines completely and end line up to end col. */
6203 len = (int)STRLEN(s);
6204 if (round == 2)
6205 {
6206 STRCPY(retval, s);
6207 retval[len] = '\n';
6208 }
6209 ++len;
6210 ++lnum;
6211 while (lnum < submatch_mmatch->endpos[no].lnum)
6212 {
6213 s = reg_getline(lnum++);
6214 if (round == 2)
6215 STRCPY(retval + len, s);
6216 len += (int)STRLEN(s);
6217 if (round == 2)
6218 retval[len] = '\n';
6219 ++len;
6220 }
6221 if (round == 2)
6222 STRNCPY(retval + len, reg_getline(lnum),
6223 submatch_mmatch->endpos[no].col);
6224 len += submatch_mmatch->endpos[no].col;
6225 if (round == 2)
6226 retval[len] = NUL;
6227 ++len;
6228 }
6229
6230 if (round == 1)
6231 {
6232 retval = lalloc((long_u)len, TRUE);
6233 if (s == NULL)
6234 return NULL;
6235 }
6236 }
6237 }
6238 else
6239 {
6240 if (submatch_match->endp[no] == NULL)
6241 retval = NULL;
6242 else
6243 {
6244 s = submatch_match->startp[no];
6245 retval = vim_strnsave(s, (int)(submatch_match->endp[no] - s));
6246 }
6247 }
6248
6249 return retval;
6250}
6251#endif